Row-filtering for other string ops

This commit is contained in:
Colin Dellow 2018-03-17 15:28:51 -04:00
parent 03a20a9432
commit a3af16eb54
4 changed files with 82 additions and 19 deletions

View File

@ -66,22 +66,10 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s
return !(minStr == maxStr && str == minStr); return !(minStr == maxStr && str == minStr);
case Like: case Like:
{ {
std::string truncated = str; const std::string& likeStringValue = constraint.likeStringValue;
size_t idx = truncated.find_first_of("%"); std::string truncatedMin = minStr.substr(0, likeStringValue.size());
if(idx != std::string::npos) { std::string truncatedMax = maxStr.substr(0, likeStringValue.size());
truncated = truncated.substr(0, idx); return likeStringValue.empty() || (likeStringValue >= truncatedMin && likeStringValue <= truncatedMax);
}
idx = truncated.find_first_of("_");
if(idx != std::string::npos) {
truncated = truncated.substr(0, idx);
}
// This permits more rowgroups than is strictly needed
// since it assumes an implicit wildcard. But it's
// simple to implement, so we'll go with it.
std::string truncatedMin = minStr.substr(0, truncated.size());
std::string truncatedMax = maxStr.substr(0, truncated.size());
return truncated.empty() || (truncated >= truncatedMin && truncated <= truncatedMax);
} }
default: default:
return true; return true;
@ -245,33 +233,87 @@ bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) {
return true; return true;
} }
const std::vector<unsigned char>& blob = constraint.blobValue;
parquet::ByteArray* ba = getByteArray(constraint.column); parquet::ByteArray* ba = getByteArray(constraint.column);
switch(constraint.op) { switch(constraint.op) {
case Is: case Is:
case Equal: case Equal:
{
const std::vector<unsigned char>& blob = constraint.blobValue;
if(blob.size() != ba->len) if(blob.size() != ba->len)
return false; return false;
return 0 == memcmp(&blob[0], ba->ptr, ba->len); return 0 == memcmp(&blob[0], ba->ptr, ba->len);
}
case IsNot: case IsNot:
case NotEqual: case NotEqual:
{
const std::vector<unsigned char>& blob = constraint.blobValue;
if(blob.size() != ba->len) if(blob.size() != ba->len)
return true; return true;
return 0 != memcmp(&blob[0], ba->ptr, ba->len); return 0 != memcmp(&blob[0], ba->ptr, ba->len);
}
case GreaterThan: case GreaterThan:
{
const std::vector<unsigned char>& blob = constraint.blobValue;
return std::lexicographical_compare(
&blob[0],
&blob[0] + blob.size(),
ba->ptr,
ba->ptr + ba->len);
}
case GreaterThanOrEqual: case GreaterThanOrEqual:
{
const std::vector<unsigned char>& blob = constraint.blobValue;
bool equal = blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len);
return equal || std::lexicographical_compare(
&blob[0],
&blob[0] + blob.size(),
ba->ptr,
ba->ptr + ba->len);
}
case LessThan: case LessThan:
{
const std::vector<unsigned char>& blob = constraint.blobValue;
return std::lexicographical_compare(
ba->ptr,
ba->ptr + ba->len,
&blob[0],
&blob[0] + blob.size());
}
case LessThanOrEqual: case LessThanOrEqual:
{
const std::vector<unsigned char>& blob = constraint.blobValue;
bool equal = blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len);
return equal || std::lexicographical_compare(
ba->ptr,
ba->ptr + ba->len,
&blob[0],
&blob[0] + blob.size());
}
case Like: case Like:
{
const std::string& likeStringValue = constraint.likeStringValue;
if(likeStringValue.size() > ba->len)
return false;
size_t len = ba->len;
if(likeStringValue.size() < len)
len = likeStringValue.size();
return 0 == memcmp(&likeStringValue[0], ba->ptr, len);
}
default: default:
return true; return true;
} }
} }
bool ParquetCursor::currentRowSatisfiesIntegerFilter(Constraint& constraint) { bool ParquetCursor::currentRowSatisfiesIntegerFilter(Constraint& constraint) {

View File

@ -15,6 +15,22 @@ Constraint::Constraint(
this->doubleValue = doubleValue; this->doubleValue = doubleValue;
this->blobValue = blobValue; this->blobValue = blobValue;
if(type == Text) if(type == Text) {
stringValue = std::string((char*)&blobValue[0], blobValue.size()); stringValue = std::string((char*)&blobValue[0], blobValue.size());
if(op == Like) {
// This permits more rowgroups than is strictly needed
// since it assumes an implicit wildcard. But it's
// simple to implement, so we'll go with it.
likeStringValue = stringValue;
size_t idx = likeStringValue.find_first_of("%");
if(idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx);
}
idx = likeStringValue.find_first_of("_");
if(idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx);
}
}
}
} }

View File

@ -51,6 +51,9 @@ public:
std::vector<unsigned char> blobValue; std::vector<unsigned char> blobValue;
// Only set when blobValue is set // Only set when blobValue is set
std::string stringValue; std::string stringValue;
// Only set when stringValue is set and op == Like
std::string likeStringValue;
}; };
#endif #endif

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where string_8 <= '003'
4