From a3af16eb546fb61817cb65f82db41034611df568 Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Sat, 17 Mar 2018 15:28:51 -0400 Subject: [PATCH] Row-filtering for other string ops --- parquet/parquet_cursor.cc | 78 ++++++++++++++++++++++++-------- parquet/parquet_filter.cc | 18 +++++++- parquet/parquet_filter.h | 3 ++ tests/queries/138-string-lte.sql | 2 + 4 files changed, 82 insertions(+), 19 deletions(-) create mode 100644 tests/queries/138-string-lte.sql diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index a6a1295..dc00773 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -66,22 +66,10 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s return !(minStr == maxStr && str == minStr); case Like: { - std::string truncated = str; - size_t idx = truncated.find_first_of("%"); - if(idx != std::string::npos) { - truncated = truncated.substr(0, idx); - } - idx = truncated.find_first_of("_"); - if(idx != std::string::npos) { - truncated = truncated.substr(0, idx); - } - - // This permits more rowgroups than is strictly needed - // since it assumes an implicit wildcard. But it's - // simple to implement, so we'll go with it. - std::string truncatedMin = minStr.substr(0, truncated.size()); - std::string truncatedMax = maxStr.substr(0, truncated.size()); - return truncated.empty() || (truncated >= truncatedMin && truncated <= truncatedMax); + const std::string& likeStringValue = constraint.likeStringValue; + std::string truncatedMin = minStr.substr(0, likeStringValue.size()); + std::string truncatedMax = maxStr.substr(0, likeStringValue.size()); + return likeStringValue.empty() || (likeStringValue >= truncatedMin && likeStringValue <= truncatedMax); } default: return true; @@ -245,33 +233,87 @@ bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) { return true; } - const std::vector& blob = constraint.blobValue; parquet::ByteArray* ba = getByteArray(constraint.column); switch(constraint.op) { case Is: case Equal: + { + const std::vector& blob = constraint.blobValue; + if(blob.size() != ba->len) return false; return 0 == memcmp(&blob[0], ba->ptr, ba->len); + } case IsNot: case NotEqual: + { + const std::vector& blob = constraint.blobValue; + if(blob.size() != ba->len) return true; return 0 != memcmp(&blob[0], ba->ptr, ba->len); + } case GreaterThan: + { + const std::vector& blob = constraint.blobValue; + + return std::lexicographical_compare( + &blob[0], + &blob[0] + blob.size(), + ba->ptr, + ba->ptr + ba->len); + } case GreaterThanOrEqual: + { + const std::vector& blob = constraint.blobValue; + + bool equal = blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len); + + return equal || std::lexicographical_compare( + &blob[0], + &blob[0] + blob.size(), + ba->ptr, + ba->ptr + ba->len); + } case LessThan: + { + const std::vector& blob = constraint.blobValue; + + return std::lexicographical_compare( + ba->ptr, + ba->ptr + ba->len, + &blob[0], + &blob[0] + blob.size()); + } case LessThanOrEqual: + { + const std::vector& blob = constraint.blobValue; + bool equal = blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len); + + return equal || std::lexicographical_compare( + ba->ptr, + ba->ptr + ba->len, + &blob[0], + &blob[0] + blob.size()); + } case Like: + { + const std::string& likeStringValue = constraint.likeStringValue; + if(likeStringValue.size() > ba->len) + return false; + size_t len = ba->len; + if(likeStringValue.size() < len) + len = likeStringValue.size(); + return 0 == memcmp(&likeStringValue[0], ba->ptr, len); + } default: return true; } - } bool ParquetCursor::currentRowSatisfiesIntegerFilter(Constraint& constraint) { diff --git a/parquet/parquet_filter.cc b/parquet/parquet_filter.cc index 765d6ae..18de068 100644 --- a/parquet/parquet_filter.cc +++ b/parquet/parquet_filter.cc @@ -15,6 +15,22 @@ Constraint::Constraint( this->doubleValue = doubleValue; this->blobValue = blobValue; - if(type == Text) + if(type == Text) { stringValue = std::string((char*)&blobValue[0], blobValue.size()); + + if(op == Like) { + // This permits more rowgroups than is strictly needed + // since it assumes an implicit wildcard. But it's + // simple to implement, so we'll go with it. + likeStringValue = stringValue; + size_t idx = likeStringValue.find_first_of("%"); + if(idx != std::string::npos) { + likeStringValue = likeStringValue.substr(0, idx); + } + idx = likeStringValue.find_first_of("_"); + if(idx != std::string::npos) { + likeStringValue = likeStringValue.substr(0, idx); + } + } + } } diff --git a/parquet/parquet_filter.h b/parquet/parquet_filter.h index d042ff4..4afb280 100644 --- a/parquet/parquet_filter.h +++ b/parquet/parquet_filter.h @@ -51,6 +51,9 @@ public: std::vector blobValue; // Only set when blobValue is set std::string stringValue; + + // Only set when stringValue is set and op == Like + std::string likeStringValue; }; #endif diff --git a/tests/queries/138-string-lte.sql b/tests/queries/138-string-lte.sql new file mode 100644 index 0000000..d4c4966 --- /dev/null +++ b/tests/queries/138-string-lte.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where string_8 <= '003' +4