LIKE row group filtering

~1.7s -> ~1.0s for the census data set on `LIKE 'Dawson %'`
2025-12-14 05:53:28 +00:00 · 2018-03-17 00:11:38 -04:00
parent 753a490687
commit 03a20a9432
1 changed files with 18 additions and 2 deletions
--- a/parquet/parquet_cursor.cc
+++ b/parquet/parquet_cursor.cc
@@ -65,8 +65,24 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s
      // If min == max == str, we can skip this.
      return !(minStr == maxStr && str == minStr);
    case Like:
-      // TODO: We could do something here where we filter based on the leading characters
-      //       of the target. For now, do nothing.
+    {
+      std::string truncated = str;
+      size_t idx = truncated.find_first_of("%");
+      if(idx != std::string::npos) {
+        truncated = truncated.substr(0, idx);
+      }
+      idx = truncated.find_first_of("_");
+      if(idx != std::string::npos) {
+        truncated = truncated.substr(0, idx);
+      }
+
+      // This permits more rowgroups than is strictly needed
+      // since it assumes an implicit wildcard. But it's
+      // simple to implement, so we'll go with it.
+      std::string truncatedMin = minStr.substr(0, truncated.size());
+      std::string truncatedMax = maxStr.substr(0, truncated.size());
+      return truncated.empty() || (truncated >= truncatedMin && truncated <= truncatedMax);
+    }
    default:
      return true;
  }