LIKE row group filtering

~1.7s -> ~1.0s for the census data set on `LIKE 'Dawson %'`
2025-07-24 18:43:30 +00:00 · 2018-03-17 00:11:38 -04:00 · 2018-03-17 00:11:38 -04:00 · 03a20a9432
commit 03a20a9432
parent 753a490687
1 changed files with 18 additions and 2 deletions
--- a/parquet/parquet_cursor.cc
+++ b/parquet/parquet_cursor.cc
@ -65,8 +65,24 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s
      // If min == max == str, we can skip this.
      return !(minStr == maxStr && str == minStr);
    case Like:
-      // TODO: We could do something here where we filter based on the leading characters
+    {
-      //       of the target. For now, do nothing.
+      std::string truncated = str;
      size_t idx = truncated.find_first_of("%");
      if(idx != std::string::npos) {
        truncated = truncated.substr(0, idx);
      }
      idx = truncated.find_first_of("_");
      if(idx != std::string::npos) {
        truncated = truncated.substr(0, idx);
      }
      // This permits more rowgroups than is strictly needed
      // since it assumes an implicit wildcard. But it's
      // simple to implement, so we'll go with it.
      std::string truncatedMin = minStr.substr(0, truncated.size());
      std::string truncatedMax = maxStr.substr(0, truncated.size());
      return truncated.empty() || (truncated >= truncatedMin && truncated <= truncatedMax);
    }
    default:
      return true;
  }