From 03a20a943221130b5002ad0223dd75d561da0282 Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Sat, 17 Mar 2018 00:11:38 -0400 Subject: [PATCH] LIKE row group filtering ~1.7s -> ~1.0s for the census data set on `LIKE 'Dawson %'` --- parquet/parquet_cursor.cc | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 3db8d3e..a6a1295 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -65,8 +65,24 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s // If min == max == str, we can skip this. return !(minStr == maxStr && str == minStr); case Like: - // TODO: We could do something here where we filter based on the leading characters - // of the target. For now, do nothing. + { + std::string truncated = str; + size_t idx = truncated.find_first_of("%"); + if(idx != std::string::npos) { + truncated = truncated.substr(0, idx); + } + idx = truncated.find_first_of("_"); + if(idx != std::string::npos) { + truncated = truncated.substr(0, idx); + } + + // This permits more rowgroups than is strictly needed + // since it assumes an implicit wildcard. But it's + // simple to implement, so we'll go with it. + std::string truncatedMin = minStr.substr(0, truncated.size()); + std::string truncatedMax = maxStr.substr(0, truncated.size()); + return truncated.empty() || (truncated >= truncatedMin && truncated <= truncatedMax); + } default: return true; }