LIKE row group filtering
~1.7s -> ~1.0s for the census data set on `LIKE 'Dawson %'`
This commit is contained in:
parent
753a490687
commit
03a20a9432
|
@ -65,8 +65,24 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s
|
||||||
// If min == max == str, we can skip this.
|
// If min == max == str, we can skip this.
|
||||||
return !(minStr == maxStr && str == minStr);
|
return !(minStr == maxStr && str == minStr);
|
||||||
case Like:
|
case Like:
|
||||||
// TODO: We could do something here where we filter based on the leading characters
|
{
|
||||||
// of the target. For now, do nothing.
|
std::string truncated = str;
|
||||||
|
size_t idx = truncated.find_first_of("%");
|
||||||
|
if(idx != std::string::npos) {
|
||||||
|
truncated = truncated.substr(0, idx);
|
||||||
|
}
|
||||||
|
idx = truncated.find_first_of("_");
|
||||||
|
if(idx != std::string::npos) {
|
||||||
|
truncated = truncated.substr(0, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This permits more rowgroups than is strictly needed
|
||||||
|
// since it assumes an implicit wildcard. But it's
|
||||||
|
// simple to implement, so we'll go with it.
|
||||||
|
std::string truncatedMin = minStr.substr(0, truncated.size());
|
||||||
|
std::string truncatedMax = maxStr.substr(0, truncated.size());
|
||||||
|
return truncated.empty() || (truncated >= truncatedMin && truncated <= truncatedMax);
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue