add string == row group filter

For the statscan census set filtering on `== 'Dawson Creek'`, the query
goes from 980ms to 660ms.

This is expected, since the data isn't sorted by that column.

I'll try adding some scaffolding to do filtering at the row level, too.

We could also try unpacking the dictionary and testing the individual
values, although we may want some heuristics to decide whether it's
worth doing -- eg if < 10% of the rows have a unique value.

Ideally, this should be like a ~1ms query.
This commit is contained in:
Colin Dellow 2018-03-15 20:40:21 -04:00
parent dc431aee20
commit 6648ff5968
3 changed files with 20 additions and 6 deletions

View File

@ -38,16 +38,21 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
return true;
}
if(constraint.getType() != Text) {
return true;
}
std::string str = constraint.getString();
parquet::ByteArray min = stats->min();
parquet::ByteArray max = stats->max();
std::string minStr((const char*)min.ptr, min.len);
std::string maxStr((const char*)max.ptr, max.len);
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
// printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data());
switch(constraint.getOperator()) {
case Is:
case Equal:
return str >= minStr && str <= maxStr;
case GreaterThan:
case GreaterThanOrEqual:
case LessThan:
@ -77,7 +82,6 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint constraint,
// data, which provides substantial performance benefits.
bool ParquetCursor::currentRowGroupSatisfiesFilter() {
for(unsigned int i = 0; i < constraints.size(); i++) {
ValueType type = constraints[i].getType();
int column = constraints[i].getColumn();
int op = constraints[i].getOperator();
bool rv = true;

View File

@ -14,6 +14,9 @@ Constraint::Constraint(
this->intValue = intValue;
this->doubleValue = doubleValue;
this->blobValue = blobValue;
if(type == Text)
stringValue = std::string((char*)&blobValue[0], blobValue.size());
}
int Constraint::getColumn() {
@ -36,6 +39,10 @@ double Constraint::getDouble() {
return doubleValue;
}
std::vector<unsigned char> Constraint::getBytes() {
const std::vector<unsigned char> Constraint::getBytes() {
return blobValue;
}
std::string Constraint::getString() {
return stringValue;
}

View File

@ -2,6 +2,7 @@
#define PARQUET_FILTER_H
#include <vector>
#include <string>
#include <cstdint>
enum ConstraintOperator {
@ -36,8 +37,9 @@ class Constraint {
int64_t intValue;
double doubleValue;
// Doubles as string value
std::vector<unsigned char> blobValue;
// Only set when blobValue is set
std::string stringValue;
public:
// Kind of a messy constructor function, but it's just for internal use, so whatever.
@ -55,7 +57,8 @@ public:
ValueType getType();
int64_t getInt();
double getDouble();
std::vector<unsigned char> getBytes();
const std::vector<unsigned char> getBytes();
std::string getString();
};
#endif