add string == row group filter
For the statscan census set filtering on `== 'Dawson Creek'`, the query goes from 980ms to 660ms. This is expected, since the data isn't sorted by that column. I'll try adding some scaffolding to do filtering at the row level, too. We could also try unpacking the dictionary and testing the individual values, although we may want some heuristics to decide whether it's worth doing -- eg if < 10% of the rows have a unique value. Ideally, this should be like a ~1ms query.
This commit is contained in:
parent
dc431aee20
commit
6648ff5968
|
@ -38,16 +38,21 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(constraint.getType() != Text) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string str = constraint.getString();
|
||||||
parquet::ByteArray min = stats->min();
|
parquet::ByteArray min = stats->min();
|
||||||
parquet::ByteArray max = stats->max();
|
parquet::ByteArray max = stats->max();
|
||||||
std::string minStr((const char*)min.ptr, min.len);
|
std::string minStr((const char*)min.ptr, min.len);
|
||||||
std::string maxStr((const char*)max.ptr, max.len);
|
std::string maxStr((const char*)max.ptr, max.len);
|
||||||
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
|
// printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data());
|
||||||
|
|
||||||
switch(constraint.getOperator()) {
|
switch(constraint.getOperator()) {
|
||||||
case Is:
|
case Is:
|
||||||
case Equal:
|
case Equal:
|
||||||
|
return str >= minStr && str <= maxStr;
|
||||||
case GreaterThan:
|
case GreaterThan:
|
||||||
case GreaterThanOrEqual:
|
case GreaterThanOrEqual:
|
||||||
case LessThan:
|
case LessThan:
|
||||||
|
@ -77,7 +82,6 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint constraint,
|
||||||
// data, which provides substantial performance benefits.
|
// data, which provides substantial performance benefits.
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
||||||
for(unsigned int i = 0; i < constraints.size(); i++) {
|
for(unsigned int i = 0; i < constraints.size(); i++) {
|
||||||
ValueType type = constraints[i].getType();
|
|
||||||
int column = constraints[i].getColumn();
|
int column = constraints[i].getColumn();
|
||||||
int op = constraints[i].getOperator();
|
int op = constraints[i].getOperator();
|
||||||
bool rv = true;
|
bool rv = true;
|
||||||
|
|
|
@ -14,6 +14,9 @@ Constraint::Constraint(
|
||||||
this->intValue = intValue;
|
this->intValue = intValue;
|
||||||
this->doubleValue = doubleValue;
|
this->doubleValue = doubleValue;
|
||||||
this->blobValue = blobValue;
|
this->blobValue = blobValue;
|
||||||
|
|
||||||
|
if(type == Text)
|
||||||
|
stringValue = std::string((char*)&blobValue[0], blobValue.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
int Constraint::getColumn() {
|
int Constraint::getColumn() {
|
||||||
|
@ -36,6 +39,10 @@ double Constraint::getDouble() {
|
||||||
return doubleValue;
|
return doubleValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<unsigned char> Constraint::getBytes() {
|
const std::vector<unsigned char> Constraint::getBytes() {
|
||||||
return blobValue;
|
return blobValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string Constraint::getString() {
|
||||||
|
return stringValue;
|
||||||
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#define PARQUET_FILTER_H
|
#define PARQUET_FILTER_H
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
enum ConstraintOperator {
|
enum ConstraintOperator {
|
||||||
|
@ -36,8 +37,9 @@ class Constraint {
|
||||||
|
|
||||||
int64_t intValue;
|
int64_t intValue;
|
||||||
double doubleValue;
|
double doubleValue;
|
||||||
// Doubles as string value
|
|
||||||
std::vector<unsigned char> blobValue;
|
std::vector<unsigned char> blobValue;
|
||||||
|
// Only set when blobValue is set
|
||||||
|
std::string stringValue;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Kind of a messy constructor function, but it's just for internal use, so whatever.
|
// Kind of a messy constructor function, but it's just for internal use, so whatever.
|
||||||
|
@ -55,7 +57,8 @@ public:
|
||||||
ValueType getType();
|
ValueType getType();
|
||||||
int64_t getInt();
|
int64_t getInt();
|
||||||
double getDouble();
|
double getDouble();
|
||||||
std::vector<unsigned char> getBytes();
|
const std::vector<unsigned char> getBytes();
|
||||||
|
std::string getString();
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue