add row filter for string ==
This gets the census `== 'Dawson Creek'` query down to ~410ms from ~650ms. That still seems much slower than it should be. Am I accidentally doing a copy? Now to go learn how to profile C++ code...
This commit is contained in:
parent
6648ff5968
commit
f7f1ed03d1
|
@ -6,7 +6,7 @@ ParquetCursor::ParquetCursor(ParquetTable* table) {
|
||||||
reset(std::vector<Constraint>());
|
reset(std::vector<Constraint>());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
|
bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint) {
|
||||||
int64_t target = constraint.getInt();
|
int64_t target = constraint.getInt();
|
||||||
switch(constraint.getOperator()) {
|
switch(constraint.getOperator()) {
|
||||||
case IsNull:
|
case IsNull:
|
||||||
|
@ -29,7 +29,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||||
std::vector<unsigned char> target = constraint.getBytes();
|
std::vector<unsigned char> target = constraint.getBytes();
|
||||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||||
|
@ -66,14 +66,52 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
|
bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
|
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) {
|
||||||
|
if(constraint.getType() != Text) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<unsigned char> blob = constraint.getBytes();
|
||||||
|
parquet::ByteArray* ba = getByteArray(constraint.getColumn());
|
||||||
|
|
||||||
|
switch(constraint.getOperator()) {
|
||||||
|
case Is:
|
||||||
|
case Equal:
|
||||||
|
if(blob.size() != ba->len)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return 0 == memcmp(&blob[0], ba->ptr, ba->len);
|
||||||
|
case GreaterThan:
|
||||||
|
case GreaterThanOrEqual:
|
||||||
|
case LessThan:
|
||||||
|
case LessThanOrEqual:
|
||||||
|
case IsNot:
|
||||||
|
case NotEqual:
|
||||||
|
case Like:
|
||||||
|
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowSatisfiesIntegerFilter(Constraint& constraint) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowSatisfiesDoubleFilter(Constraint& constraint) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Return true if it is _possible_ that the current
|
// Return true if it is _possible_ that the current
|
||||||
// rowgroup satisfies the constraints. Only return false
|
// rowgroup satisfies the constraints. Only return false
|
||||||
// if it definitely does not.
|
// if it definitely does not.
|
||||||
|
@ -194,6 +232,19 @@ bool ParquetCursor::currentRowSatisfiesFilter() {
|
||||||
rv = isNull(column);
|
rv = isNull(column);
|
||||||
} else if(op == IsNotNull) {
|
} else if(op == IsNotNull) {
|
||||||
rv = !isNull(column);
|
rv = !isNull(column);
|
||||||
|
} else {
|
||||||
|
parquet::Type::type pqType = types[column];
|
||||||
|
|
||||||
|
if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||||
|
rv = currentRowSatisfiesTextFilter(constraints[i]);
|
||||||
|
} else if(pqType == parquet::Type::INT32 ||
|
||||||
|
pqType == parquet::Type::INT64 ||
|
||||||
|
pqType == parquet::Type::INT96 ||
|
||||||
|
pqType == parquet::Type::BOOLEAN) {
|
||||||
|
rv = currentRowSatisfiesIntegerFilter(constraints[i]);
|
||||||
|
} else if(pqType == parquet::Type::FLOAT || pqType == parquet::Type::DOUBLE) {
|
||||||
|
rv = currentRowSatisfiesDoubleFilter(constraints[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!rv)
|
if(!rv)
|
||||||
|
|
|
@ -35,10 +35,15 @@ class ParquetCursor {
|
||||||
|
|
||||||
bool currentRowSatisfiesFilter();
|
bool currentRowSatisfiesFilter();
|
||||||
bool currentRowGroupSatisfiesFilter();
|
bool currentRowGroupSatisfiesFilter();
|
||||||
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint);
|
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint);
|
||||||
bool currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
bool currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
bool currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
|
|
||||||
|
bool currentRowSatisfiesTextFilter(Constraint& constraint);
|
||||||
|
bool currentRowSatisfiesIntegerFilter(Constraint& constraint);
|
||||||
|
bool currentRowSatisfiesDoubleFilter(Constraint& constraint);
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ParquetCursor(ParquetTable* table);
|
ParquetCursor(ParquetTable* table);
|
||||||
|
|
|
@ -39,10 +39,10 @@ double Constraint::getDouble() {
|
||||||
return doubleValue;
|
return doubleValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<unsigned char> Constraint::getBytes() {
|
const std::vector<unsigned char>& Constraint::getBytes() {
|
||||||
return blobValue;
|
return blobValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string Constraint::getString() {
|
const std::string& Constraint::getString() {
|
||||||
return stringValue;
|
return stringValue;
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,8 +57,8 @@ public:
|
||||||
ValueType getType();
|
ValueType getType();
|
||||||
int64_t getInt();
|
int64_t getInt();
|
||||||
double getDouble();
|
double getDouble();
|
||||||
const std::vector<unsigned char> getBytes();
|
const std::vector<unsigned char>& getBytes();
|
||||||
std::string getString();
|
const std::string& getString();
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue