add row filter for string ==

This gets the census `== 'Dawson Creek'` query down to ~410ms from
~650ms.

That still seems much slower than it should be. Am I accidentally
doing a copy? Now to go learn how to profile C++ code...
This commit is contained in:
Colin Dellow 2018-03-15 21:37:52 -04:00
parent 6648ff5968
commit f7f1ed03d1
4 changed files with 68 additions and 12 deletions

View File

@ -6,7 +6,7 @@ ParquetCursor::ParquetCursor(ParquetTable* table) {
reset(std::vector<Constraint>()); reset(std::vector<Constraint>());
} }
bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) { bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint) {
int64_t target = constraint.getInt(); int64_t target = constraint.getInt();
switch(constraint.getOperator()) { switch(constraint.getOperator()) {
case IsNull: case IsNull:
@ -29,7 +29,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
} }
} }
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) { bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
std::vector<unsigned char> target = constraint.getBytes(); std::vector<unsigned char> target = constraint.getBytes();
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats = parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get(); (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
@ -66,14 +66,52 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
} }
} }
bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) { bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
return true; return true;
} }
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) { bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
return true; return true;
} }
bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) {
if(constraint.getType() != Text) {
return true;
}
std::vector<unsigned char> blob = constraint.getBytes();
parquet::ByteArray* ba = getByteArray(constraint.getColumn());
switch(constraint.getOperator()) {
case Is:
case Equal:
if(blob.size() != ba->len)
return false;
return 0 == memcmp(&blob[0], ba->ptr, ba->len);
case GreaterThan:
case GreaterThanOrEqual:
case LessThan:
case LessThanOrEqual:
case IsNot:
case NotEqual:
case Like:
default:
return true;
}
}
bool ParquetCursor::currentRowSatisfiesIntegerFilter(Constraint& constraint) {
return true;
}
bool ParquetCursor::currentRowSatisfiesDoubleFilter(Constraint& constraint) {
return true;
}
// Return true if it is _possible_ that the current // Return true if it is _possible_ that the current
// rowgroup satisfies the constraints. Only return false // rowgroup satisfies the constraints. Only return false
// if it definitely does not. // if it definitely does not.
@ -194,6 +232,19 @@ bool ParquetCursor::currentRowSatisfiesFilter() {
rv = isNull(column); rv = isNull(column);
} else if(op == IsNotNull) { } else if(op == IsNotNull) {
rv = !isNull(column); rv = !isNull(column);
} else {
parquet::Type::type pqType = types[column];
if(pqType == parquet::Type::BYTE_ARRAY) {
rv = currentRowSatisfiesTextFilter(constraints[i]);
} else if(pqType == parquet::Type::INT32 ||
pqType == parquet::Type::INT64 ||
pqType == parquet::Type::INT96 ||
pqType == parquet::Type::BOOLEAN) {
rv = currentRowSatisfiesIntegerFilter(constraints[i]);
} else if(pqType == parquet::Type::FLOAT || pqType == parquet::Type::DOUBLE) {
rv = currentRowSatisfiesDoubleFilter(constraints[i]);
}
} }
if(!rv) if(!rv)

View File

@ -35,10 +35,15 @@ class ParquetCursor {
bool currentRowSatisfiesFilter(); bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter(); bool currentRowGroupSatisfiesFilter();
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint); bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint);
bool currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesTextFilter(Constraint& constraint);
bool currentRowSatisfiesIntegerFilter(Constraint& constraint);
bool currentRowSatisfiesDoubleFilter(Constraint& constraint);
public: public:
ParquetCursor(ParquetTable* table); ParquetCursor(ParquetTable* table);

View File

@ -39,10 +39,10 @@ double Constraint::getDouble() {
return doubleValue; return doubleValue;
} }
const std::vector<unsigned char> Constraint::getBytes() { const std::vector<unsigned char>& Constraint::getBytes() {
return blobValue; return blobValue;
} }
std::string Constraint::getString() { const std::string& Constraint::getString() {
return stringValue; return stringValue;
} }

View File

@ -57,8 +57,8 @@ public:
ValueType getType(); ValueType getType();
int64_t getInt(); int64_t getInt();
double getDouble(); double getDouble();
const std::vector<unsigned char> getBytes(); const std::vector<unsigned char>& getBytes();
std::string getString(); const std::string& getString();
}; };
#endif #endif