diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 2dd512d..5d25d48 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -29,6 +29,52 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) { } } +bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr _stats) { + std::vector target = constraint.getBytes(); + parquet::TypedRowGroupStatistics>* stats = + (parquet::TypedRowGroupStatistics>*)_stats.get(); + + // TODO: parquet-cpp doesn't seem to support stats for UTF8? + // 1) empirically, the following is false on a few parquets I've tested + // 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported + // 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported + if(!stats->HasMinMax()) { + return true; + } + + /* + parquet::ByteArray min = stats->min(); + parquet::ByteArray max = stats->max(); + std::string minStr((const char*)min.ptr, min.len); + std::string maxStr((const char*)max.ptr, max.len); + printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len); + */ + + switch(constraint.getOperator()) { + case Is: + case Equal: + + case GreaterThan: + case GreaterThanOrEqual: + case LessThan: + case LessThanOrEqual: + case IsNot: + case NotEqual: + case Like: + + default: + return true; + } +} + +bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr stats) { + return true; +} + +bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr stats) { + return true; +} + // Return true if it is _possible_ that the current // rowgroup satisfies the constraints. Only return false // if it definitely does not. @@ -37,6 +83,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) { // data, which provides substantial performance benefits. bool ParquetCursor::currentRowGroupSatisfiesFilter() { for(unsigned int i = 0; i < constraints.size(); i++) { + ValueType type = constraints[i].getType(); int column = constraints[i].getColumn(); int op = constraints[i].getOperator(); bool rv = true; @@ -54,6 +101,12 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { rv = stats->null_count() > 0; } else if(op == IsNotNull) { rv = stats->num_values() > 0; + } else if(type == Text) { + rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); + } else if(type == Integer) { + rv = currentRowGroupSatisfiesIntegerFilter(constraints[i], stats); + } else if(type == Double) { + rv = currentRowGroupSatisfiesDoubleFilter(constraints[i], stats); } } diff --git a/parquet/parquet_cursor.h b/parquet/parquet_cursor.h index 4289fa6..26c7316 100644 --- a/parquet/parquet_cursor.h +++ b/parquet/parquet_cursor.h @@ -33,9 +33,12 @@ class ParquetCursor { std::vector constraints; - bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint); bool currentRowSatisfiesFilter(); bool currentRowGroupSatisfiesFilter(); + bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint); + bool currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr stats); + bool currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr stats); + bool currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr stats); public: ParquetCursor(ParquetTable* table); diff --git a/parquet/parquet_filter.h b/parquet/parquet_filter.h index 283cfae..08c073e 100644 --- a/parquet/parquet_filter.h +++ b/parquet/parquet_filter.h @@ -23,7 +23,6 @@ enum ConstraintOperator { enum ValueType { Null, - Boolean, Integer, Double, Blob,