Add stub row group filters for text/int/dbl
Checkpointing to investigate why min/max stats for text aren't present
This commit is contained in:
parent
110e3e3668
commit
769060dbcb
|
@ -29,6 +29,52 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||||
|
std::vector<unsigned char> target = constraint.getBytes();
|
||||||
|
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||||
|
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||||
|
|
||||||
|
// TODO: parquet-cpp doesn't seem to support stats for UTF8?
|
||||||
|
// 1) empirically, the following is false on a few parquets I've tested
|
||||||
|
// 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported
|
||||||
|
// 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported
|
||||||
|
if(!stats->HasMinMax()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
parquet::ByteArray min = stats->min();
|
||||||
|
parquet::ByteArray max = stats->max();
|
||||||
|
std::string minStr((const char*)min.ptr, min.len);
|
||||||
|
std::string maxStr((const char*)max.ptr, max.len);
|
||||||
|
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
|
||||||
|
*/
|
||||||
|
|
||||||
|
switch(constraint.getOperator()) {
|
||||||
|
case Is:
|
||||||
|
case Equal:
|
||||||
|
|
||||||
|
case GreaterThan:
|
||||||
|
case GreaterThanOrEqual:
|
||||||
|
case LessThan:
|
||||||
|
case LessThanOrEqual:
|
||||||
|
case IsNot:
|
||||||
|
case NotEqual:
|
||||||
|
case Like:
|
||||||
|
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Return true if it is _possible_ that the current
|
// Return true if it is _possible_ that the current
|
||||||
// rowgroup satisfies the constraints. Only return false
|
// rowgroup satisfies the constraints. Only return false
|
||||||
// if it definitely does not.
|
// if it definitely does not.
|
||||||
|
@ -37,6 +83,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
|
||||||
// data, which provides substantial performance benefits.
|
// data, which provides substantial performance benefits.
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
||||||
for(unsigned int i = 0; i < constraints.size(); i++) {
|
for(unsigned int i = 0; i < constraints.size(); i++) {
|
||||||
|
ValueType type = constraints[i].getType();
|
||||||
int column = constraints[i].getColumn();
|
int column = constraints[i].getColumn();
|
||||||
int op = constraints[i].getOperator();
|
int op = constraints[i].getOperator();
|
||||||
bool rv = true;
|
bool rv = true;
|
||||||
|
@ -54,6 +101,12 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
||||||
rv = stats->null_count() > 0;
|
rv = stats->null_count() > 0;
|
||||||
} else if(op == IsNotNull) {
|
} else if(op == IsNotNull) {
|
||||||
rv = stats->num_values() > 0;
|
rv = stats->num_values() > 0;
|
||||||
|
} else if(type == Text) {
|
||||||
|
rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats);
|
||||||
|
} else if(type == Integer) {
|
||||||
|
rv = currentRowGroupSatisfiesIntegerFilter(constraints[i], stats);
|
||||||
|
} else if(type == Double) {
|
||||||
|
rv = currentRowGroupSatisfiesDoubleFilter(constraints[i], stats);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,9 +33,12 @@ class ParquetCursor {
|
||||||
|
|
||||||
std::vector<Constraint> constraints;
|
std::vector<Constraint> constraints;
|
||||||
|
|
||||||
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint);
|
|
||||||
bool currentRowSatisfiesFilter();
|
bool currentRowSatisfiesFilter();
|
||||||
bool currentRowGroupSatisfiesFilter();
|
bool currentRowGroupSatisfiesFilter();
|
||||||
|
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint);
|
||||||
|
bool currentRowGroupSatisfiesTextFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
|
bool currentRowGroupSatisfiesIntegerFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
|
bool currentRowGroupSatisfiesDoubleFilter(Constraint constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ParquetCursor(ParquetTable* table);
|
ParquetCursor(ParquetTable* table);
|
||||||
|
|
|
@ -23,7 +23,6 @@ enum ConstraintOperator {
|
||||||
|
|
||||||
enum ValueType {
|
enum ValueType {
|
||||||
Null,
|
Null,
|
||||||
Boolean,
|
|
||||||
Integer,
|
Integer,
|
||||||
Double,
|
Double,
|
||||||
Blob,
|
Blob,
|
||||||
|
|
Loading…
Reference in New Issue