Row group filtering for double/float

This commit is contained in:
Colin Dellow 2018-03-16 16:30:05 -04:00
parent 9c22fd1f57
commit 01e8ffaba7
2 changed files with 88 additions and 5 deletions

View File

@ -33,13 +33,36 @@ sqlite> SELECT * FROM demo;
## Supported features
### Index
### Row group filtering
Only full table scans are supported.
Row group filtering is supported for strings and numerics so long as the SQLite
type matches the Parquet type.
e.g. if you have a column `foo` that is an INT32, this query will skip row groups whose
statistics prove that it does not contain relevant rows:
```
SELECT * FROM tbl WHERE foo = 123;
```
but this query will devolve to a table scan:
```
SELECT * FROM tbl WHERE foo = '123';
```
This is laziness on my part and could be fixed without too much effort.
### Row filtering
For common constraints, the row is checked to see if it satisfies the query's
constraints before returning control to SQLite's virtual machine. This minimizes
the number of allocations performed when many rows are filtered out by
the user's criteria.
### Types
These types are supported:
These Parquet types are supported:
* INT96 timestamps (exposed as milliseconds since the epoch)
* INT8/INT16/INT32/INT64
@ -49,7 +72,7 @@ These types are supported:
* DOUBLE
* Variable- and fixed-length byte arrays
These are not supported:
These are not currently supported:
* UINT8/UINT16/UINT32/UINT64
* DECIMAL

View File

@ -160,8 +160,68 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint
return true;
}
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
if(!_stats->HasMinMax()) {
return true;
}
if(constraint.type != Double) {
return true;
}
int column = constraint.column;
double min = std::numeric_limits<double>::min();
double max = std::numeric_limits<double>::max();
parquet::Type::type pqType = types[column];
if(pqType == parquet::Type::DOUBLE) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>* stats =
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>*)_stats.get();
min = stats->min();
max = stats->max();
} else if(pqType == parquet::Type::FLOAT) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>* stats =
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>*)_stats.get();
min = stats->min();
max = stats->max();
} else {
// Should be impossible to get here as we should have forbidden this at
// CREATE time -- maybe file changed underneath us?
std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesIntegerFilter called on unsupported type: " <<
parquet::TypeToString(pqType);
throw std::invalid_argument(ss.str());
}
const double value = constraint.doubleValue;
// printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data());
switch(constraint.op) {
case Is:
case Equal:
return value >= min && value <= max;
case GreaterThanOrEqual:
return max >= value;
case GreaterThan:
return max > value;
case LessThan:
return min < value;
case LessThanOrEqual:
return min <= value;
case IsNot:
case NotEqual:
// If min == max == str, we can skip this.
return !(min == max && value == min);
case Like:
default:
return true;
}
return true;
}
bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) {