Row group filtering for double/float
This commit is contained in:
parent
9c22fd1f57
commit
01e8ffaba7
31
README.md
31
README.md
|
@ -33,13 +33,36 @@ sqlite> SELECT * FROM demo;
|
||||||
|
|
||||||
## Supported features
|
## Supported features
|
||||||
|
|
||||||
### Index
|
### Row group filtering
|
||||||
|
|
||||||
Only full table scans are supported.
|
Row group filtering is supported for strings and numerics so long as the SQLite
|
||||||
|
type matches the Parquet type.
|
||||||
|
|
||||||
|
e.g. if you have a column `foo` that is an INT32, this query will skip row groups whose
|
||||||
|
statistics prove that it does not contain relevant rows:
|
||||||
|
|
||||||
|
```
|
||||||
|
SELECT * FROM tbl WHERE foo = 123;
|
||||||
|
```
|
||||||
|
|
||||||
|
but this query will devolve to a table scan:
|
||||||
|
|
||||||
|
```
|
||||||
|
SELECT * FROM tbl WHERE foo = '123';
|
||||||
|
```
|
||||||
|
|
||||||
|
This is laziness on my part and could be fixed without too much effort.
|
||||||
|
|
||||||
|
### Row filtering
|
||||||
|
|
||||||
|
For common constraints, the row is checked to see if it satisfies the query's
|
||||||
|
constraints before returning control to SQLite's virtual machine. This minimizes
|
||||||
|
the number of allocations performed when many rows are filtered out by
|
||||||
|
the user's criteria.
|
||||||
|
|
||||||
### Types
|
### Types
|
||||||
|
|
||||||
These types are supported:
|
These Parquet types are supported:
|
||||||
|
|
||||||
* INT96 timestamps (exposed as milliseconds since the epoch)
|
* INT96 timestamps (exposed as milliseconds since the epoch)
|
||||||
* INT8/INT16/INT32/INT64
|
* INT8/INT16/INT32/INT64
|
||||||
|
@ -49,7 +72,7 @@ These types are supported:
|
||||||
* DOUBLE
|
* DOUBLE
|
||||||
* Variable- and fixed-length byte arrays
|
* Variable- and fixed-length byte arrays
|
||||||
|
|
||||||
These are not supported:
|
These are not currently supported:
|
||||||
|
|
||||||
* UINT8/UINT16/UINT32/UINT64
|
* UINT8/UINT16/UINT32/UINT64
|
||||||
* DECIMAL
|
* DECIMAL
|
||||||
|
|
|
@ -160,10 +160,70 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats) {
|
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||||
|
if(!_stats->HasMinMax()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(constraint.type != Double) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int column = constraint.column;
|
||||||
|
|
||||||
|
double min = std::numeric_limits<double>::min();
|
||||||
|
double max = std::numeric_limits<double>::max();
|
||||||
|
parquet::Type::type pqType = types[column];
|
||||||
|
|
||||||
|
if(pqType == parquet::Type::DOUBLE) {
|
||||||
|
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>* stats =
|
||||||
|
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>*)_stats.get();
|
||||||
|
|
||||||
|
min = stats->min();
|
||||||
|
max = stats->max();
|
||||||
|
} else if(pqType == parquet::Type::FLOAT) {
|
||||||
|
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>* stats =
|
||||||
|
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>*)_stats.get();
|
||||||
|
|
||||||
|
min = stats->min();
|
||||||
|
max = stats->max();
|
||||||
|
} else {
|
||||||
|
// Should be impossible to get here as we should have forbidden this at
|
||||||
|
// CREATE time -- maybe file changed underneath us?
|
||||||
|
std::ostringstream ss;
|
||||||
|
ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesIntegerFilter called on unsupported type: " <<
|
||||||
|
parquet::TypeToString(pqType);
|
||||||
|
throw std::invalid_argument(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
const double value = constraint.doubleValue;
|
||||||
|
// printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data());
|
||||||
|
|
||||||
|
switch(constraint.op) {
|
||||||
|
case Is:
|
||||||
|
case Equal:
|
||||||
|
return value >= min && value <= max;
|
||||||
|
case GreaterThanOrEqual:
|
||||||
|
return max >= value;
|
||||||
|
case GreaterThan:
|
||||||
|
return max > value;
|
||||||
|
case LessThan:
|
||||||
|
return min < value;
|
||||||
|
case LessThanOrEqual:
|
||||||
|
return min <= value;
|
||||||
|
case IsNot:
|
||||||
|
case NotEqual:
|
||||||
|
// If min == max == str, we can skip this.
|
||||||
|
return !(min == max && value == min);
|
||||||
|
case Like:
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) {
|
bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) {
|
||||||
if(constraint.type != Text) {
|
if(constraint.type != Text) {
|
||||||
return true;
|
return true;
|
||||||
|
|
Loading…
Reference in New Issue