From 01e8ffaba74523449920fa85d1c0334c6d3039d7 Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Fri, 16 Mar 2018 16:30:05 -0400 Subject: [PATCH] Row group filtering for double/float --- README.md | 31 +++++++++++++++++--- parquet/parquet_cursor.cc | 62 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e4641d4..50b3399 100644 --- a/README.md +++ b/README.md @@ -33,13 +33,36 @@ sqlite> SELECT * FROM demo; ## Supported features -### Index +### Row group filtering -Only full table scans are supported. +Row group filtering is supported for strings and numerics so long as the SQLite +type matches the Parquet type. + +e.g. if you have a column `foo` that is an INT32, this query will skip row groups whose +statistics prove that it does not contain relevant rows: + +``` +SELECT * FROM tbl WHERE foo = 123; +``` + +but this query will devolve to a table scan: + +``` +SELECT * FROM tbl WHERE foo = '123'; +``` + +This is laziness on my part and could be fixed without too much effort. + +### Row filtering + +For common constraints, the row is checked to see if it satisfies the query's +constraints before returning control to SQLite's virtual machine. This minimizes +the number of allocations performed when many rows are filtered out by +the user's criteria. ### Types -These types are supported: +These Parquet types are supported: * INT96 timestamps (exposed as milliseconds since the epoch) * INT8/INT16/INT32/INT64 @@ -49,7 +72,7 @@ These types are supported: * DOUBLE * Variable- and fixed-length byte arrays -These are not supported: +These are not currently supported: * UINT8/UINT16/UINT32/UINT64 * DECIMAL diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 11ab67e..3db8d3e 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -160,8 +160,68 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint return true; } -bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr stats) { +bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr _stats) { + if(!_stats->HasMinMax()) { + return true; + } + + if(constraint.type != Double) { + return true; + } + + int column = constraint.column; + + double min = std::numeric_limits::min(); + double max = std::numeric_limits::max(); + parquet::Type::type pqType = types[column]; + + if(pqType == parquet::Type::DOUBLE) { + parquet::TypedRowGroupStatistics>* stats = + (parquet::TypedRowGroupStatistics>*)_stats.get(); + + min = stats->min(); + max = stats->max(); + } else if(pqType == parquet::Type::FLOAT) { + parquet::TypedRowGroupStatistics>* stats = + (parquet::TypedRowGroupStatistics>*)_stats.get(); + + min = stats->min(); + max = stats->max(); + } else { + // Should be impossible to get here as we should have forbidden this at + // CREATE time -- maybe file changed underneath us? + std::ostringstream ss; + ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesIntegerFilter called on unsupported type: " << + parquet::TypeToString(pqType); + throw std::invalid_argument(ss.str()); + } + + const double value = constraint.doubleValue; +// printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data()); + + switch(constraint.op) { + case Is: + case Equal: + return value >= min && value <= max; + case GreaterThanOrEqual: + return max >= value; + case GreaterThan: + return max > value; + case LessThan: + return min < value; + case LessThanOrEqual: + return min <= value; + case IsNot: + case NotEqual: + // If min == max == str, we can skip this. + return !(min == max && value == min); + case Like: + default: + return true; + } + return true; + } bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) {