From acc15256ec4e68c7beb6fbec6d169ef679dde065 Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Mon, 12 Mar 2018 20:42:50 -0400 Subject: [PATCH] Add rowgroup filtering for rowid --- parquet/parquet.cc | 2 +- parquet/parquet_cursor.cc | 47 +++++++++++++++++++++++++++---- parquet/parquet_cursor.h | 9 ++++-- parquet/parquet_filter.cc | 4 +-- parquet/parquet_filter.h | 6 ++-- tests/queries/105-rowid-gte-0.sql | 2 ++ tests/queries/106-rowid-gt-0.sql | 2 ++ tests/queries/107-rowid-gt-0.sql | 2 ++ tests/queries/108-rowid-gte-0.sql | 2 ++ 9 files changed, 61 insertions(+), 15 deletions(-) create mode 100644 tests/queries/105-rowid-gte-0.sql create mode 100644 tests/queries/106-rowid-gt-0.sql create mode 100644 tests/queries/107-rowid-gt-0.sql create mode 100644 tests/queries/108-rowid-gte-0.sql diff --git a/parquet/parquet.cc b/parquet/parquet.cc index a5b25d1..f547d17 100644 --- a/parquet/parquet.cc +++ b/parquet/parquet.cc @@ -396,7 +396,7 @@ static int parquetFilter( ValueType type = Null; bool boolValue = false; - uintptr_t intValue = 0; + int64_t intValue = 0; double doubleValue = 0; std::vector blobValue; int sqliteType = sqlite3_value_type(argv[j]); diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 29d8d32..e355298 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -6,6 +6,29 @@ ParquetCursor::ParquetCursor(ParquetTable* table) { reset(std::vector()); } +bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) { + int64_t target = constraint.getInt(); + switch(constraint.getOperator()) { + case IsNull: + return false; + case Is: + case Equal: + return target >= rowId && target < rowId + rowGroupSize; + case GreaterThan: + // rowId > target + return rowId + rowGroupSize > target; + case GreaterThanOrEqual: + // rowId >= target + return rowId + rowGroupSize >= rowId; + case LessThan: + return target > rowId; + case LessThanOrEqual: + return target >= rowId; + default: + return true; + } +} + // Return true if it is _possible_ that the current // rowgroup satisfies the constraints. Only return false // if it definitely does not. @@ -19,9 +42,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { bool rv = true; if(column == -1) { - if(op == IsNull) { - return false; - } + rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]); } else { // printf("column = %d\n", column); // std::unique_ptr md = rowGroupMetadata->ColumnChunk(column); @@ -41,13 +62,18 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { bool ParquetCursor::nextRowGroup() { start: - if((rowGroupId + 1) >= numRowGroups) + // Ensure that rowId points at the start of this rowGroup (eg, in the case where + // we skipped an entire row group). + rowId = rowGroupStartRowId + rowGroupSize; + + if((rowGroupId + 1) >= numRowGroups) { return false; + } rowGroupStartRowId = rowId; rowGroupId++; rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId); - rowsLeftInRowGroup = rowGroupMetadata->num_rows(); + rowGroupSize = rowsLeftInRowGroup = rowGroupMetadata->num_rows(); rowGroup = reader->RowGroup(rowGroupId); for(unsigned int i = 0; i < scanners.size(); i++) scanners[i] = NULL; @@ -69,6 +95,9 @@ start: colRows[i] = rowId; } + // Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it; + // it'll get decremented by our caller + rowId++; if(!currentRowGroupSatisfiesFilter()) goto start; @@ -106,8 +135,12 @@ start: if(rowsLeftInRowGroup == 0) { if(!nextRowGroup()) { // put rowId over the edge so eof returns true - rowId++; + rowId = numRows + 1; return; + } else { + // After a successful nextRowGroup, rowId is pointing at the current row. Make it + // point before so the rest of the logic works out. + rowId--; } } @@ -395,6 +428,8 @@ void ParquetCursor::reset(std::vector constraints) { reader = parquet::ParquetFileReader::OpenFile(table->file.data()); rowGroupId = -1; + rowGroupSize = 0; + rowGroupStartRowId = -1; // TODO: handle the case where rowgroups have disjoint schemas? // TODO: or at least, fail fast if detected rowsLeftInRowGroup = 0; diff --git a/parquet/parquet_cursor.h b/parquet/parquet_cursor.h index 2ab51a6..4289fa6 100644 --- a/parquet/parquet_cursor.h +++ b/parquet/parquet_cursor.h @@ -17,13 +17,14 @@ class ParquetCursor { std::vector colRows; std::vector colNulls; - std::vector colIntValues; + std::vector colIntValues; std::vector colDoubleValues; std::vector colByteArrayValues; int rowId; int rowGroupId; int rowGroupStartRowId; + int rowGroupSize; int numRows; int numRowGroups; int rowsLeftInRowGroup; @@ -32,11 +33,13 @@ class ParquetCursor { std::vector constraints; + bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint); + bool currentRowSatisfiesFilter(); + bool currentRowGroupSatisfiesFilter(); + public: ParquetCursor(ParquetTable* table); int getRowId(); - bool currentRowSatisfiesFilter(); - bool currentRowGroupSatisfiesFilter(); void next(); void close(); void reset(std::vector constraints); diff --git a/parquet/parquet_filter.cc b/parquet/parquet_filter.cc index 4f4b290..b98eaa9 100644 --- a/parquet/parquet_filter.cc +++ b/parquet/parquet_filter.cc @@ -5,7 +5,7 @@ Constraint::Constraint( ConstraintOperator op, ValueType type, bool boolValue, - uintptr_t intValue, + int64_t intValue, double doubleValue, std::vector blobValue ) { @@ -34,7 +34,7 @@ bool Constraint::getBool() { return boolValue; } -uintptr_t Constraint::getInt() { +int64_t Constraint::getInt() { return intValue; } diff --git a/parquet/parquet_filter.h b/parquet/parquet_filter.h index 11590a7..3f74cc7 100644 --- a/parquet/parquet_filter.h +++ b/parquet/parquet_filter.h @@ -36,7 +36,7 @@ class Constraint { ValueType type; bool boolValue; - uintptr_t intValue; + int64_t intValue; double doubleValue; // Doubles as string value std::vector blobValue; @@ -48,7 +48,7 @@ public: ConstraintOperator op, ValueType type, bool boolValue, - uintptr_t intValue, + int64_t intValue, double doubleValue, std::vector blobValue ); @@ -57,7 +57,7 @@ public: ConstraintOperator getOperator(); ValueType getType(); bool getBool(); - uintptr_t getInt(); + int64_t getInt(); double getDouble(); std::vector getBytes(); }; diff --git a/tests/queries/105-rowid-gte-0.sql b/tests/queries/105-rowid-gte-0.sql new file mode 100644 index 0000000..941d653 --- /dev/null +++ b/tests/queries/105-rowid-gte-0.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid >= 0 +99 diff --git a/tests/queries/106-rowid-gt-0.sql b/tests/queries/106-rowid-gt-0.sql new file mode 100644 index 0000000..55e8e53 --- /dev/null +++ b/tests/queries/106-rowid-gt-0.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid > 0 +98 diff --git a/tests/queries/107-rowid-gt-0.sql b/tests/queries/107-rowid-gt-0.sql new file mode 100644 index 0000000..7f1c921 --- /dev/null +++ b/tests/queries/107-rowid-gt-0.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls2 where rowid > 0 +98 diff --git a/tests/queries/108-rowid-gte-0.sql b/tests/queries/108-rowid-gte-0.sql new file mode 100644 index 0000000..4618b4d --- /dev/null +++ b/tests/queries/108-rowid-gte-0.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls2 where rowid >= 0 +99