diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 0e79ba7..b6bbf53 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -6,8 +6,35 @@ ParquetCursor::ParquetCursor(ParquetTable* table) { reset(std::vector()); } +// Return true if it is _possible_ that the current +// rowgroup satisfies the constraints. Only return false +// if it definitely does not. +// +// This avoids opening rowgroups that can't return useful +// data, which provides substantial performance benefits. +bool ParquetCursor::currentRowGroupSatisfiesFilter() { + for(unsigned int i = 0; i < constraints.size(); i++) { + int column = constraints[i].getColumn(); + int op = constraints[i].getOperator(); + bool rv = true; + +// printf("column = %d\n", column); +// std::unique_ptr md = rowGroupMetadata->ColumnChunk(column); + + if(op == IsNull) { + } else if(op == IsNotNull) { + } + + if(!rv) + return false; + } + + return true; +} + + bool ParquetCursor::nextRowGroup() { - // TODO: skip row groups that cannot satisfy the constraints +start: if((rowGroupId + 1) >= numRowGroups) return false; @@ -36,23 +63,34 @@ bool ParquetCursor::nextRowGroup() { colRows[i] = rowId; } + if(!currentRowGroupSatisfiesFilter()) + goto start; + return true; } // Return true if it is _possible_ that the current // row satisfies the constraints. Only return false // if it definitely does not. +// +// This avoids pointless transitions between the SQLite VM +// and the extension, which can add up on a dataset of tens +// of millions of rows. bool ParquetCursor::currentRowSatisfiesFilter() { for(unsigned int i = 0; i < constraints.size(); i++) { + bool rv = true; int column = constraints[i].getColumn(); ensureColumn(column); int op = constraints[i].getOperator(); if(op == IsNull) { - return isNull(column); + rv = isNull(column); } else if(op == IsNotNull) { - return !isNull(column); + rv = !isNull(column); } + + if(!rv) + return false; } return true; } diff --git a/parquet/parquet_cursor.h b/parquet/parquet_cursor.h index 80bc64a..2ab51a6 100644 --- a/parquet/parquet_cursor.h +++ b/parquet/parquet_cursor.h @@ -36,6 +36,7 @@ public: ParquetCursor(ParquetTable* table); int getRowId(); bool currentRowSatisfiesFilter(); + bool currentRowGroupSatisfiesFilter(); void next(); void close(); void reset(std::vector constraints); diff --git a/tests/queries/050-rowid-gt-none.sql b/tests/queries/050-rowid-gt-none.sql new file mode 100644 index 0000000..6ebb920 --- /dev/null +++ b/tests/queries/050-rowid-gt-none.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid > 100 +0 diff --git a/tests/queries/051-rowid-gte-none.sql b/tests/queries/051-rowid-gte-none.sql new file mode 100644 index 0000000..00b2eef --- /dev/null +++ b/tests/queries/051-rowid-gte-none.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid >= 100 +0 diff --git a/tests/queries/052-rowid-lt-none.sql b/tests/queries/052-rowid-lt-none.sql new file mode 100644 index 0000000..0c28165 --- /dev/null +++ b/tests/queries/052-rowid-lt-none.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid < 0 +0 diff --git a/tests/queries/053-rowid-lte-none.sql b/tests/queries/053-rowid-lte-none.sql new file mode 100644 index 0000000..2545b46 --- /dev/null +++ b/tests/queries/053-rowid-lte-none.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid < -1 +0 diff --git a/tests/queries/054-rowid-lte-one.sql b/tests/queries/054-rowid-lte-one.sql new file mode 100644 index 0000000..ba6fc8b --- /dev/null +++ b/tests/queries/054-rowid-lte-one.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid <= 0 +1 diff --git a/tests/queries/055-rowid-lt-one.sql b/tests/queries/055-rowid-lt-one.sql new file mode 100644 index 0000000..e52480d --- /dev/null +++ b/tests/queries/055-rowid-lt-one.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid < 1 +1 diff --git a/tests/queries/056-rowid-ne-some.sql b/tests/queries/056-rowid-ne-some.sql new file mode 100644 index 0000000..fecfaad --- /dev/null +++ b/tests/queries/056-rowid-ne-some.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid <> 1 +98 diff --git a/tests/queries/057-rowid-is-null.sql b/tests/queries/057-rowid-is-null.sql new file mode 100644 index 0000000..19d15af --- /dev/null +++ b/tests/queries/057-rowid-is-null.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid is null +0 diff --git a/tests/queries/058-rowid-is-not-null.sql b/tests/queries/058-rowid-is-not-null.sql new file mode 100644 index 0000000..ad1478c --- /dev/null +++ b/tests/queries/058-rowid-is-not-null.sql @@ -0,0 +1,2 @@ +select count(*) from no_nulls1 where rowid is not null +99