From cbde3c73b601383fff33ce501d0b26047326e93f Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Sat, 23 Jun 2018 20:31:03 -0400 Subject: [PATCH] Don't eagerly evaluate constraints ...to avoid decompressing columns when we know from previous columns that the row can't match. Fixes #10 --- parquet/parquet_cursor.cc | 15 +++++++++++---- parquet/parquet_filter.h | 1 + 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 35502cd..ee2ad6f 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -630,9 +630,12 @@ start: // a row for(unsigned int i = 0; i < constraints.size(); i++) { if(rowGroupId > 0 && constraints[i].rowGroupId == rowGroupId - 1) { - constraints[i].bitmap.setActualMembership(rowGroupId - 1, constraints[i].hadRows); + if(constraints[i].valid || constraints[i].hadRows) { + constraints[i].bitmap.setActualMembership(rowGroupId - 1, constraints[i].hadRows); + } } constraints[i].hadRows = false; + constraints[i].valid = true; } if(!currentRowGroupSatisfiesFilter()) @@ -652,7 +655,6 @@ start: // and the extension, which can add up on a dataset of tens // of millions of rows. bool ParquetCursor::currentRowSatisfiesFilter() { - bool overallRv = true; for(unsigned int i = 0; i < constraints.size(); i++) { bool rv = true; int column = constraints[i].column; @@ -682,10 +684,15 @@ bool ParquetCursor::currentRowSatisfiesFilter() { // ideally we'd short-circuit if we'd already set this group as visited if(rv) { constraints[i].hadRows = true; + } else { + // When we short circuit, mark the other constraints as not evaluated to avoid persisting incorrect data. + for(unsigned int j = i + 1; j < constraints.size(); j++) { + constraints[j].valid = false; + } + return false; } - overallRv = overallRv && rv; } - return overallRv; + return true; } void ParquetCursor::next() { diff --git a/parquet/parquet_filter.h b/parquet/parquet_filter.h index d9f336e..d350bfb 100644 --- a/parquet/parquet_filter.h +++ b/parquet/parquet_filter.h @@ -117,6 +117,7 @@ public: // that matched this constraint. int rowGroupId; bool hadRows; + bool valid; }; #endif