Scaffolding for row group filters, tests

rowid is special since its column index is -1, so add
explicit tests around it
This commit is contained in:
Colin Dellow 2018-03-11 15:43:40 -04:00
parent 5559a7b563
commit 095b576cc2
11 changed files with 60 additions and 3 deletions

View File

@ -6,8 +6,35 @@ ParquetCursor::ParquetCursor(ParquetTable* table) {
reset(std::vector<Constraint>()); reset(std::vector<Constraint>());
} }
// Return true if it is _possible_ that the current
// rowgroup satisfies the constraints. Only return false
// if it definitely does not.
//
// This avoids opening rowgroups that can't return useful
// data, which provides substantial performance benefits.
bool ParquetCursor::currentRowGroupSatisfiesFilter() {
for(unsigned int i = 0; i < constraints.size(); i++) {
int column = constraints[i].getColumn();
int op = constraints[i].getOperator();
bool rv = true;
// printf("column = %d\n", column);
// std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
if(op == IsNull) {
} else if(op == IsNotNull) {
}
if(!rv)
return false;
}
return true;
}
bool ParquetCursor::nextRowGroup() { bool ParquetCursor::nextRowGroup() {
// TODO: skip row groups that cannot satisfy the constraints start:
if((rowGroupId + 1) >= numRowGroups) if((rowGroupId + 1) >= numRowGroups)
return false; return false;
@ -36,23 +63,34 @@ bool ParquetCursor::nextRowGroup() {
colRows[i] = rowId; colRows[i] = rowId;
} }
if(!currentRowGroupSatisfiesFilter())
goto start;
return true; return true;
} }
// Return true if it is _possible_ that the current // Return true if it is _possible_ that the current
// row satisfies the constraints. Only return false // row satisfies the constraints. Only return false
// if it definitely does not. // if it definitely does not.
//
// This avoids pointless transitions between the SQLite VM
// and the extension, which can add up on a dataset of tens
// of millions of rows.
bool ParquetCursor::currentRowSatisfiesFilter() { bool ParquetCursor::currentRowSatisfiesFilter() {
for(unsigned int i = 0; i < constraints.size(); i++) { for(unsigned int i = 0; i < constraints.size(); i++) {
bool rv = true;
int column = constraints[i].getColumn(); int column = constraints[i].getColumn();
ensureColumn(column); ensureColumn(column);
int op = constraints[i].getOperator(); int op = constraints[i].getOperator();
if(op == IsNull) { if(op == IsNull) {
return isNull(column); rv = isNull(column);
} else if(op == IsNotNull) { } else if(op == IsNotNull) {
return !isNull(column); rv = !isNull(column);
} }
if(!rv)
return false;
} }
return true; return true;
} }

View File

@ -36,6 +36,7 @@ public:
ParquetCursor(ParquetTable* table); ParquetCursor(ParquetTable* table);
int getRowId(); int getRowId();
bool currentRowSatisfiesFilter(); bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter();
void next(); void next();
void close(); void close();
void reset(std::vector<Constraint> constraints); void reset(std::vector<Constraint> constraints);

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid > 100
0

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid >= 100
0

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid < 0
0

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid < -1
0

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid <= 0
1

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid < 1
1

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid <> 1
98

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid is null
0

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid is not null
99