Add rowgroup filtering for rowid

This commit is contained in:
Colin Dellow 2018-03-12 20:42:50 -04:00
parent 1f938a005d
commit acc15256ec
9 changed files with 61 additions and 15 deletions

View File

@ -396,7 +396,7 @@ static int parquetFilter(
ValueType type = Null; ValueType type = Null;
bool boolValue = false; bool boolValue = false;
uintptr_t intValue = 0; int64_t intValue = 0;
double doubleValue = 0; double doubleValue = 0;
std::vector<unsigned char> blobValue; std::vector<unsigned char> blobValue;
int sqliteType = sqlite3_value_type(argv[j]); int sqliteType = sqlite3_value_type(argv[j]);

View File

@ -6,6 +6,29 @@ ParquetCursor::ParquetCursor(ParquetTable* table) {
reset(std::vector<Constraint>()); reset(std::vector<Constraint>());
} }
bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
int64_t target = constraint.getInt();
switch(constraint.getOperator()) {
case IsNull:
return false;
case Is:
case Equal:
return target >= rowId && target < rowId + rowGroupSize;
case GreaterThan:
// rowId > target
return rowId + rowGroupSize > target;
case GreaterThanOrEqual:
// rowId >= target
return rowId + rowGroupSize >= rowId;
case LessThan:
return target > rowId;
case LessThanOrEqual:
return target >= rowId;
default:
return true;
}
}
// Return true if it is _possible_ that the current // Return true if it is _possible_ that the current
// rowgroup satisfies the constraints. Only return false // rowgroup satisfies the constraints. Only return false
// if it definitely does not. // if it definitely does not.
@ -19,9 +42,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
bool rv = true; bool rv = true;
if(column == -1) { if(column == -1) {
if(op == IsNull) { rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]);
return false;
}
} else { } else {
// printf("column = %d\n", column); // printf("column = %d\n", column);
// std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column); // std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
@ -41,13 +62,18 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
bool ParquetCursor::nextRowGroup() { bool ParquetCursor::nextRowGroup() {
start: start:
if((rowGroupId + 1) >= numRowGroups) // Ensure that rowId points at the start of this rowGroup (eg, in the case where
// we skipped an entire row group).
rowId = rowGroupStartRowId + rowGroupSize;
if((rowGroupId + 1) >= numRowGroups) {
return false; return false;
}
rowGroupStartRowId = rowId; rowGroupStartRowId = rowId;
rowGroupId++; rowGroupId++;
rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId); rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId);
rowsLeftInRowGroup = rowGroupMetadata->num_rows(); rowGroupSize = rowsLeftInRowGroup = rowGroupMetadata->num_rows();
rowGroup = reader->RowGroup(rowGroupId); rowGroup = reader->RowGroup(rowGroupId);
for(unsigned int i = 0; i < scanners.size(); i++) for(unsigned int i = 0; i < scanners.size(); i++)
scanners[i] = NULL; scanners[i] = NULL;
@ -69,6 +95,9 @@ start:
colRows[i] = rowId; colRows[i] = rowId;
} }
// Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it;
// it'll get decremented by our caller
rowId++;
if(!currentRowGroupSatisfiesFilter()) if(!currentRowGroupSatisfiesFilter())
goto start; goto start;
@ -106,8 +135,12 @@ start:
if(rowsLeftInRowGroup == 0) { if(rowsLeftInRowGroup == 0) {
if(!nextRowGroup()) { if(!nextRowGroup()) {
// put rowId over the edge so eof returns true // put rowId over the edge so eof returns true
rowId++; rowId = numRows + 1;
return; return;
} else {
// After a successful nextRowGroup, rowId is pointing at the current row. Make it
// point before so the rest of the logic works out.
rowId--;
} }
} }
@ -395,6 +428,8 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
reader = parquet::ParquetFileReader::OpenFile(table->file.data()); reader = parquet::ParquetFileReader::OpenFile(table->file.data());
rowGroupId = -1; rowGroupId = -1;
rowGroupSize = 0;
rowGroupStartRowId = -1;
// TODO: handle the case where rowgroups have disjoint schemas? // TODO: handle the case where rowgroups have disjoint schemas?
// TODO: or at least, fail fast if detected // TODO: or at least, fail fast if detected
rowsLeftInRowGroup = 0; rowsLeftInRowGroup = 0;

View File

@ -17,13 +17,14 @@ class ParquetCursor {
std::vector<int> colRows; std::vector<int> colRows;
std::vector<bool> colNulls; std::vector<bool> colNulls;
std::vector<uintptr_t> colIntValues; std::vector<int64_t> colIntValues;
std::vector<double> colDoubleValues; std::vector<double> colDoubleValues;
std::vector<parquet::ByteArray> colByteArrayValues; std::vector<parquet::ByteArray> colByteArrayValues;
int rowId; int rowId;
int rowGroupId; int rowGroupId;
int rowGroupStartRowId; int rowGroupStartRowId;
int rowGroupSize;
int numRows; int numRows;
int numRowGroups; int numRowGroups;
int rowsLeftInRowGroup; int rowsLeftInRowGroup;
@ -32,11 +33,13 @@ class ParquetCursor {
std::vector<Constraint> constraints; std::vector<Constraint> constraints;
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint);
bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter();
public: public:
ParquetCursor(ParquetTable* table); ParquetCursor(ParquetTable* table);
int getRowId(); int getRowId();
bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter();
void next(); void next();
void close(); void close();
void reset(std::vector<Constraint> constraints); void reset(std::vector<Constraint> constraints);

View File

@ -5,7 +5,7 @@ Constraint::Constraint(
ConstraintOperator op, ConstraintOperator op,
ValueType type, ValueType type,
bool boolValue, bool boolValue,
uintptr_t intValue, int64_t intValue,
double doubleValue, double doubleValue,
std::vector<unsigned char> blobValue std::vector<unsigned char> blobValue
) { ) {
@ -34,7 +34,7 @@ bool Constraint::getBool() {
return boolValue; return boolValue;
} }
uintptr_t Constraint::getInt() { int64_t Constraint::getInt() {
return intValue; return intValue;
} }

View File

@ -36,7 +36,7 @@ class Constraint {
ValueType type; ValueType type;
bool boolValue; bool boolValue;
uintptr_t intValue; int64_t intValue;
double doubleValue; double doubleValue;
// Doubles as string value // Doubles as string value
std::vector<unsigned char> blobValue; std::vector<unsigned char> blobValue;
@ -48,7 +48,7 @@ public:
ConstraintOperator op, ConstraintOperator op,
ValueType type, ValueType type,
bool boolValue, bool boolValue,
uintptr_t intValue, int64_t intValue,
double doubleValue, double doubleValue,
std::vector<unsigned char> blobValue std::vector<unsigned char> blobValue
); );
@ -57,7 +57,7 @@ public:
ConstraintOperator getOperator(); ConstraintOperator getOperator();
ValueType getType(); ValueType getType();
bool getBool(); bool getBool();
uintptr_t getInt(); int64_t getInt();
double getDouble(); double getDouble();
std::vector<unsigned char> getBytes(); std::vector<unsigned char> getBytes();
}; };

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid >= 0
99

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid > 0
98

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls2 where rowid > 0
98

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls2 where rowid >= 0
99