Add rowgroup filtering for rowid

This commit is contained in:
Colin Dellow 2018-03-12 20:42:50 -04:00
parent 1f938a005d
commit acc15256ec
9 changed files with 61 additions and 15 deletions

View File

@ -396,7 +396,7 @@ static int parquetFilter(
ValueType type = Null;
bool boolValue = false;
uintptr_t intValue = 0;
int64_t intValue = 0;
double doubleValue = 0;
std::vector<unsigned char> blobValue;
int sqliteType = sqlite3_value_type(argv[j]);

View File

@ -6,6 +6,29 @@ ParquetCursor::ParquetCursor(ParquetTable* table) {
reset(std::vector<Constraint>());
}
bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
int64_t target = constraint.getInt();
switch(constraint.getOperator()) {
case IsNull:
return false;
case Is:
case Equal:
return target >= rowId && target < rowId + rowGroupSize;
case GreaterThan:
// rowId > target
return rowId + rowGroupSize > target;
case GreaterThanOrEqual:
// rowId >= target
return rowId + rowGroupSize >= rowId;
case LessThan:
return target > rowId;
case LessThanOrEqual:
return target >= rowId;
default:
return true;
}
}
// Return true if it is _possible_ that the current
// rowgroup satisfies the constraints. Only return false
// if it definitely does not.
@ -19,9 +42,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
bool rv = true;
if(column == -1) {
if(op == IsNull) {
return false;
}
rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]);
} else {
// printf("column = %d\n", column);
// std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
@ -41,13 +62,18 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
bool ParquetCursor::nextRowGroup() {
start:
if((rowGroupId + 1) >= numRowGroups)
// Ensure that rowId points at the start of this rowGroup (eg, in the case where
// we skipped an entire row group).
rowId = rowGroupStartRowId + rowGroupSize;
if((rowGroupId + 1) >= numRowGroups) {
return false;
}
rowGroupStartRowId = rowId;
rowGroupId++;
rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId);
rowsLeftInRowGroup = rowGroupMetadata->num_rows();
rowGroupSize = rowsLeftInRowGroup = rowGroupMetadata->num_rows();
rowGroup = reader->RowGroup(rowGroupId);
for(unsigned int i = 0; i < scanners.size(); i++)
scanners[i] = NULL;
@ -69,6 +95,9 @@ start:
colRows[i] = rowId;
}
// Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it;
// it'll get decremented by our caller
rowId++;
if(!currentRowGroupSatisfiesFilter())
goto start;
@ -106,8 +135,12 @@ start:
if(rowsLeftInRowGroup == 0) {
if(!nextRowGroup()) {
// put rowId over the edge so eof returns true
rowId++;
rowId = numRows + 1;
return;
} else {
// After a successful nextRowGroup, rowId is pointing at the current row. Make it
// point before so the rest of the logic works out.
rowId--;
}
}
@ -395,6 +428,8 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
reader = parquet::ParquetFileReader::OpenFile(table->file.data());
rowGroupId = -1;
rowGroupSize = 0;
rowGroupStartRowId = -1;
// TODO: handle the case where rowgroups have disjoint schemas?
// TODO: or at least, fail fast if detected
rowsLeftInRowGroup = 0;

View File

@ -17,13 +17,14 @@ class ParquetCursor {
std::vector<int> colRows;
std::vector<bool> colNulls;
std::vector<uintptr_t> colIntValues;
std::vector<int64_t> colIntValues;
std::vector<double> colDoubleValues;
std::vector<parquet::ByteArray> colByteArrayValues;
int rowId;
int rowGroupId;
int rowGroupStartRowId;
int rowGroupSize;
int numRows;
int numRowGroups;
int rowsLeftInRowGroup;
@ -32,11 +33,13 @@ class ParquetCursor {
std::vector<Constraint> constraints;
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint);
bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter();
public:
ParquetCursor(ParquetTable* table);
int getRowId();
bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter();
void next();
void close();
void reset(std::vector<Constraint> constraints);

View File

@ -5,7 +5,7 @@ Constraint::Constraint(
ConstraintOperator op,
ValueType type,
bool boolValue,
uintptr_t intValue,
int64_t intValue,
double doubleValue,
std::vector<unsigned char> blobValue
) {
@ -34,7 +34,7 @@ bool Constraint::getBool() {
return boolValue;
}
uintptr_t Constraint::getInt() {
int64_t Constraint::getInt() {
return intValue;
}

View File

@ -36,7 +36,7 @@ class Constraint {
ValueType type;
bool boolValue;
uintptr_t intValue;
int64_t intValue;
double doubleValue;
// Doubles as string value
std::vector<unsigned char> blobValue;
@ -48,7 +48,7 @@ public:
ConstraintOperator op,
ValueType type,
bool boolValue,
uintptr_t intValue,
int64_t intValue,
double doubleValue,
std::vector<unsigned char> blobValue
);
@ -57,7 +57,7 @@ public:
ConstraintOperator getOperator();
ValueType getType();
bool getBool();
uintptr_t getInt();
int64_t getInt();
double getDouble();
std::vector<unsigned char> getBytes();
};

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid >= 0
99

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls1 where rowid > 0
98

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls2 where rowid > 0
98

View File

@ -0,0 +1,2 @@
select count(*) from no_nulls2 where rowid >= 0
99