Add rowgroup filtering for rowid
This commit is contained in:
parent
1f938a005d
commit
acc15256ec
|
@ -396,7 +396,7 @@ static int parquetFilter(
|
|||
|
||||
ValueType type = Null;
|
||||
bool boolValue = false;
|
||||
uintptr_t intValue = 0;
|
||||
int64_t intValue = 0;
|
||||
double doubleValue = 0;
|
||||
std::vector<unsigned char> blobValue;
|
||||
int sqliteType = sqlite3_value_type(argv[j]);
|
||||
|
|
|
@ -6,6 +6,29 @@ ParquetCursor::ParquetCursor(ParquetTable* table) {
|
|||
reset(std::vector<Constraint>());
|
||||
}
|
||||
|
||||
bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
|
||||
int64_t target = constraint.getInt();
|
||||
switch(constraint.getOperator()) {
|
||||
case IsNull:
|
||||
return false;
|
||||
case Is:
|
||||
case Equal:
|
||||
return target >= rowId && target < rowId + rowGroupSize;
|
||||
case GreaterThan:
|
||||
// rowId > target
|
||||
return rowId + rowGroupSize > target;
|
||||
case GreaterThanOrEqual:
|
||||
// rowId >= target
|
||||
return rowId + rowGroupSize >= rowId;
|
||||
case LessThan:
|
||||
return target > rowId;
|
||||
case LessThanOrEqual:
|
||||
return target >= rowId;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if it is _possible_ that the current
|
||||
// rowgroup satisfies the constraints. Only return false
|
||||
// if it definitely does not.
|
||||
|
@ -19,9 +42,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
|||
bool rv = true;
|
||||
|
||||
if(column == -1) {
|
||||
if(op == IsNull) {
|
||||
return false;
|
||||
}
|
||||
rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]);
|
||||
} else {
|
||||
// printf("column = %d\n", column);
|
||||
// std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
|
||||
|
@ -41,13 +62,18 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
|||
|
||||
bool ParquetCursor::nextRowGroup() {
|
||||
start:
|
||||
if((rowGroupId + 1) >= numRowGroups)
|
||||
// Ensure that rowId points at the start of this rowGroup (eg, in the case where
|
||||
// we skipped an entire row group).
|
||||
rowId = rowGroupStartRowId + rowGroupSize;
|
||||
|
||||
if((rowGroupId + 1) >= numRowGroups) {
|
||||
return false;
|
||||
}
|
||||
|
||||
rowGroupStartRowId = rowId;
|
||||
rowGroupId++;
|
||||
rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId);
|
||||
rowsLeftInRowGroup = rowGroupMetadata->num_rows();
|
||||
rowGroupSize = rowsLeftInRowGroup = rowGroupMetadata->num_rows();
|
||||
rowGroup = reader->RowGroup(rowGroupId);
|
||||
for(unsigned int i = 0; i < scanners.size(); i++)
|
||||
scanners[i] = NULL;
|
||||
|
@ -69,6 +95,9 @@ start:
|
|||
colRows[i] = rowId;
|
||||
}
|
||||
|
||||
// Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it;
|
||||
// it'll get decremented by our caller
|
||||
rowId++;
|
||||
if(!currentRowGroupSatisfiesFilter())
|
||||
goto start;
|
||||
|
||||
|
@ -106,8 +135,12 @@ start:
|
|||
if(rowsLeftInRowGroup == 0) {
|
||||
if(!nextRowGroup()) {
|
||||
// put rowId over the edge so eof returns true
|
||||
rowId++;
|
||||
rowId = numRows + 1;
|
||||
return;
|
||||
} else {
|
||||
// After a successful nextRowGroup, rowId is pointing at the current row. Make it
|
||||
// point before so the rest of the logic works out.
|
||||
rowId--;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -395,6 +428,8 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
|
|||
reader = parquet::ParquetFileReader::OpenFile(table->file.data());
|
||||
|
||||
rowGroupId = -1;
|
||||
rowGroupSize = 0;
|
||||
rowGroupStartRowId = -1;
|
||||
// TODO: handle the case where rowgroups have disjoint schemas?
|
||||
// TODO: or at least, fail fast if detected
|
||||
rowsLeftInRowGroup = 0;
|
||||
|
|
|
@ -17,13 +17,14 @@ class ParquetCursor {
|
|||
|
||||
std::vector<int> colRows;
|
||||
std::vector<bool> colNulls;
|
||||
std::vector<uintptr_t> colIntValues;
|
||||
std::vector<int64_t> colIntValues;
|
||||
std::vector<double> colDoubleValues;
|
||||
std::vector<parquet::ByteArray> colByteArrayValues;
|
||||
|
||||
int rowId;
|
||||
int rowGroupId;
|
||||
int rowGroupStartRowId;
|
||||
int rowGroupSize;
|
||||
int numRows;
|
||||
int numRowGroups;
|
||||
int rowsLeftInRowGroup;
|
||||
|
@ -32,11 +33,13 @@ class ParquetCursor {
|
|||
|
||||
std::vector<Constraint> constraints;
|
||||
|
||||
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint);
|
||||
bool currentRowSatisfiesFilter();
|
||||
bool currentRowGroupSatisfiesFilter();
|
||||
|
||||
public:
|
||||
ParquetCursor(ParquetTable* table);
|
||||
int getRowId();
|
||||
bool currentRowSatisfiesFilter();
|
||||
bool currentRowGroupSatisfiesFilter();
|
||||
void next();
|
||||
void close();
|
||||
void reset(std::vector<Constraint> constraints);
|
||||
|
|
|
@ -5,7 +5,7 @@ Constraint::Constraint(
|
|||
ConstraintOperator op,
|
||||
ValueType type,
|
||||
bool boolValue,
|
||||
uintptr_t intValue,
|
||||
int64_t intValue,
|
||||
double doubleValue,
|
||||
std::vector<unsigned char> blobValue
|
||||
) {
|
||||
|
@ -34,7 +34,7 @@ bool Constraint::getBool() {
|
|||
return boolValue;
|
||||
}
|
||||
|
||||
uintptr_t Constraint::getInt() {
|
||||
int64_t Constraint::getInt() {
|
||||
return intValue;
|
||||
}
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ class Constraint {
|
|||
ValueType type;
|
||||
|
||||
bool boolValue;
|
||||
uintptr_t intValue;
|
||||
int64_t intValue;
|
||||
double doubleValue;
|
||||
// Doubles as string value
|
||||
std::vector<unsigned char> blobValue;
|
||||
|
@ -48,7 +48,7 @@ public:
|
|||
ConstraintOperator op,
|
||||
ValueType type,
|
||||
bool boolValue,
|
||||
uintptr_t intValue,
|
||||
int64_t intValue,
|
||||
double doubleValue,
|
||||
std::vector<unsigned char> blobValue
|
||||
);
|
||||
|
@ -57,7 +57,7 @@ public:
|
|||
ConstraintOperator getOperator();
|
||||
ValueType getType();
|
||||
bool getBool();
|
||||
uintptr_t getInt();
|
||||
int64_t getInt();
|
||||
double getDouble();
|
||||
std::vector<unsigned char> getBytes();
|
||||
};
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
select count(*) from no_nulls1 where rowid >= 0
|
||||
99
|
|
@ -0,0 +1,2 @@
|
|||
select count(*) from no_nulls1 where rowid > 0
|
||||
98
|
|
@ -0,0 +1,2 @@
|
|||
select count(*) from no_nulls2 where rowid > 0
|
||||
98
|
|
@ -0,0 +1,2 @@
|
|||
select count(*) from no_nulls2 where rowid >= 0
|
||||
99
|
Loading…
Reference in New Issue