Add rowgroup filtering for rowid
This commit is contained in:
parent
1f938a005d
commit
acc15256ec
|
@ -396,7 +396,7 @@ static int parquetFilter(
|
||||||
|
|
||||||
ValueType type = Null;
|
ValueType type = Null;
|
||||||
bool boolValue = false;
|
bool boolValue = false;
|
||||||
uintptr_t intValue = 0;
|
int64_t intValue = 0;
|
||||||
double doubleValue = 0;
|
double doubleValue = 0;
|
||||||
std::vector<unsigned char> blobValue;
|
std::vector<unsigned char> blobValue;
|
||||||
int sqliteType = sqlite3_value_type(argv[j]);
|
int sqliteType = sqlite3_value_type(argv[j]);
|
||||||
|
|
|
@ -6,6 +6,29 @@ ParquetCursor::ParquetCursor(ParquetTable* table) {
|
||||||
reset(std::vector<Constraint>());
|
reset(std::vector<Constraint>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
|
||||||
|
int64_t target = constraint.getInt();
|
||||||
|
switch(constraint.getOperator()) {
|
||||||
|
case IsNull:
|
||||||
|
return false;
|
||||||
|
case Is:
|
||||||
|
case Equal:
|
||||||
|
return target >= rowId && target < rowId + rowGroupSize;
|
||||||
|
case GreaterThan:
|
||||||
|
// rowId > target
|
||||||
|
return rowId + rowGroupSize > target;
|
||||||
|
case GreaterThanOrEqual:
|
||||||
|
// rowId >= target
|
||||||
|
return rowId + rowGroupSize >= rowId;
|
||||||
|
case LessThan:
|
||||||
|
return target > rowId;
|
||||||
|
case LessThanOrEqual:
|
||||||
|
return target >= rowId;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Return true if it is _possible_ that the current
|
// Return true if it is _possible_ that the current
|
||||||
// rowgroup satisfies the constraints. Only return false
|
// rowgroup satisfies the constraints. Only return false
|
||||||
// if it definitely does not.
|
// if it definitely does not.
|
||||||
|
@ -19,9 +42,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
||||||
bool rv = true;
|
bool rv = true;
|
||||||
|
|
||||||
if(column == -1) {
|
if(column == -1) {
|
||||||
if(op == IsNull) {
|
rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// printf("column = %d\n", column);
|
// printf("column = %d\n", column);
|
||||||
// std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
|
// std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
|
||||||
|
@ -41,13 +62,18 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
||||||
|
|
||||||
bool ParquetCursor::nextRowGroup() {
|
bool ParquetCursor::nextRowGroup() {
|
||||||
start:
|
start:
|
||||||
if((rowGroupId + 1) >= numRowGroups)
|
// Ensure that rowId points at the start of this rowGroup (eg, in the case where
|
||||||
|
// we skipped an entire row group).
|
||||||
|
rowId = rowGroupStartRowId + rowGroupSize;
|
||||||
|
|
||||||
|
if((rowGroupId + 1) >= numRowGroups) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
rowGroupStartRowId = rowId;
|
rowGroupStartRowId = rowId;
|
||||||
rowGroupId++;
|
rowGroupId++;
|
||||||
rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId);
|
rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId);
|
||||||
rowsLeftInRowGroup = rowGroupMetadata->num_rows();
|
rowGroupSize = rowsLeftInRowGroup = rowGroupMetadata->num_rows();
|
||||||
rowGroup = reader->RowGroup(rowGroupId);
|
rowGroup = reader->RowGroup(rowGroupId);
|
||||||
for(unsigned int i = 0; i < scanners.size(); i++)
|
for(unsigned int i = 0; i < scanners.size(); i++)
|
||||||
scanners[i] = NULL;
|
scanners[i] = NULL;
|
||||||
|
@ -69,6 +95,9 @@ start:
|
||||||
colRows[i] = rowId;
|
colRows[i] = rowId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it;
|
||||||
|
// it'll get decremented by our caller
|
||||||
|
rowId++;
|
||||||
if(!currentRowGroupSatisfiesFilter())
|
if(!currentRowGroupSatisfiesFilter())
|
||||||
goto start;
|
goto start;
|
||||||
|
|
||||||
|
@ -106,8 +135,12 @@ start:
|
||||||
if(rowsLeftInRowGroup == 0) {
|
if(rowsLeftInRowGroup == 0) {
|
||||||
if(!nextRowGroup()) {
|
if(!nextRowGroup()) {
|
||||||
// put rowId over the edge so eof returns true
|
// put rowId over the edge so eof returns true
|
||||||
rowId++;
|
rowId = numRows + 1;
|
||||||
return;
|
return;
|
||||||
|
} else {
|
||||||
|
// After a successful nextRowGroup, rowId is pointing at the current row. Make it
|
||||||
|
// point before so the rest of the logic works out.
|
||||||
|
rowId--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -395,6 +428,8 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
|
||||||
reader = parquet::ParquetFileReader::OpenFile(table->file.data());
|
reader = parquet::ParquetFileReader::OpenFile(table->file.data());
|
||||||
|
|
||||||
rowGroupId = -1;
|
rowGroupId = -1;
|
||||||
|
rowGroupSize = 0;
|
||||||
|
rowGroupStartRowId = -1;
|
||||||
// TODO: handle the case where rowgroups have disjoint schemas?
|
// TODO: handle the case where rowgroups have disjoint schemas?
|
||||||
// TODO: or at least, fail fast if detected
|
// TODO: or at least, fail fast if detected
|
||||||
rowsLeftInRowGroup = 0;
|
rowsLeftInRowGroup = 0;
|
||||||
|
|
|
@ -17,13 +17,14 @@ class ParquetCursor {
|
||||||
|
|
||||||
std::vector<int> colRows;
|
std::vector<int> colRows;
|
||||||
std::vector<bool> colNulls;
|
std::vector<bool> colNulls;
|
||||||
std::vector<uintptr_t> colIntValues;
|
std::vector<int64_t> colIntValues;
|
||||||
std::vector<double> colDoubleValues;
|
std::vector<double> colDoubleValues;
|
||||||
std::vector<parquet::ByteArray> colByteArrayValues;
|
std::vector<parquet::ByteArray> colByteArrayValues;
|
||||||
|
|
||||||
int rowId;
|
int rowId;
|
||||||
int rowGroupId;
|
int rowGroupId;
|
||||||
int rowGroupStartRowId;
|
int rowGroupStartRowId;
|
||||||
|
int rowGroupSize;
|
||||||
int numRows;
|
int numRows;
|
||||||
int numRowGroups;
|
int numRowGroups;
|
||||||
int rowsLeftInRowGroup;
|
int rowsLeftInRowGroup;
|
||||||
|
@ -32,11 +33,13 @@ class ParquetCursor {
|
||||||
|
|
||||||
std::vector<Constraint> constraints;
|
std::vector<Constraint> constraints;
|
||||||
|
|
||||||
|
bool currentRowGroupSatisfiesRowIdFilter(Constraint constraint);
|
||||||
|
bool currentRowSatisfiesFilter();
|
||||||
|
bool currentRowGroupSatisfiesFilter();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ParquetCursor(ParquetTable* table);
|
ParquetCursor(ParquetTable* table);
|
||||||
int getRowId();
|
int getRowId();
|
||||||
bool currentRowSatisfiesFilter();
|
|
||||||
bool currentRowGroupSatisfiesFilter();
|
|
||||||
void next();
|
void next();
|
||||||
void close();
|
void close();
|
||||||
void reset(std::vector<Constraint> constraints);
|
void reset(std::vector<Constraint> constraints);
|
||||||
|
|
|
@ -5,7 +5,7 @@ Constraint::Constraint(
|
||||||
ConstraintOperator op,
|
ConstraintOperator op,
|
||||||
ValueType type,
|
ValueType type,
|
||||||
bool boolValue,
|
bool boolValue,
|
||||||
uintptr_t intValue,
|
int64_t intValue,
|
||||||
double doubleValue,
|
double doubleValue,
|
||||||
std::vector<unsigned char> blobValue
|
std::vector<unsigned char> blobValue
|
||||||
) {
|
) {
|
||||||
|
@ -34,7 +34,7 @@ bool Constraint::getBool() {
|
||||||
return boolValue;
|
return boolValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
uintptr_t Constraint::getInt() {
|
int64_t Constraint::getInt() {
|
||||||
return intValue;
|
return intValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ class Constraint {
|
||||||
ValueType type;
|
ValueType type;
|
||||||
|
|
||||||
bool boolValue;
|
bool boolValue;
|
||||||
uintptr_t intValue;
|
int64_t intValue;
|
||||||
double doubleValue;
|
double doubleValue;
|
||||||
// Doubles as string value
|
// Doubles as string value
|
||||||
std::vector<unsigned char> blobValue;
|
std::vector<unsigned char> blobValue;
|
||||||
|
@ -48,7 +48,7 @@ public:
|
||||||
ConstraintOperator op,
|
ConstraintOperator op,
|
||||||
ValueType type,
|
ValueType type,
|
||||||
bool boolValue,
|
bool boolValue,
|
||||||
uintptr_t intValue,
|
int64_t intValue,
|
||||||
double doubleValue,
|
double doubleValue,
|
||||||
std::vector<unsigned char> blobValue
|
std::vector<unsigned char> blobValue
|
||||||
);
|
);
|
||||||
|
@ -57,7 +57,7 @@ public:
|
||||||
ConstraintOperator getOperator();
|
ConstraintOperator getOperator();
|
||||||
ValueType getType();
|
ValueType getType();
|
||||||
bool getBool();
|
bool getBool();
|
||||||
uintptr_t getInt();
|
int64_t getInt();
|
||||||
double getDouble();
|
double getDouble();
|
||||||
std::vector<unsigned char> getBytes();
|
std::vector<unsigned char> getBytes();
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
select count(*) from no_nulls1 where rowid >= 0
|
||||||
|
99
|
|
@ -0,0 +1,2 @@
|
||||||
|
select count(*) from no_nulls1 where rowid > 0
|
||||||
|
98
|
|
@ -0,0 +1,2 @@
|
||||||
|
select count(*) from no_nulls2 where rowid > 0
|
||||||
|
98
|
|
@ -0,0 +1,2 @@
|
||||||
|
select count(*) from no_nulls2 where rowid >= 0
|
||||||
|
99
|
Loading…
Reference in New Issue