mirror of
https://github.com/cldellow/sqlite-parquet-vtable.git
synced 2025-02-26 06:39:45 +00:00

This reverts commit cbde3c73b601383fff33ce501d0b26047326e93f. This regresses: ``` WITH inputs AS ( SELECT geo_name, CASE WHEN profile_id = 1930 THEN 'total' ELSE 'cyclist' END AS mode, female, male FROM census WHERE profile_id IN ( '1930', '1935') AND csd_type_name = 'CY' AND geo_name IN ('Victoria', 'Dawson Creek', 'Kitchener') ) SELECT total.geo_name, cyclist.male, cyclist.female, 100.0 * cyclist.male / total.male, 100.0 * cyclist.female / total.female FROM inputs AS total JOIN inputs AS cyclist USING (geo_name) WHERE total.mode = 'total' AND cyclist.mode = 'cyclist'; ``` while improving: ``` select count(*) from census where geo_name in ('Dawson Creek', 'Kitchener', 'Victoria') and csd_type_name = 'CY' and profile_id = '1930'; ``` which seems like a bad tradeoff.
123 lines
2.9 KiB
C++
123 lines
2.9 KiB
C++
#ifndef PARQUET_FILTER_H
|
|
#define PARQUET_FILTER_H
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include <cstdint>
|
|
|
|
enum ConstraintOperator {
|
|
Equal,
|
|
GreaterThan,
|
|
LessThanOrEqual,
|
|
LessThan,
|
|
GreaterThanOrEqual,
|
|
Match,
|
|
Like,
|
|
Glob,
|
|
Regexp,
|
|
NotEqual,
|
|
IsNot,
|
|
IsNotNull,
|
|
IsNull,
|
|
Is
|
|
};
|
|
|
|
enum ValueType {
|
|
Null,
|
|
Integer,
|
|
Double,
|
|
Blob,
|
|
Text
|
|
};
|
|
|
|
class RowGroupBitmap {
|
|
void setBit(std::vector<unsigned char>& membership, unsigned int rowGroup, bool isSet) {
|
|
int byte = rowGroup / 8;
|
|
int offset = rowGroup % 8;
|
|
unsigned char c = membership[byte];
|
|
c &= ~(1UL << offset);
|
|
if(isSet) {
|
|
c |= 1UL << offset;
|
|
}
|
|
membership[byte] = c;
|
|
}
|
|
// Compares estimated rowGroupFilter results against observed results
|
|
// when we explored the row group. This lets us cache
|
|
public:
|
|
RowGroupBitmap(unsigned int totalRowGroups) {
|
|
// Initialize everything to assume that all row groups match.
|
|
// As we discover otherwise, we'll update that assumption.
|
|
for(unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) {
|
|
estimatedMembership.push_back(0xFF);
|
|
actualMembership.push_back(0xFF);
|
|
}
|
|
}
|
|
|
|
RowGroupBitmap(
|
|
std::vector<unsigned char> estimatedMembership,
|
|
std::vector<unsigned char> actualMembership) :
|
|
estimatedMembership(estimatedMembership),
|
|
actualMembership(actualMembership) {
|
|
}
|
|
|
|
std::vector<unsigned char> estimatedMembership;
|
|
std::vector<unsigned char> actualMembership;
|
|
// Pass false only if definitely does not have rows
|
|
void setEstimatedMembership(unsigned int rowGroup, bool hasRows) {
|
|
setBit(estimatedMembership, rowGroup, hasRows);
|
|
}
|
|
|
|
// Pass false only after exhausting all rows
|
|
void setActualMembership(unsigned int rowGroup, bool hadRows) {
|
|
setBit(actualMembership, rowGroup, hadRows);
|
|
}
|
|
|
|
bool getActualMembership(unsigned int rowGroup) {
|
|
int byte = rowGroup / 8;
|
|
int offset = rowGroup % 8;
|
|
|
|
return (actualMembership[byte] >> offset) & 1U;
|
|
}
|
|
};
|
|
|
|
class Constraint {
|
|
public:
|
|
// Kind of a messy constructor function, but it's just for internal use, so whatever.
|
|
Constraint(
|
|
RowGroupBitmap bitmap,
|
|
int column,
|
|
std::string columnName,
|
|
ConstraintOperator op,
|
|
ValueType type,
|
|
int64_t intValue,
|
|
double doubleValue,
|
|
std::vector<unsigned char> blobValue
|
|
);
|
|
|
|
RowGroupBitmap bitmap;
|
|
int column; // underlying column in the query
|
|
std::string columnName;
|
|
ConstraintOperator op;
|
|
ValueType type;
|
|
|
|
int64_t intValue;
|
|
double doubleValue;
|
|
std::vector<unsigned char> blobValue;
|
|
// Only set when blobValue is set
|
|
std::string stringValue;
|
|
|
|
// Only set when stringValue is set and op == Like
|
|
std::string likeStringValue;
|
|
|
|
// A unique identifier for this constraint, e.g.
|
|
// col0 = 'Dawson Creek'
|
|
std::string describe() const;
|
|
|
|
// This is a temp field used while evaluating if a rowgroup had rows
|
|
// that matched this constraint.
|
|
int rowGroupId;
|
|
bool hadRows;
|
|
};
|
|
|
|
#endif
|