mirror of
https://github.com/cldellow/sqlite-parquet-vtable.git
synced 2025-06-30 16:43:30 +00:00

For the statscan census set filtering on `== 'Dawson Creek'`, the query goes from 980ms to 660ms. This is expected, since the data isn't sorted by that column. I'll try adding some scaffolding to do filtering at the row level, too. We could also try unpacking the dictionary and testing the individual values, although we may want some heuristics to decide whether it's worth doing -- eg if < 10% of the rows have a unique value. Ideally, this should be like a ~1ms query.
65 lines
1.1 KiB
C++
65 lines
1.1 KiB
C++
#ifndef PARQUET_FILTER_H
|
|
#define PARQUET_FILTER_H
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include <cstdint>
|
|
|
|
enum ConstraintOperator {
|
|
Equal,
|
|
GreaterThan,
|
|
LessThanOrEqual,
|
|
LessThan,
|
|
GreaterThanOrEqual,
|
|
Match,
|
|
Like,
|
|
Glob,
|
|
Regexp,
|
|
NotEqual,
|
|
IsNot,
|
|
IsNotNull,
|
|
IsNull,
|
|
Is
|
|
};
|
|
|
|
enum ValueType {
|
|
Null,
|
|
Integer,
|
|
Double,
|
|
Blob,
|
|
Text
|
|
};
|
|
|
|
class Constraint {
|
|
int column; // underlying column in the query
|
|
ConstraintOperator op;
|
|
ValueType type;
|
|
|
|
int64_t intValue;
|
|
double doubleValue;
|
|
std::vector<unsigned char> blobValue;
|
|
// Only set when blobValue is set
|
|
std::string stringValue;
|
|
|
|
public:
|
|
// Kind of a messy constructor function, but it's just for internal use, so whatever.
|
|
Constraint(
|
|
int column,
|
|
ConstraintOperator op,
|
|
ValueType type,
|
|
int64_t intValue,
|
|
double doubleValue,
|
|
std::vector<unsigned char> blobValue
|
|
);
|
|
|
|
int getColumn();
|
|
ConstraintOperator getOperator();
|
|
ValueType getType();
|
|
int64_t getInt();
|
|
double getDouble();
|
|
const std::vector<unsigned char> getBytes();
|
|
std::string getString();
|
|
};
|
|
|
|
#endif
|