1
0
mirror of https://github.com/cldellow/sqlite-parquet-vtable.git synced 2025-06-30 16:43:30 +00:00
sqlite-parquet-vtable/parquet/parquet_filter.h
Colin Dellow 6648ff5968 add string == row group filter
For the statscan census set filtering on `== 'Dawson Creek'`, the query
goes from 980ms to 660ms.

This is expected, since the data isn't sorted by that column.

I'll try adding some scaffolding to do filtering at the row level, too.

We could also try unpacking the dictionary and testing the individual
values, although we may want some heuristics to decide whether it's
worth doing -- eg if < 10% of the rows have a unique value.

Ideally, this should be like a ~1ms query.
2018-03-15 20:40:21 -04:00

65 lines
1.1 KiB
C++

#ifndef PARQUET_FILTER_H
#define PARQUET_FILTER_H
#include <vector>
#include <string>
#include <cstdint>
enum ConstraintOperator {
Equal,
GreaterThan,
LessThanOrEqual,
LessThan,
GreaterThanOrEqual,
Match,
Like,
Glob,
Regexp,
NotEqual,
IsNot,
IsNotNull,
IsNull,
Is
};
enum ValueType {
Null,
Integer,
Double,
Blob,
Text
};
class Constraint {
int column; // underlying column in the query
ConstraintOperator op;
ValueType type;
int64_t intValue;
double doubleValue;
std::vector<unsigned char> blobValue;
// Only set when blobValue is set
std::string stringValue;
public:
// Kind of a messy constructor function, but it's just for internal use, so whatever.
Constraint(
int column,
ConstraintOperator op,
ValueType type,
int64_t intValue,
double doubleValue,
std::vector<unsigned char> blobValue
);
int getColumn();
ConstraintOperator getOperator();
ValueType getType();
int64_t getInt();
double getDouble();
const std::vector<unsigned char> getBytes();
std::string getString();
};
#endif