From 6648ff59686197e239bba88c5de4fbb99cfb5f19 Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Thu, 15 Mar 2018 20:40:21 -0400 Subject: [PATCH] add string == row group filter For the statscan census set filtering on `== 'Dawson Creek'`, the query goes from 980ms to 660ms. This is expected, since the data isn't sorted by that column. I'll try adding some scaffolding to do filtering at the row level, too. We could also try unpacking the dictionary and testing the individual values, although we may want some heuristics to decide whether it's worth doing -- eg if < 10% of the rows have a unique value. Ideally, this should be like a ~1ms query. --- parquet/parquet_cursor.cc | 10 +++++++--- parquet/parquet_filter.cc | 9 ++++++++- parquet/parquet_filter.h | 7 +++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 5fa66db..e2afbbd 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -38,16 +38,21 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st return true; } + if(constraint.getType() != Text) { + return true; + } + + std::string str = constraint.getString(); parquet::ByteArray min = stats->min(); parquet::ByteArray max = stats->max(); std::string minStr((const char*)min.ptr, min.len); std::string maxStr((const char*)max.ptr, max.len); - printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len); +// printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data()); switch(constraint.getOperator()) { case Is: case Equal: - + return str >= minStr && str <= maxStr; case GreaterThan: case GreaterThanOrEqual: case LessThan: @@ -77,7 +82,6 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint constraint, // data, which provides substantial performance benefits. bool ParquetCursor::currentRowGroupSatisfiesFilter() { for(unsigned int i = 0; i < constraints.size(); i++) { - ValueType type = constraints[i].getType(); int column = constraints[i].getColumn(); int op = constraints[i].getOperator(); bool rv = true; diff --git a/parquet/parquet_filter.cc b/parquet/parquet_filter.cc index ab772cf..503ff4a 100644 --- a/parquet/parquet_filter.cc +++ b/parquet/parquet_filter.cc @@ -14,6 +14,9 @@ Constraint::Constraint( this->intValue = intValue; this->doubleValue = doubleValue; this->blobValue = blobValue; + + if(type == Text) + stringValue = std::string((char*)&blobValue[0], blobValue.size()); } int Constraint::getColumn() { @@ -36,6 +39,10 @@ double Constraint::getDouble() { return doubleValue; } -std::vector Constraint::getBytes() { +const std::vector Constraint::getBytes() { return blobValue; } + +std::string Constraint::getString() { + return stringValue; +} diff --git a/parquet/parquet_filter.h b/parquet/parquet_filter.h index 08c073e..55e6d7d 100644 --- a/parquet/parquet_filter.h +++ b/parquet/parquet_filter.h @@ -2,6 +2,7 @@ #define PARQUET_FILTER_H #include +#include #include enum ConstraintOperator { @@ -36,8 +37,9 @@ class Constraint { int64_t intValue; double doubleValue; - // Doubles as string value std::vector blobValue; + // Only set when blobValue is set + std::string stringValue; public: // Kind of a messy constructor function, but it's just for internal use, so whatever. @@ -55,7 +57,8 @@ public: ValueType getType(); int64_t getInt(); double getDouble(); - std::vector getBytes(); + const std::vector getBytes(); + std::string getString(); }; #endif