Cache clauses -> row group mapping

Create a shadow table. For `stats`, it'd be `_stats_rowgroups`.

It contains three columns:

- the clause (eg `city = 'Dawson Creek'`)
- the initial estimate, as a bitmap of rowgroups based on stats
- the actual observed rowgroups, as a bitmap

This papers over poorly sorted parquet files, at the cost of some disk
space. It makes interactive queries much more natural -- drilldown style
queries are much faster, as they can leverage work done by previous
queries.

eg 'SELECT * FROM stats WHERE city = 'Dawson Creek' and question_id >= 1935 and question_id <= 1940`
takes ~584ms on first run, but 9ms on subsequent runs.

We only create entries when the estimates don't match the actual
results.

Fixes #6
This commit is contained in:
Colin Dellow 2018-03-24 23:51:15 -04:00
parent d2c736f25a
commit d3ab5ff3e7
9 changed files with 397 additions and 63 deletions

View File

@ -72,6 +72,18 @@ constraints before returning control to SQLite's virtual machine. This minimizes
the number of allocations performed when many rows are filtered out by the number of allocations performed when many rows are filtered out by
the user's criteria. the user's criteria.
### Memoized slices
Individual clauses are mapped to the row groups they match.
eg going on row group statistics, which store minimum and maximum values, a clause
like `WHERE city = 'Dawson Creek'` may match 80% of row groups.
In reality, it may only be present in one or two row groups.
This is recorded in a shadow table so future queries that contain that clause
can read only the necessary row groups.
### Types ### Types
These Parquet types are supported: These Parquet types are supported:

View File

@ -17,6 +17,7 @@ SQLITE_EXTENSION_INIT1
#include <stdarg.h> #include <stdarg.h>
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <iomanip>
#include <memory> #include <memory>
@ -32,6 +33,7 @@ static int parquetConnect(sqlite3*, void*, int, const char*const*,
sqlite3_vtab**,char**); sqlite3_vtab**,char**);
static int parquetBestIndex(sqlite3_vtab*,sqlite3_index_info*); static int parquetBestIndex(sqlite3_vtab*,sqlite3_index_info*);
static int parquetDisconnect(sqlite3_vtab*); static int parquetDisconnect(sqlite3_vtab*);
static int parquetDestroy(sqlite3_vtab*);
static int parquetOpen(sqlite3_vtab*, sqlite3_vtab_cursor**); static int parquetOpen(sqlite3_vtab*, sqlite3_vtab_cursor**);
static int parquetClose(sqlite3_vtab_cursor*); static int parquetClose(sqlite3_vtab_cursor*);
static int parquetFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr, static int parquetFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr,
@ -45,6 +47,7 @@ static int parquetRowid(sqlite3_vtab_cursor*,sqlite3_int64*);
typedef struct sqlite3_vtab_parquet { typedef struct sqlite3_vtab_parquet {
sqlite3_vtab base; /* Base class. Must be first */ sqlite3_vtab base; /* Base class. Must be first */
ParquetTable* table; ParquetTable* table;
sqlite3* db;
} sqlite3_vtab_parquet; } sqlite3_vtab_parquet;
@ -54,6 +57,21 @@ typedef struct sqlite3_vtab_cursor_parquet {
ParquetCursor* cursor; ParquetCursor* cursor;
} sqlite3_vtab_cursor_parquet; } sqlite3_vtab_cursor_parquet;
static int parquetDestroy(sqlite3_vtab *pVtab) {
sqlite3_vtab_parquet *p = (sqlite3_vtab_parquet*)pVtab;
// Clean up our shadow table. This is useful if the user has recreated
// the parquet file, and our mappings would now be invalid.
std::string drop = "DROP TABLE IF EXISTS _";
drop.append(p->table->getTableName());
drop.append("_rowgroups");
int rv = sqlite3_exec(p->db, drop.data(), 0, 0, 0);
if(rv != 0)
return rv;
return SQLITE_OK;
}
/* /*
** This method is the destructor fo a sqlite3_vtab_parquet object. ** This method is the destructor fo a sqlite3_vtab_parquet object.
*/ */
@ -78,6 +96,7 @@ static int parquetConnect(
return SQLITE_ERROR; return SQLITE_ERROR;
} }
std::string tableName = argv[2];
// Remove the delimiting single quotes // Remove the delimiting single quotes
std::string fname = argv[3]; std::string fname = argv[3];
fname = fname.substr(1, fname.length() - 2); fname = fname.substr(1, fname.length() - 2);
@ -87,7 +106,7 @@ static int parquetConnect(
memset(vtab.get(), 0, sizeof(*vtab.get())); memset(vtab.get(), 0, sizeof(*vtab.get()));
try { try {
std::unique_ptr<ParquetTable> table(new ParquetTable(fname)); std::unique_ptr<ParquetTable> table(new ParquetTable(fname, tableName));
std::string create = table->CreateStatement(); std::string create = table->CreateStatement();
int rc = sqlite3_declare_vtab(db, create.data()); int rc = sqlite3_declare_vtab(db, create.data());
@ -95,6 +114,7 @@ static int parquetConnect(
return rc; return rc;
vtab->table = table.release(); vtab->table = table.release();
vtab->db = db;
*ppVtab = (sqlite3_vtab*)vtab.release(); *ppVtab = (sqlite3_vtab*)vtab.release();
return SQLITE_OK; return SQLITE_OK;
} catch (const std::exception& e) { } catch (const std::exception& e) {
@ -119,16 +139,81 @@ static int parquetCreate(
sqlite3_vtab **ppVtab, sqlite3_vtab **ppVtab,
char **pzErr char **pzErr
){ ){
try {
// Create shadow table for storing constraint -> rowid mappings
std::string create = "CREATE TABLE IF NOT EXISTS _";
create.append(argv[2]);
create.append("_rowgroups(clause TEXT, estimate BLOB, actual BLOB)");
int rv = sqlite3_exec(db, create.data(), 0, 0, 0);
if(rv != 0)
return rv;
create = "CREATE UNIQUE INDEX IF NOT EXISTS _";
create.append(argv[2]);
create.append("_index ON _");
create.append(argv[2]);
create.append("_rowgroups(clause)");
rv = sqlite3_exec(db, create.data(), 0, 0, 0);
return parquetConnect(db, pAux, argc, argv, ppVtab, pzErr); return parquetConnect(db, pAux, argc, argv, ppVtab, pzErr);
} catch (std::bad_alloc& ba) {
return SQLITE_NOMEM;
} }
}
std::string quoteBlob(const std::vector<unsigned char>& bytes) {
std::ostringstream ss;
ss << "X'" << std::hex;
for(unsigned int i = 0; i < bytes.size(); i++) {
ss << std::setfill('0') << std::setw(2) << (unsigned int)(unsigned char)bytes[i];
}
ss << "'";
return ss.str();
}
void persistConstraints(sqlite3* db, ParquetCursor* cursor) {
for(unsigned int i = 0; i < cursor->getNumConstraints(); i++) {
const Constraint& constraint = cursor->getConstraint(i);
const std::vector<unsigned char>& estimated = constraint.bitmap.estimatedMembership;
const std::vector<unsigned char>& actual = constraint.bitmap.actualMembership;
if(estimated == actual) {
continue;
}
std::string desc = constraint.describe();
std::string estimatedStr = quoteBlob(estimated);
std::string actualStr = quoteBlob(actual);
// This is only advisory, so ignore failures.
char* sql = sqlite3_mprintf(
"INSERT OR REPLACE INTO _%s_rowgroups(clause, estimate, actual) VALUES ('%q', %s, %s)",
cursor->getTable()->getTableName().c_str(),
desc.c_str(),
estimatedStr.c_str(),
actualStr.c_str());
if(sql == NULL)
return;
sqlite3_exec(db, sql, 0, 0, 0);
sqlite3_free(sql);
}
}
/* /*
** Destructor for a sqlite3_vtab_cursor_parquet. ** Destructor for a sqlite3_vtab_cursor_parquet.
*/ */
static int parquetClose(sqlite3_vtab_cursor *cur){ static int parquetClose(sqlite3_vtab_cursor *cur){
sqlite3_vtab_cursor_parquet* p = (sqlite3_vtab_cursor_parquet*)cur; sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur;
p->cursor->close(); sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab);
delete p->cursor; ParquetCursor* cursor = vtab_cursor_parquet->cursor;
persistConstraints(vtab_parquet->db, cursor);
vtab_cursor_parquet->cursor->close();
delete vtab_cursor_parquet->cursor;
sqlite3_free(cur); sqlite3_free(cur);
return SQLITE_OK; return SQLITE_OK;
} }
@ -196,7 +281,8 @@ const char* opName(int op) {
*/ */
static int parquetNext(sqlite3_vtab_cursor *cur){ static int parquetNext(sqlite3_vtab_cursor *cur){
try { try {
ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur;
ParquetCursor* cursor = vtab_cursor_parquet->cursor;
cursor->next(); cursor->next();
return SQLITE_OK; return SQLITE_OK;
} catch(std::bad_alloc& ba) { } catch(std::bad_alloc& ba) {
@ -395,6 +481,38 @@ ConstraintOperator constraintOperatorFromSqlite(int op) {
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
std::vector<unsigned char> getRowGroupsForClause(sqlite3* db, std::string table, std::string clause) {
std::vector<unsigned char> rv;
std::unique_ptr<char, void(*)(void*)> sql(sqlite3_mprintf(
"SELECT actual FROM _%s_rowgroups WHERE clause = '%q'",
table.c_str(),
clause.c_str()), sqlite3_free);
if(sql.get() == NULL)
return rv;
sqlite3_stmt* pStmt = NULL;
int rc = sqlite3_prepare_v2(db, sql.get(), -1, &pStmt, NULL);
if(rc != 0)
return rv;
rc = sqlite3_step(pStmt);
if(rc == SQLITE_ROW) {
int size = sqlite3_column_bytes(pStmt, 0);
unsigned char* blob = (unsigned char*)sqlite3_column_blob(pStmt, 0);
// TODO: there is a memory leak here if we get a std::bad_alloc while populating rv;
// we fail to free pStmt
for(int i = 0; i < size; i++) {
rv.push_back(blob[i]);
}
}
sqlite3_finalize(pStmt);
return rv;
}
/* /*
** Only a full table scan is supported. So xFilter simply rewinds to ** Only a full table scan is supported. So xFilter simply rewinds to
** the beginning. ** the beginning.
@ -407,7 +525,10 @@ static int parquetFilter(
sqlite3_value **argv sqlite3_value **argv
){ ){
try { try {
ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur;
sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab);
sqlite3* db = vtab_parquet->db;
ParquetCursor* cursor = vtab_cursor_parquet->cursor;
sqlite3_index_info* indexInfo = (sqlite3_index_info*)idxStr; sqlite3_index_info* indexInfo = (sqlite3_index_info*)idxStr;
#ifdef DEBUG #ifdef DEBUG
@ -451,13 +572,40 @@ static int parquetFilter(
type = Null; type = Null;
} }
Constraint constraint( std::string columnName = "rowid";
if(indexInfo->aConstraint[i].iColumn >= 0) {
columnName = cursor->getTable()->columnName(indexInfo->aConstraint[i].iColumn);
}
RowGroupBitmap bitmap = RowGroupBitmap(cursor->getNumRowGroups());
Constraint dummy(
bitmap,
indexInfo->aConstraint[i].iColumn, indexInfo->aConstraint[i].iColumn,
columnName,
constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), constraintOperatorFromSqlite(indexInfo->aConstraint[i].op),
type, type,
intValue, intValue,
doubleValue, doubleValue,
blobValue); blobValue);
std::vector<unsigned char> actual = getRowGroupsForClause(db, cursor->getTable()->getTableName(), dummy.describe());
if(actual.size() > 0) {
// Initialize the estimate to be the actual -- eventually they'll converge
// and we'll stop writing back to the db.
std::vector<unsigned char> estimate = actual;
bitmap = RowGroupBitmap(estimate, actual);
}
Constraint constraint(
bitmap,
indexInfo->aConstraint[i].iColumn,
columnName,
constraintOperatorFromSqlite(indexInfo->aConstraint[i].op),
type,
intValue,
doubleValue,
blobValue);
constraints.push_back(constraint); constraints.push_back(constraint);
j++; j++;
} }
@ -555,7 +703,7 @@ static sqlite3_module ParquetModule = {
parquetConnect, /* xConnect */ parquetConnect, /* xConnect */
parquetBestIndex, /* xBestIndex */ parquetBestIndex, /* xBestIndex */
parquetDisconnect, /* xDisconnect */ parquetDisconnect, /* xDisconnect */
parquetDisconnect, /* xDestroy */ parquetDestroy, /* xDestroy */
parquetOpen, /* xOpen - open a cursor */ parquetOpen, /* xOpen - open a cursor */
parquetClose, /* xClose - close a cursor */ parquetClose, /* xClose - close a cursor */
parquetFilter, /* xFilter - configure scan constraints */ parquetFilter, /* xFilter - configure scan constraints */

View File

@ -1,7 +1,6 @@
#include "parquet_cursor.h" #include "parquet_cursor.h"
ParquetCursor::ParquetCursor(ParquetTable* table) { ParquetCursor::ParquetCursor(ParquetTable* table): table(table) {
this->table = table;
reader = NULL; reader = NULL;
reset(std::vector<Constraint>()); reset(std::vector<Constraint>());
} }
@ -518,6 +517,7 @@ bool ParquetCursor::currentRowSatisfiesDoubleFilter(Constraint& constraint) {
// This avoids opening rowgroups that can't return useful // This avoids opening rowgroups that can't return useful
// data, which provides substantial performance benefits. // data, which provides substantial performance benefits.
bool ParquetCursor::currentRowGroupSatisfiesFilter() { bool ParquetCursor::currentRowGroupSatisfiesFilter() {
bool overallRv = true;
for(unsigned int i = 0; i < constraints.size(); i++) { for(unsigned int i = 0; i < constraints.size(); i++) {
int column = constraints[i].column; int column = constraints[i].column;
int op = constraints[i].op; int op = constraints[i].op;
@ -527,9 +527,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]); rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]);
} else { } else {
std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column); std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
if(!md->is_stats_set()) { if(md->is_stats_set()) {
continue;
}
std::shared_ptr<parquet::RowGroupStatistics> stats = md->statistics(); std::shared_ptr<parquet::RowGroupStatistics> stats = md->statistics();
// SQLite is much looser with types than you might expect if you // SQLite is much looser with types than you might expect if you
@ -562,12 +560,19 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
} }
} }
} }
if(!rv)
return false;
} }
return true; // and it with the existing actual, which may have come from a previous run
rv = rv && constraints[i].bitmap.getActualMembership(rowGroupId);
if(!rv) {
constraints[i].bitmap.setEstimatedMembership(rowGroupId, rv);
constraints[i].bitmap.setActualMembership(rowGroupId, rv);
}
overallRv = overallRv && rv;
}
// printf("rowGroup %d %s\n", rowGroupId, overallRv ? "may satisfy" : "does not satisfy");
return overallRv;
} }
@ -609,9 +614,22 @@ start:
// Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it; // Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it;
// it'll get decremented by our caller // it'll get decremented by our caller
rowId++; rowId++;
// We're going to scan this row group; reset the expectation of discovering
// a row
for(unsigned int i = 0; i < constraints.size(); i++) {
if(rowGroupId > 0 && constraints[i].rowGroupId == rowGroupId - 1) {
constraints[i].bitmap.setActualMembership(rowGroupId - 1, constraints[i].hadRows);
}
constraints[i].hadRows = false;
}
if(!currentRowGroupSatisfiesFilter()) if(!currentRowGroupSatisfiesFilter())
goto start; goto start;
for(unsigned int i = 0; i < constraints.size(); i++) {
constraints[i].rowGroupId = rowGroupId;
}
return true; return true;
} }
@ -623,6 +641,7 @@ start:
// and the extension, which can add up on a dataset of tens // and the extension, which can add up on a dataset of tens
// of millions of rows. // of millions of rows.
bool ParquetCursor::currentRowSatisfiesFilter() { bool ParquetCursor::currentRowSatisfiesFilter() {
bool overallRv = true;
for(unsigned int i = 0; i < constraints.size(); i++) { for(unsigned int i = 0; i < constraints.size(); i++) {
bool rv = true; bool rv = true;
int column = constraints[i].column; int column = constraints[i].column;
@ -648,13 +667,18 @@ bool ParquetCursor::currentRowSatisfiesFilter() {
} }
} }
if(!rv) // it defaults to false; so only set it if true
return false; // ideally we'd short-circuit if we'd already set this group as visited
if(rv) {
constraints[i].hadRows = true;
} }
return true; overallRv = overallRv && rv;
}
return overallRv;
} }
void ParquetCursor::next() { void ParquetCursor::next() {
// Returns true if we've crossed a row group boundary
start: start:
if(rowsLeftInRowGroup == 0) { if(rowsLeftInRowGroup == 0) {
if(!nextRowGroup()) { if(!nextRowGroup()) {
@ -672,7 +696,6 @@ start:
rowId++; rowId++;
if(!currentRowSatisfiesFilter()) if(!currentRowSatisfiesFilter())
goto start; goto start;
} }
int ParquetCursor::getRowId() { int ParquetCursor::getRowId() {
@ -939,7 +962,7 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
// TODO: consider having a long lived handle in ParquetTable that can be borrowed // TODO: consider having a long lived handle in ParquetTable that can be borrowed
// without incurring the cost of opening the file from scratch twice // without incurring the cost of opening the file from scratch twice
reader = parquet::ParquetFileReader::OpenFile( reader = parquet::ParquetFileReader::OpenFile(
table->file.data(), table->getFile().data(),
true, true,
parquet::default_reader_properties(), parquet::default_reader_properties(),
table->getMetadata()); table->getMetadata());
@ -955,4 +978,10 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
numRowGroups = reader->metadata()->num_row_groups(); numRowGroups = reader->metadata()->num_row_groups();
} }
ParquetTable* ParquetCursor::getTable() { return table; } ParquetTable* ParquetCursor::getTable() const { return table; }
unsigned int ParquetCursor::getNumRowGroups() const { return numRowGroups; }
unsigned int ParquetCursor::getNumConstraints() const { return constraints.size(); }
const Constraint& ParquetCursor::getConstraint(unsigned int i) const { return constraints[i]; }

View File

@ -56,9 +56,12 @@ public:
void ensureColumn(int col); void ensureColumn(int col);
bool isNull(int col); bool isNull(int col);
unsigned int getNumRowGroups() const;
unsigned int getNumConstraints() const;
const Constraint& getConstraint(unsigned int i) const;
parquet::Type::type getPhysicalType(int col); parquet::Type::type getPhysicalType(int col);
parquet::LogicalType::type getLogicalType(int col); parquet::LogicalType::type getLogicalType(int col);
ParquetTable* getTable(); ParquetTable* getTable() const;
int getInt32(int col); int getInt32(int col);
long getInt64(int col); long getInt64(int col);

View File

@ -1,19 +1,25 @@
#include "parquet_filter.h" #include "parquet_filter.h"
Constraint::Constraint( Constraint::Constraint(
RowGroupBitmap bitmap,
int column, int column,
std::string columnName,
ConstraintOperator op, ConstraintOperator op,
ValueType type, ValueType type,
int64_t intValue, int64_t intValue,
double doubleValue, double doubleValue,
std::vector<unsigned char> blobValue std::vector<unsigned char> blobValue
) { ): bitmap(bitmap),
this->column = column; column(column),
this->op = op; columnName(columnName),
this->type = type; op(op),
this->intValue = intValue; type(type),
this->doubleValue = doubleValue; intValue(intValue),
this->blobValue = blobValue; doubleValue(doubleValue),
blobValue(blobValue),
hadRows(false) {
RowGroupBitmap bm = bitmap;
this->bitmap = bm;
if(type == Text) { if(type == Text) {
stringValue = std::string((char*)&blobValue[0], blobValue.size()); stringValue = std::string((char*)&blobValue[0], blobValue.size());
@ -34,3 +40,72 @@ Constraint::Constraint(
} }
} }
} }
std::string Constraint::describe() const {
std::string rv;
rv.append(columnName);
rv.append(" ");
switch(op) {
case Equal:
rv.append("=");
break;
case GreaterThan:
rv.append(">");
break;
case LessThanOrEqual:
rv.append("<=");
break;
case LessThan:
rv.append("<");
break;
case GreaterThanOrEqual:
rv.append(">=");
break;
case Match:
rv.append("MATCH");
break;
case Like:
rv.append("LIKE");
break;
case Glob:
rv.append("GLOB");
break;
case Regexp:
rv.append("REGEXP");
break;
case NotEqual:
rv.append("<>");
break;
case IsNot:
rv.append("IS NOT");
break;
case IsNotNull:
rv.append("IS NOT NULL");
break;
case IsNull:
rv.append("IS NULL");
break;
case Is:
rv.append("IS");
break;
}
rv.append(" ");
switch(type) {
case Null:
rv.append("NULL");
break;
case Integer:
rv.append(std::to_string(intValue));
break;
case Double:
rv.append(std::to_string(doubleValue));
break;
case Blob:
break;
case Text:
rv.append(stringValue);
break;
}
return rv;
}

View File

@ -30,11 +30,63 @@ enum ValueType {
Text Text
}; };
class RowGroupBitmap {
void setBit(std::vector<unsigned char>& membership, unsigned int rowGroup, bool isSet) {
int byte = rowGroup / 8;
int offset = rowGroup % 8;
unsigned char c = membership[byte];
c &= ~(1UL << offset);
if(isSet) {
c |= 1UL << offset;
}
membership[byte] = c;
}
// Compares estimated rowGroupFilter results against observed results
// when we explored the row group. This lets us cache
public:
RowGroupBitmap(unsigned int totalRowGroups) {
// Initialize everything to assume that all row groups match.
// As we discover otherwise, we'll update that assumption.
for(unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) {
estimatedMembership.push_back(0xFF);
actualMembership.push_back(0xFF);
}
}
RowGroupBitmap(
std::vector<unsigned char> estimatedMembership,
std::vector<unsigned char> actualMembership) :
estimatedMembership(estimatedMembership),
actualMembership(actualMembership) {
}
std::vector<unsigned char> estimatedMembership;
std::vector<unsigned char> actualMembership;
// Pass false only if definitely does not have rows
void setEstimatedMembership(unsigned int rowGroup, bool hasRows) {
setBit(estimatedMembership, rowGroup, hasRows);
}
// Pass false only after exhausting all rows
void setActualMembership(unsigned int rowGroup, bool hadRows) {
setBit(actualMembership, rowGroup, hadRows);
}
bool getActualMembership(unsigned int rowGroup) {
int byte = rowGroup / 8;
int offset = rowGroup % 8;
return (actualMembership[byte] >> offset) & 1U;
}
};
class Constraint { class Constraint {
public: public:
// Kind of a messy constructor function, but it's just for internal use, so whatever. // Kind of a messy constructor function, but it's just for internal use, so whatever.
Constraint( Constraint(
RowGroupBitmap bitmap,
int column, int column,
std::string columnName,
ConstraintOperator op, ConstraintOperator op,
ValueType type, ValueType type,
int64_t intValue, int64_t intValue,
@ -42,7 +94,9 @@ public:
std::vector<unsigned char> blobValue std::vector<unsigned char> blobValue
); );
RowGroupBitmap bitmap;
int column; // underlying column in the query int column; // underlying column in the query
std::string columnName;
ConstraintOperator op; ConstraintOperator op;
ValueType type; ValueType type;
@ -54,6 +108,15 @@ public:
// Only set when stringValue is set and op == Like // Only set when stringValue is set and op == Like
std::string likeStringValue; std::string likeStringValue;
// A unique identifier for this constraint, e.g.
// col0 = 'Dawson Creek'
std::string describe() const;
// This is a temp field used while evaluating if a rowgroup had rows
// that matched this constraint.
int rowGroupId;
bool hadRows;
}; };
#endif #endif

View File

@ -2,9 +2,7 @@
#include "parquet/api/reader.h" #include "parquet/api/reader.h"
ParquetTable::ParquetTable(std::string file) { ParquetTable::ParquetTable(std::string file, std::string tableName): file(file), tableName(tableName) {
this->file = file;
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
metadata = reader->metadata(); metadata = reader->metadata();
} }
@ -138,3 +136,6 @@ std::string ParquetTable::CreateStatement() {
} }
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; } std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; }
const std::string& ParquetTable::getFile() { return file; }
const std::string& ParquetTable::getTableName() { return tableName; }

View File

@ -6,16 +6,19 @@
#include "parquet/api/reader.h" #include "parquet/api/reader.h"
class ParquetTable { class ParquetTable {
std::string file;
std::string tableName;
std::vector<std::string> columnNames; std::vector<std::string> columnNames;
std::shared_ptr<parquet::FileMetaData> metadata; std::shared_ptr<parquet::FileMetaData> metadata;
public: public:
ParquetTable(std::string file); ParquetTable(std::string file, std::string tableName);
std::string CreateStatement(); std::string CreateStatement();
std::string file;
std::string columnName(int idx); std::string columnName(int idx);
std::shared_ptr<parquet::FileMetaData> getMetadata(); std::shared_ptr<parquet::FileMetaData> getMetadata();
const std::string& getFile();
const std::string& getTableName();
}; };
#endif #endif

View File

@ -47,7 +47,7 @@ main() {
fi fi
cat "$root"/parquet-generator/*.sql > "$root"/testcase-bootstrap.sql cat "$root"/parquet-generator/*.sql > "$root"/testcase-bootstrap.sql
rm test.db rm -f test.db
"$root"/sqlite/sqlite3 test.db -init "$root"/testcase-bootstrap.sql < /dev/null "$root"/sqlite/sqlite3 test.db -init "$root"/testcase-bootstrap.sql < /dev/null
if [ ! -v NO_DEBUG ] && [ "$(cat testcases.txt | wc -l)" == "1" ]; then if [ ! -v NO_DEBUG ] && [ "$(cat testcases.txt | wc -l)" == "1" ]; then
set -x set -x