From d3ab5ff3e72c439d3bb1a4ea2bd58a15222d2ed8 Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Sat, 24 Mar 2018 23:51:15 -0400 Subject: [PATCH] Cache clauses -> row group mapping Create a shadow table. For `stats`, it'd be `_stats_rowgroups`. It contains three columns: - the clause (eg `city = 'Dawson Creek'`) - the initial estimate, as a bitmap of rowgroups based on stats - the actual observed rowgroups, as a bitmap This papers over poorly sorted parquet files, at the cost of some disk space. It makes interactive queries much more natural -- drilldown style queries are much faster, as they can leverage work done by previous queries. eg 'SELECT * FROM stats WHERE city = 'Dawson Creek' and question_id >= 1935 and question_id <= 1940` takes ~584ms on first run, but 9ms on subsequent runs. We only create entries when the estimates don't match the actual results. Fixes #6 --- README.md | 12 +++ parquet/parquet.cc | 166 +++++++++++++++++++++++++++++++++++--- parquet/parquet_cursor.cc | 109 ++++++++++++++++--------- parquet/parquet_cursor.h | 5 +- parquet/parquet_filter.cc | 89 ++++++++++++++++++-- parquet/parquet_filter.h | 63 +++++++++++++++ parquet/parquet_table.cc | 7 +- parquet/parquet_table.h | 7 +- tests/test-queries | 2 +- 9 files changed, 397 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 96330dc..5c67433 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,18 @@ constraints before returning control to SQLite's virtual machine. This minimizes the number of allocations performed when many rows are filtered out by the user's criteria. +### Memoized slices + +Individual clauses are mapped to the row groups they match. + +eg going on row group statistics, which store minimum and maximum values, a clause +like `WHERE city = 'Dawson Creek'` may match 80% of row groups. + +In reality, it may only be present in one or two row groups. + +This is recorded in a shadow table so future queries that contain that clause +can read only the necessary row groups. + ### Types These Parquet types are supported: diff --git a/parquet/parquet.cc b/parquet/parquet.cc index b28dd0f..e7a292e 100644 --- a/parquet/parquet.cc +++ b/parquet/parquet.cc @@ -17,6 +17,7 @@ SQLITE_EXTENSION_INIT1 #include #include #include +#include #include @@ -32,6 +33,7 @@ static int parquetConnect(sqlite3*, void*, int, const char*const*, sqlite3_vtab**,char**); static int parquetBestIndex(sqlite3_vtab*,sqlite3_index_info*); static int parquetDisconnect(sqlite3_vtab*); +static int parquetDestroy(sqlite3_vtab*); static int parquetOpen(sqlite3_vtab*, sqlite3_vtab_cursor**); static int parquetClose(sqlite3_vtab_cursor*); static int parquetFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr, @@ -45,6 +47,7 @@ static int parquetRowid(sqlite3_vtab_cursor*,sqlite3_int64*); typedef struct sqlite3_vtab_parquet { sqlite3_vtab base; /* Base class. Must be first */ ParquetTable* table; + sqlite3* db; } sqlite3_vtab_parquet; @@ -54,6 +57,21 @@ typedef struct sqlite3_vtab_cursor_parquet { ParquetCursor* cursor; } sqlite3_vtab_cursor_parquet; +static int parquetDestroy(sqlite3_vtab *pVtab) { + sqlite3_vtab_parquet *p = (sqlite3_vtab_parquet*)pVtab; + + // Clean up our shadow table. This is useful if the user has recreated + // the parquet file, and our mappings would now be invalid. + std::string drop = "DROP TABLE IF EXISTS _"; + drop.append(p->table->getTableName()); + drop.append("_rowgroups"); + int rv = sqlite3_exec(p->db, drop.data(), 0, 0, 0); + if(rv != 0) + return rv; + + return SQLITE_OK; +} + /* ** This method is the destructor fo a sqlite3_vtab_parquet object. */ @@ -78,6 +96,7 @@ static int parquetConnect( return SQLITE_ERROR; } + std::string tableName = argv[2]; // Remove the delimiting single quotes std::string fname = argv[3]; fname = fname.substr(1, fname.length() - 2); @@ -87,7 +106,7 @@ static int parquetConnect( memset(vtab.get(), 0, sizeof(*vtab.get())); try { - std::unique_ptr table(new ParquetTable(fname)); + std::unique_ptr table(new ParquetTable(fname, tableName)); std::string create = table->CreateStatement(); int rc = sqlite3_declare_vtab(db, create.data()); @@ -95,6 +114,7 @@ static int parquetConnect( return rc; vtab->table = table.release(); + vtab->db = db; *ppVtab = (sqlite3_vtab*)vtab.release(); return SQLITE_OK; } catch (const std::exception& e) { @@ -119,16 +139,81 @@ static int parquetCreate( sqlite3_vtab **ppVtab, char **pzErr ){ - return parquetConnect(db, pAux, argc, argv, ppVtab, pzErr); + try { + // Create shadow table for storing constraint -> rowid mappings + std::string create = "CREATE TABLE IF NOT EXISTS _"; + create.append(argv[2]); + create.append("_rowgroups(clause TEXT, estimate BLOB, actual BLOB)"); + int rv = sqlite3_exec(db, create.data(), 0, 0, 0); + if(rv != 0) + return rv; + + create = "CREATE UNIQUE INDEX IF NOT EXISTS _"; + create.append(argv[2]); + create.append("_index ON _"); + create.append(argv[2]); + create.append("_rowgroups(clause)"); + rv = sqlite3_exec(db, create.data(), 0, 0, 0); + + return parquetConnect(db, pAux, argc, argv, ppVtab, pzErr); + } catch (std::bad_alloc& ba) { + return SQLITE_NOMEM; + } } +std::string quoteBlob(const std::vector& bytes) { + std::ostringstream ss; + ss << "X'" << std::hex; + for(unsigned int i = 0; i < bytes.size(); i++) { + ss << std::setfill('0') << std::setw(2) << (unsigned int)(unsigned char)bytes[i]; + } + ss << "'"; + + return ss.str(); +} + +void persistConstraints(sqlite3* db, ParquetCursor* cursor) { + for(unsigned int i = 0; i < cursor->getNumConstraints(); i++) { + const Constraint& constraint = cursor->getConstraint(i); + const std::vector& estimated = constraint.bitmap.estimatedMembership; + const std::vector& actual = constraint.bitmap.actualMembership; + if(estimated == actual) { + continue; + } + std::string desc = constraint.describe(); + + std::string estimatedStr = quoteBlob(estimated); + std::string actualStr = quoteBlob(actual); + + // This is only advisory, so ignore failures. + char* sql = sqlite3_mprintf( + "INSERT OR REPLACE INTO _%s_rowgroups(clause, estimate, actual) VALUES ('%q', %s, %s)", + cursor->getTable()->getTableName().c_str(), + desc.c_str(), + estimatedStr.c_str(), + actualStr.c_str()); + + + if(sql == NULL) + return; + + sqlite3_exec(db, sql, 0, 0, 0); + sqlite3_free(sql); + } +} + + /* ** Destructor for a sqlite3_vtab_cursor_parquet. */ static int parquetClose(sqlite3_vtab_cursor *cur){ - sqlite3_vtab_cursor_parquet* p = (sqlite3_vtab_cursor_parquet*)cur; - p->cursor->close(); - delete p->cursor; + sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; + sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); + ParquetCursor* cursor = vtab_cursor_parquet->cursor; + persistConstraints(vtab_parquet->db, cursor); + + vtab_cursor_parquet->cursor->close(); + delete vtab_cursor_parquet->cursor; sqlite3_free(cur); return SQLITE_OK; } @@ -196,7 +281,8 @@ const char* opName(int op) { */ static int parquetNext(sqlite3_vtab_cursor *cur){ try { - ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; + sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; + ParquetCursor* cursor = vtab_cursor_parquet->cursor; cursor->next(); return SQLITE_OK; } catch(std::bad_alloc& ba) { @@ -395,6 +481,38 @@ ConstraintOperator constraintOperatorFromSqlite(int op) { throw std::invalid_argument(ss.str()); } +std::vector getRowGroupsForClause(sqlite3* db, std::string table, std::string clause) { + std::vector rv; + + std::unique_ptr sql(sqlite3_mprintf( + "SELECT actual FROM _%s_rowgroups WHERE clause = '%q'", + table.c_str(), + clause.c_str()), sqlite3_free); + + if(sql.get() == NULL) + return rv; + + sqlite3_stmt* pStmt = NULL; + int rc = sqlite3_prepare_v2(db, sql.get(), -1, &pStmt, NULL); + if(rc != 0) + return rv; + + rc = sqlite3_step(pStmt); + if(rc == SQLITE_ROW) { + int size = sqlite3_column_bytes(pStmt, 0); + unsigned char* blob = (unsigned char*)sqlite3_column_blob(pStmt, 0); + // TODO: there is a memory leak here if we get a std::bad_alloc while populating rv; + // we fail to free pStmt + for(int i = 0; i < size; i++) { + rv.push_back(blob[i]); + } + } + + sqlite3_finalize(pStmt); + return rv; +} + + /* ** Only a full table scan is supported. So xFilter simply rewinds to ** the beginning. @@ -407,7 +525,10 @@ static int parquetFilter( sqlite3_value **argv ){ try { - ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; + sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; + sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); + sqlite3* db = vtab_parquet->db; + ParquetCursor* cursor = vtab_cursor_parquet->cursor; sqlite3_index_info* indexInfo = (sqlite3_index_info*)idxStr; #ifdef DEBUG @@ -451,13 +572,40 @@ static int parquetFilter( type = Null; } - Constraint constraint( + std::string columnName = "rowid"; + if(indexInfo->aConstraint[i].iColumn >= 0) { + columnName = cursor->getTable()->columnName(indexInfo->aConstraint[i].iColumn); + } + + RowGroupBitmap bitmap = RowGroupBitmap(cursor->getNumRowGroups()); + Constraint dummy( + bitmap, indexInfo->aConstraint[i].iColumn, + columnName, constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), type, intValue, doubleValue, blobValue); + + std::vector actual = getRowGroupsForClause(db, cursor->getTable()->getTableName(), dummy.describe()); + if(actual.size() > 0) { + // Initialize the estimate to be the actual -- eventually they'll converge + // and we'll stop writing back to the db. + std::vector estimate = actual; + bitmap = RowGroupBitmap(estimate, actual); + } + + Constraint constraint( + bitmap, + indexInfo->aConstraint[i].iColumn, + columnName, + constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), + type, + intValue, + doubleValue, + blobValue); + constraints.push_back(constraint); j++; } @@ -555,7 +703,7 @@ static sqlite3_module ParquetModule = { parquetConnect, /* xConnect */ parquetBestIndex, /* xBestIndex */ parquetDisconnect, /* xDisconnect */ - parquetDisconnect, /* xDestroy */ + parquetDestroy, /* xDestroy */ parquetOpen, /* xOpen - open a cursor */ parquetClose, /* xClose - close a cursor */ parquetFilter, /* xFilter - configure scan constraints */ diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 9dc39df..782784f 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -1,7 +1,6 @@ #include "parquet_cursor.h" -ParquetCursor::ParquetCursor(ParquetTable* table) { - this->table = table; +ParquetCursor::ParquetCursor(ParquetTable* table): table(table) { reader = NULL; reset(std::vector()); } @@ -518,6 +517,7 @@ bool ParquetCursor::currentRowSatisfiesDoubleFilter(Constraint& constraint) { // This avoids opening rowgroups that can't return useful // data, which provides substantial performance benefits. bool ParquetCursor::currentRowGroupSatisfiesFilter() { + bool overallRv = true; for(unsigned int i = 0; i < constraints.size(); i++) { int column = constraints[i].column; int op = constraints[i].op; @@ -527,47 +527,52 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]); } else { std::unique_ptr md = rowGroupMetadata->ColumnChunk(column); - if(!md->is_stats_set()) { - continue; - } - std::shared_ptr stats = md->statistics(); + if(md->is_stats_set()) { + std::shared_ptr stats = md->statistics(); - // SQLite is much looser with types than you might expect if you - // come from a Postgres background. The constraint '30.0' (that is, - // a string containing a floating point number) should be treated - // as equal to a field containing an integer 30. - // - // This means that even if the parquet physical type is integer, - // the constraint type may be a string, so dispatch to the filter - // fn based on the Parquet type. + // SQLite is much looser with types than you might expect if you + // come from a Postgres background. The constraint '30.0' (that is, + // a string containing a floating point number) should be treated + // as equal to a field containing an integer 30. + // + // This means that even if the parquet physical type is integer, + // the constraint type may be a string, so dispatch to the filter + // fn based on the Parquet type. - if(op == IsNull) { - rv = stats->null_count() > 0; - } else if(op == IsNotNull) { - rv = stats->num_values() > 0; - } else { - parquet::Type::type pqType = types[column]; + if(op == IsNull) { + rv = stats->null_count() > 0; + } else if(op == IsNotNull) { + rv = stats->num_values() > 0; + } else { + parquet::Type::type pqType = types[column]; - if(pqType == parquet::Type::BYTE_ARRAY && logicalTypes[column] == parquet::LogicalType::UTF8) { - rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); - } else if(pqType == parquet::Type::BYTE_ARRAY) { - rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats); - } else if(pqType == parquet::Type::INT32 || - pqType == parquet::Type::INT64 || - pqType == parquet::Type::INT96 || - pqType == parquet::Type::BOOLEAN) { - rv = currentRowGroupSatisfiesIntegerFilter(constraints[i], stats); - } else if(pqType == parquet::Type::FLOAT || pqType == parquet::Type::DOUBLE) { - rv = currentRowGroupSatisfiesDoubleFilter(constraints[i], stats); + if(pqType == parquet::Type::BYTE_ARRAY && logicalTypes[column] == parquet::LogicalType::UTF8) { + rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); + } else if(pqType == parquet::Type::BYTE_ARRAY) { + rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats); + } else if(pqType == parquet::Type::INT32 || + pqType == parquet::Type::INT64 || + pqType == parquet::Type::INT96 || + pqType == parquet::Type::BOOLEAN) { + rv = currentRowGroupSatisfiesIntegerFilter(constraints[i], stats); + } else if(pqType == parquet::Type::FLOAT || pqType == parquet::Type::DOUBLE) { + rv = currentRowGroupSatisfiesDoubleFilter(constraints[i], stats); + } } } } - if(!rv) - return false; + // and it with the existing actual, which may have come from a previous run + rv = rv && constraints[i].bitmap.getActualMembership(rowGroupId); + if(!rv) { + constraints[i].bitmap.setEstimatedMembership(rowGroupId, rv); + constraints[i].bitmap.setActualMembership(rowGroupId, rv); + } + overallRv = overallRv && rv; } - return true; +// printf("rowGroup %d %s\n", rowGroupId, overallRv ? "may satisfy" : "does not satisfy"); + return overallRv; } @@ -609,9 +614,22 @@ start: // Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it; // it'll get decremented by our caller rowId++; + + // We're going to scan this row group; reset the expectation of discovering + // a row + for(unsigned int i = 0; i < constraints.size(); i++) { + if(rowGroupId > 0 && constraints[i].rowGroupId == rowGroupId - 1) { + constraints[i].bitmap.setActualMembership(rowGroupId - 1, constraints[i].hadRows); + } + constraints[i].hadRows = false; + } + if(!currentRowGroupSatisfiesFilter()) goto start; + for(unsigned int i = 0; i < constraints.size(); i++) { + constraints[i].rowGroupId = rowGroupId; + } return true; } @@ -623,6 +641,7 @@ start: // and the extension, which can add up on a dataset of tens // of millions of rows. bool ParquetCursor::currentRowSatisfiesFilter() { + bool overallRv = true; for(unsigned int i = 0; i < constraints.size(); i++) { bool rv = true; int column = constraints[i].column; @@ -648,13 +667,18 @@ bool ParquetCursor::currentRowSatisfiesFilter() { } } - if(!rv) - return false; + // it defaults to false; so only set it if true + // ideally we'd short-circuit if we'd already set this group as visited + if(rv) { + constraints[i].hadRows = true; + } + overallRv = overallRv && rv; } - return true; + return overallRv; } void ParquetCursor::next() { + // Returns true if we've crossed a row group boundary start: if(rowsLeftInRowGroup == 0) { if(!nextRowGroup()) { @@ -672,7 +696,6 @@ start: rowId++; if(!currentRowSatisfiesFilter()) goto start; - } int ParquetCursor::getRowId() { @@ -939,7 +962,7 @@ void ParquetCursor::reset(std::vector constraints) { // TODO: consider having a long lived handle in ParquetTable that can be borrowed // without incurring the cost of opening the file from scratch twice reader = parquet::ParquetFileReader::OpenFile( - table->file.data(), + table->getFile().data(), true, parquet::default_reader_properties(), table->getMetadata()); @@ -955,4 +978,10 @@ void ParquetCursor::reset(std::vector constraints) { numRowGroups = reader->metadata()->num_row_groups(); } -ParquetTable* ParquetCursor::getTable() { return table; } +ParquetTable* ParquetCursor::getTable() const { return table; } + +unsigned int ParquetCursor::getNumRowGroups() const { return numRowGroups; } +unsigned int ParquetCursor::getNumConstraints() const { return constraints.size(); } +const Constraint& ParquetCursor::getConstraint(unsigned int i) const { return constraints[i]; } + + diff --git a/parquet/parquet_cursor.h b/parquet/parquet_cursor.h index a237ecf..f7d8c2a 100644 --- a/parquet/parquet_cursor.h +++ b/parquet/parquet_cursor.h @@ -56,9 +56,12 @@ public: void ensureColumn(int col); bool isNull(int col); + unsigned int getNumRowGroups() const; + unsigned int getNumConstraints() const; + const Constraint& getConstraint(unsigned int i) const; parquet::Type::type getPhysicalType(int col); parquet::LogicalType::type getLogicalType(int col); - ParquetTable* getTable(); + ParquetTable* getTable() const; int getInt32(int col); long getInt64(int col); diff --git a/parquet/parquet_filter.cc b/parquet/parquet_filter.cc index 18de068..7095b65 100644 --- a/parquet/parquet_filter.cc +++ b/parquet/parquet_filter.cc @@ -1,19 +1,25 @@ #include "parquet_filter.h" Constraint::Constraint( + RowGroupBitmap bitmap, int column, + std::string columnName, ConstraintOperator op, ValueType type, int64_t intValue, double doubleValue, std::vector blobValue -) { - this->column = column; - this->op = op; - this->type = type; - this->intValue = intValue; - this->doubleValue = doubleValue; - this->blobValue = blobValue; +): bitmap(bitmap), + column(column), + columnName(columnName), + op(op), + type(type), + intValue(intValue), + doubleValue(doubleValue), + blobValue(blobValue), + hadRows(false) { + RowGroupBitmap bm = bitmap; + this->bitmap = bm; if(type == Text) { stringValue = std::string((char*)&blobValue[0], blobValue.size()); @@ -34,3 +40,72 @@ Constraint::Constraint( } } } + +std::string Constraint::describe() const { + std::string rv; + rv.append(columnName); + rv.append(" "); + switch(op) { + case Equal: + rv.append("="); + break; + case GreaterThan: + rv.append(">"); + break; + case LessThanOrEqual: + rv.append("<="); + break; + case LessThan: + rv.append("<"); + break; + case GreaterThanOrEqual: + rv.append(">="); + break; + case Match: + rv.append("MATCH"); + break; + case Like: + rv.append("LIKE"); + break; + case Glob: + rv.append("GLOB"); + break; + case Regexp: + rv.append("REGEXP"); + break; + case NotEqual: + rv.append("<>"); + break; + case IsNot: + rv.append("IS NOT"); + break; + case IsNotNull: + rv.append("IS NOT NULL"); + break; + case IsNull: + rv.append("IS NULL"); + break; + case Is: + rv.append("IS"); + break; + } + rv.append(" "); + + switch(type) { + case Null: + rv.append("NULL"); + break; + case Integer: + rv.append(std::to_string(intValue)); + break; + case Double: + rv.append(std::to_string(doubleValue)); + break; + case Blob: + break; + case Text: + rv.append(stringValue); + break; + } + return rv; +} diff --git a/parquet/parquet_filter.h b/parquet/parquet_filter.h index 4afb280..d9f336e 100644 --- a/parquet/parquet_filter.h +++ b/parquet/parquet_filter.h @@ -30,11 +30,63 @@ enum ValueType { Text }; +class RowGroupBitmap { + void setBit(std::vector& membership, unsigned int rowGroup, bool isSet) { + int byte = rowGroup / 8; + int offset = rowGroup % 8; + unsigned char c = membership[byte]; + c &= ~(1UL << offset); + if(isSet) { + c |= 1UL << offset; + } + membership[byte] = c; + } +// Compares estimated rowGroupFilter results against observed results +// when we explored the row group. This lets us cache +public: + RowGroupBitmap(unsigned int totalRowGroups) { + // Initialize everything to assume that all row groups match. + // As we discover otherwise, we'll update that assumption. + for(unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) { + estimatedMembership.push_back(0xFF); + actualMembership.push_back(0xFF); + } + } + + RowGroupBitmap( + std::vector estimatedMembership, + std::vector actualMembership) : + estimatedMembership(estimatedMembership), + actualMembership(actualMembership) { + } + + std::vector estimatedMembership; + std::vector actualMembership; + // Pass false only if definitely does not have rows + void setEstimatedMembership(unsigned int rowGroup, bool hasRows) { + setBit(estimatedMembership, rowGroup, hasRows); + } + + // Pass false only after exhausting all rows + void setActualMembership(unsigned int rowGroup, bool hadRows) { + setBit(actualMembership, rowGroup, hadRows); + } + + bool getActualMembership(unsigned int rowGroup) { + int byte = rowGroup / 8; + int offset = rowGroup % 8; + + return (actualMembership[byte] >> offset) & 1U; + } +}; + class Constraint { public: // Kind of a messy constructor function, but it's just for internal use, so whatever. Constraint( + RowGroupBitmap bitmap, int column, + std::string columnName, ConstraintOperator op, ValueType type, int64_t intValue, @@ -42,7 +94,9 @@ public: std::vector blobValue ); + RowGroupBitmap bitmap; int column; // underlying column in the query + std::string columnName; ConstraintOperator op; ValueType type; @@ -54,6 +108,15 @@ public: // Only set when stringValue is set and op == Like std::string likeStringValue; + + // A unique identifier for this constraint, e.g. + // col0 = 'Dawson Creek' + std::string describe() const; + + // This is a temp field used while evaluating if a rowgroup had rows + // that matched this constraint. + int rowGroupId; + bool hadRows; }; #endif diff --git a/parquet/parquet_table.cc b/parquet/parquet_table.cc index c213e2c..dbc6c14 100644 --- a/parquet/parquet_table.cc +++ b/parquet/parquet_table.cc @@ -2,9 +2,7 @@ #include "parquet/api/reader.h" -ParquetTable::ParquetTable(std::string file) { - this->file = file; - +ParquetTable::ParquetTable(std::string file, std::string tableName): file(file), tableName(tableName) { std::unique_ptr reader = parquet::ParquetFileReader::OpenFile(file.data()); metadata = reader->metadata(); } @@ -138,3 +136,6 @@ std::string ParquetTable::CreateStatement() { } std::shared_ptr ParquetTable::getMetadata() { return metadata; } + +const std::string& ParquetTable::getFile() { return file; } +const std::string& ParquetTable::getTableName() { return tableName; } diff --git a/parquet/parquet_table.h b/parquet/parquet_table.h index da284c6..43321f6 100644 --- a/parquet/parquet_table.h +++ b/parquet/parquet_table.h @@ -6,16 +6,19 @@ #include "parquet/api/reader.h" class ParquetTable { + std::string file; + std::string tableName; std::vector columnNames; std::shared_ptr metadata; public: - ParquetTable(std::string file); + ParquetTable(std::string file, std::string tableName); std::string CreateStatement(); - std::string file; std::string columnName(int idx); std::shared_ptr getMetadata(); + const std::string& getFile(); + const std::string& getTableName(); }; #endif diff --git a/tests/test-queries b/tests/test-queries index 3383ef2..f9f4e4f 100755 --- a/tests/test-queries +++ b/tests/test-queries @@ -47,7 +47,7 @@ main() { fi cat "$root"/parquet-generator/*.sql > "$root"/testcase-bootstrap.sql - rm test.db + rm -f test.db "$root"/sqlite/sqlite3 test.db -init "$root"/testcase-bootstrap.sql < /dev/null if [ ! -v NO_DEBUG ] && [ "$(cat testcases.txt | wc -l)" == "1" ]; then set -x