mirror of
				https://github.com/cldellow/sqlite-parquet-vtable.git
				synced 2025-10-27 01:59:56 +00:00 
			
		
		
		
	Cache clauses -> row group mapping
Create a shadow table. For `stats`, it'd be `_stats_rowgroups`. It contains three columns: - the clause (eg `city = 'Dawson Creek'`) - the initial estimate, as a bitmap of rowgroups based on stats - the actual observed rowgroups, as a bitmap This papers over poorly sorted parquet files, at the cost of some disk space. It makes interactive queries much more natural -- drilldown style queries are much faster, as they can leverage work done by previous queries. eg 'SELECT * FROM stats WHERE city = 'Dawson Creek' and question_id >= 1935 and question_id <= 1940` takes ~584ms on first run, but 9ms on subsequent runs. We only create entries when the estimates don't match the actual results. Fixes #6
This commit is contained in:
		
							
								
								
									
										12
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
									
									
									
									
								
							| @@ -72,6 +72,18 @@ constraints before returning control to SQLite's virtual machine. This minimizes | ||||
| the number of allocations performed when many rows are filtered out by | ||||
| the user's criteria. | ||||
|  | ||||
| ### Memoized slices | ||||
|  | ||||
| Individual clauses are mapped to the row groups they match. | ||||
|  | ||||
| eg going on row group statistics, which store minimum and maximum values, a clause | ||||
| like `WHERE city = 'Dawson Creek'` may match 80% of row groups. | ||||
|  | ||||
| In reality, it may only be present in one or two row groups. | ||||
|  | ||||
| This is recorded in a shadow table so future queries that contain that clause | ||||
| can read only the necessary row groups. | ||||
|  | ||||
| ### Types | ||||
|  | ||||
| These Parquet types are supported: | ||||
|   | ||||
| @@ -17,6 +17,7 @@ SQLITE_EXTENSION_INIT1 | ||||
| #include <stdarg.h> | ||||
| #include <ctype.h> | ||||
| #include <stdio.h> | ||||
| #include <iomanip> | ||||
|  | ||||
| #include <memory> | ||||
|  | ||||
| @@ -32,6 +33,7 @@ static int parquetConnect(sqlite3*, void*, int, const char*const*, | ||||
|                            sqlite3_vtab**,char**); | ||||
| static int parquetBestIndex(sqlite3_vtab*,sqlite3_index_info*); | ||||
| static int parquetDisconnect(sqlite3_vtab*); | ||||
| static int parquetDestroy(sqlite3_vtab*); | ||||
| static int parquetOpen(sqlite3_vtab*, sqlite3_vtab_cursor**); | ||||
| static int parquetClose(sqlite3_vtab_cursor*); | ||||
| static int parquetFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr, | ||||
| @@ -45,6 +47,7 @@ static int parquetRowid(sqlite3_vtab_cursor*,sqlite3_int64*); | ||||
| typedef struct sqlite3_vtab_parquet { | ||||
|   sqlite3_vtab base;              /* Base class.  Must be first */ | ||||
|   ParquetTable* table; | ||||
|   sqlite3* db; | ||||
| } sqlite3_vtab_parquet; | ||||
|  | ||||
|  | ||||
| @@ -54,6 +57,21 @@ typedef struct sqlite3_vtab_cursor_parquet { | ||||
|   ParquetCursor* cursor; | ||||
| } sqlite3_vtab_cursor_parquet; | ||||
|  | ||||
| static int parquetDestroy(sqlite3_vtab *pVtab) { | ||||
|   sqlite3_vtab_parquet *p = (sqlite3_vtab_parquet*)pVtab; | ||||
|  | ||||
|   // Clean up our shadow table. This is useful if the user has recreated | ||||
|   // the parquet file, and our mappings would now be invalid. | ||||
|   std::string drop = "DROP TABLE IF EXISTS _"; | ||||
|   drop.append(p->table->getTableName()); | ||||
|   drop.append("_rowgroups"); | ||||
|   int rv = sqlite3_exec(p->db, drop.data(), 0, 0, 0); | ||||
|   if(rv != 0) | ||||
|     return rv; | ||||
|  | ||||
|   return SQLITE_OK; | ||||
| } | ||||
|  | ||||
| /* | ||||
| ** This method is the destructor fo a sqlite3_vtab_parquet object. | ||||
| */ | ||||
| @@ -78,6 +96,7 @@ static int parquetConnect( | ||||
|       return SQLITE_ERROR; | ||||
|     } | ||||
|  | ||||
|     std::string tableName = argv[2]; | ||||
|     // Remove the delimiting single quotes | ||||
|     std::string fname = argv[3]; | ||||
|     fname = fname.substr(1, fname.length() - 2); | ||||
| @@ -87,7 +106,7 @@ static int parquetConnect( | ||||
|     memset(vtab.get(), 0, sizeof(*vtab.get())); | ||||
|  | ||||
|     try { | ||||
|       std::unique_ptr<ParquetTable> table(new ParquetTable(fname)); | ||||
|       std::unique_ptr<ParquetTable> table(new ParquetTable(fname, tableName)); | ||||
|  | ||||
|       std::string create = table->CreateStatement(); | ||||
|       int rc = sqlite3_declare_vtab(db, create.data()); | ||||
| @@ -95,6 +114,7 @@ static int parquetConnect( | ||||
|         return rc; | ||||
|  | ||||
|       vtab->table = table.release(); | ||||
|       vtab->db = db; | ||||
|       *ppVtab = (sqlite3_vtab*)vtab.release(); | ||||
|       return SQLITE_OK; | ||||
|     } catch (const std::exception& e) { | ||||
| @@ -119,16 +139,81 @@ static int parquetCreate( | ||||
|   sqlite3_vtab **ppVtab, | ||||
|   char **pzErr | ||||
| ){ | ||||
|   try { | ||||
|     // Create shadow table for storing constraint -> rowid mappings | ||||
|     std::string create = "CREATE TABLE IF NOT EXISTS _"; | ||||
|     create.append(argv[2]); | ||||
|     create.append("_rowgroups(clause TEXT, estimate BLOB, actual BLOB)"); | ||||
|     int rv = sqlite3_exec(db, create.data(), 0, 0, 0); | ||||
|     if(rv != 0) | ||||
|       return rv; | ||||
|  | ||||
|     create = "CREATE UNIQUE INDEX IF NOT EXISTS _"; | ||||
|     create.append(argv[2]); | ||||
|     create.append("_index ON _"); | ||||
|     create.append(argv[2]); | ||||
|     create.append("_rowgroups(clause)"); | ||||
|     rv = sqlite3_exec(db, create.data(), 0, 0, 0); | ||||
|  | ||||
|     return parquetConnect(db, pAux, argc, argv, ppVtab, pzErr); | ||||
|   } catch (std::bad_alloc& ba) { | ||||
|     return SQLITE_NOMEM; | ||||
|   } | ||||
| } | ||||
|  | ||||
| std::string quoteBlob(const std::vector<unsigned char>& bytes) { | ||||
|   std::ostringstream ss; | ||||
|   ss << "X'" << std::hex; | ||||
|   for(unsigned int i = 0; i < bytes.size(); i++) { | ||||
|     ss << std::setfill('0') << std::setw(2) << (unsigned int)(unsigned char)bytes[i]; | ||||
|   } | ||||
|   ss << "'"; | ||||
|  | ||||
|   return ss.str(); | ||||
| } | ||||
|  | ||||
| void persistConstraints(sqlite3* db, ParquetCursor* cursor) { | ||||
|   for(unsigned int i = 0; i < cursor->getNumConstraints(); i++) { | ||||
|     const Constraint& constraint = cursor->getConstraint(i); | ||||
|     const std::vector<unsigned char>& estimated = constraint.bitmap.estimatedMembership; | ||||
|     const std::vector<unsigned char>& actual = constraint.bitmap.actualMembership; | ||||
|     if(estimated == actual) { | ||||
|       continue; | ||||
|     } | ||||
|     std::string desc = constraint.describe(); | ||||
|  | ||||
|     std::string estimatedStr = quoteBlob(estimated); | ||||
|     std::string actualStr = quoteBlob(actual); | ||||
|  | ||||
|     // This is only advisory, so ignore failures. | ||||
|     char* sql = sqlite3_mprintf( | ||||
|         "INSERT OR REPLACE INTO _%s_rowgroups(clause, estimate, actual) VALUES ('%q', %s, %s)", | ||||
|         cursor->getTable()->getTableName().c_str(), | ||||
|         desc.c_str(), | ||||
|         estimatedStr.c_str(), | ||||
|         actualStr.c_str()); | ||||
|  | ||||
|  | ||||
|     if(sql == NULL) | ||||
|       return; | ||||
|  | ||||
|     sqlite3_exec(db, sql, 0, 0, 0); | ||||
|     sqlite3_free(sql); | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| /* | ||||
| ** Destructor for a sqlite3_vtab_cursor_parquet. | ||||
| */ | ||||
| static int parquetClose(sqlite3_vtab_cursor *cur){ | ||||
|   sqlite3_vtab_cursor_parquet* p = (sqlite3_vtab_cursor_parquet*)cur; | ||||
|   p->cursor->close(); | ||||
|   delete p->cursor; | ||||
|   sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; | ||||
|   sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); | ||||
|   ParquetCursor* cursor = vtab_cursor_parquet->cursor; | ||||
|   persistConstraints(vtab_parquet->db, cursor); | ||||
|  | ||||
|   vtab_cursor_parquet->cursor->close(); | ||||
|   delete vtab_cursor_parquet->cursor; | ||||
|   sqlite3_free(cur); | ||||
|   return SQLITE_OK; | ||||
| } | ||||
| @@ -196,7 +281,8 @@ const char* opName(int op) { | ||||
| */ | ||||
| static int parquetNext(sqlite3_vtab_cursor *cur){ | ||||
|   try { | ||||
|     ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; | ||||
|     sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; | ||||
|     ParquetCursor* cursor = vtab_cursor_parquet->cursor; | ||||
|     cursor->next(); | ||||
|     return SQLITE_OK; | ||||
|   } catch(std::bad_alloc& ba) { | ||||
| @@ -395,6 +481,38 @@ ConstraintOperator constraintOperatorFromSqlite(int op) { | ||||
|   throw std::invalid_argument(ss.str()); | ||||
| } | ||||
|  | ||||
| std::vector<unsigned char> getRowGroupsForClause(sqlite3* db, std::string table, std::string clause) { | ||||
|   std::vector<unsigned char> rv; | ||||
|  | ||||
|   std::unique_ptr<char, void(*)(void*)> sql(sqlite3_mprintf( | ||||
|       "SELECT actual FROM _%s_rowgroups WHERE clause = '%q'", | ||||
|       table.c_str(), | ||||
|       clause.c_str()), sqlite3_free); | ||||
|  | ||||
|   if(sql.get() == NULL) | ||||
|     return rv; | ||||
|  | ||||
|   sqlite3_stmt* pStmt = NULL; | ||||
|   int rc = sqlite3_prepare_v2(db, sql.get(), -1, &pStmt, NULL); | ||||
|   if(rc != 0) | ||||
|     return rv; | ||||
|  | ||||
|   rc = sqlite3_step(pStmt); | ||||
|   if(rc == SQLITE_ROW) { | ||||
|     int size = sqlite3_column_bytes(pStmt, 0); | ||||
|     unsigned char* blob = (unsigned char*)sqlite3_column_blob(pStmt, 0); | ||||
|     // TODO: there is a memory leak here if we get a std::bad_alloc while populating rv; | ||||
|     // we fail to free pStmt | ||||
|     for(int i = 0; i < size; i++) { | ||||
|       rv.push_back(blob[i]); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   sqlite3_finalize(pStmt); | ||||
|   return rv; | ||||
| } | ||||
|  | ||||
|  | ||||
| /* | ||||
| ** Only a full table scan is supported.  So xFilter simply rewinds to | ||||
| ** the beginning. | ||||
| @@ -407,7 +525,10 @@ static int parquetFilter( | ||||
|   sqlite3_value **argv | ||||
| ){ | ||||
|   try { | ||||
|     ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; | ||||
|     sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; | ||||
|     sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); | ||||
|     sqlite3* db = vtab_parquet->db; | ||||
|     ParquetCursor* cursor = vtab_cursor_parquet->cursor; | ||||
|     sqlite3_index_info* indexInfo = (sqlite3_index_info*)idxStr; | ||||
|  | ||||
| #ifdef DEBUG | ||||
| @@ -451,13 +572,40 @@ static int parquetFilter( | ||||
|         type = Null; | ||||
|       } | ||||
|  | ||||
|       Constraint constraint( | ||||
|       std::string columnName = "rowid"; | ||||
|       if(indexInfo->aConstraint[i].iColumn >= 0) { | ||||
|         columnName = cursor->getTable()->columnName(indexInfo->aConstraint[i].iColumn); | ||||
|       } | ||||
|  | ||||
|       RowGroupBitmap bitmap = RowGroupBitmap(cursor->getNumRowGroups()); | ||||
|       Constraint dummy( | ||||
|         bitmap, | ||||
|         indexInfo->aConstraint[i].iColumn, | ||||
|         columnName, | ||||
|         constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), | ||||
|         type, | ||||
|         intValue, | ||||
|         doubleValue, | ||||
|         blobValue); | ||||
|  | ||||
|       std::vector<unsigned char> actual = getRowGroupsForClause(db, cursor->getTable()->getTableName(), dummy.describe()); | ||||
|       if(actual.size() > 0) { | ||||
|         // Initialize the estimate to be the actual -- eventually they'll converge | ||||
|         // and we'll stop writing back to the db. | ||||
|         std::vector<unsigned char> estimate = actual; | ||||
|         bitmap = RowGroupBitmap(estimate, actual); | ||||
|       } | ||||
|  | ||||
|       Constraint constraint( | ||||
|         bitmap, | ||||
|         indexInfo->aConstraint[i].iColumn, | ||||
|         columnName, | ||||
|         constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), | ||||
|         type, | ||||
|         intValue, | ||||
|         doubleValue, | ||||
|         blobValue); | ||||
|  | ||||
|       constraints.push_back(constraint); | ||||
|       j++; | ||||
|     } | ||||
| @@ -555,7 +703,7 @@ static sqlite3_module ParquetModule = { | ||||
|   parquetConnect,           /* xConnect */ | ||||
|   parquetBestIndex,         /* xBestIndex */ | ||||
|   parquetDisconnect,        /* xDisconnect */ | ||||
|   parquetDisconnect,        /* xDestroy */ | ||||
|   parquetDestroy,           /* xDestroy */ | ||||
|   parquetOpen,              /* xOpen - open a cursor */ | ||||
|   parquetClose,             /* xClose - close a cursor */ | ||||
|   parquetFilter,            /* xFilter - configure scan constraints */ | ||||
|   | ||||
| @@ -1,7 +1,6 @@ | ||||
| #include "parquet_cursor.h" | ||||
|  | ||||
| ParquetCursor::ParquetCursor(ParquetTable* table) { | ||||
|   this->table = table; | ||||
| ParquetCursor::ParquetCursor(ParquetTable* table): table(table) { | ||||
|   reader = NULL; | ||||
|   reset(std::vector<Constraint>()); | ||||
| } | ||||
| @@ -518,6 +517,7 @@ bool ParquetCursor::currentRowSatisfiesDoubleFilter(Constraint& constraint) { | ||||
| // This avoids opening rowgroups that can't return useful | ||||
| // data, which provides substantial performance benefits. | ||||
| bool ParquetCursor::currentRowGroupSatisfiesFilter() { | ||||
|   bool overallRv = true; | ||||
|   for(unsigned int i = 0; i < constraints.size(); i++) { | ||||
|     int column = constraints[i].column; | ||||
|     int op = constraints[i].op; | ||||
| @@ -527,9 +527,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { | ||||
|       rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]); | ||||
|     } else { | ||||
|       std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column); | ||||
|       if(!md->is_stats_set()) { | ||||
|         continue; | ||||
|       } | ||||
|       if(md->is_stats_set()) { | ||||
|         std::shared_ptr<parquet::RowGroupStatistics> stats = md->statistics(); | ||||
|  | ||||
|         // SQLite is much looser with types than you might expect if you | ||||
| @@ -562,12 +560,19 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { | ||||
|           } | ||||
|         } | ||||
|       } | ||||
|  | ||||
|     if(!rv) | ||||
|       return false; | ||||
|     } | ||||
|  | ||||
|   return true; | ||||
|     // and it with the existing actual, which may have come from a previous run | ||||
|     rv = rv && constraints[i].bitmap.getActualMembership(rowGroupId); | ||||
|     if(!rv) { | ||||
|       constraints[i].bitmap.setEstimatedMembership(rowGroupId, rv); | ||||
|       constraints[i].bitmap.setActualMembership(rowGroupId, rv); | ||||
|     } | ||||
|     overallRv = overallRv && rv; | ||||
|   } | ||||
|  | ||||
| //  printf("rowGroup %d %s\n", rowGroupId, overallRv ? "may satisfy" : "does not satisfy"); | ||||
|   return overallRv; | ||||
| } | ||||
|  | ||||
|  | ||||
| @@ -609,9 +614,22 @@ start: | ||||
|   // Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it; | ||||
|   // it'll get decremented by our caller | ||||
|   rowId++; | ||||
|  | ||||
|   // We're going to scan this row group; reset the expectation of discovering | ||||
|   // a row | ||||
|   for(unsigned int i = 0; i < constraints.size(); i++) { | ||||
|     if(rowGroupId > 0 && constraints[i].rowGroupId == rowGroupId - 1) { | ||||
|       constraints[i].bitmap.setActualMembership(rowGroupId - 1, constraints[i].hadRows); | ||||
|     } | ||||
|     constraints[i].hadRows = false; | ||||
|   } | ||||
|  | ||||
|   if(!currentRowGroupSatisfiesFilter()) | ||||
|     goto start; | ||||
|  | ||||
|   for(unsigned int i = 0; i < constraints.size(); i++) { | ||||
|     constraints[i].rowGroupId = rowGroupId; | ||||
|   } | ||||
|   return true; | ||||
| } | ||||
|  | ||||
| @@ -623,6 +641,7 @@ start: | ||||
| // and the extension, which can add up on a dataset of tens | ||||
| // of millions of rows. | ||||
| bool ParquetCursor::currentRowSatisfiesFilter() { | ||||
|   bool overallRv = true; | ||||
|   for(unsigned int i = 0; i < constraints.size(); i++) { | ||||
|     bool rv = true; | ||||
|     int column = constraints[i].column; | ||||
| @@ -648,13 +667,18 @@ bool ParquetCursor::currentRowSatisfiesFilter() { | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     if(!rv) | ||||
|       return false; | ||||
|     // it defaults to false; so only set it if true | ||||
|     // ideally we'd short-circuit if we'd already set this group as visited | ||||
|     if(rv) { | ||||
|       constraints[i].hadRows = true; | ||||
|     } | ||||
|   return true; | ||||
|     overallRv = overallRv && rv; | ||||
|   } | ||||
|   return overallRv; | ||||
| } | ||||
|  | ||||
| void ParquetCursor::next() { | ||||
|   // Returns true if we've crossed a row group boundary | ||||
| start: | ||||
|   if(rowsLeftInRowGroup == 0) { | ||||
|     if(!nextRowGroup()) { | ||||
| @@ -672,7 +696,6 @@ start: | ||||
|   rowId++; | ||||
|   if(!currentRowSatisfiesFilter()) | ||||
|     goto start; | ||||
|  | ||||
| } | ||||
|  | ||||
| int ParquetCursor::getRowId() { | ||||
| @@ -939,7 +962,7 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) { | ||||
|   // TODO: consider having a long lived handle in ParquetTable that can be borrowed | ||||
|   // without incurring the cost of opening the file from scratch twice | ||||
|   reader = parquet::ParquetFileReader::OpenFile( | ||||
|       table->file.data(), | ||||
|       table->getFile().data(), | ||||
|       true, | ||||
|       parquet::default_reader_properties(), | ||||
|       table->getMetadata()); | ||||
| @@ -955,4 +978,10 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) { | ||||
|   numRowGroups = reader->metadata()->num_row_groups(); | ||||
| } | ||||
|  | ||||
| ParquetTable* ParquetCursor::getTable() { return table; } | ||||
| ParquetTable* ParquetCursor::getTable() const { return table; } | ||||
|  | ||||
| unsigned int ParquetCursor::getNumRowGroups() const { return numRowGroups; } | ||||
| unsigned int ParquetCursor::getNumConstraints() const { return constraints.size(); } | ||||
| const Constraint& ParquetCursor::getConstraint(unsigned int i) const { return constraints[i]; } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -56,9 +56,12 @@ public: | ||||
|  | ||||
|   void ensureColumn(int col); | ||||
|   bool isNull(int col); | ||||
|   unsigned int getNumRowGroups() const; | ||||
|   unsigned int getNumConstraints() const; | ||||
|   const Constraint& getConstraint(unsigned int i) const; | ||||
|   parquet::Type::type getPhysicalType(int col); | ||||
|   parquet::LogicalType::type getLogicalType(int col); | ||||
|   ParquetTable* getTable(); | ||||
|   ParquetTable* getTable() const; | ||||
|  | ||||
|   int getInt32(int col); | ||||
|   long getInt64(int col); | ||||
|   | ||||
| @@ -1,19 +1,25 @@ | ||||
| #include "parquet_filter.h" | ||||
|  | ||||
| Constraint::Constraint( | ||||
|   RowGroupBitmap bitmap, | ||||
|   int column, | ||||
|   std::string columnName, | ||||
|   ConstraintOperator op, | ||||
|   ValueType type, | ||||
|   int64_t intValue, | ||||
|   double doubleValue, | ||||
|   std::vector<unsigned char> blobValue | ||||
| ) { | ||||
|   this->column = column; | ||||
|   this->op = op; | ||||
|   this->type = type; | ||||
|   this->intValue = intValue; | ||||
|   this->doubleValue = doubleValue; | ||||
|   this->blobValue = blobValue; | ||||
| ): bitmap(bitmap), | ||||
|    column(column), | ||||
|    columnName(columnName), | ||||
|    op(op), | ||||
|    type(type), | ||||
|    intValue(intValue), | ||||
|    doubleValue(doubleValue), | ||||
|    blobValue(blobValue), | ||||
|    hadRows(false) { | ||||
|      RowGroupBitmap bm = bitmap; | ||||
|      this->bitmap = bm; | ||||
|  | ||||
|   if(type == Text) { | ||||
|     stringValue = std::string((char*)&blobValue[0], blobValue.size()); | ||||
| @@ -34,3 +40,72 @@ Constraint::Constraint( | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| std::string Constraint::describe() const { | ||||
|   std::string rv; | ||||
|   rv.append(columnName); | ||||
|   rv.append(" "); | ||||
|   switch(op) { | ||||
|     case Equal: | ||||
|       rv.append("="); | ||||
|       break; | ||||
|     case GreaterThan: | ||||
|       rv.append(">"); | ||||
|       break; | ||||
|     case LessThanOrEqual: | ||||
|       rv.append("<="); | ||||
|       break; | ||||
|     case LessThan: | ||||
|       rv.append("<"); | ||||
|       break; | ||||
|     case GreaterThanOrEqual: | ||||
|       rv.append(">="); | ||||
|       break; | ||||
|     case Match: | ||||
|       rv.append("MATCH"); | ||||
|       break; | ||||
|     case Like: | ||||
|       rv.append("LIKE"); | ||||
|       break; | ||||
|     case Glob: | ||||
|       rv.append("GLOB"); | ||||
|       break; | ||||
|     case Regexp: | ||||
|       rv.append("REGEXP"); | ||||
|       break; | ||||
|     case NotEqual: | ||||
|       rv.append("<>"); | ||||
|       break; | ||||
|     case IsNot: | ||||
|       rv.append("IS NOT"); | ||||
|       break; | ||||
|     case IsNotNull: | ||||
|       rv.append("IS NOT NULL"); | ||||
|       break; | ||||
|     case IsNull: | ||||
|       rv.append("IS NULL"); | ||||
|       break; | ||||
|     case Is: | ||||
|       rv.append("IS"); | ||||
|       break; | ||||
|   } | ||||
|   rv.append(" "); | ||||
|  | ||||
|   switch(type) { | ||||
|     case Null: | ||||
|       rv.append("NULL"); | ||||
|       break; | ||||
|     case Integer: | ||||
|       rv.append(std::to_string(intValue)); | ||||
|       break; | ||||
|     case Double: | ||||
|       rv.append(std::to_string(doubleValue)); | ||||
|       break; | ||||
|     case Blob: | ||||
|       break; | ||||
|     case Text: | ||||
|       rv.append(stringValue); | ||||
|       break; | ||||
|   } | ||||
|   return rv; | ||||
| } | ||||
|   | ||||
| @@ -30,11 +30,63 @@ enum ValueType { | ||||
|   Text | ||||
| }; | ||||
|  | ||||
| class RowGroupBitmap { | ||||
|   void setBit(std::vector<unsigned char>& membership, unsigned int rowGroup, bool isSet) { | ||||
|     int byte = rowGroup / 8; | ||||
|     int offset = rowGroup % 8; | ||||
|     unsigned char c = membership[byte]; | ||||
|     c &= ~(1UL << offset); | ||||
|     if(isSet) { | ||||
|       c |= 1UL << offset; | ||||
|     } | ||||
|     membership[byte] = c; | ||||
|   } | ||||
| // Compares estimated rowGroupFilter results against observed results | ||||
| // when we explored the row group. This lets us cache  | ||||
| public: | ||||
|   RowGroupBitmap(unsigned int totalRowGroups) { | ||||
|     // Initialize everything to assume that all row groups match. | ||||
|     // As we discover otherwise, we'll update that assumption. | ||||
|     for(unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) { | ||||
|       estimatedMembership.push_back(0xFF); | ||||
|       actualMembership.push_back(0xFF); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   RowGroupBitmap( | ||||
|       std::vector<unsigned char> estimatedMembership, | ||||
|       std::vector<unsigned char> actualMembership) : | ||||
|     estimatedMembership(estimatedMembership), | ||||
|     actualMembership(actualMembership) { | ||||
|   } | ||||
|  | ||||
|   std::vector<unsigned char> estimatedMembership; | ||||
|   std::vector<unsigned char> actualMembership; | ||||
|   // Pass false only if definitely does not have rows | ||||
|   void setEstimatedMembership(unsigned int rowGroup, bool hasRows) { | ||||
|     setBit(estimatedMembership, rowGroup, hasRows); | ||||
|   } | ||||
|  | ||||
|   // Pass false only after exhausting all rows | ||||
|   void setActualMembership(unsigned int rowGroup, bool hadRows) { | ||||
|     setBit(actualMembership, rowGroup, hadRows); | ||||
|   } | ||||
|  | ||||
|   bool getActualMembership(unsigned int rowGroup) { | ||||
|     int byte = rowGroup / 8; | ||||
|     int offset = rowGroup % 8; | ||||
|  | ||||
|     return (actualMembership[byte] >> offset) & 1U; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| class Constraint { | ||||
| public: | ||||
|   // Kind of a messy constructor function, but it's just for internal use, so whatever. | ||||
|   Constraint( | ||||
|     RowGroupBitmap bitmap, | ||||
|     int column, | ||||
|     std::string columnName, | ||||
|     ConstraintOperator op, | ||||
|     ValueType type, | ||||
|     int64_t intValue, | ||||
| @@ -42,7 +94,9 @@ public: | ||||
|     std::vector<unsigned char> blobValue | ||||
|   ); | ||||
|  | ||||
|   RowGroupBitmap bitmap; | ||||
|   int column; // underlying column in the query | ||||
|   std::string columnName; | ||||
|   ConstraintOperator op; | ||||
|   ValueType type; | ||||
|  | ||||
| @@ -54,6 +108,15 @@ public: | ||||
|  | ||||
|   // Only set when stringValue is set and op == Like | ||||
|   std::string likeStringValue; | ||||
|  | ||||
|   // A unique identifier for this constraint, e.g. | ||||
|   // col0 = 'Dawson Creek' | ||||
|   std::string describe() const; | ||||
|  | ||||
|   // This is a temp field used while evaluating if a rowgroup had rows | ||||
|   // that matched this constraint. | ||||
|   int rowGroupId; | ||||
|   bool hadRows; | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -2,9 +2,7 @@ | ||||
|  | ||||
| #include "parquet/api/reader.h" | ||||
|  | ||||
| ParquetTable::ParquetTable(std::string file) { | ||||
|   this->file = file; | ||||
|  | ||||
| ParquetTable::ParquetTable(std::string file, std::string tableName): file(file), tableName(tableName) { | ||||
|   std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); | ||||
|   metadata = reader->metadata(); | ||||
| } | ||||
| @@ -138,3 +136,6 @@ std::string ParquetTable::CreateStatement() { | ||||
| } | ||||
|  | ||||
| std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; } | ||||
|  | ||||
| const std::string& ParquetTable::getFile() { return file; } | ||||
| const std::string& ParquetTable::getTableName() { return tableName; } | ||||
|   | ||||
| @@ -6,16 +6,19 @@ | ||||
| #include "parquet/api/reader.h" | ||||
|  | ||||
| class ParquetTable { | ||||
|   std::string file; | ||||
|   std::string tableName; | ||||
|   std::vector<std::string> columnNames; | ||||
|   std::shared_ptr<parquet::FileMetaData> metadata; | ||||
|  | ||||
|  | ||||
| public: | ||||
|   ParquetTable(std::string file); | ||||
|   ParquetTable(std::string file, std::string tableName); | ||||
|   std::string CreateStatement(); | ||||
|   std::string file; | ||||
|   std::string columnName(int idx); | ||||
|   std::shared_ptr<parquet::FileMetaData> getMetadata(); | ||||
|   const std::string& getFile(); | ||||
|   const std::string& getTableName(); | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -47,7 +47,7 @@ main() { | ||||
|   fi | ||||
|  | ||||
|   cat "$root"/parquet-generator/*.sql > "$root"/testcase-bootstrap.sql | ||||
|   rm test.db | ||||
|   rm -f test.db | ||||
|   "$root"/sqlite/sqlite3 test.db -init "$root"/testcase-bootstrap.sql < /dev/null | ||||
|   if [ ! -v NO_DEBUG ] && [ "$(cat testcases.txt | wc -l)" == "1" ]; then | ||||
|     set -x | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Colin Dellow
					Colin Dellow