mirror of
				https://github.com/cldellow/sqlite-parquet-vtable.git
				synced 2025-10-31 02:19:56 +00:00 
			
		
		
		
	Run a formatting pass with clang-format to minimize future git churn
This commit is contained in:
		
							
								
								
									
										258
									
								
								src/parquet.cc
									
									
									
									
									
								
							
							
						
						
									
										258
									
								
								src/parquet.cc
									
									
									
									
									
								
							| @@ -11,19 +11,19 @@ | |||||||
|  */ |  */ | ||||||
| #include <sqlite3ext.h> | #include <sqlite3ext.h> | ||||||
| SQLITE_EXTENSION_INIT1 | SQLITE_EXTENSION_INIT1 | ||||||
| #include <string.h> |  | ||||||
| #include <stdlib.h> |  | ||||||
| #include <assert.h> | #include <assert.h> | ||||||
| #include <stdarg.h> |  | ||||||
| #include <ctype.h> | #include <ctype.h> | ||||||
| #include <stdio.h> |  | ||||||
| #include <iomanip> | #include <iomanip> | ||||||
| #include <sys/time.h> |  | ||||||
| #include <memory> | #include <memory> | ||||||
|  | #include <stdarg.h> | ||||||
|  | #include <stdio.h> | ||||||
|  | #include <stdlib.h> | ||||||
|  | #include <string.h> | ||||||
|  | #include <sys/time.h> | ||||||
|  |  | ||||||
| #include "parquet_table.h" |  | ||||||
| #include "parquet_cursor.h" | #include "parquet_cursor.h" | ||||||
| #include "parquet_filter.h" | #include "parquet_filter.h" | ||||||
|  | #include "parquet_table.h" | ||||||
|  |  | ||||||
| //#define DEBUG | //#define DEBUG | ||||||
|  |  | ||||||
| @@ -52,7 +52,6 @@ typedef struct sqlite3_vtab_parquet { | |||||||
|   sqlite3 *db; |   sqlite3 *db; | ||||||
| } sqlite3_vtab_parquet; | } sqlite3_vtab_parquet; | ||||||
|  |  | ||||||
|  |  | ||||||
| /* A cursor for the Parquet virtual table */ | /* A cursor for the Parquet virtual table */ | ||||||
| typedef struct sqlite3_vtab_cursor_parquet { | typedef struct sqlite3_vtab_cursor_parquet { | ||||||
|   sqlite3_vtab_cursor base; /* Base class.  Must be first */ |   sqlite3_vtab_cursor base; /* Base class.  Must be first */ | ||||||
| @@ -84,17 +83,13 @@ static int parquetDisconnect(sqlite3_vtab *pVtab){ | |||||||
|   return SQLITE_OK; |   return SQLITE_OK; | ||||||
| } | } | ||||||
|  |  | ||||||
| static int parquetConnect( | static int parquetConnect(sqlite3 *db, void *pAux, int argc, | ||||||
|   sqlite3 *db, |                           const char *const *argv, sqlite3_vtab **ppVtab, | ||||||
|   void *pAux, |                           char **pzErr) { | ||||||
|   int argc, |  | ||||||
|   const char *const*argv, |  | ||||||
|   sqlite3_vtab **ppVtab, |  | ||||||
|   char **pzErr |  | ||||||
| ){ |  | ||||||
|   try { |   try { | ||||||
|     if (argc != 4 || strlen(argv[3]) < 2) { |     if (argc != 4 || strlen(argv[3]) < 2) { | ||||||
|       *pzErr = sqlite3_mprintf("must provide exactly one argument, the path to a parquet file"); |       *pzErr = sqlite3_mprintf( | ||||||
|  |           "must provide exactly one argument, the path to a parquet file"); | ||||||
|       return SQLITE_ERROR; |       return SQLITE_ERROR; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -134,13 +129,9 @@ static int parquetConnect( | |||||||
| ** The xConnect and xCreate methods do the same thing, but they must be | ** The xConnect and xCreate methods do the same thing, but they must be | ||||||
| ** different so that the virtual table is not an eponymous virtual table. | ** different so that the virtual table is not an eponymous virtual table. | ||||||
| */ | */ | ||||||
| static int parquetCreate( | static int parquetCreate(sqlite3 *db, void *pAux, int argc, | ||||||
|   sqlite3 *db, |                          const char *const *argv, sqlite3_vtab **ppVtab, | ||||||
|   void *pAux, |                          char **pzErr) { | ||||||
|   int argc, const char *const*argv, |  | ||||||
|   sqlite3_vtab **ppVtab, |  | ||||||
|   char **pzErr |  | ||||||
| ){ |  | ||||||
|   try { |   try { | ||||||
|     // Create shadow table for storing constraint -> rowid mappings |     // Create shadow table for storing constraint -> rowid mappings | ||||||
|     std::string create = "CREATE TABLE IF NOT EXISTS _"; |     std::string create = "CREATE TABLE IF NOT EXISTS _"; | ||||||
| @@ -167,7 +158,8 @@ std::string quoteBlob(const std::vector<unsigned char>& bytes) { | |||||||
|   std::ostringstream ss; |   std::ostringstream ss; | ||||||
|   ss << "X'" << std::hex; |   ss << "X'" << std::hex; | ||||||
|   for (unsigned int i = 0; i < bytes.size(); i++) { |   for (unsigned int i = 0; i < bytes.size(); i++) { | ||||||
|     ss << std::setfill('0') << std::setw(2) << (unsigned int)(unsigned char)bytes[i]; |     ss << std::setfill('0') << std::setw(2) | ||||||
|  |        << (unsigned int)(unsigned char)bytes[i]; | ||||||
|   } |   } | ||||||
|   ss << "'"; |   ss << "'"; | ||||||
|  |  | ||||||
| @@ -177,8 +169,10 @@ std::string quoteBlob(const std::vector<unsigned char>& bytes) { | |||||||
| void persistConstraints(sqlite3 *db, ParquetCursor *cursor) { | void persistConstraints(sqlite3 *db, ParquetCursor *cursor) { | ||||||
|   for (unsigned int i = 0; i < cursor->getNumConstraints(); i++) { |   for (unsigned int i = 0; i < cursor->getNumConstraints(); i++) { | ||||||
|     const Constraint &constraint = cursor->getConstraint(i); |     const Constraint &constraint = cursor->getConstraint(i); | ||||||
|     const std::vector<unsigned char>& estimated = constraint.bitmap.estimatedMembership; |     const std::vector<unsigned char> &estimated = | ||||||
|     const std::vector<unsigned char>& actual = constraint.bitmap.actualMembership; |         constraint.bitmap.estimatedMembership; | ||||||
|  |     const std::vector<unsigned char> &actual = | ||||||
|  |         constraint.bitmap.actualMembership; | ||||||
|     if (estimated == actual) { |     if (estimated == actual) { | ||||||
|       continue; |       continue; | ||||||
|     } |     } | ||||||
| @@ -188,13 +182,11 @@ void persistConstraints(sqlite3* db, ParquetCursor* cursor) { | |||||||
|     std::string actualStr = quoteBlob(actual); |     std::string actualStr = quoteBlob(actual); | ||||||
|  |  | ||||||
|     // This is only advisory, so ignore failures. |     // This is only advisory, so ignore failures. | ||||||
|     char* sql = sqlite3_mprintf( |     char *sql = | ||||||
|         "INSERT OR REPLACE INTO _%s_rowgroups(clause, estimate, actual) VALUES ('%q', %s, %s)", |         sqlite3_mprintf("INSERT OR REPLACE INTO _%s_rowgroups(clause, " | ||||||
|  |                         "estimate, actual) VALUES ('%q', %s, %s)", | ||||||
|                         cursor->getTable()->getTableName().c_str(), |                         cursor->getTable()->getTableName().c_str(), | ||||||
|         desc.c_str(), |                         desc.c_str(), estimatedStr.c_str(), actualStr.c_str()); | ||||||
|         estimatedStr.c_str(), |  | ||||||
|         actualStr.c_str()); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     if (sql == NULL) |     if (sql == NULL) | ||||||
|       return; |       return; | ||||||
| @@ -204,12 +196,12 @@ void persistConstraints(sqlite3* db, ParquetCursor* cursor) { | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| /* | /* | ||||||
| ** Destructor for a sqlite3_vtab_cursor_parquet. | ** Destructor for a sqlite3_vtab_cursor_parquet. | ||||||
| */ | */ | ||||||
| static int parquetClose(sqlite3_vtab_cursor *cur) { | static int parquetClose(sqlite3_vtab_cursor *cur) { | ||||||
|   sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; |   sqlite3_vtab_cursor_parquet *vtab_cursor_parquet = | ||||||
|  |       (sqlite3_vtab_cursor_parquet *)cur; | ||||||
|   vtab_cursor_parquet->cursor->close(); |   vtab_cursor_parquet->cursor->close(); | ||||||
|   delete vtab_cursor_parquet->cursor; |   delete vtab_cursor_parquet->cursor; | ||||||
|   sqlite3_free(cur); |   sqlite3_free(cur); | ||||||
| @@ -222,7 +214,8 @@ static int parquetClose(sqlite3_vtab_cursor *cur){ | |||||||
| static int parquetOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) { | static int parquetOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) { | ||||||
|   try { |   try { | ||||||
|     std::unique_ptr<sqlite3_vtab_cursor_parquet, void (*)(void *)> cursor( |     std::unique_ptr<sqlite3_vtab_cursor_parquet, void (*)(void *)> cursor( | ||||||
|         (sqlite3_vtab_cursor_parquet*)sqlite3_malloc(sizeof(sqlite3_vtab_cursor_parquet)), |         (sqlite3_vtab_cursor_parquet *)sqlite3_malloc( | ||||||
|  |             sizeof(sqlite3_vtab_cursor_parquet)), | ||||||
|         sqlite3_free); |         sqlite3_free); | ||||||
|     memset(cursor.get(), 0, sizeof(*cursor.get())); |     memset(cursor.get(), 0, sizeof(*cursor.get())); | ||||||
|  |  | ||||||
| @@ -238,14 +231,14 @@ static int parquetOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){ | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| /* | /* | ||||||
| ** Advance a sqlite3_vtab_cursor_parquet to its next row of input. | ** Advance a sqlite3_vtab_cursor_parquet to its next row of input. | ||||||
| ** Set the EOF marker if we reach the end of input. | ** Set the EOF marker if we reach the end of input. | ||||||
| */ | */ | ||||||
| static int parquetNext(sqlite3_vtab_cursor *cur) { | static int parquetNext(sqlite3_vtab_cursor *cur) { | ||||||
|   try { |   try { | ||||||
|     sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; |     sqlite3_vtab_cursor_parquet *vtab_cursor_parquet = | ||||||
|  |         (sqlite3_vtab_cursor_parquet *)cur; | ||||||
|     ParquetCursor *cursor = vtab_cursor_parquet->cursor; |     ParquetCursor *cursor = vtab_cursor_parquet->cursor; | ||||||
|     cursor->next(); |     cursor->next(); | ||||||
|     return SQLITE_OK; |     return SQLITE_OK; | ||||||
| @@ -260,8 +253,8 @@ static int parquetNext(sqlite3_vtab_cursor *cur){ | |||||||
| ** Return values of columns for the row at which the sqlite3_vtab_cursor_parquet | ** Return values of columns for the row at which the sqlite3_vtab_cursor_parquet | ||||||
| ** is currently pointing. | ** is currently pointing. | ||||||
| */ | */ | ||||||
| static int parquetColumn( | static int | ||||||
|   sqlite3_vtab_cursor *cur,   /* The cursor */ | parquetColumn(sqlite3_vtab_cursor *cur, /* The cursor */ | ||||||
|               sqlite3_context *ctx, /* First argument to sqlite3_result_...() */ |               sqlite3_context *ctx, /* First argument to sqlite3_result_...() */ | ||||||
|               int col               /* Which column to return */ |               int col               /* Which column to return */ | ||||||
| ) { | ) { | ||||||
| @@ -274,24 +267,22 @@ static int parquetColumn( | |||||||
|     } else { |     } else { | ||||||
|       switch (cursor->getPhysicalType(col)) { |       switch (cursor->getPhysicalType(col)) { | ||||||
|       case parquet::Type::BOOLEAN: |       case parquet::Type::BOOLEAN: | ||||||
|         case parquet::Type::INT32: |       case parquet::Type::INT32: { | ||||||
|         { |  | ||||||
|         int rv = cursor->getInt32(col); |         int rv = cursor->getInt32(col); | ||||||
|         sqlite3_result_int(ctx, rv); |         sqlite3_result_int(ctx, rv); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|       case parquet::Type::FLOAT: |       case parquet::Type::FLOAT: | ||||||
|         case parquet::Type::DOUBLE: |       case parquet::Type::DOUBLE: { | ||||||
|         { |  | ||||||
|         double rv = cursor->getDouble(col); |         double rv = cursor->getDouble(col); | ||||||
|         sqlite3_result_double(ctx, rv); |         sqlite3_result_double(ctx, rv); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::BYTE_ARRAY: |       case parquet::Type::BYTE_ARRAY: { | ||||||
|         { |  | ||||||
|         parquet::ByteArray *rv = cursor->getByteArray(col); |         parquet::ByteArray *rv = cursor->getByteArray(col); | ||||||
|         if (cursor->getLogicalType(col) == parquet::LogicalType::UTF8) { |         if (cursor->getLogicalType(col) == parquet::LogicalType::UTF8) { | ||||||
|             sqlite3_result_text(ctx, (const char*)rv->ptr, rv->len, SQLITE_TRANSIENT); |           sqlite3_result_text(ctx, (const char *)rv->ptr, rv->len, | ||||||
|  |                               SQLITE_TRANSIENT); | ||||||
|         } else { |         } else { | ||||||
|           sqlite3_result_blob(ctx, (void *)rv->ptr, rv->len, SQLITE_TRANSIENT); |           sqlite3_result_blob(ctx, (void *)rv->ptr, rv->len, SQLITE_TRANSIENT); | ||||||
|         } |         } | ||||||
| @@ -300,14 +291,12 @@ static int parquetColumn( | |||||||
|       case parquet::Type::INT96: |       case parquet::Type::INT96: | ||||||
|         // This type exists to store timestamps in nanoseconds due to legacy |         // This type exists to store timestamps in nanoseconds due to legacy | ||||||
|         // reasons. We just interpret it as a timestamp in milliseconds. |         // reasons. We just interpret it as a timestamp in milliseconds. | ||||||
|         case parquet::Type::INT64: |       case parquet::Type::INT64: { | ||||||
|         { |  | ||||||
|         long rv = cursor->getInt64(col); |         long rv = cursor->getInt64(col); | ||||||
|         sqlite3_result_int64(ctx, rv); |         sqlite3_result_int64(ctx, rv); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::FIXED_LEN_BYTE_ARRAY: |       case parquet::Type::FIXED_LEN_BYTE_ARRAY: { | ||||||
|         { |  | ||||||
|         parquet::ByteArray *rv = cursor->getByteArray(col); |         parquet::ByteArray *rv = cursor->getByteArray(col); | ||||||
|         sqlite3_result_blob(ctx, (void *)rv->ptr, rv->len, SQLITE_TRANSIENT); |         sqlite3_result_blob(ctx, (void *)rv->ptr, rv->len, SQLITE_TRANSIENT); | ||||||
|         break; |         break; | ||||||
| @@ -316,8 +305,9 @@ static int parquetColumn( | |||||||
|         // Should be impossible to get here as we should have forbidden this at |         // Should be impossible to get here as we should have forbidden this at | ||||||
|         // CREATE time -- maybe file changed underneath us? |         // CREATE time -- maybe file changed underneath us? | ||||||
|         std::ostringstream ss; |         std::ostringstream ss; | ||||||
|           ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " << |         ss << __FILE__ << ":" << __LINE__ << ": column " << col | ||||||
|             parquet::TypeToString(cursor->getPhysicalType(col)); |            << " has unsupported type: " | ||||||
|  |            << parquet::TypeToString(cursor->getPhysicalType(col)); | ||||||
|  |  | ||||||
|         throw std::invalid_argument(ss.str()); |         throw std::invalid_argument(ss.str()); | ||||||
|         break; |         break; | ||||||
| @@ -347,8 +337,10 @@ static int parquetRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ | |||||||
| static int parquetEof(sqlite3_vtab_cursor *cur) { | static int parquetEof(sqlite3_vtab_cursor *cur) { | ||||||
|   ParquetCursor *cursor = ((sqlite3_vtab_cursor_parquet *)cur)->cursor; |   ParquetCursor *cursor = ((sqlite3_vtab_cursor_parquet *)cur)->cursor; | ||||||
|   if (cursor->eof()) { |   if (cursor->eof()) { | ||||||
|     sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; |     sqlite3_vtab_cursor_parquet *vtab_cursor_parquet = | ||||||
|     sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); |         (sqlite3_vtab_cursor_parquet *)cur; | ||||||
|  |     sqlite3_vtab_parquet *vtab_parquet = | ||||||
|  |         (sqlite3_vtab_parquet *)(vtab_cursor_parquet->base.pVtab); | ||||||
|     persistConstraints(vtab_parquet->db, cursor); |     persistConstraints(vtab_parquet->db, cursor); | ||||||
|     return 1; |     return 1; | ||||||
|   } |   } | ||||||
| @@ -391,7 +383,8 @@ const char* opName(int op) { | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| void debugConstraints(sqlite3_index_info *pIdxInfo, ParquetTable *table, int argc, sqlite3_value** argv) { | void debugConstraints(sqlite3_index_info *pIdxInfo, ParquetTable *table, | ||||||
|  |                       int argc, sqlite3_value **argv) { | ||||||
|   printf("debugConstraints, argc=%d\n", argc); |   printf("debugConstraints, argc=%d\n", argc); | ||||||
|   int j = 0; |   int j = 0; | ||||||
|   for (int i = 0; i < pIdxInfo->nConstraint; i++) { |   for (int i = 0; i < pIdxInfo->nConstraint; i++) { | ||||||
| @@ -399,51 +392,44 @@ void debugConstraints(sqlite3_index_info *pIdxInfo, ParquetTable *table, int arg | |||||||
|     if (argv != NULL && pIdxInfo->aConstraint[i].usable) { |     if (argv != NULL && pIdxInfo->aConstraint[i].usable) { | ||||||
|       int type = sqlite3_value_type(argv[j]); |       int type = sqlite3_value_type(argv[j]); | ||||||
|       switch (type) { |       switch (type) { | ||||||
|         case SQLITE_INTEGER: |       case SQLITE_INTEGER: { | ||||||
|         { |  | ||||||
|         sqlite3_int64 rv = sqlite3_value_int64(argv[j]); |         sqlite3_int64 rv = sqlite3_value_int64(argv[j]); | ||||||
|         std::ostringstream ss; |         std::ostringstream ss; | ||||||
|         ss << rv; |         ss << rv; | ||||||
|         valueStr = ss.str(); |         valueStr = ss.str(); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case SQLITE_FLOAT: |       case SQLITE_FLOAT: { | ||||||
|         { |  | ||||||
|         double rv = sqlite3_value_double(argv[j]); |         double rv = sqlite3_value_double(argv[j]); | ||||||
|         std::ostringstream ss; |         std::ostringstream ss; | ||||||
|         ss << rv; |         ss << rv; | ||||||
|         valueStr = ss.str(); |         valueStr = ss.str(); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case SQLITE_TEXT: |       case SQLITE_TEXT: { | ||||||
|         { |  | ||||||
|         const unsigned char *rv = sqlite3_value_text(argv[j]); |         const unsigned char *rv = sqlite3_value_text(argv[j]); | ||||||
|         std::ostringstream ss; |         std::ostringstream ss; | ||||||
|         ss << "'" << rv << "'"; |         ss << "'" << rv << "'"; | ||||||
|         valueStr = ss.str(); |         valueStr = ss.str(); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case SQLITE_BLOB: |       case SQLITE_BLOB: { | ||||||
|         { |  | ||||||
|         int sizeBytes = sqlite3_value_bytes(argv[j]); |         int sizeBytes = sqlite3_value_bytes(argv[j]); | ||||||
|         std::ostringstream ss; |         std::ostringstream ss; | ||||||
|         ss << "'..." << sizeBytes << "-byte blob...'"; |         ss << "'..." << sizeBytes << "-byte blob...'"; | ||||||
|         valueStr = ss.str(); |         valueStr = ss.str(); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case SQLITE_NULL: |       case SQLITE_NULL: { | ||||||
|         { |  | ||||||
|         valueStr = "NULL"; |         valueStr = "NULL"; | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|       } |       } | ||||||
|       j++; |       j++; | ||||||
|     } |     } | ||||||
|     printf("  constraint %d: col %s %s %s, usable %d\n", |     printf("  constraint %d: col %s %s %s, usable %d\n", i, | ||||||
|         i, |  | ||||||
|            table->columnName(pIdxInfo->aConstraint[i].iColumn).data(), |            table->columnName(pIdxInfo->aConstraint[i].iColumn).data(), | ||||||
|         opName(pIdxInfo->aConstraint[i].op), |            opName(pIdxInfo->aConstraint[i].op), valueStr.data(), | ||||||
|         valueStr.data(), |  | ||||||
|            pIdxInfo->aConstraint[i].usable); |            pIdxInfo->aConstraint[i].usable); | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -482,13 +468,14 @@ ConstraintOperator constraintOperatorFromSqlite(int op) { | |||||||
|   throw std::invalid_argument(ss.str()); |   throw std::invalid_argument(ss.str()); | ||||||
| } | } | ||||||
|  |  | ||||||
| std::vector<unsigned char> getRowGroupsForClause(sqlite3* db, std::string table, std::string clause) { | std::vector<unsigned char> getRowGroupsForClause(sqlite3 *db, std::string table, | ||||||
|  |                                                  std::string clause) { | ||||||
|   std::vector<unsigned char> rv; |   std::vector<unsigned char> rv; | ||||||
|  |  | ||||||
|   std::unique_ptr<char, void(*)(void*)> sql(sqlite3_mprintf( |   std::unique_ptr<char, void (*)(void *)> sql( | ||||||
|       "SELECT actual FROM _%s_rowgroups WHERE clause = '%q'", |       sqlite3_mprintf("SELECT actual FROM _%s_rowgroups WHERE clause = '%q'", | ||||||
|       table.c_str(), |                       table.c_str(), clause.c_str()), | ||||||
|       clause.c_str()), sqlite3_free); |       sqlite3_free); | ||||||
|  |  | ||||||
|   if (sql.get() == NULL) |   if (sql.get() == NULL) | ||||||
|     return rv; |     return rv; | ||||||
| @@ -502,8 +489,8 @@ std::vector<unsigned char> getRowGroupsForClause(sqlite3* db, std::string table, | |||||||
|   if (rc == SQLITE_ROW) { |   if (rc == SQLITE_ROW) { | ||||||
|     int size = sqlite3_column_bytes(pStmt, 0); |     int size = sqlite3_column_bytes(pStmt, 0); | ||||||
|     unsigned char *blob = (unsigned char *)sqlite3_column_blob(pStmt, 0); |     unsigned char *blob = (unsigned char *)sqlite3_column_blob(pStmt, 0); | ||||||
|     // TODO: there is a memory leak here if we get a std::bad_alloc while populating rv; |     // TODO: there is a memory leak here if we get a std::bad_alloc while | ||||||
|     // we fail to free pStmt |     // populating rv; we fail to free pStmt | ||||||
|     for (int i = 0; i < size; i++) { |     for (int i = 0; i < size; i++) { | ||||||
|       rv.push_back(blob[i]); |       rv.push_back(blob[i]); | ||||||
|     } |     } | ||||||
| @@ -513,21 +500,17 @@ std::vector<unsigned char> getRowGroupsForClause(sqlite3* db, std::string table, | |||||||
|   return rv; |   return rv; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| /* | /* | ||||||
| ** Only a full table scan is supported.  So xFilter simply rewinds to | ** Only a full table scan is supported.  So xFilter simply rewinds to | ||||||
| ** the beginning. | ** the beginning. | ||||||
| */ | */ | ||||||
| static int parquetFilter( | static int parquetFilter(sqlite3_vtab_cursor *cur, int idxNum, | ||||||
|   sqlite3_vtab_cursor *cur, |                          const char *idxStr, int argc, sqlite3_value **argv) { | ||||||
|   int idxNum, |  | ||||||
|   const char *idxStr, |  | ||||||
|   int argc, |  | ||||||
|   sqlite3_value **argv |  | ||||||
| ){ |  | ||||||
|   try { |   try { | ||||||
|     sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; |     sqlite3_vtab_cursor_parquet *vtab_cursor_parquet = | ||||||
|     sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); |         (sqlite3_vtab_cursor_parquet *)cur; | ||||||
|  |     sqlite3_vtab_parquet *vtab_parquet = | ||||||
|  |         (sqlite3_vtab_parquet *)(vtab_cursor_parquet->base.pVtab); | ||||||
|     sqlite3 *db = vtab_parquet->db; |     sqlite3 *db = vtab_parquet->db; | ||||||
|     ParquetCursor *cursor = vtab_cursor_parquet->cursor; |     ParquetCursor *cursor = vtab_cursor_parquet->cursor; | ||||||
|     sqlite3_index_info *indexInfo = (sqlite3_index_info *)idxStr; |     sqlite3_index_info *indexInfo = (sqlite3_index_info *)idxStr; | ||||||
| @@ -539,7 +522,8 @@ static int parquetFilter( | |||||||
|         (unsigned long long)(tv.tv_sec) * 1000 + |         (unsigned long long)(tv.tv_sec) * 1000 + | ||||||
|         (unsigned long long)(tv.tv_usec) / 1000; |         (unsigned long long)(tv.tv_usec) / 1000; | ||||||
|  |  | ||||||
|     printf("%llu xFilter: idxNum=%d, idxStr=%lu, argc=%d\n", millisecondsSinceEpoch, idxNum, (long unsigned int)idxStr, argc); |     printf("%llu xFilter: idxNum=%d, idxStr=%lu, argc=%d\n", | ||||||
|  |            millisecondsSinceEpoch, idxNum, (long unsigned int)idxStr, argc); | ||||||
|     debugConstraints(indexInfo, cursor->getTable(), argc, argv); |     debugConstraints(indexInfo, cursor->getTable(), argc, argv); | ||||||
| #endif | #endif | ||||||
|     std::vector<Constraint> constraints; |     std::vector<Constraint> constraints; | ||||||
| @@ -571,7 +555,8 @@ static int parquetFilter( | |||||||
|       } else if (sqliteType == SQLITE_BLOB) { |       } else if (sqliteType == SQLITE_BLOB) { | ||||||
|         type = Blob; |         type = Blob; | ||||||
|         int len = sqlite3_value_bytes(argv[j]); |         int len = sqlite3_value_bytes(argv[j]); | ||||||
|         const unsigned char* ptr = (const unsigned char*)sqlite3_value_blob(argv[j]); |         const unsigned char *ptr = | ||||||
|  |             (const unsigned char *)sqlite3_value_blob(argv[j]); | ||||||
|         for (int k = 0; k < len; k++) { |         for (int k = 0; k < len; k++) { | ||||||
|           blobValue.push_back(ptr[k]); |           blobValue.push_back(ptr[k]); | ||||||
|         } |         } | ||||||
| @@ -581,37 +566,29 @@ static int parquetFilter( | |||||||
|  |  | ||||||
|       std::string columnName = "rowid"; |       std::string columnName = "rowid"; | ||||||
|       if (indexInfo->aConstraint[i].iColumn >= 0) { |       if (indexInfo->aConstraint[i].iColumn >= 0) { | ||||||
|         columnName = cursor->getTable()->columnName(indexInfo->aConstraint[i].iColumn); |         columnName = | ||||||
|  |             cursor->getTable()->columnName(indexInfo->aConstraint[i].iColumn); | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       RowGroupBitmap bitmap = RowGroupBitmap(cursor->getNumRowGroups()); |       RowGroupBitmap bitmap = RowGroupBitmap(cursor->getNumRowGroups()); | ||||||
|       Constraint dummy( |       Constraint dummy( | ||||||
|         bitmap, |           bitmap, indexInfo->aConstraint[i].iColumn, columnName, | ||||||
|         indexInfo->aConstraint[i].iColumn, |           constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), type, | ||||||
|         columnName, |           intValue, doubleValue, blobValue); | ||||||
|         constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), |  | ||||||
|         type, |  | ||||||
|         intValue, |  | ||||||
|         doubleValue, |  | ||||||
|         blobValue); |  | ||||||
|  |  | ||||||
|       std::vector<unsigned char> actual = getRowGroupsForClause(db, cursor->getTable()->getTableName(), dummy.describe()); |       std::vector<unsigned char> actual = getRowGroupsForClause( | ||||||
|  |           db, cursor->getTable()->getTableName(), dummy.describe()); | ||||||
|       if (actual.size() > 0) { |       if (actual.size() > 0) { | ||||||
|         // Initialize the estimate to be the actual -- eventually they'll converge |         // Initialize the estimate to be the actual -- eventually they'll | ||||||
|         // and we'll stop writing back to the db. |         // converge and we'll stop writing back to the db. | ||||||
|         std::vector<unsigned char> estimate = actual; |         std::vector<unsigned char> estimate = actual; | ||||||
|         bitmap = RowGroupBitmap(estimate, actual); |         bitmap = RowGroupBitmap(estimate, actual); | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       Constraint constraint( |       Constraint constraint( | ||||||
|         bitmap, |           bitmap, indexInfo->aConstraint[i].iColumn, columnName, | ||||||
|         indexInfo->aConstraint[i].iColumn, |           constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), type, | ||||||
|         columnName, |           intValue, doubleValue, blobValue); | ||||||
|         constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), |  | ||||||
|         type, |  | ||||||
|         intValue, |  | ||||||
|         doubleValue, |  | ||||||
|         blobValue); |  | ||||||
|  |  | ||||||
|       constraints.push_back(constraint); |       constraints.push_back(constraint); | ||||||
|       j++; |       j++; | ||||||
| @@ -626,15 +603,12 @@ static int parquetFilter( | |||||||
| } | } | ||||||
|  |  | ||||||
| /* | /* | ||||||
| * We'll always indicate to SQLite that we prefer it to use an index so that it will |  * We'll always indicate to SQLite that we prefer it to use an index so that it | ||||||
| * pass additional context to xFilter, which we may or may not use. |  * will pass additional context to xFilter, which we may or may not use. | ||||||
|  * |  * | ||||||
|  * We copy the sqlite3_index_info structure, as is, into idxStr for later use. |  * We copy the sqlite3_index_info structure, as is, into idxStr for later use. | ||||||
|  */ |  */ | ||||||
| static int parquetBestIndex( | static int parquetBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo) { | ||||||
|   sqlite3_vtab *tab, |  | ||||||
|   sqlite3_index_info *pIdxInfo |  | ||||||
| ){ |  | ||||||
|   try { |   try { | ||||||
|  |  | ||||||
| #ifdef DEBUG | #ifdef DEBUG | ||||||
| @@ -644,15 +618,16 @@ static int parquetBestIndex( | |||||||
|         (unsigned long long)(tv.tv_sec) * 1000 + |         (unsigned long long)(tv.tv_sec) * 1000 + | ||||||
|         (unsigned long long)(tv.tv_usec) / 1000; |         (unsigned long long)(tv.tv_usec) / 1000; | ||||||
|  |  | ||||||
|  |  | ||||||
|     ParquetTable *table = ((sqlite3_vtab_parquet *)tab)->table; |     ParquetTable *table = ((sqlite3_vtab_parquet *)tab)->table; | ||||||
|     printf("%llu xBestIndex: nConstraint=%d, nOrderBy=%d\n", millisecondsSinceEpoch, pIdxInfo->nConstraint, pIdxInfo->nOrderBy); |     printf("%llu xBestIndex: nConstraint=%d, nOrderBy=%d\n", | ||||||
|  |            millisecondsSinceEpoch, pIdxInfo->nConstraint, pIdxInfo->nOrderBy); | ||||||
|     debugConstraints(pIdxInfo, table, 0, NULL); |     debugConstraints(pIdxInfo, table, 0, NULL); | ||||||
| #endif | #endif | ||||||
|     // We traverse in rowid ascending order, so if they're asking for it to be ordered like that, |     // We traverse in rowid ascending order, so if they're asking for it to be | ||||||
|     // we can tell SQLite that it's guaranteed. This speeds up some DB viewer utilities that |     // ordered like that, we can tell SQLite that it's guaranteed. This speeds | ||||||
|     // use rowids for pagination. |     // up some DB viewer utilities that use rowids for pagination. | ||||||
|     if(pIdxInfo->nOrderBy == 1 && pIdxInfo->aOrderBy[0].iColumn == -1 && pIdxInfo->aOrderBy[0].desc == 0) |     if (pIdxInfo->nOrderBy == 1 && pIdxInfo->aOrderBy[0].iColumn == -1 && | ||||||
|  |         pIdxInfo->aOrderBy[0].desc == 0) | ||||||
|       pIdxInfo->orderByConsumed = 1; |       pIdxInfo->orderByConsumed = 1; | ||||||
|  |  | ||||||
|     if (pIdxInfo->nConstraint == 0) { |     if (pIdxInfo->nConstraint == 0) { | ||||||
| @@ -672,11 +647,14 @@ static int parquetBestIndex( | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     size_t dupeSize = sizeof(sqlite3_index_info) + |     size_t dupeSize = | ||||||
|  |         sizeof(sqlite3_index_info) + | ||||||
|         // pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) + |         // pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) + | ||||||
|       pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) + |         pIdxInfo->nConstraint * | ||||||
|  |             sizeof(sqlite3_index_info::sqlite3_index_constraint) + | ||||||
|         pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) + |         pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) + | ||||||
|       pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint_usage); |         pIdxInfo->nConstraint * | ||||||
|  |             sizeof(sqlite3_index_info::sqlite3_index_constraint_usage); | ||||||
|     sqlite3_index_info *dupe = (sqlite3_index_info *)sqlite3_malloc(dupeSize); |     sqlite3_index_info *dupe = (sqlite3_index_info *)sqlite3_malloc(dupeSize); | ||||||
|     pIdxInfo->idxStr = (char *)dupe; |     pIdxInfo->idxStr = (char *)dupe; | ||||||
|     pIdxInfo->needToFreeIdxStr = 1; |     pIdxInfo->needToFreeIdxStr = 1; | ||||||
| @@ -684,15 +662,20 @@ static int parquetBestIndex( | |||||||
|     memset(dupe, 0, dupeSize); |     memset(dupe, 0, dupeSize); | ||||||
|     memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info)); |     memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info)); | ||||||
|  |  | ||||||
|     dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint*)((char*)dupe + sizeof(sqlite3_index_info)); |     dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint | ||||||
|     dupe->aOrderBy = (sqlite3_index_info::sqlite3_index_orderby*)((char*)dupe + |                              *)((char *)dupe + sizeof(sqlite3_index_info)); | ||||||
|         sizeof(sqlite3_index_info) + |     dupe->aOrderBy = | ||||||
|         pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint)); |         (sqlite3_index_info::sqlite3_index_orderby | ||||||
|     dupe->aConstraintUsage = (sqlite3_index_info::sqlite3_index_constraint_usage*)((char*)dupe + |              *)((char *)dupe + sizeof(sqlite3_index_info) + | ||||||
|         sizeof(sqlite3_index_info) + |                 pIdxInfo->nConstraint * | ||||||
|         pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) + |                     sizeof(sqlite3_index_info::sqlite3_index_constraint)); | ||||||
|         pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby)); |     dupe->aConstraintUsage = | ||||||
|  |         (sqlite3_index_info::sqlite3_index_constraint_usage | ||||||
|  |              *)((char *)dupe + sizeof(sqlite3_index_info) + | ||||||
|  |                 pIdxInfo->nConstraint * | ||||||
|  |                     sizeof(sqlite3_index_info::sqlite3_index_constraint) + | ||||||
|  |                 pIdxInfo->nOrderBy * | ||||||
|  |                     sizeof(sqlite3_index_info::sqlite3_index_orderby)); | ||||||
|  |  | ||||||
|     for (int i = 0; i < pIdxInfo->nConstraint; i++) { |     for (int i = 0; i < pIdxInfo->nConstraint; i++) { | ||||||
|       dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn; |       dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn; | ||||||
| @@ -700,7 +683,8 @@ static int parquetBestIndex( | |||||||
|       dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable; |       dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable; | ||||||
|       dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset; |       dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset; | ||||||
|  |  | ||||||
|       dupe->aConstraintUsage[i].argvIndex = pIdxInfo->aConstraintUsage[i].argvIndex; |       dupe->aConstraintUsage[i].argvIndex = | ||||||
|  |           pIdxInfo->aConstraintUsage[i].argvIndex; | ||||||
|       dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit; |       dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -717,7 +701,6 @@ static int parquetBestIndex( | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| static sqlite3_module ParquetModule = { | static sqlite3_module ParquetModule = { | ||||||
|     0,                 /* iVersion */ |     0,                 /* iVersion */ | ||||||
|     parquetCreate,     /* xCreate */ |     parquetCreate,     /* xCreate */ | ||||||
| @@ -747,11 +730,8 @@ static sqlite3_module ParquetModule = { | |||||||
|  * connection. |  * connection. | ||||||
|  */ |  */ | ||||||
| extern "C" { | extern "C" { | ||||||
|   int sqlite3_parquet_init( | int sqlite3_parquet_init(sqlite3 *db, char **pzErrMsg, | ||||||
|     sqlite3 *db,  |                          const sqlite3_api_routines *pApi) { | ||||||
|     char **pzErrMsg,  |  | ||||||
|     const sqlite3_api_routines *pApi |  | ||||||
|   ){ |  | ||||||
|   int rc; |   int rc; | ||||||
|   SQLITE_EXTENSION_INIT2(pApi); |   SQLITE_EXTENSION_INIT2(pApi); | ||||||
|   rc = sqlite3_create_module(db, "parquet", &ParquetModule, 0); |   rc = sqlite3_create_module(db, "parquet", &ParquetModule, 0); | ||||||
|   | |||||||
| @@ -5,7 +5,8 @@ ParquetCursor::ParquetCursor(ParquetTable* table): table(table) { | |||||||
|   reset(std::vector<Constraint>()); |   reset(std::vector<Constraint>()); | ||||||
| } | } | ||||||
|  |  | ||||||
| bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint) { | bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter( | ||||||
|  |     Constraint &constraint) { | ||||||
|   if (constraint.type != Integer) |   if (constraint.type != Integer) | ||||||
|     return true; |     return true; | ||||||
|  |  | ||||||
| @@ -31,7 +32,9 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint) | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) { | bool ParquetCursor::currentRowGroupSatisfiesBlobFilter( | ||||||
|  |     Constraint &constraint, | ||||||
|  |     std::shared_ptr<parquet::RowGroupStatistics> _stats) { | ||||||
|   if (!_stats->HasMinMax()) { |   if (!_stats->HasMinMax()) { | ||||||
|     return true; |     return true; | ||||||
|   } |   } | ||||||
| @@ -48,8 +51,10 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s | |||||||
|   parquet::Type::type pqType = types[constraint.column]; |   parquet::Type::type pqType = types[constraint.column]; | ||||||
|  |  | ||||||
|   if (pqType == parquet::Type::BYTE_ARRAY) { |   if (pqType == parquet::Type::BYTE_ARRAY) { | ||||||
|     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats = |     parquet::TypedRowGroupStatistics< | ||||||
|       (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get(); |         parquet::DataType<parquet::Type::BYTE_ARRAY>> *stats = | ||||||
|  |         (parquet::TypedRowGroupStatistics< | ||||||
|  |             parquet::DataType<parquet::Type::BYTE_ARRAY>> *)_stats.get(); | ||||||
|  |  | ||||||
|     minPtr = stats->min().ptr; |     minPtr = stats->min().ptr; | ||||||
|     minLen = stats->min().len; |     minLen = stats->min().len; | ||||||
| @@ -64,8 +69,9 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s | |||||||
|   } else { |   } else { | ||||||
|     // Should be impossible to get here |     // Should be impossible to get here | ||||||
|     std::ostringstream ss; |     std::ostringstream ss; | ||||||
|     ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesBlobFilter called on unsupported type: " << |     ss << __FILE__ << ":" << __LINE__ | ||||||
|       parquet::TypeToString(pqType); |        << ": currentRowGroupSatisfiesBlobFilter called on unsupported type: " | ||||||
|  |        << parquet::TypeToString(pqType); | ||||||
|     throw std::invalid_argument(ss.str()); |     throw std::invalid_argument(ss.str()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -73,61 +79,45 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s | |||||||
|  |  | ||||||
|   switch (constraint.op) { |   switch (constraint.op) { | ||||||
|   case Is: |   case Is: | ||||||
|     case Equal: |   case Equal: { | ||||||
|     { |     bool minEqual = | ||||||
|       bool minEqual = blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0; |         blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0; | ||||||
|       bool maxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0; |     bool maxEqual = | ||||||
|  |         blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0; | ||||||
|  |  | ||||||
|     bool blobGteMinBlob = std::lexicographical_compare( |     bool blobGteMinBlob = std::lexicographical_compare( | ||||||
|           minPtr, |         minPtr, minPtr + minLen, &blob[0], &blob[0] + blob.size()); | ||||||
|           minPtr + minLen, |  | ||||||
|           &blob[0], |  | ||||||
|           &blob[0] + blob.size()); |  | ||||||
|  |  | ||||||
|     bool blobLtMaxBlob = std::lexicographical_compare( |     bool blobLtMaxBlob = std::lexicographical_compare( | ||||||
|           &blob[0], |         &blob[0], &blob[0] + blob.size(), maxPtr, maxPtr + maxLen); | ||||||
|           &blob[0] + blob.size(), |  | ||||||
|           maxPtr, |  | ||||||
|           maxPtr + maxLen); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     return (minEqual || blobGteMinBlob) && (maxEqual || blobLtMaxBlob); |     return (minEqual || blobGteMinBlob) && (maxEqual || blobLtMaxBlob); | ||||||
|   } |   } | ||||||
|     case GreaterThanOrEqual: |   case GreaterThanOrEqual: { | ||||||
|     { |     bool maxEqual = | ||||||
|       bool maxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0; |         blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0; | ||||||
|  |  | ||||||
|       return maxEqual || std::lexicographical_compare( |     return maxEqual || | ||||||
|           &blob[0], |            std::lexicographical_compare(&blob[0], &blob[0] + blob.size(), | ||||||
|           &blob[0] + blob.size(), |                                         maxPtr, maxPtr + maxLen); | ||||||
|           maxPtr, |  | ||||||
|           maxPtr + maxLen); |  | ||||||
|   } |   } | ||||||
|   case GreaterThan: |   case GreaterThan: | ||||||
|       return std::lexicographical_compare( |     return std::lexicographical_compare(&blob[0], &blob[0] + blob.size(), | ||||||
|           &blob[0], |                                         maxPtr, maxPtr + maxLen); | ||||||
|           &blob[0] + blob.size(), |  | ||||||
|           maxPtr, |  | ||||||
|           maxPtr + maxLen); |  | ||||||
|   case LessThan: |   case LessThan: | ||||||
|       return std::lexicographical_compare( |     return std::lexicographical_compare(minPtr, minPtr + minLen, &blob[0], | ||||||
|           minPtr, |  | ||||||
|           minPtr + minLen, |  | ||||||
|           &blob[0], |  | ||||||
|                                         &blob[0] + blob.size()); |                                         &blob[0] + blob.size()); | ||||||
|     case LessThanOrEqual: |   case LessThanOrEqual: { | ||||||
|     { |     bool minEqual = | ||||||
|       bool minEqual = blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0; |         blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0; | ||||||
|       return minEqual || std::lexicographical_compare( |     return minEqual || | ||||||
|           minPtr, |            std::lexicographical_compare(minPtr, minPtr + minLen, &blob[0], | ||||||
|           minPtr + minLen, |  | ||||||
|           &blob[0], |  | ||||||
|                                         &blob[0] + blob.size()); |                                         &blob[0] + blob.size()); | ||||||
|   } |   } | ||||||
|     case NotEqual: |   case NotEqual: { | ||||||
|     { |  | ||||||
|     // If min == max == blob, we can skip this. |     // If min == max == blob, we can skip this. | ||||||
|       bool blobMaxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0; |     bool blobMaxEqual = | ||||||
|  |         blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0; | ||||||
|     bool minMaxEqual = minLen == maxLen && memcmp(minPtr, maxPtr, minLen) == 0; |     bool minMaxEqual = minLen == maxLen && memcmp(minPtr, maxPtr, minLen) == 0; | ||||||
|     return !(blobMaxEqual && minMaxEqual); |     return !(blobMaxEqual && minMaxEqual); | ||||||
|   } |   } | ||||||
| @@ -137,9 +127,12 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) { | bool ParquetCursor::currentRowGroupSatisfiesTextFilter( | ||||||
|   parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats = |     Constraint &constraint, | ||||||
|     (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get(); |     std::shared_ptr<parquet::RowGroupStatistics> _stats) { | ||||||
|  |   parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>> | ||||||
|  |       *stats = (parquet::TypedRowGroupStatistics< | ||||||
|  |                 parquet::DataType<parquet::Type::BYTE_ARRAY>> *)_stats.get(); | ||||||
|  |  | ||||||
|   if (!stats->HasMinMax()) { |   if (!stats->HasMinMax()) { | ||||||
|     return true; |     return true; | ||||||
| @@ -154,7 +147,8 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s | |||||||
|   const parquet::ByteArray &max = stats->max(); |   const parquet::ByteArray &max = stats->max(); | ||||||
|   std::string minStr((const char *)min.ptr, min.len); |   std::string minStr((const char *)min.ptr, min.len); | ||||||
|   std::string maxStr((const char *)max.ptr, max.len); |   std::string maxStr((const char *)max.ptr, max.len); | ||||||
| //  printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data()); |   //  printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, | ||||||
|  |   //  maxStr.data(), max.len, str.data()); | ||||||
|  |  | ||||||
|   switch (constraint.op) { |   switch (constraint.op) { | ||||||
|   case Is: |   case Is: | ||||||
| @@ -171,12 +165,12 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, s | |||||||
|   case NotEqual: |   case NotEqual: | ||||||
|     // If min == max == str, we can skip this. |     // If min == max == str, we can skip this. | ||||||
|     return !(minStr == maxStr && str == minStr); |     return !(minStr == maxStr && str == minStr); | ||||||
|     case Like: |   case Like: { | ||||||
|     { |  | ||||||
|     const std::string &likeStringValue = constraint.likeStringValue; |     const std::string &likeStringValue = constraint.likeStringValue; | ||||||
|     std::string truncatedMin = minStr.substr(0, likeStringValue.size()); |     std::string truncatedMin = minStr.substr(0, likeStringValue.size()); | ||||||
|     std::string truncatedMax = maxStr.substr(0, likeStringValue.size()); |     std::string truncatedMax = maxStr.substr(0, likeStringValue.size()); | ||||||
|       return likeStringValue.empty() || (likeStringValue >= truncatedMin && likeStringValue <= truncatedMax); |     return likeStringValue.empty() || | ||||||
|  |            (likeStringValue >= truncatedMin && likeStringValue <= truncatedMax); | ||||||
|   } |   } | ||||||
|   case IsNot: |   case IsNot: | ||||||
|   default: |   default: | ||||||
| @@ -195,7 +189,9 @@ int64_t int96toMsSinceEpoch(const parquet::Int96& rv) { | |||||||
|   return nsSinceEpoch; |   return nsSinceEpoch; | ||||||
| } | } | ||||||
|  |  | ||||||
| bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) { | bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter( | ||||||
|  |     Constraint &constraint, | ||||||
|  |     std::shared_ptr<parquet::RowGroupStatistics> _stats) { | ||||||
|   if (!_stats->HasMinMax()) { |   if (!_stats->HasMinMax()) { | ||||||
|     return true; |     return true; | ||||||
|   } |   } | ||||||
| @@ -211,27 +207,31 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint | |||||||
|   parquet::Type::type pqType = types[column]; |   parquet::Type::type pqType = types[column]; | ||||||
|  |  | ||||||
|   if (pqType == parquet::Type::INT32) { |   if (pqType == parquet::Type::INT32) { | ||||||
|     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT32>>* stats = |     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT32>> | ||||||
|       (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT32>>*)_stats.get(); |         *stats = (parquet::TypedRowGroupStatistics< | ||||||
|  |                   parquet::DataType<parquet::Type::INT32>> *)_stats.get(); | ||||||
|  |  | ||||||
|     min = stats->min(); |     min = stats->min(); | ||||||
|     max = stats->max(); |     max = stats->max(); | ||||||
|   } else if (pqType == parquet::Type::INT64) { |   } else if (pqType == parquet::Type::INT64) { | ||||||
|     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT64>>* stats = |     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT64>> | ||||||
|       (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT64>>*)_stats.get(); |         *stats = (parquet::TypedRowGroupStatistics< | ||||||
|  |                   parquet::DataType<parquet::Type::INT64>> *)_stats.get(); | ||||||
|  |  | ||||||
|     min = stats->min(); |     min = stats->min(); | ||||||
|     max = stats->max(); |     max = stats->max(); | ||||||
|   } else if (pqType == parquet::Type::INT96) { |   } else if (pqType == parquet::Type::INT96) { | ||||||
|     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT96>>* stats = |     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT96>> | ||||||
|       (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT96>>*)_stats.get(); |         *stats = (parquet::TypedRowGroupStatistics< | ||||||
|  |                   parquet::DataType<parquet::Type::INT96>> *)_stats.get(); | ||||||
|  |  | ||||||
|     min = int96toMsSinceEpoch(stats->min()); |     min = int96toMsSinceEpoch(stats->min()); | ||||||
|     max = int96toMsSinceEpoch(stats->max()); |     max = int96toMsSinceEpoch(stats->max()); | ||||||
|  |  | ||||||
|   } else if (pqType == parquet::Type::BOOLEAN) { |   } else if (pqType == parquet::Type::BOOLEAN) { | ||||||
|     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BOOLEAN>>* stats = |     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BOOLEAN>> | ||||||
|       (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BOOLEAN>>*)_stats.get(); |         *stats = (parquet::TypedRowGroupStatistics< | ||||||
|  |                   parquet::DataType<parquet::Type::BOOLEAN>> *)_stats.get(); | ||||||
|  |  | ||||||
|     min = stats->min(); |     min = stats->min(); | ||||||
|     max = stats->max(); |     max = stats->max(); | ||||||
| @@ -240,13 +240,15 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint | |||||||
|     // Should be impossible to get here as we should have forbidden this at |     // Should be impossible to get here as we should have forbidden this at | ||||||
|     // CREATE time -- maybe file changed underneath us? |     // CREATE time -- maybe file changed underneath us? | ||||||
|     std::ostringstream ss; |     std::ostringstream ss; | ||||||
|     ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesIntegerFilter called on unsupported type: " << |     ss << __FILE__ << ":" << __LINE__ | ||||||
|       parquet::TypeToString(pqType); |        << ": currentRowGroupSatisfiesIntegerFilter called on unsupported type: " | ||||||
|  |        << parquet::TypeToString(pqType); | ||||||
|     throw std::invalid_argument(ss.str()); |     throw std::invalid_argument(ss.str()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   const int64_t value = constraint.intValue; |   const int64_t value = constraint.intValue; | ||||||
| //  printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data()); |   //  printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, | ||||||
|  |   //  maxStr.data(), max.len, str.data()); | ||||||
|  |  | ||||||
|   switch (constraint.op) { |   switch (constraint.op) { | ||||||
|   case Is: |   case Is: | ||||||
| @@ -272,7 +274,9 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint | |||||||
|   return true; |   return true; | ||||||
| } | } | ||||||
|  |  | ||||||
| bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) { | bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter( | ||||||
|  |     Constraint &constraint, | ||||||
|  |     std::shared_ptr<parquet::RowGroupStatistics> _stats) { | ||||||
|   if (!_stats->HasMinMax()) { |   if (!_stats->HasMinMax()) { | ||||||
|     return true; |     return true; | ||||||
|   } |   } | ||||||
| @@ -288,14 +292,16 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, | |||||||
|   parquet::Type::type pqType = types[column]; |   parquet::Type::type pqType = types[column]; | ||||||
|  |  | ||||||
|   if (pqType == parquet::Type::DOUBLE) { |   if (pqType == parquet::Type::DOUBLE) { | ||||||
|     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>* stats = |     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>> | ||||||
|       (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>*)_stats.get(); |         *stats = (parquet::TypedRowGroupStatistics< | ||||||
|  |                   parquet::DataType<parquet::Type::DOUBLE>> *)_stats.get(); | ||||||
|  |  | ||||||
|     min = stats->min(); |     min = stats->min(); | ||||||
|     max = stats->max(); |     max = stats->max(); | ||||||
|   } else if (pqType == parquet::Type::FLOAT) { |   } else if (pqType == parquet::Type::FLOAT) { | ||||||
|     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>* stats = |     parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>> | ||||||
|       (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>*)_stats.get(); |         *stats = (parquet::TypedRowGroupStatistics< | ||||||
|  |                   parquet::DataType<parquet::Type::FLOAT>> *)_stats.get(); | ||||||
|  |  | ||||||
|     min = stats->min(); |     min = stats->min(); | ||||||
|     max = stats->max(); |     max = stats->max(); | ||||||
| @@ -303,13 +309,15 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, | |||||||
|     // Should be impossible to get here as we should have forbidden this at |     // Should be impossible to get here as we should have forbidden this at | ||||||
|     // CREATE time -- maybe file changed underneath us? |     // CREATE time -- maybe file changed underneath us? | ||||||
|     std::ostringstream ss; |     std::ostringstream ss; | ||||||
|     ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesIntegerFilter called on unsupported type: " << |     ss << __FILE__ << ":" << __LINE__ | ||||||
|       parquet::TypeToString(pqType); |        << ": currentRowGroupSatisfiesIntegerFilter called on unsupported type: " | ||||||
|  |        << parquet::TypeToString(pqType); | ||||||
|     throw std::invalid_argument(ss.str()); |     throw std::invalid_argument(ss.str()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   const double value = constraint.doubleValue; |   const double value = constraint.doubleValue; | ||||||
| //  printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, maxStr.data(), max.len, str.data()); |   //  printf("min=%s [%d], max=%s [%d], target=%s\n", minStr.data(), min.len, | ||||||
|  |   //  maxStr.data(), max.len, str.data()); | ||||||
|  |  | ||||||
|   switch (constraint.op) { |   switch (constraint.op) { | ||||||
|   case Is: |   case Is: | ||||||
| @@ -333,7 +341,6 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   return true; |   return true; | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint &constraint) { | bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint &constraint) { | ||||||
| @@ -345,8 +352,7 @@ bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) { | |||||||
|  |  | ||||||
|   switch (constraint.op) { |   switch (constraint.op) { | ||||||
|   case Is: |   case Is: | ||||||
|     case Equal: |   case Equal: { | ||||||
|     { |  | ||||||
|     const std::vector<unsigned char> &blob = constraint.blobValue; |     const std::vector<unsigned char> &blob = constraint.blobValue; | ||||||
|  |  | ||||||
|     if (blob.size() != ba->len) |     if (blob.size() != ba->len) | ||||||
| @@ -354,8 +360,7 @@ bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) { | |||||||
|  |  | ||||||
|     return 0 == memcmp(&blob[0], ba->ptr, ba->len); |     return 0 == memcmp(&blob[0], ba->ptr, ba->len); | ||||||
|   } |   } | ||||||
|     case NotEqual: |   case NotEqual: { | ||||||
|     { |  | ||||||
|     const std::vector<unsigned char> &blob = constraint.blobValue; |     const std::vector<unsigned char> &blob = constraint.blobValue; | ||||||
|  |  | ||||||
|     if (blob.size() != ba->len) |     if (blob.size() != ba->len) | ||||||
| @@ -363,52 +368,39 @@ bool ParquetCursor::currentRowSatisfiesTextFilter(Constraint& constraint) { | |||||||
|  |  | ||||||
|     return 0 != memcmp(&blob[0], ba->ptr, ba->len); |     return 0 != memcmp(&blob[0], ba->ptr, ba->len); | ||||||
|   } |   } | ||||||
|     case GreaterThan: |   case GreaterThan: { | ||||||
|     { |  | ||||||
|     const std::vector<unsigned char> &blob = constraint.blobValue; |     const std::vector<unsigned char> &blob = constraint.blobValue; | ||||||
|  |  | ||||||
|       return std::lexicographical_compare( |     return std::lexicographical_compare(&blob[0], &blob[0] + blob.size(), | ||||||
|           &blob[0], |                                         ba->ptr, ba->ptr + ba->len); | ||||||
|           &blob[0] + blob.size(), |  | ||||||
|           ba->ptr, |  | ||||||
|           ba->ptr + ba->len); |  | ||||||
|   } |   } | ||||||
|     case GreaterThanOrEqual: |   case GreaterThanOrEqual: { | ||||||
|     { |  | ||||||
|     const std::vector<unsigned char> &blob = constraint.blobValue; |     const std::vector<unsigned char> &blob = constraint.blobValue; | ||||||
|  |  | ||||||
|       bool equal = blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len); |     bool equal = | ||||||
|  |         blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len); | ||||||
|  |  | ||||||
|       return equal || std::lexicographical_compare( |     return equal || | ||||||
|           &blob[0], |            std::lexicographical_compare(&blob[0], &blob[0] + blob.size(), | ||||||
|           &blob[0] + blob.size(), |                                         ba->ptr, ba->ptr + ba->len); | ||||||
|           ba->ptr, |  | ||||||
|           ba->ptr + ba->len); |  | ||||||
|   } |   } | ||||||
|     case LessThan: |   case LessThan: { | ||||||
|     { |  | ||||||
|     const std::vector<unsigned char> &blob = constraint.blobValue; |     const std::vector<unsigned char> &blob = constraint.blobValue; | ||||||
|  |  | ||||||
|       return std::lexicographical_compare( |     return std::lexicographical_compare(ba->ptr, ba->ptr + ba->len, &blob[0], | ||||||
|           ba->ptr, |  | ||||||
|           ba->ptr + ba->len, |  | ||||||
|           &blob[0], |  | ||||||
|                                         &blob[0] + blob.size()); |                                         &blob[0] + blob.size()); | ||||||
|   } |   } | ||||||
|     case LessThanOrEqual: |   case LessThanOrEqual: { | ||||||
|     { |  | ||||||
|     const std::vector<unsigned char> &blob = constraint.blobValue; |     const std::vector<unsigned char> &blob = constraint.blobValue; | ||||||
|  |  | ||||||
|       bool equal = blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len); |     bool equal = | ||||||
|  |         blob.size() == ba->len && 0 == memcmp(&blob[0], ba->ptr, ba->len); | ||||||
|  |  | ||||||
|       return equal || std::lexicographical_compare( |     return equal || | ||||||
|           ba->ptr, |            std::lexicographical_compare(ba->ptr, ba->ptr + ba->len, &blob[0], | ||||||
|           ba->ptr + ba->len, |  | ||||||
|           &blob[0], |  | ||||||
|                                         &blob[0] + blob.size()); |                                         &blob[0] + blob.size()); | ||||||
|   } |   } | ||||||
|     case Like: |   case Like: { | ||||||
|     { |  | ||||||
|     const std::string &likeStringValue = constraint.likeStringValue; |     const std::string &likeStringValue = constraint.likeStringValue; | ||||||
|     if (likeStringValue.size() > ba->len) |     if (likeStringValue.size() > ba->len) | ||||||
|       return false; |       return false; | ||||||
| @@ -441,13 +433,15 @@ bool ParquetCursor::currentRowSatisfiesIntegerFilter(Constraint& constraint) { | |||||||
|  |  | ||||||
|     if (pqType == parquet::Type::INT32 || pqType == parquet::Type::BOOLEAN) { |     if (pqType == parquet::Type::INT32 || pqType == parquet::Type::BOOLEAN) { | ||||||
|       value = getInt32(column); |       value = getInt32(column); | ||||||
|     } else if(pqType == parquet::Type::INT64 || pqType == parquet::Type::INT96) { |     } else if (pqType == parquet::Type::INT64 || | ||||||
|  |                pqType == parquet::Type::INT96) { | ||||||
|       value = getInt64(column); |       value = getInt64(column); | ||||||
|     } else { |     } else { | ||||||
|       // Should be impossible to get here |       // Should be impossible to get here | ||||||
|       std::ostringstream ss; |       std::ostringstream ss; | ||||||
|       ss << __FILE__ << ":" << __LINE__ << ": currentRowSatisfiesIntegerFilter called on unsupported type: " << |       ss << __FILE__ << ":" << __LINE__ | ||||||
|         parquet::TypeToString(pqType); |          << ": currentRowSatisfiesIntegerFilter called on unsupported type: " | ||||||
|  |          << parquet::TypeToString(pqType); | ||||||
|       throw std::invalid_argument(ss.str()); |       throw std::invalid_argument(ss.str()); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -509,7 +503,6 @@ bool ParquetCursor::currentRowSatisfiesDoubleFilter(Constraint& constraint) { | |||||||
|   return true; |   return true; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| // Return true if it is _possible_ that the current | // Return true if it is _possible_ that the current | ||||||
| // rowgroup satisfies the constraints. Only return false | // rowgroup satisfies the constraints. Only return false | ||||||
| // if it definitely does not. | // if it definitely does not. | ||||||
| @@ -525,7 +518,8 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { | |||||||
|     if (column == -1) { |     if (column == -1) { | ||||||
|       rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]); |       rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]); | ||||||
|     } else { |     } else { | ||||||
|       std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column); |       std::unique_ptr<parquet::ColumnChunkMetaData> md = | ||||||
|  |           rowGroupMetadata->ColumnChunk(column); | ||||||
|       if (md->is_stats_set()) { |       if (md->is_stats_set()) { | ||||||
|         std::shared_ptr<parquet::RowGroupStatistics> stats = md->statistics(); |         std::shared_ptr<parquet::RowGroupStatistics> stats = md->statistics(); | ||||||
|  |  | ||||||
| @@ -545,7 +539,8 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { | |||||||
|         } else { |         } else { | ||||||
|           parquet::Type::type pqType = types[column]; |           parquet::Type::type pqType = types[column]; | ||||||
|  |  | ||||||
|           if(pqType == parquet::Type::BYTE_ARRAY && logicalTypes[column] == parquet::LogicalType::UTF8) { |           if (pqType == parquet::Type::BYTE_ARRAY && | ||||||
|  |               logicalTypes[column] == parquet::LogicalType::UTF8) { | ||||||
|             rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); |             rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); | ||||||
|           } else if (pqType == parquet::Type::BYTE_ARRAY) { |           } else if (pqType == parquet::Type::BYTE_ARRAY) { | ||||||
|             rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats); |             rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats); | ||||||
| @@ -554,7 +549,8 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { | |||||||
|                      pqType == parquet::Type::INT96 || |                      pqType == parquet::Type::INT96 || | ||||||
|                      pqType == parquet::Type::BOOLEAN) { |                      pqType == parquet::Type::BOOLEAN) { | ||||||
|             rv = currentRowGroupSatisfiesIntegerFilter(constraints[i], stats); |             rv = currentRowGroupSatisfiesIntegerFilter(constraints[i], stats); | ||||||
|           } else if(pqType == parquet::Type::FLOAT || pqType == parquet::Type::DOUBLE) { |           } else if (pqType == parquet::Type::FLOAT || | ||||||
|  |                      pqType == parquet::Type::DOUBLE) { | ||||||
|             rv = currentRowGroupSatisfiesDoubleFilter(constraints[i], stats); |             rv = currentRowGroupSatisfiesDoubleFilter(constraints[i], stats); | ||||||
|           } |           } | ||||||
|         } |         } | ||||||
| @@ -570,15 +566,15 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| //  printf("rowGroup %d %s\n", rowGroupId, overallRv ? "may satisfy" : "does not satisfy"); |   //  printf("rowGroup %d %s\n", rowGroupId, overallRv ? "may satisfy" : "does | ||||||
|  |   //  not satisfy"); | ||||||
|   return true; |   return true; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| bool ParquetCursor::nextRowGroup() { | bool ParquetCursor::nextRowGroup() { | ||||||
| start: | start: | ||||||
|   // Ensure that rowId points at the start of this rowGroup (eg, in the case where |   // Ensure that rowId points at the start of this rowGroup (eg, in the case | ||||||
|   // we skipped an entire row group). |   // where we skipped an entire row group). | ||||||
|   rowId = rowGroupStartRowId + rowGroupSize; |   rowId = rowGroupStartRowId + rowGroupSize; | ||||||
|  |  | ||||||
|   if ((rowGroupId + 1) >= numRowGroups) { |   if ((rowGroupId + 1) >= numRowGroups) { | ||||||
| @@ -595,7 +591,6 @@ start: | |||||||
|     colByteArrayValues.push_back(parquet::ByteArray()); |     colByteArrayValues.push_back(parquet::ByteArray()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   rowGroupStartRowId = rowId; |   rowGroupStartRowId = rowId; | ||||||
|   rowGroupId++; |   rowGroupId++; | ||||||
|   rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId); |   rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId); | ||||||
| @@ -609,10 +604,12 @@ start: | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   while (logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) { |   while (logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) { | ||||||
|     logicalTypes.push_back(rowGroupMetadata->schema()->Column(0)->logical_type()); |     logicalTypes.push_back( | ||||||
|  |         rowGroupMetadata->schema()->Column(0)->logical_type()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   for(unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); i++) { |   for (unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); | ||||||
|  |        i++) { | ||||||
|     types[i] = rowGroupMetadata->schema()->Column(i)->physical_type(); |     types[i] = rowGroupMetadata->schema()->Column(i)->physical_type(); | ||||||
|     logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type(); |     logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type(); | ||||||
|   } |   } | ||||||
| @@ -629,7 +626,8 @@ start: | |||||||
|   // a row |   // a row | ||||||
|   for (unsigned int i = 0; i < constraints.size(); i++) { |   for (unsigned int i = 0; i < constraints.size(); i++) { | ||||||
|     if (rowGroupId > 0 && constraints[i].rowGroupId == rowGroupId - 1) { |     if (rowGroupId > 0 && constraints[i].rowGroupId == rowGroupId - 1) { | ||||||
|       constraints[i].bitmap.setActualMembership(rowGroupId - 1, constraints[i].hadRows); |       constraints[i].bitmap.setActualMembership(rowGroupId - 1, | ||||||
|  |                                                 constraints[i].hadRows); | ||||||
|     } |     } | ||||||
|     constraints[i].hadRows = false; |     constraints[i].hadRows = false; | ||||||
|   } |   } | ||||||
| @@ -668,12 +666,12 @@ bool ParquetCursor::currentRowSatisfiesFilter() { | |||||||
|         rv = currentRowSatisfiesTextFilter(constraints[i]); |         rv = currentRowSatisfiesTextFilter(constraints[i]); | ||||||
|       } else { |       } else { | ||||||
|         parquet::Type::type pqType = types[column]; |         parquet::Type::type pqType = types[column]; | ||||||
|         if(pqType == parquet::Type::INT32 || |         if (pqType == parquet::Type::INT32 || pqType == parquet::Type::INT64 || | ||||||
|            pqType == parquet::Type::INT64 || |  | ||||||
|             pqType == parquet::Type::INT96 || |             pqType == parquet::Type::INT96 || | ||||||
|             pqType == parquet::Type::BOOLEAN) { |             pqType == parquet::Type::BOOLEAN) { | ||||||
|           rv = currentRowSatisfiesIntegerFilter(constraints[i]); |           rv = currentRowSatisfiesIntegerFilter(constraints[i]); | ||||||
|         } else if(pqType == parquet::Type::FLOAT || pqType == parquet::Type::DOUBLE) { |         } else if (pqType == parquet::Type::FLOAT || | ||||||
|  |                    pqType == parquet::Type::DOUBLE) { | ||||||
|           rv = currentRowSatisfiesDoubleFilter(constraints[i]); |           rv = currentRowSatisfiesDoubleFilter(constraints[i]); | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
| @@ -698,8 +696,8 @@ start: | |||||||
|       rowId = numRows + 1; |       rowId = numRows + 1; | ||||||
|       return; |       return; | ||||||
|     } else { |     } else { | ||||||
|       // After a successful nextRowGroup, rowId is pointing at the current row. Make it |       // After a successful nextRowGroup, rowId is pointing at the current row. | ||||||
|       // point before so the rest of the logic works out. |       // Make it point before so the rest of the logic works out. | ||||||
|       rowId--; |       rowId--; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -710,13 +708,9 @@ start: | |||||||
|     goto start; |     goto start; | ||||||
| } | } | ||||||
|  |  | ||||||
| int ParquetCursor::getRowId() { | int ParquetCursor::getRowId() { return rowId; } | ||||||
|   return rowId; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| bool ParquetCursor::eof() { | bool ParquetCursor::eof() { return rowId > numRows; } | ||||||
|   return rowId > numRows; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void ParquetCursor::ensureColumn(int col) { | void ParquetCursor::ensureColumn(int col) { | ||||||
|   // -1 signals rowid, which is trivially available |   // -1 signals rowid, which is trivially available | ||||||
| @@ -737,58 +731,53 @@ void ParquetCursor::ensureColumn(int col) { | |||||||
|     bool wasNull = false; |     bool wasNull = false; | ||||||
|     while (colRows[col] + 1 < rowId) { |     while (colRows[col] + 1 < rowId) { | ||||||
|       switch (types[col]) { |       switch (types[col]) { | ||||||
|         case parquet::Type::INT32: |       case parquet::Type::INT32: { | ||||||
|         { |  | ||||||
|         parquet::Int32Scanner *s = (parquet::Int32Scanner *)scanners[col].get(); |         parquet::Int32Scanner *s = (parquet::Int32Scanner *)scanners[col].get(); | ||||||
|         int rv = 0; |         int rv = 0; | ||||||
|         s->NextValue(&rv, &wasNull); |         s->NextValue(&rv, &wasNull); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::FLOAT: |       case parquet::Type::FLOAT: { | ||||||
|         { |  | ||||||
|         parquet::FloatScanner *s = (parquet::FloatScanner *)scanners[col].get(); |         parquet::FloatScanner *s = (parquet::FloatScanner *)scanners[col].get(); | ||||||
|         float rv = 0; |         float rv = 0; | ||||||
|         s->NextValue(&rv, &wasNull); |         s->NextValue(&rv, &wasNull); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::DOUBLE: |       case parquet::Type::DOUBLE: { | ||||||
|         { |         parquet::DoubleScanner *s = | ||||||
|           parquet::DoubleScanner* s = (parquet::DoubleScanner*)scanners[col].get(); |             (parquet::DoubleScanner *)scanners[col].get(); | ||||||
|         double rv = 0; |         double rv = 0; | ||||||
|         s->NextValue(&rv, &wasNull); |         s->NextValue(&rv, &wasNull); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::BYTE_ARRAY: |       case parquet::Type::BYTE_ARRAY: { | ||||||
|         { |         parquet::ByteArrayScanner *s = | ||||||
|           parquet::ByteArrayScanner* s = (parquet::ByteArrayScanner*)scanners[col].get(); |             (parquet::ByteArrayScanner *)scanners[col].get(); | ||||||
|         parquet::ByteArray ba; |         parquet::ByteArray ba; | ||||||
|         s->NextValue(&ba, &wasNull); |         s->NextValue(&ba, &wasNull); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::INT96: |       case parquet::Type::INT96: { | ||||||
|         { |  | ||||||
|         parquet::Int96Scanner *s = (parquet::Int96Scanner *)scanners[col].get(); |         parquet::Int96Scanner *s = (parquet::Int96Scanner *)scanners[col].get(); | ||||||
|         parquet::Int96 rv; |         parquet::Int96 rv; | ||||||
|         s->NextValue(&rv, &wasNull); |         s->NextValue(&rv, &wasNull); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::INT64: |       case parquet::Type::INT64: { | ||||||
|         { |  | ||||||
|         parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); |         parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); | ||||||
|         long rv = 0; |         long rv = 0; | ||||||
|         s->NextValue(&rv, &wasNull); |         s->NextValue(&rv, &wasNull); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::BOOLEAN: |       case parquet::Type::BOOLEAN: { | ||||||
|         { |  | ||||||
|         parquet::BoolScanner *s = (parquet::BoolScanner *)scanners[col].get(); |         parquet::BoolScanner *s = (parquet::BoolScanner *)scanners[col].get(); | ||||||
|         bool rv = false; |         bool rv = false; | ||||||
|         s->NextValue(&rv, &wasNull); |         s->NextValue(&rv, &wasNull); | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|         case parquet::Type::FIXED_LEN_BYTE_ARRAY: |       case parquet::Type::FIXED_LEN_BYTE_ARRAY: { | ||||||
|         { |         parquet::FixedLenByteArrayScanner *s = | ||||||
|           parquet::FixedLenByteArrayScanner* s = (parquet::FixedLenByteArrayScanner*)scanners[col].get(); |             (parquet::FixedLenByteArrayScanner *)scanners[col].get(); | ||||||
|         parquet::FixedLenByteArray flba; |         parquet::FixedLenByteArray flba; | ||||||
|         s->NextValue(&flba, &wasNull); |         s->NextValue(&flba, &wasNull); | ||||||
|         break; |         break; | ||||||
| @@ -797,11 +786,10 @@ void ParquetCursor::ensureColumn(int col) { | |||||||
|         // Should be impossible to get here as we should have forbidden this at |         // Should be impossible to get here as we should have forbidden this at | ||||||
|         // CREATE time -- maybe file changed underneath us? |         // CREATE time -- maybe file changed underneath us? | ||||||
|         std::ostringstream ss; |         std::ostringstream ss; | ||||||
|           ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " << |         ss << __FILE__ << ":" << __LINE__ << ": column " << col | ||||||
|             parquet::TypeToString(types[col]); |            << " has unsupported type: " << parquet::TypeToString(types[col]); | ||||||
|         throw std::invalid_argument(ss.str()); |         throw std::invalid_argument(ss.str()); | ||||||
|         break; |         break; | ||||||
|  |  | ||||||
|       } |       } | ||||||
|       colRows[col]++; |       colRows[col]++; | ||||||
|     } |     } | ||||||
| @@ -811,39 +799,36 @@ void ParquetCursor::ensureColumn(int col) { | |||||||
|  |  | ||||||
|     bool hadValue = false; |     bool hadValue = false; | ||||||
|     switch (types[col]) { |     switch (types[col]) { | ||||||
|       case parquet::Type::INT32: |     case parquet::Type::INT32: { | ||||||
|       { |  | ||||||
|       parquet::Int32Scanner *s = (parquet::Int32Scanner *)scanners[col].get(); |       parquet::Int32Scanner *s = (parquet::Int32Scanner *)scanners[col].get(); | ||||||
|       int rv = 0; |       int rv = 0; | ||||||
|       hadValue = s->NextValue(&rv, &wasNull); |       hadValue = s->NextValue(&rv, &wasNull); | ||||||
|       colIntValues[col] = rv; |       colIntValues[col] = rv; | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|       case parquet::Type::FLOAT: |     case parquet::Type::FLOAT: { | ||||||
|       { |  | ||||||
|       parquet::FloatScanner *s = (parquet::FloatScanner *)scanners[col].get(); |       parquet::FloatScanner *s = (parquet::FloatScanner *)scanners[col].get(); | ||||||
|       float rv = 0; |       float rv = 0; | ||||||
|       hadValue = s->NextValue(&rv, &wasNull); |       hadValue = s->NextValue(&rv, &wasNull); | ||||||
|       colDoubleValues[col] = rv; |       colDoubleValues[col] = rv; | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|       case parquet::Type::DOUBLE: |     case parquet::Type::DOUBLE: { | ||||||
|       { |  | ||||||
|       parquet::DoubleScanner *s = (parquet::DoubleScanner *)scanners[col].get(); |       parquet::DoubleScanner *s = (parquet::DoubleScanner *)scanners[col].get(); | ||||||
|       double rv = 0; |       double rv = 0; | ||||||
|       hadValue = s->NextValue(&rv, &wasNull); |       hadValue = s->NextValue(&rv, &wasNull); | ||||||
|       colDoubleValues[col] = rv; |       colDoubleValues[col] = rv; | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|       case parquet::Type::BYTE_ARRAY: |     case parquet::Type::BYTE_ARRAY: { | ||||||
|       { |       parquet::ByteArrayScanner *s = | ||||||
|         parquet::ByteArrayScanner* s = (parquet::ByteArrayScanner*)scanners[col].get(); |           (parquet::ByteArrayScanner *)scanners[col].get(); | ||||||
|       hadValue = s->NextValue(&colByteArrayValues[col], &wasNull); |       hadValue = s->NextValue(&colByteArrayValues[col], &wasNull); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|       case parquet::Type::INT96: |     case parquet::Type::INT96: { | ||||||
|       { |       // INT96 tracks a date with nanosecond precision, convert to ms since | ||||||
|         // INT96 tracks a date with nanosecond precision, convert to ms since epoch. |       // epoch. | ||||||
|       // ...see https://github.com/apache/parquet-format/pull/49 for more |       // ...see https://github.com/apache/parquet-format/pull/49 for more | ||||||
|       // |       // | ||||||
|       // First 8 bytes: nanoseconds into the day |       // First 8 bytes: nanoseconds into the day | ||||||
| @@ -856,8 +841,7 @@ void ParquetCursor::ensureColumn(int col) { | |||||||
|       colIntValues[col] = int96toMsSinceEpoch(rv); |       colIntValues[col] = int96toMsSinceEpoch(rv); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|       case parquet::Type::INT64: |     case parquet::Type::INT64: { | ||||||
|       { |  | ||||||
|       parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); |       parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); | ||||||
|       long rv = 0; |       long rv = 0; | ||||||
|       hadValue = s->NextValue(&rv, &wasNull); |       hadValue = s->NextValue(&rv, &wasNull); | ||||||
| @@ -865,30 +849,30 @@ void ParquetCursor::ensureColumn(int col) { | |||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|       case parquet::Type::BOOLEAN: |     case parquet::Type::BOOLEAN: { | ||||||
|       { |  | ||||||
|       parquet::BoolScanner *s = (parquet::BoolScanner *)scanners[col].get(); |       parquet::BoolScanner *s = (parquet::BoolScanner *)scanners[col].get(); | ||||||
|       bool rv = false; |       bool rv = false; | ||||||
|       hadValue = s->NextValue(&rv, &wasNull); |       hadValue = s->NextValue(&rv, &wasNull); | ||||||
|       colIntValues[col] = rv ? 1 : 0; |       colIntValues[col] = rv ? 1 : 0; | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|       case parquet::Type::FIXED_LEN_BYTE_ARRAY: |     case parquet::Type::FIXED_LEN_BYTE_ARRAY: { | ||||||
|       { |       parquet::FixedLenByteArrayScanner *s = | ||||||
|         parquet::FixedLenByteArrayScanner* s = (parquet::FixedLenByteArrayScanner*)scanners[col].get(); |           (parquet::FixedLenByteArrayScanner *)scanners[col].get(); | ||||||
|       parquet::FixedLenByteArray flba; |       parquet::FixedLenByteArray flba; | ||||||
|       hadValue = s->NextValue(&flba, &wasNull); |       hadValue = s->NextValue(&flba, &wasNull); | ||||||
|       colByteArrayValues[col].ptr = flba.ptr; |       colByteArrayValues[col].ptr = flba.ptr; | ||||||
|       // TODO: cache this |       // TODO: cache this | ||||||
|         colByteArrayValues[col].len = rowGroupMetadata->schema()->Column(col)->type_length(); |       colByteArrayValues[col].len = | ||||||
|  |           rowGroupMetadata->schema()->Column(col)->type_length(); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|     default: |     default: | ||||||
|       // Should be impossible to get here as we should have forbidden this at |       // Should be impossible to get here as we should have forbidden this at | ||||||
|       // CREATE time -- maybe file changed underneath us? |       // CREATE time -- maybe file changed underneath us? | ||||||
|       std::ostringstream ss; |       std::ostringstream ss; | ||||||
|         ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " << |       ss << __FILE__ << ":" << __LINE__ << ": column " << col | ||||||
|           parquet::TypeToString(types[col]); |          << " has unsupported type: " << parquet::TypeToString(types[col]); | ||||||
|       throw std::invalid_argument(ss.str()); |       throw std::invalid_argument(ss.str()); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
| @@ -908,17 +892,11 @@ bool ParquetCursor::isNull(int col) { | |||||||
|   return colNulls[col]; |   return colNulls[col]; | ||||||
| } | } | ||||||
|  |  | ||||||
| int ParquetCursor::getInt32(int col) { | int ParquetCursor::getInt32(int col) { return colIntValues[col]; } | ||||||
|   return colIntValues[col]; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| long ParquetCursor::getInt64(int col) { | long ParquetCursor::getInt64(int col) { return colIntValues[col]; } | ||||||
|   return colIntValues[col]; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| double ParquetCursor::getDouble(int col) { | double ParquetCursor::getDouble(int col) { return colDoubleValues[col]; } | ||||||
|   return colDoubleValues[col]; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| parquet::ByteArray *ParquetCursor::getByteArray(int col) { | parquet::ByteArray *ParquetCursor::getByteArray(int col) { | ||||||
|   return &colByteArrayValues[col]; |   return &colByteArrayValues[col]; | ||||||
| @@ -942,12 +920,10 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) { | |||||||
|   close(); |   close(); | ||||||
|   this->constraints = constraints; |   this->constraints = constraints; | ||||||
|   rowId = 0; |   rowId = 0; | ||||||
|   // TODO: consider having a long lived handle in ParquetTable that can be borrowed |   // TODO: consider having a long lived handle in ParquetTable that can be | ||||||
|   // without incurring the cost of opening the file from scratch twice |   // borrowed without incurring the cost of opening the file from scratch twice | ||||||
|   reader = parquet::ParquetFileReader::OpenFile( |   reader = parquet::ParquetFileReader::OpenFile( | ||||||
|       table->getFile().data(), |       table->getFile().data(), true, parquet::default_reader_properties(), | ||||||
|       true, |  | ||||||
|       parquet::default_reader_properties(), |  | ||||||
|       table->getMetadata()); |       table->getMetadata()); | ||||||
|  |  | ||||||
|   rowGroupId = -1; |   rowGroupId = -1; | ||||||
| @@ -964,7 +940,9 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) { | |||||||
| ParquetTable *ParquetCursor::getTable() const { return table; } | ParquetTable *ParquetCursor::getTable() const { return table; } | ||||||
|  |  | ||||||
| unsigned int ParquetCursor::getNumRowGroups() const { return numRowGroups; } | unsigned int ParquetCursor::getNumRowGroups() const { return numRowGroups; } | ||||||
| unsigned int ParquetCursor::getNumConstraints() const { return constraints.size(); } | unsigned int ParquetCursor::getNumConstraints() const { | ||||||
| const Constraint& ParquetCursor::getConstraint(unsigned int i) const { return constraints[i]; } |   return constraints.size(); | ||||||
|  | } | ||||||
|  | const Constraint &ParquetCursor::getConstraint(unsigned int i) const { | ||||||
|  |   return constraints[i]; | ||||||
|  | } | ||||||
|   | |||||||
| @@ -1,9 +1,9 @@ | |||||||
| #ifndef PARQUET_CURSOR_H | #ifndef PARQUET_CURSOR_H | ||||||
| #define PARQUET_CURSOR_H | #define PARQUET_CURSOR_H | ||||||
|  |  | ||||||
|  | #include "parquet/api/reader.h" | ||||||
| #include "parquet_filter.h" | #include "parquet_filter.h" | ||||||
| #include "parquet_table.h" | #include "parquet_table.h" | ||||||
| #include "parquet/api/reader.h" |  | ||||||
|  |  | ||||||
| class ParquetCursor { | class ParquetCursor { | ||||||
|  |  | ||||||
| @@ -36,16 +36,23 @@ class ParquetCursor { | |||||||
|   bool currentRowSatisfiesFilter(); |   bool currentRowSatisfiesFilter(); | ||||||
|   bool currentRowGroupSatisfiesFilter(); |   bool currentRowGroupSatisfiesFilter(); | ||||||
|   bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint); |   bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint); | ||||||
|   bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); |   bool currentRowGroupSatisfiesTextFilter( | ||||||
|   bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); |       Constraint &constraint, | ||||||
|   bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); |       std::shared_ptr<parquet::RowGroupStatistics> stats); | ||||||
|   bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); |   bool currentRowGroupSatisfiesBlobFilter( | ||||||
|  |       Constraint &constraint, | ||||||
|  |       std::shared_ptr<parquet::RowGroupStatistics> stats); | ||||||
|  |   bool currentRowGroupSatisfiesIntegerFilter( | ||||||
|  |       Constraint &constraint, | ||||||
|  |       std::shared_ptr<parquet::RowGroupStatistics> stats); | ||||||
|  |   bool currentRowGroupSatisfiesDoubleFilter( | ||||||
|  |       Constraint &constraint, | ||||||
|  |       std::shared_ptr<parquet::RowGroupStatistics> stats); | ||||||
|  |  | ||||||
|   bool currentRowSatisfiesTextFilter(Constraint &constraint); |   bool currentRowSatisfiesTextFilter(Constraint &constraint); | ||||||
|   bool currentRowSatisfiesIntegerFilter(Constraint &constraint); |   bool currentRowSatisfiesIntegerFilter(Constraint &constraint); | ||||||
|   bool currentRowSatisfiesDoubleFilter(Constraint &constraint); |   bool currentRowSatisfiesDoubleFilter(Constraint &constraint); | ||||||
|  |  | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   ParquetCursor(ParquetTable *table); |   ParquetCursor(ParquetTable *table); | ||||||
|   int getRowId(); |   int getRowId(); | ||||||
| @@ -70,4 +77,3 @@ public: | |||||||
| }; | }; | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,23 +1,12 @@ | |||||||
| #include "parquet_filter.h" | #include "parquet_filter.h" | ||||||
|  |  | ||||||
| Constraint::Constraint( | Constraint::Constraint(RowGroupBitmap bitmap, int column, | ||||||
|   RowGroupBitmap bitmap, |                        std::string columnName, ConstraintOperator op, | ||||||
|   int column, |                        ValueType type, int64_t intValue, double doubleValue, | ||||||
|   std::string columnName, |                        std::vector<unsigned char> blobValue) | ||||||
|   ConstraintOperator op, |     : bitmap(bitmap), column(column), columnName(columnName), op(op), | ||||||
|   ValueType type, |       type(type), intValue(intValue), doubleValue(doubleValue), | ||||||
|   int64_t intValue, |       blobValue(blobValue), hadRows(false) { | ||||||
|   double doubleValue, |  | ||||||
|   std::vector<unsigned char> blobValue |  | ||||||
| ): bitmap(bitmap), |  | ||||||
|    column(column), |  | ||||||
|    columnName(columnName), |  | ||||||
|    op(op), |  | ||||||
|    type(type), |  | ||||||
|    intValue(intValue), |  | ||||||
|    doubleValue(doubleValue), |  | ||||||
|    blobValue(blobValue), |  | ||||||
|    hadRows(false) { |  | ||||||
|   RowGroupBitmap bm = bitmap; |   RowGroupBitmap bm = bitmap; | ||||||
|   this->bitmap = bm; |   this->bitmap = bm; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,9 +1,9 @@ | |||||||
| #ifndef PARQUET_FILTER_H | #ifndef PARQUET_FILTER_H | ||||||
| #define PARQUET_FILTER_H | #define PARQUET_FILTER_H | ||||||
|  |  | ||||||
| #include <vector> |  | ||||||
| #include <string> |  | ||||||
| #include <cstdint> | #include <cstdint> | ||||||
|  | #include <string> | ||||||
|  | #include <vector> | ||||||
|  |  | ||||||
| enum ConstraintOperator { | enum ConstraintOperator { | ||||||
|   Equal, |   Equal, | ||||||
| @@ -20,16 +20,11 @@ enum ConstraintOperator { | |||||||
|   Is |   Is | ||||||
| }; | }; | ||||||
|  |  | ||||||
| enum ValueType { | enum ValueType { Null, Integer, Double, Blob, Text }; | ||||||
|   Null, |  | ||||||
|   Integer, |  | ||||||
|   Double, |  | ||||||
|   Blob, |  | ||||||
|   Text |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| class RowGroupBitmap { | class RowGroupBitmap { | ||||||
|   void setBit(std::vector<unsigned char>& membership, unsigned int rowGroup, bool isSet) { |   void setBit(std::vector<unsigned char> &membership, unsigned int rowGroup, | ||||||
|  |               bool isSet) { | ||||||
|     int byte = rowGroup / 8; |     int byte = rowGroup / 8; | ||||||
|     int offset = rowGroup % 8; |     int offset = rowGroup % 8; | ||||||
|     unsigned char c = membership[byte]; |     unsigned char c = membership[byte]; | ||||||
| @@ -51,12 +46,10 @@ public: | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   RowGroupBitmap( |   RowGroupBitmap(std::vector<unsigned char> estimatedMembership, | ||||||
|       std::vector<unsigned char> estimatedMembership, |                  std::vector<unsigned char> actualMembership) | ||||||
|       std::vector<unsigned char> actualMembership) : |       : estimatedMembership(estimatedMembership), | ||||||
|     estimatedMembership(estimatedMembership), |         actualMembership(actualMembership) {} | ||||||
|     actualMembership(actualMembership) { |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::vector<unsigned char> estimatedMembership; |   std::vector<unsigned char> estimatedMembership; | ||||||
|   std::vector<unsigned char> actualMembership; |   std::vector<unsigned char> actualMembership; | ||||||
| @@ -80,17 +73,11 @@ public: | |||||||
|  |  | ||||||
| class Constraint { | class Constraint { | ||||||
| public: | public: | ||||||
|   // Kind of a messy constructor function, but it's just for internal use, so whatever. |   // Kind of a messy constructor function, but it's just for internal use, so | ||||||
|   Constraint( |   // whatever. | ||||||
|     RowGroupBitmap bitmap, |   Constraint(RowGroupBitmap bitmap, int column, std::string columnName, | ||||||
|     int column, |              ConstraintOperator op, ValueType type, int64_t intValue, | ||||||
|     std::string columnName, |              double doubleValue, std::vector<unsigned char> blobValue); | ||||||
|     ConstraintOperator op, |  | ||||||
|     ValueType type, |  | ||||||
|     int64_t intValue, |  | ||||||
|     double doubleValue, |  | ||||||
|     std::vector<unsigned char> blobValue |  | ||||||
|   ); |  | ||||||
|  |  | ||||||
|   RowGroupBitmap bitmap; |   RowGroupBitmap bitmap; | ||||||
|   int column; // underlying column in the query |   int column; // underlying column in the query | ||||||
|   | |||||||
| @@ -2,8 +2,10 @@ | |||||||
|  |  | ||||||
| #include "parquet/api/reader.h" | #include "parquet/api/reader.h" | ||||||
|  |  | ||||||
| ParquetTable::ParquetTable(std::string file, std::string tableName): file(file), tableName(tableName) { | ParquetTable::ParquetTable(std::string file, std::string tableName) | ||||||
|   std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); |     : file(file), tableName(tableName) { | ||||||
|  |   std::unique_ptr<parquet::ParquetFileReader> reader = | ||||||
|  |       parquet::ParquetFileReader::OpenFile(file.data()); | ||||||
|   metadata = reader->metadata(); |   metadata = reader->metadata(); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -13,17 +15,12 @@ std::string ParquetTable::columnName(int i) { | |||||||
|   return columnNames[i]; |   return columnNames[i]; | ||||||
| } | } | ||||||
|  |  | ||||||
| unsigned int ParquetTable::getNumColumns() { | unsigned int ParquetTable::getNumColumns() { return columnNames.size(); } | ||||||
|   return columnNames.size(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| std::string ParquetTable::CreateStatement() { | std::string ParquetTable::CreateStatement() { | ||||||
|   std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile( |   std::unique_ptr<parquet::ParquetFileReader> reader = | ||||||
|       file.data(), |       parquet::ParquetFileReader::OpenFile( | ||||||
|       true, |           file.data(), true, parquet::default_reader_properties(), metadata); | ||||||
|       parquet::default_reader_properties(), |  | ||||||
|       metadata); |  | ||||||
|   std::string text("CREATE TABLE x("); |   std::string text("CREATE TABLE x("); | ||||||
|   auto schema = reader->metadata()->schema(); |   auto schema = reader->metadata()->schema(); | ||||||
|  |  | ||||||
| @@ -37,17 +34,20 @@ std::string ParquetTable::CreateStatement() { | |||||||
|  |  | ||||||
|     if (!_col->is_primitive()) { |     if (!_col->is_primitive()) { | ||||||
|       std::ostringstream ss; |       std::ostringstream ss; | ||||||
|       ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-primitive type"; |       ss << __FILE__ << ":" << __LINE__ << ": column " << i | ||||||
|  |          << " has non-primitive type"; | ||||||
|       throw std::invalid_argument(ss.str()); |       throw std::invalid_argument(ss.str()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (_col->is_repeated()) { |     if (_col->is_repeated()) { | ||||||
|       std::ostringstream ss; |       std::ostringstream ss; | ||||||
|       ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-scalar type"; |       ss << __FILE__ << ":" << __LINE__ << ": column " << i | ||||||
|  |          << " has non-scalar type"; | ||||||
|       throw std::invalid_argument(ss.str()); |       throw std::invalid_argument(ss.str()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     parquet::schema::PrimitiveNode* col = (parquet::schema::PrimitiveNode*)_col; |     parquet::schema::PrimitiveNode *col = | ||||||
|  |         (parquet::schema::PrimitiveNode *)_col; | ||||||
|  |  | ||||||
|     if (i > 0) |     if (i > 0) | ||||||
|       text += ", "; |       text += ", "; | ||||||
| @@ -125,21 +125,19 @@ std::string ParquetTable::CreateStatement() { | |||||||
|  |  | ||||||
|     if (type.empty()) { |     if (type.empty()) { | ||||||
|       std::ostringstream ss; |       std::ostringstream ss; | ||||||
|       ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " << |       ss << __FILE__ << ":" << __LINE__ << ": column " << i | ||||||
|         parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical); |          << " has unsupported type: " << parquet::TypeToString(physical) << "/" | ||||||
|  |          << parquet::LogicalTypeToString(logical); | ||||||
|  |  | ||||||
|       throw std::invalid_argument(ss.str()); |       throw std::invalid_argument(ss.str()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| #ifdef DEBUG | #ifdef DEBUG | ||||||
|     printf("col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", |     printf( | ||||||
|         i, |         "col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(), | ||||||
|         col->name().data(), |  | ||||||
|         col->physical_type(), |         col->physical_type(), | ||||||
|         parquet::TypeToString(col->physical_type()).data(), |         parquet::TypeToString(col->physical_type()).data(), col->logical_type(), | ||||||
|         col->logical_type(), |         parquet::LogicalTypeToString(col->logical_type()).data(), type.data()); | ||||||
|         parquet::LogicalTypeToString(col->logical_type()).data(), |  | ||||||
|         type.data()); |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     text += " "; |     text += " "; | ||||||
| @@ -149,7 +147,9 @@ std::string ParquetTable::CreateStatement() { | |||||||
|   return text; |   return text; | ||||||
| } | } | ||||||
|  |  | ||||||
| std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; } | std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { | ||||||
|  |   return metadata; | ||||||
|  | } | ||||||
|  |  | ||||||
| const std::string &ParquetTable::getFile() { return file; } | const std::string &ParquetTable::getFile() { return file; } | ||||||
| const std::string &ParquetTable::getTableName() { return tableName; } | const std::string &ParquetTable::getTableName() { return tableName; } | ||||||
|   | |||||||
| @@ -1,9 +1,9 @@ | |||||||
| #ifndef PARQUET_TABLE_H | #ifndef PARQUET_TABLE_H | ||||||
| #define PARQUET_TABLE_H | #define PARQUET_TABLE_H | ||||||
|  |  | ||||||
| #include <vector> |  | ||||||
| #include <string> |  | ||||||
| #include "parquet/api/reader.h" | #include "parquet/api/reader.h" | ||||||
|  | #include <string> | ||||||
|  | #include <vector> | ||||||
|  |  | ||||||
| class ParquetTable { | class ParquetTable { | ||||||
|   std::string file; |   std::string file; | ||||||
| @@ -11,7 +11,6 @@ class ParquetTable { | |||||||
|   std::vector<std::string> columnNames; |   std::vector<std::string> columnNames; | ||||||
|   std::shared_ptr<parquet::FileMetaData> metadata; |   std::shared_ptr<parquet::FileMetaData> metadata; | ||||||
|  |  | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   ParquetTable(std::string file, std::string tableName); |   ParquetTable(std::string file, std::string tableName); | ||||||
|   std::string CreateStatement(); |   std::string CreateStatement(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Addie Morrison
					Addie Morrison