1
0
mirror of https://github.com/cldellow/sqlite-parquet-vtable.git synced 2025-06-12 15:17:19 +00:00

Run a formatting pass with clang-format to minimize future git churn

This commit is contained in:
Addie Morrison 2019-12-08 16:08:11 -06:00
parent ae194c69c5
commit 7bc6f91f6f
7 changed files with 1104 additions and 1165 deletions

View File

@ -1,66 +1,65 @@
/* /*
* This file contains the implementation of an SQLite virtual table for * This file contains the implementation of an SQLite virtual table for
* reading Parquet files. * reading Parquet files.
* *
* Usage: * Usage:
* *
* .load ./parquet * .load ./parquet
* CREATE VIRTUAL TABLE demo USING parquet(FILENAME); * CREATE VIRTUAL TABLE demo USING parquet(FILENAME);
* SELECT * FROM demo; * SELECT * FROM demo;
* *
*/ */
#include <sqlite3ext.h> #include <sqlite3ext.h>
SQLITE_EXTENSION_INIT1 SQLITE_EXTENSION_INIT1
#include <string.h>
#include <stdlib.h>
#include <assert.h> #include <assert.h>
#include <stdarg.h>
#include <ctype.h> #include <ctype.h>
#include <stdio.h>
#include <iomanip> #include <iomanip>
#include <sys/time.h>
#include <memory> #include <memory>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include "parquet_table.h"
#include "parquet_cursor.h" #include "parquet_cursor.h"
#include "parquet_filter.h" #include "parquet_filter.h"
#include "parquet_table.h"
//#define DEBUG //#define DEBUG
/* Forward references to the various virtual table methods implemented /* Forward references to the various virtual table methods implemented
* in this file. */ * in this file. */
static int parquetCreate(sqlite3*, void*, int, const char*const*, static int parquetCreate(sqlite3 *, void *, int, const char *const *,
sqlite3_vtab**,char**); sqlite3_vtab **, char **);
static int parquetConnect(sqlite3*, void*, int, const char*const*, static int parquetConnect(sqlite3 *, void *, int, const char *const *,
sqlite3_vtab**,char**); sqlite3_vtab **, char **);
static int parquetBestIndex(sqlite3_vtab*,sqlite3_index_info*); static int parquetBestIndex(sqlite3_vtab *, sqlite3_index_info *);
static int parquetDisconnect(sqlite3_vtab*); static int parquetDisconnect(sqlite3_vtab *);
static int parquetDestroy(sqlite3_vtab*); static int parquetDestroy(sqlite3_vtab *);
static int parquetOpen(sqlite3_vtab*, sqlite3_vtab_cursor**); static int parquetOpen(sqlite3_vtab *, sqlite3_vtab_cursor **);
static int parquetClose(sqlite3_vtab_cursor*); static int parquetClose(sqlite3_vtab_cursor *);
static int parquetFilter(sqlite3_vtab_cursor*, int idxNum, const char *idxStr, static int parquetFilter(sqlite3_vtab_cursor *, int idxNum, const char *idxStr,
int argc, sqlite3_value **argv); int argc, sqlite3_value **argv);
static int parquetNext(sqlite3_vtab_cursor*); static int parquetNext(sqlite3_vtab_cursor *);
static int parquetEof(sqlite3_vtab_cursor*); static int parquetEof(sqlite3_vtab_cursor *);
static int parquetColumn(sqlite3_vtab_cursor*,sqlite3_context*,int); static int parquetColumn(sqlite3_vtab_cursor *, sqlite3_context *, int);
static int parquetRowid(sqlite3_vtab_cursor*,sqlite3_int64*); static int parquetRowid(sqlite3_vtab_cursor *, sqlite3_int64 *);
/* An instance of the Parquet virtual table */ /* An instance of the Parquet virtual table */
typedef struct sqlite3_vtab_parquet { typedef struct sqlite3_vtab_parquet {
sqlite3_vtab base; /* Base class. Must be first */ sqlite3_vtab base; /* Base class. Must be first */
ParquetTable* table; ParquetTable *table;
sqlite3* db; sqlite3 *db;
} sqlite3_vtab_parquet; } sqlite3_vtab_parquet;
/* A cursor for the Parquet virtual table */ /* A cursor for the Parquet virtual table */
typedef struct sqlite3_vtab_cursor_parquet { typedef struct sqlite3_vtab_cursor_parquet {
sqlite3_vtab_cursor base; /* Base class. Must be first */ sqlite3_vtab_cursor base; /* Base class. Must be first */
ParquetCursor* cursor; ParquetCursor *cursor;
} sqlite3_vtab_cursor_parquet; } sqlite3_vtab_cursor_parquet;
static int parquetDestroy(sqlite3_vtab *pVtab) { static int parquetDestroy(sqlite3_vtab *pVtab) {
sqlite3_vtab_parquet *p = (sqlite3_vtab_parquet*)pVtab; sqlite3_vtab_parquet *p = (sqlite3_vtab_parquet *)pVtab;
// Clean up our shadow table. This is useful if the user has recreated // Clean up our shadow table. This is useful if the user has recreated
// the parquet file, and our mappings would now be invalid. // the parquet file, and our mappings would now be invalid.
@ -68,7 +67,7 @@ static int parquetDestroy(sqlite3_vtab *pVtab) {
drop.append(p->table->getTableName()); drop.append(p->table->getTableName());
drop.append("_rowgroups"); drop.append("_rowgroups");
int rv = sqlite3_exec(p->db, drop.data(), 0, 0, 0); int rv = sqlite3_exec(p->db, drop.data(), 0, 0, 0);
if(rv != 0) if (rv != 0)
return rv; return rv;
return SQLITE_OK; return SQLITE_OK;
@ -77,24 +76,20 @@ static int parquetDestroy(sqlite3_vtab *pVtab) {
/* /*
** This method is the destructor fo a sqlite3_vtab_parquet object. ** This method is the destructor fo a sqlite3_vtab_parquet object.
*/ */
static int parquetDisconnect(sqlite3_vtab *pVtab){ static int parquetDisconnect(sqlite3_vtab *pVtab) {
sqlite3_vtab_parquet *p = (sqlite3_vtab_parquet*)pVtab; sqlite3_vtab_parquet *p = (sqlite3_vtab_parquet *)pVtab;
delete p->table; delete p->table;
sqlite3_free(p); sqlite3_free(p);
return SQLITE_OK; return SQLITE_OK;
} }
static int parquetConnect( static int parquetConnect(sqlite3 *db, void *pAux, int argc,
sqlite3 *db, const char *const *argv, sqlite3_vtab **ppVtab,
void *pAux, char **pzErr) {
int argc,
const char *const*argv,
sqlite3_vtab **ppVtab,
char **pzErr
){
try { try {
if(argc != 4 || strlen(argv[3]) < 2) { if (argc != 4 || strlen(argv[3]) < 2) {
*pzErr = sqlite3_mprintf("must provide exactly one argument, the path to a parquet file"); *pzErr = sqlite3_mprintf(
"must provide exactly one argument, the path to a parquet file");
return SQLITE_ERROR; return SQLITE_ERROR;
} }
@ -102,8 +97,8 @@ static int parquetConnect(
// Remove the delimiting single quotes // Remove the delimiting single quotes
std::string fname = argv[3]; std::string fname = argv[3];
fname = fname.substr(1, fname.length() - 2); fname = fname.substr(1, fname.length() - 2);
std::unique_ptr<sqlite3_vtab_parquet, void(*)(void*)> vtab( std::unique_ptr<sqlite3_vtab_parquet, void (*)(void *)> vtab(
(sqlite3_vtab_parquet*)sqlite3_malloc(sizeof(sqlite3_vtab_parquet)), (sqlite3_vtab_parquet *)sqlite3_malloc(sizeof(sqlite3_vtab_parquet)),
sqlite3_free); sqlite3_free);
memset(vtab.get(), 0, sizeof(*vtab.get())); memset(vtab.get(), 0, sizeof(*vtab.get()));
@ -112,20 +107,20 @@ static int parquetConnect(
std::string create = table->CreateStatement(); std::string create = table->CreateStatement();
int rc = sqlite3_declare_vtab(db, create.data()); int rc = sqlite3_declare_vtab(db, create.data());
if(rc) if (rc)
return rc; return rc;
vtab->table = table.release(); vtab->table = table.release();
vtab->db = db; vtab->db = db;
*ppVtab = (sqlite3_vtab*)vtab.release(); *ppVtab = (sqlite3_vtab *)vtab.release();
return SQLITE_OK; return SQLITE_OK;
} catch (const std::exception& e) { } catch (const std::exception &e) {
*pzErr = sqlite3_mprintf(e.what()); *pzErr = sqlite3_mprintf(e.what());
return SQLITE_ERROR; return SQLITE_ERROR;
} }
} catch(std::bad_alloc& ba) { } catch (std::bad_alloc &ba) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} catch(std::exception& e) { } catch (std::exception &e) {
return SQLITE_ERROR; return SQLITE_ERROR;
} }
} }
@ -134,20 +129,16 @@ static int parquetConnect(
** The xConnect and xCreate methods do the same thing, but they must be ** The xConnect and xCreate methods do the same thing, but they must be
** different so that the virtual table is not an eponymous virtual table. ** different so that the virtual table is not an eponymous virtual table.
*/ */
static int parquetCreate( static int parquetCreate(sqlite3 *db, void *pAux, int argc,
sqlite3 *db, const char *const *argv, sqlite3_vtab **ppVtab,
void *pAux, char **pzErr) {
int argc, const char *const*argv,
sqlite3_vtab **ppVtab,
char **pzErr
){
try { try {
// Create shadow table for storing constraint -> rowid mappings // Create shadow table for storing constraint -> rowid mappings
std::string create = "CREATE TABLE IF NOT EXISTS _"; std::string create = "CREATE TABLE IF NOT EXISTS _";
create.append(argv[2]); create.append(argv[2]);
create.append("_rowgroups(clause TEXT, estimate BLOB, actual BLOB)"); create.append("_rowgroups(clause TEXT, estimate BLOB, actual BLOB)");
int rv = sqlite3_exec(db, create.data(), 0, 0, 0); int rv = sqlite3_exec(db, create.data(), 0, 0, 0);
if(rv != 0) if (rv != 0)
return rv; return rv;
create = "CREATE UNIQUE INDEX IF NOT EXISTS _"; create = "CREATE UNIQUE INDEX IF NOT EXISTS _";
@ -158,28 +149,31 @@ static int parquetCreate(
rv = sqlite3_exec(db, create.data(), 0, 0, 0); rv = sqlite3_exec(db, create.data(), 0, 0, 0);
return parquetConnect(db, pAux, argc, argv, ppVtab, pzErr); return parquetConnect(db, pAux, argc, argv, ppVtab, pzErr);
} catch (std::bad_alloc& ba) { } catch (std::bad_alloc &ba) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} }
} }
std::string quoteBlob(const std::vector<unsigned char>& bytes) { std::string quoteBlob(const std::vector<unsigned char> &bytes) {
std::ostringstream ss; std::ostringstream ss;
ss << "X'" << std::hex; ss << "X'" << std::hex;
for(unsigned int i = 0; i < bytes.size(); i++) { for (unsigned int i = 0; i < bytes.size(); i++) {
ss << std::setfill('0') << std::setw(2) << (unsigned int)(unsigned char)bytes[i]; ss << std::setfill('0') << std::setw(2)
<< (unsigned int)(unsigned char)bytes[i];
} }
ss << "'"; ss << "'";
return ss.str(); return ss.str();
} }
void persistConstraints(sqlite3* db, ParquetCursor* cursor) { void persistConstraints(sqlite3 *db, ParquetCursor *cursor) {
for(unsigned int i = 0; i < cursor->getNumConstraints(); i++) { for (unsigned int i = 0; i < cursor->getNumConstraints(); i++) {
const Constraint& constraint = cursor->getConstraint(i); const Constraint &constraint = cursor->getConstraint(i);
const std::vector<unsigned char>& estimated = constraint.bitmap.estimatedMembership; const std::vector<unsigned char> &estimated =
const std::vector<unsigned char>& actual = constraint.bitmap.actualMembership; constraint.bitmap.estimatedMembership;
if(estimated == actual) { const std::vector<unsigned char> &actual =
constraint.bitmap.actualMembership;
if (estimated == actual) {
continue; continue;
} }
std::string desc = constraint.describe(); std::string desc = constraint.describe();
@ -188,15 +182,13 @@ void persistConstraints(sqlite3* db, ParquetCursor* cursor) {
std::string actualStr = quoteBlob(actual); std::string actualStr = quoteBlob(actual);
// This is only advisory, so ignore failures. // This is only advisory, so ignore failures.
char* sql = sqlite3_mprintf( char *sql =
"INSERT OR REPLACE INTO _%s_rowgroups(clause, estimate, actual) VALUES ('%q', %s, %s)", sqlite3_mprintf("INSERT OR REPLACE INTO _%s_rowgroups(clause, "
"estimate, actual) VALUES ('%q', %s, %s)",
cursor->getTable()->getTableName().c_str(), cursor->getTable()->getTableName().c_str(),
desc.c_str(), desc.c_str(), estimatedStr.c_str(), actualStr.c_str());
estimatedStr.c_str(),
actualStr.c_str());
if (sql == NULL)
if(sql == NULL)
return; return;
sqlite3_exec(db, sql, 0, 0, 0); sqlite3_exec(db, sql, 0, 0, 0);
@ -204,12 +196,12 @@ void persistConstraints(sqlite3* db, ParquetCursor* cursor) {
} }
} }
/* /*
** Destructor for a sqlite3_vtab_cursor_parquet. ** Destructor for a sqlite3_vtab_cursor_parquet.
*/ */
static int parquetClose(sqlite3_vtab_cursor *cur){ static int parquetClose(sqlite3_vtab_cursor *cur) {
sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; sqlite3_vtab_cursor_parquet *vtab_cursor_parquet =
(sqlite3_vtab_cursor_parquet *)cur;
vtab_cursor_parquet->cursor->close(); vtab_cursor_parquet->cursor->close();
delete vtab_cursor_parquet->cursor; delete vtab_cursor_parquet->cursor;
sqlite3_free(cur); sqlite3_free(cur);
@ -219,39 +211,40 @@ static int parquetClose(sqlite3_vtab_cursor *cur){
/* /*
** Constructor for a new sqlite3_vtab_parquet cursor object. ** Constructor for a new sqlite3_vtab_parquet cursor object.
*/ */
static int parquetOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){ static int parquetOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) {
try { try {
std::unique_ptr<sqlite3_vtab_cursor_parquet, void(*)(void*)> cursor( std::unique_ptr<sqlite3_vtab_cursor_parquet, void (*)(void *)> cursor(
(sqlite3_vtab_cursor_parquet*)sqlite3_malloc(sizeof(sqlite3_vtab_cursor_parquet)), (sqlite3_vtab_cursor_parquet *)sqlite3_malloc(
sizeof(sqlite3_vtab_cursor_parquet)),
sqlite3_free); sqlite3_free);
memset(cursor.get(), 0, sizeof(*cursor.get())); memset(cursor.get(), 0, sizeof(*cursor.get()));
sqlite3_vtab_parquet* pParquet = (sqlite3_vtab_parquet*)p; sqlite3_vtab_parquet *pParquet = (sqlite3_vtab_parquet *)p;
cursor->cursor = new ParquetCursor(pParquet->table); cursor->cursor = new ParquetCursor(pParquet->table);
*ppCursor = (sqlite3_vtab_cursor*)cursor.release(); *ppCursor = (sqlite3_vtab_cursor *)cursor.release();
return SQLITE_OK; return SQLITE_OK;
} catch(std::bad_alloc& ba) { } catch (std::bad_alloc &ba) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} catch(std::exception& e) { } catch (std::exception &e) {
return SQLITE_ERROR; return SQLITE_ERROR;
} }
} }
/* /*
** Advance a sqlite3_vtab_cursor_parquet to its next row of input. ** Advance a sqlite3_vtab_cursor_parquet to its next row of input.
** Set the EOF marker if we reach the end of input. ** Set the EOF marker if we reach the end of input.
*/ */
static int parquetNext(sqlite3_vtab_cursor *cur){ static int parquetNext(sqlite3_vtab_cursor *cur) {
try { try {
sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; sqlite3_vtab_cursor_parquet *vtab_cursor_parquet =
ParquetCursor* cursor = vtab_cursor_parquet->cursor; (sqlite3_vtab_cursor_parquet *)cur;
ParquetCursor *cursor = vtab_cursor_parquet->cursor;
cursor->next(); cursor->next();
return SQLITE_OK; return SQLITE_OK;
} catch(std::bad_alloc& ba) { } catch (std::bad_alloc &ba) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} catch(std::exception& e) { } catch (std::exception &e) {
return SQLITE_ERROR; return SQLITE_ERROR;
} }
} }
@ -260,73 +253,70 @@ static int parquetNext(sqlite3_vtab_cursor *cur){
** Return values of columns for the row at which the sqlite3_vtab_cursor_parquet ** Return values of columns for the row at which the sqlite3_vtab_cursor_parquet
** is currently pointing. ** is currently pointing.
*/ */
static int parquetColumn( static int
sqlite3_vtab_cursor *cur, /* The cursor */ parquetColumn(sqlite3_vtab_cursor *cur, /* The cursor */
sqlite3_context *ctx, /* First argument to sqlite3_result_...() */ sqlite3_context *ctx, /* First argument to sqlite3_result_...() */
int col /* Which column to return */ int col /* Which column to return */
){ ) {
try { try {
ParquetCursor *cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; ParquetCursor *cursor = ((sqlite3_vtab_cursor_parquet *)cur)->cursor;
cursor->ensureColumn(col); cursor->ensureColumn(col);
if(cursor->isNull(col)) { if (cursor->isNull(col)) {
sqlite3_result_null(ctx); sqlite3_result_null(ctx);
} else { } else {
switch(cursor->getPhysicalType(col)) { switch (cursor->getPhysicalType(col)) {
case parquet::Type::BOOLEAN: case parquet::Type::BOOLEAN:
case parquet::Type::INT32: case parquet::Type::INT32: {
{
int rv = cursor->getInt32(col); int rv = cursor->getInt32(col);
sqlite3_result_int(ctx, rv); sqlite3_result_int(ctx, rv);
break; break;
} }
case parquet::Type::FLOAT: case parquet::Type::FLOAT:
case parquet::Type::DOUBLE: case parquet::Type::DOUBLE: {
{
double rv = cursor->getDouble(col); double rv = cursor->getDouble(col);
sqlite3_result_double(ctx, rv); sqlite3_result_double(ctx, rv);
break; break;
} }
case parquet::Type::BYTE_ARRAY: case parquet::Type::BYTE_ARRAY: {
{ parquet::ByteArray *rv = cursor->getByteArray(col);
parquet::ByteArray* rv = cursor->getByteArray(col); if (cursor->getLogicalType(col) == parquet::LogicalType::UTF8) {
if(cursor->getLogicalType(col) == parquet::LogicalType::UTF8) { sqlite3_result_text(ctx, (const char *)rv->ptr, rv->len,
sqlite3_result_text(ctx, (const char*)rv->ptr, rv->len, SQLITE_TRANSIENT); SQLITE_TRANSIENT);
} else { } else {
sqlite3_result_blob(ctx, (void*)rv->ptr, rv->len, SQLITE_TRANSIENT); sqlite3_result_blob(ctx, (void *)rv->ptr, rv->len, SQLITE_TRANSIENT);
} }
break; break;
} }
case parquet::Type::INT96: case parquet::Type::INT96:
// This type exists to store timestamps in nanoseconds due to legacy // This type exists to store timestamps in nanoseconds due to legacy
// reasons. We just interpret it as a timestamp in milliseconds. // reasons. We just interpret it as a timestamp in milliseconds.
case parquet::Type::INT64: case parquet::Type::INT64: {
{
long rv = cursor->getInt64(col); long rv = cursor->getInt64(col);
sqlite3_result_int64(ctx, rv); sqlite3_result_int64(ctx, rv);
break; break;
} }
case parquet::Type::FIXED_LEN_BYTE_ARRAY: case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
{ parquet::ByteArray *rv = cursor->getByteArray(col);
parquet::ByteArray* rv = cursor->getByteArray(col); sqlite3_result_blob(ctx, (void *)rv->ptr, rv->len, SQLITE_TRANSIENT);
sqlite3_result_blob(ctx, (void*)rv->ptr, rv->len, SQLITE_TRANSIENT);
break; break;
} }
default: default:
// Should be impossible to get here as we should have forbidden this at // Should be impossible to get here as we should have forbidden this at
// CREATE time -- maybe file changed underneath us? // CREATE time -- maybe file changed underneath us?
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " << ss << __FILE__ << ":" << __LINE__ << ": column " << col
parquet::TypeToString(cursor->getPhysicalType(col)); << " has unsupported type: "
<< parquet::TypeToString(cursor->getPhysicalType(col));
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
break; break;
} }
} }
return SQLITE_OK; return SQLITE_OK;
} catch(std::bad_alloc& ba) { } catch (std::bad_alloc &ba) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} catch(std::exception& e) { } catch (std::exception &e) {
return SQLITE_ERROR; return SQLITE_ERROR;
} }
} }
@ -334,8 +324,8 @@ static int parquetColumn(
/* /*
** Return the rowid for the current row. ** Return the rowid for the current row.
*/ */
static int parquetRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ static int parquetRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) {
ParquetCursor *cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; ParquetCursor *cursor = ((sqlite3_vtab_cursor_parquet *)cur)->cursor;
*pRowid = cursor->getRowId(); *pRowid = cursor->getRowId();
return SQLITE_OK; return SQLITE_OK;
} }
@ -344,11 +334,13 @@ static int parquetRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
** Return TRUE if the cursor has been moved off of the last ** Return TRUE if the cursor has been moved off of the last
** row of output. ** row of output.
*/ */
static int parquetEof(sqlite3_vtab_cursor *cur){ static int parquetEof(sqlite3_vtab_cursor *cur) {
ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; ParquetCursor *cursor = ((sqlite3_vtab_cursor_parquet *)cur)->cursor;
if(cursor->eof()) { if (cursor->eof()) {
sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; sqlite3_vtab_cursor_parquet *vtab_cursor_parquet =
sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); (sqlite3_vtab_cursor_parquet *)cur;
sqlite3_vtab_parquet *vtab_parquet =
(sqlite3_vtab_parquet *)(vtab_cursor_parquet->base.pVtab);
persistConstraints(vtab_parquet->db, cursor); persistConstraints(vtab_parquet->db, cursor);
return 1; return 1;
} }
@ -356,8 +348,8 @@ static int parquetEof(sqlite3_vtab_cursor *cur){
} }
#ifdef DEBUG #ifdef DEBUG
const char* opName(int op) { const char *opName(int op) {
switch(op) { switch (op) {
case SQLITE_INDEX_CONSTRAINT_EQ: case SQLITE_INDEX_CONSTRAINT_EQ:
return "="; return "=";
case SQLITE_INDEX_CONSTRAINT_GT: case SQLITE_INDEX_CONSTRAINT_GT:
@ -391,66 +383,60 @@ const char* opName(int op) {
} }
} }
void debugConstraints(sqlite3_index_info *pIdxInfo, ParquetTable *table, int argc, sqlite3_value** argv) { void debugConstraints(sqlite3_index_info *pIdxInfo, ParquetTable *table,
int argc, sqlite3_value **argv) {
printf("debugConstraints, argc=%d\n", argc); printf("debugConstraints, argc=%d\n", argc);
int j = 0; int j = 0;
for(int i = 0; i < pIdxInfo->nConstraint; i++) { for (int i = 0; i < pIdxInfo->nConstraint; i++) {
std::string valueStr = "?"; std::string valueStr = "?";
if(argv != NULL && pIdxInfo->aConstraint[i].usable) { if (argv != NULL && pIdxInfo->aConstraint[i].usable) {
int type = sqlite3_value_type(argv[j]); int type = sqlite3_value_type(argv[j]);
switch(type) { switch (type) {
case SQLITE_INTEGER: case SQLITE_INTEGER: {
{
sqlite3_int64 rv = sqlite3_value_int64(argv[j]); sqlite3_int64 rv = sqlite3_value_int64(argv[j]);
std::ostringstream ss; std::ostringstream ss;
ss << rv; ss << rv;
valueStr = ss.str(); valueStr = ss.str();
break; break;
} }
case SQLITE_FLOAT: case SQLITE_FLOAT: {
{
double rv = sqlite3_value_double(argv[j]); double rv = sqlite3_value_double(argv[j]);
std::ostringstream ss; std::ostringstream ss;
ss << rv; ss << rv;
valueStr = ss.str(); valueStr = ss.str();
break; break;
} }
case SQLITE_TEXT: case SQLITE_TEXT: {
{ const unsigned char *rv = sqlite3_value_text(argv[j]);
const unsigned char* rv = sqlite3_value_text(argv[j]);
std::ostringstream ss; std::ostringstream ss;
ss << "'" << rv << "'"; ss << "'" << rv << "'";
valueStr = ss.str(); valueStr = ss.str();
break; break;
} }
case SQLITE_BLOB: case SQLITE_BLOB: {
{
int sizeBytes = sqlite3_value_bytes(argv[j]); int sizeBytes = sqlite3_value_bytes(argv[j]);
std::ostringstream ss; std::ostringstream ss;
ss << "'..." << sizeBytes << "-byte blob...'"; ss << "'..." << sizeBytes << "-byte blob...'";
valueStr = ss.str(); valueStr = ss.str();
break; break;
} }
case SQLITE_NULL: case SQLITE_NULL: {
{
valueStr = "NULL"; valueStr = "NULL";
break; break;
} }
} }
j++; j++;
} }
printf(" constraint %d: col %s %s %s, usable %d\n", printf(" constraint %d: col %s %s %s, usable %d\n", i,
i,
table->columnName(pIdxInfo->aConstraint[i].iColumn).data(), table->columnName(pIdxInfo->aConstraint[i].iColumn).data(),
opName(pIdxInfo->aConstraint[i].op), opName(pIdxInfo->aConstraint[i].op), valueStr.data(),
valueStr.data(),
pIdxInfo->aConstraint[i].usable); pIdxInfo->aConstraint[i].usable);
} }
} }
#endif #endif
ConstraintOperator constraintOperatorFromSqlite(int op) { ConstraintOperator constraintOperatorFromSqlite(int op) {
switch(op) { switch (op) {
case SQLITE_INDEX_CONSTRAINT_EQ: case SQLITE_INDEX_CONSTRAINT_EQ:
return Equal; return Equal;
case SQLITE_INDEX_CONSTRAINT_GT: case SQLITE_INDEX_CONSTRAINT_GT:
@ -482,29 +468,30 @@ ConstraintOperator constraintOperatorFromSqlite(int op) {
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
std::vector<unsigned char> getRowGroupsForClause(sqlite3* db, std::string table, std::string clause) { std::vector<unsigned char> getRowGroupsForClause(sqlite3 *db, std::string table,
std::string clause) {
std::vector<unsigned char> rv; std::vector<unsigned char> rv;
std::unique_ptr<char, void(*)(void*)> sql(sqlite3_mprintf( std::unique_ptr<char, void (*)(void *)> sql(
"SELECT actual FROM _%s_rowgroups WHERE clause = '%q'", sqlite3_mprintf("SELECT actual FROM _%s_rowgroups WHERE clause = '%q'",
table.c_str(), table.c_str(), clause.c_str()),
clause.c_str()), sqlite3_free); sqlite3_free);
if(sql.get() == NULL) if (sql.get() == NULL)
return rv; return rv;
sqlite3_stmt* pStmt = NULL; sqlite3_stmt *pStmt = NULL;
int rc = sqlite3_prepare_v2(db, sql.get(), -1, &pStmt, NULL); int rc = sqlite3_prepare_v2(db, sql.get(), -1, &pStmt, NULL);
if(rc != 0) if (rc != 0)
return rv; return rv;
rc = sqlite3_step(pStmt); rc = sqlite3_step(pStmt);
if(rc == SQLITE_ROW) { if (rc == SQLITE_ROW) {
int size = sqlite3_column_bytes(pStmt, 0); int size = sqlite3_column_bytes(pStmt, 0);
unsigned char* blob = (unsigned char*)sqlite3_column_blob(pStmt, 0); unsigned char *blob = (unsigned char *)sqlite3_column_blob(pStmt, 0);
// TODO: there is a memory leak here if we get a std::bad_alloc while populating rv; // TODO: there is a memory leak here if we get a std::bad_alloc while
// we fail to free pStmt // populating rv; we fail to free pStmt
for(int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
rv.push_back(blob[i]); rv.push_back(blob[i]);
} }
} }
@ -513,24 +500,20 @@ std::vector<unsigned char> getRowGroupsForClause(sqlite3* db, std::string table,
return rv; return rv;
} }
/* /*
** Only a full table scan is supported. So xFilter simply rewinds to ** Only a full table scan is supported. So xFilter simply rewinds to
** the beginning. ** the beginning.
*/ */
static int parquetFilter( static int parquetFilter(sqlite3_vtab_cursor *cur, int idxNum,
sqlite3_vtab_cursor *cur, const char *idxStr, int argc, sqlite3_value **argv) {
int idxNum,
const char *idxStr,
int argc,
sqlite3_value **argv
){
try { try {
sqlite3_vtab_cursor_parquet* vtab_cursor_parquet = (sqlite3_vtab_cursor_parquet*)cur; sqlite3_vtab_cursor_parquet *vtab_cursor_parquet =
sqlite3_vtab_parquet* vtab_parquet = (sqlite3_vtab_parquet*)(vtab_cursor_parquet->base.pVtab); (sqlite3_vtab_cursor_parquet *)cur;
sqlite3* db = vtab_parquet->db; sqlite3_vtab_parquet *vtab_parquet =
ParquetCursor* cursor = vtab_cursor_parquet->cursor; (sqlite3_vtab_parquet *)(vtab_cursor_parquet->base.pVtab);
sqlite3_index_info* indexInfo = (sqlite3_index_info*)idxStr; sqlite3 *db = vtab_parquet->db;
ParquetCursor *cursor = vtab_cursor_parquet->cursor;
sqlite3_index_info *indexInfo = (sqlite3_index_info *)idxStr;
#ifdef DEBUG #ifdef DEBUG
struct timeval tv; struct timeval tv;
@ -539,13 +522,14 @@ static int parquetFilter(
(unsigned long long)(tv.tv_sec) * 1000 + (unsigned long long)(tv.tv_sec) * 1000 +
(unsigned long long)(tv.tv_usec) / 1000; (unsigned long long)(tv.tv_usec) / 1000;
printf("%llu xFilter: idxNum=%d, idxStr=%lu, argc=%d\n", millisecondsSinceEpoch, idxNum, (long unsigned int)idxStr, argc); printf("%llu xFilter: idxNum=%d, idxStr=%lu, argc=%d\n",
millisecondsSinceEpoch, idxNum, (long unsigned int)idxStr, argc);
debugConstraints(indexInfo, cursor->getTable(), argc, argv); debugConstraints(indexInfo, cursor->getTable(), argc, argv);
#endif #endif
std::vector<Constraint> constraints; std::vector<Constraint> constraints;
int j = 0; int j = 0;
for(int i = 0; i < indexInfo->nConstraint; i++) { for (int i = 0; i < indexInfo->nConstraint; i++) {
if(!indexInfo->aConstraint[i].usable) { if (!indexInfo->aConstraint[i].usable) {
continue; continue;
} }
@ -555,86 +539,76 @@ static int parquetFilter(
std::vector<unsigned char> blobValue; std::vector<unsigned char> blobValue;
int sqliteType = sqlite3_value_type(argv[j]); int sqliteType = sqlite3_value_type(argv[j]);
if(sqliteType == SQLITE_INTEGER) { if (sqliteType == SQLITE_INTEGER) {
type = Integer; type = Integer;
intValue = sqlite3_value_int64(argv[j]); intValue = sqlite3_value_int64(argv[j]);
} else if(sqliteType == SQLITE_FLOAT) { } else if (sqliteType == SQLITE_FLOAT) {
type = Double; type = Double;
doubleValue = sqlite3_value_double(argv[j]); doubleValue = sqlite3_value_double(argv[j]);
} else if(sqliteType == SQLITE_TEXT) { } else if (sqliteType == SQLITE_TEXT) {
type = Text; type = Text;
int len = sqlite3_value_bytes(argv[j]); int len = sqlite3_value_bytes(argv[j]);
const unsigned char* ptr = sqlite3_value_text(argv[j]); const unsigned char *ptr = sqlite3_value_text(argv[j]);
for(int k = 0; k < len; k++) { for (int k = 0; k < len; k++) {
blobValue.push_back(ptr[k]); blobValue.push_back(ptr[k]);
} }
} else if(sqliteType == SQLITE_BLOB) { } else if (sqliteType == SQLITE_BLOB) {
type = Blob; type = Blob;
int len = sqlite3_value_bytes(argv[j]); int len = sqlite3_value_bytes(argv[j]);
const unsigned char* ptr = (const unsigned char*)sqlite3_value_blob(argv[j]); const unsigned char *ptr =
for(int k = 0; k < len; k++) { (const unsigned char *)sqlite3_value_blob(argv[j]);
for (int k = 0; k < len; k++) {
blobValue.push_back(ptr[k]); blobValue.push_back(ptr[k]);
} }
} else if(sqliteType == SQLITE_NULL) { } else if (sqliteType == SQLITE_NULL) {
type = Null; type = Null;
} }
std::string columnName = "rowid"; std::string columnName = "rowid";
if(indexInfo->aConstraint[i].iColumn >= 0) { if (indexInfo->aConstraint[i].iColumn >= 0) {
columnName = cursor->getTable()->columnName(indexInfo->aConstraint[i].iColumn); columnName =
cursor->getTable()->columnName(indexInfo->aConstraint[i].iColumn);
} }
RowGroupBitmap bitmap = RowGroupBitmap(cursor->getNumRowGroups()); RowGroupBitmap bitmap = RowGroupBitmap(cursor->getNumRowGroups());
Constraint dummy( Constraint dummy(
bitmap, bitmap, indexInfo->aConstraint[i].iColumn, columnName,
indexInfo->aConstraint[i].iColumn, constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), type,
columnName, intValue, doubleValue, blobValue);
constraintOperatorFromSqlite(indexInfo->aConstraint[i].op),
type,
intValue,
doubleValue,
blobValue);
std::vector<unsigned char> actual = getRowGroupsForClause(db, cursor->getTable()->getTableName(), dummy.describe()); std::vector<unsigned char> actual = getRowGroupsForClause(
if(actual.size() > 0) { db, cursor->getTable()->getTableName(), dummy.describe());
// Initialize the estimate to be the actual -- eventually they'll converge if (actual.size() > 0) {
// and we'll stop writing back to the db. // Initialize the estimate to be the actual -- eventually they'll
// converge and we'll stop writing back to the db.
std::vector<unsigned char> estimate = actual; std::vector<unsigned char> estimate = actual;
bitmap = RowGroupBitmap(estimate, actual); bitmap = RowGroupBitmap(estimate, actual);
} }
Constraint constraint( Constraint constraint(
bitmap, bitmap, indexInfo->aConstraint[i].iColumn, columnName,
indexInfo->aConstraint[i].iColumn, constraintOperatorFromSqlite(indexInfo->aConstraint[i].op), type,
columnName, intValue, doubleValue, blobValue);
constraintOperatorFromSqlite(indexInfo->aConstraint[i].op),
type,
intValue,
doubleValue,
blobValue);
constraints.push_back(constraint); constraints.push_back(constraint);
j++; j++;
} }
cursor->reset(constraints); cursor->reset(constraints);
return parquetNext(cur); return parquetNext(cur);
} catch(std::bad_alloc& ba) { } catch (std::bad_alloc &ba) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} catch(std::exception& e) { } catch (std::exception &e) {
return SQLITE_ERROR; return SQLITE_ERROR;
} }
} }
/* /*
* We'll always indicate to SQLite that we prefer it to use an index so that it will * We'll always indicate to SQLite that we prefer it to use an index so that it
* pass additional context to xFilter, which we may or may not use. * will pass additional context to xFilter, which we may or may not use.
* *
* We copy the sqlite3_index_info structure, as is, into idxStr for later use. * We copy the sqlite3_index_info structure, as is, into idxStr for later use.
*/ */
static int parquetBestIndex( static int parquetBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo) {
sqlite3_vtab *tab,
sqlite3_index_info *pIdxInfo
){
try { try {
#ifdef DEBUG #ifdef DEBUG
@ -644,18 +618,19 @@ static int parquetBestIndex(
(unsigned long long)(tv.tv_sec) * 1000 + (unsigned long long)(tv.tv_sec) * 1000 +
(unsigned long long)(tv.tv_usec) / 1000; (unsigned long long)(tv.tv_usec) / 1000;
ParquetTable *table = ((sqlite3_vtab_parquet *)tab)->table;
ParquetTable* table = ((sqlite3_vtab_parquet*)tab)->table; printf("%llu xBestIndex: nConstraint=%d, nOrderBy=%d\n",
printf("%llu xBestIndex: nConstraint=%d, nOrderBy=%d\n", millisecondsSinceEpoch, pIdxInfo->nConstraint, pIdxInfo->nOrderBy); millisecondsSinceEpoch, pIdxInfo->nConstraint, pIdxInfo->nOrderBy);
debugConstraints(pIdxInfo, table, 0, NULL); debugConstraints(pIdxInfo, table, 0, NULL);
#endif #endif
// We traverse in rowid ascending order, so if they're asking for it to be ordered like that, // We traverse in rowid ascending order, so if they're asking for it to be
// we can tell SQLite that it's guaranteed. This speeds up some DB viewer utilities that // ordered like that, we can tell SQLite that it's guaranteed. This speeds
// use rowids for pagination. // up some DB viewer utilities that use rowids for pagination.
if(pIdxInfo->nOrderBy == 1 && pIdxInfo->aOrderBy[0].iColumn == -1 && pIdxInfo->aOrderBy[0].desc == 0) if (pIdxInfo->nOrderBy == 1 && pIdxInfo->aOrderBy[0].iColumn == -1 &&
pIdxInfo->aOrderBy[0].desc == 0)
pIdxInfo->orderByConsumed = 1; pIdxInfo->orderByConsumed = 1;
if(pIdxInfo->nConstraint == 0) { if (pIdxInfo->nConstraint == 0) {
pIdxInfo->estimatedCost = 1000000000000; pIdxInfo->estimatedCost = 1000000000000;
pIdxInfo->idxNum = 0; pIdxInfo->idxNum = 0;
} else { } else {
@ -663,61 +638,69 @@ static int parquetBestIndex(
pIdxInfo->idxNum = 1; pIdxInfo->idxNum = 1;
int j = 0; int j = 0;
for(int i = 0; i < pIdxInfo->nConstraint; i++) { for (int i = 0; i < pIdxInfo->nConstraint; i++) {
if(pIdxInfo->aConstraint[i].usable) { if (pIdxInfo->aConstraint[i].usable) {
j++; j++;
pIdxInfo->aConstraintUsage[i].argvIndex = j; pIdxInfo->aConstraintUsage[i].argvIndex = j;
// pIdxInfo->aConstraintUsage[i].omit = 1; // pIdxInfo->aConstraintUsage[i].omit = 1;
} }
} }
} }
size_t dupeSize = sizeof(sqlite3_index_info) + size_t dupeSize =
//pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) + sizeof(sqlite3_index_info) +
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) + // pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) +
pIdxInfo->nConstraint *
sizeof(sqlite3_index_info::sqlite3_index_constraint) +
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) + pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) +
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint_usage); pIdxInfo->nConstraint *
sqlite3_index_info* dupe = (sqlite3_index_info*)sqlite3_malloc(dupeSize); sizeof(sqlite3_index_info::sqlite3_index_constraint_usage);
pIdxInfo->idxStr = (char*)dupe; sqlite3_index_info *dupe = (sqlite3_index_info *)sqlite3_malloc(dupeSize);
pIdxInfo->idxStr = (char *)dupe;
pIdxInfo->needToFreeIdxStr = 1; pIdxInfo->needToFreeIdxStr = 1;
memset(dupe, 0, dupeSize); memset(dupe, 0, dupeSize);
memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info)); memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info));
dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint*)((char*)dupe + sizeof(sqlite3_index_info)); dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint
dupe->aOrderBy = (sqlite3_index_info::sqlite3_index_orderby*)((char*)dupe + *)((char *)dupe + sizeof(sqlite3_index_info));
sizeof(sqlite3_index_info) + dupe->aOrderBy =
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint)); (sqlite3_index_info::sqlite3_index_orderby
dupe->aConstraintUsage = (sqlite3_index_info::sqlite3_index_constraint_usage*)((char*)dupe + *)((char *)dupe + sizeof(sqlite3_index_info) +
sizeof(sqlite3_index_info) + pIdxInfo->nConstraint *
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) + sizeof(sqlite3_index_info::sqlite3_index_constraint));
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby)); dupe->aConstraintUsage =
(sqlite3_index_info::sqlite3_index_constraint_usage
*)((char *)dupe + sizeof(sqlite3_index_info) +
pIdxInfo->nConstraint *
sizeof(sqlite3_index_info::sqlite3_index_constraint) +
pIdxInfo->nOrderBy *
sizeof(sqlite3_index_info::sqlite3_index_orderby));
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
for(int i = 0; i < pIdxInfo->nConstraint; i++) {
dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn; dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn;
dupe->aConstraint[i].op = pIdxInfo->aConstraint[i].op; dupe->aConstraint[i].op = pIdxInfo->aConstraint[i].op;
dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable; dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable;
dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset; dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset;
dupe->aConstraintUsage[i].argvIndex = pIdxInfo->aConstraintUsage[i].argvIndex; dupe->aConstraintUsage[i].argvIndex =
pIdxInfo->aConstraintUsage[i].argvIndex;
dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit; dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit;
} }
for(int i = 0; i < pIdxInfo->nOrderBy; i++) { for (int i = 0; i < pIdxInfo->nOrderBy; i++) {
dupe->aOrderBy[i].iColumn = pIdxInfo->aOrderBy[i].iColumn; dupe->aOrderBy[i].iColumn = pIdxInfo->aOrderBy[i].iColumn;
dupe->aOrderBy[i].desc = pIdxInfo->aOrderBy[i].desc; dupe->aOrderBy[i].desc = pIdxInfo->aOrderBy[i].desc;
} }
return SQLITE_OK; return SQLITE_OK;
} catch(std::bad_alloc& ba) { } catch (std::bad_alloc &ba) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} catch(std::exception& e) { } catch (std::exception &e) {
return SQLITE_ERROR; return SQLITE_ERROR;
} }
} }
static sqlite3_module ParquetModule = { static sqlite3_module ParquetModule = {
0, /* iVersion */ 0, /* iVersion */
parquetCreate, /* xCreate */ parquetCreate, /* xCreate */
@ -742,19 +725,16 @@ static sqlite3_module ParquetModule = {
}; };
/* /*
* This routine is called when the extension is loaded. The new * This routine is called when the extension is loaded. The new
* Parquet virtual table module is registered with the calling database * Parquet virtual table module is registered with the calling database
* connection. * connection.
*/ */
extern "C" { extern "C" {
int sqlite3_parquet_init( int sqlite3_parquet_init(sqlite3 *db, char **pzErrMsg,
sqlite3 *db, const sqlite3_api_routines *pApi) {
char **pzErrMsg,
const sqlite3_api_routines *pApi
){
int rc; int rc;
SQLITE_EXTENSION_INIT2(pApi); SQLITE_EXTENSION_INIT2(pApi);
rc = sqlite3_create_module(db, "parquet", &ParquetModule, 0); rc = sqlite3_create_module(db, "parquet", &ParquetModule, 0);
return rc; return rc;
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,13 @@
#ifndef PARQUET_CURSOR_H #ifndef PARQUET_CURSOR_H
#define PARQUET_CURSOR_H #define PARQUET_CURSOR_H
#include "parquet/api/reader.h"
#include "parquet_filter.h" #include "parquet_filter.h"
#include "parquet_table.h" #include "parquet_table.h"
#include "parquet/api/reader.h"
class ParquetCursor { class ParquetCursor {
ParquetTable* table; ParquetTable *table;
std::unique_ptr<parquet::ParquetFileReader> reader; std::unique_ptr<parquet::ParquetFileReader> reader;
std::unique_ptr<parquet::RowGroupMetaData> rowGroupMetadata; std::unique_ptr<parquet::RowGroupMetaData> rowGroupMetadata;
std::shared_ptr<parquet::RowGroupReader> rowGroup; std::shared_ptr<parquet::RowGroupReader> rowGroup;
@ -35,19 +35,26 @@ class ParquetCursor {
bool currentRowSatisfiesFilter(); bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter(); bool currentRowGroupSatisfiesFilter();
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint); bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint);
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); bool currentRowGroupSatisfiesTextFilter(
bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); Constraint &constraint,
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); bool currentRowGroupSatisfiesBlobFilter(
Constraint &constraint,
bool currentRowSatisfiesTextFilter(Constraint& constraint); std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesIntegerFilter(Constraint& constraint); bool currentRowGroupSatisfiesIntegerFilter(
bool currentRowSatisfiesDoubleFilter(Constraint& constraint); Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter(
Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesTextFilter(Constraint &constraint);
bool currentRowSatisfiesIntegerFilter(Constraint &constraint);
bool currentRowSatisfiesDoubleFilter(Constraint &constraint);
public: public:
ParquetCursor(ParquetTable* table); ParquetCursor(ParquetTable *table);
int getRowId(); int getRowId();
void next(); void next();
void close(); void close();
@ -58,16 +65,15 @@ public:
bool isNull(int col); bool isNull(int col);
unsigned int getNumRowGroups() const; unsigned int getNumRowGroups() const;
unsigned int getNumConstraints() const; unsigned int getNumConstraints() const;
const Constraint& getConstraint(unsigned int i) const; const Constraint &getConstraint(unsigned int i) const;
parquet::Type::type getPhysicalType(int col); parquet::Type::type getPhysicalType(int col);
parquet::LogicalType::type getLogicalType(int col); parquet::LogicalType::type getLogicalType(int col);
ParquetTable* getTable() const; ParquetTable *getTable() const;
int getInt32(int col); int getInt32(int col);
long getInt64(int col); long getInt64(int col);
double getDouble(int col); double getDouble(int col);
parquet::ByteArray* getByteArray(int col); parquet::ByteArray *getByteArray(int col);
}; };
#endif #endif

View File

@ -1,40 +1,29 @@
#include "parquet_filter.h" #include "parquet_filter.h"
Constraint::Constraint( Constraint::Constraint(RowGroupBitmap bitmap, int column,
RowGroupBitmap bitmap, std::string columnName, ConstraintOperator op,
int column, ValueType type, int64_t intValue, double doubleValue,
std::string columnName, std::vector<unsigned char> blobValue)
ConstraintOperator op, : bitmap(bitmap), column(column), columnName(columnName), op(op),
ValueType type, type(type), intValue(intValue), doubleValue(doubleValue),
int64_t intValue, blobValue(blobValue), hadRows(false) {
double doubleValue,
std::vector<unsigned char> blobValue
): bitmap(bitmap),
column(column),
columnName(columnName),
op(op),
type(type),
intValue(intValue),
doubleValue(doubleValue),
blobValue(blobValue),
hadRows(false) {
RowGroupBitmap bm = bitmap; RowGroupBitmap bm = bitmap;
this->bitmap = bm; this->bitmap = bm;
if(type == Text) { if (type == Text) {
stringValue = std::string((char*)&blobValue[0], blobValue.size()); stringValue = std::string((char *)&blobValue[0], blobValue.size());
if(op == Like) { if (op == Like) {
// This permits more rowgroups than is strictly needed // This permits more rowgroups than is strictly needed
// since it assumes an implicit wildcard. But it's // since it assumes an implicit wildcard. But it's
// simple to implement, so we'll go with it. // simple to implement, so we'll go with it.
likeStringValue = stringValue; likeStringValue = stringValue;
size_t idx = likeStringValue.find_first_of("%"); size_t idx = likeStringValue.find_first_of("%");
if(idx != std::string::npos) { if (idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx); likeStringValue = likeStringValue.substr(0, idx);
} }
idx = likeStringValue.find_first_of("_"); idx = likeStringValue.find_first_of("_");
if(idx != std::string::npos) { if (idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx); likeStringValue = likeStringValue.substr(0, idx);
} }
} }
@ -45,7 +34,7 @@ std::string Constraint::describe() const {
std::string rv; std::string rv;
rv.append(columnName); rv.append(columnName);
rv.append(" "); rv.append(" ");
switch(op) { switch (op) {
case Equal: case Equal:
rv.append("="); rv.append("=");
break; break;
@ -85,7 +74,7 @@ std::string Constraint::describe() const {
} }
rv.append(" "); rv.append(" ");
switch(type) { switch (type) {
case Null: case Null:
rv.append("NULL"); rv.append("NULL");
break; break;

View File

@ -1,9 +1,9 @@
#ifndef PARQUET_FILTER_H #ifndef PARQUET_FILTER_H
#define PARQUET_FILTER_H #define PARQUET_FILTER_H
#include <vector>
#include <string>
#include <cstdint> #include <cstdint>
#include <string>
#include <vector>
enum ConstraintOperator { enum ConstraintOperator {
Equal, Equal,
@ -20,43 +20,36 @@ enum ConstraintOperator {
Is Is
}; };
enum ValueType { enum ValueType { Null, Integer, Double, Blob, Text };
Null,
Integer,
Double,
Blob,
Text
};
class RowGroupBitmap { class RowGroupBitmap {
void setBit(std::vector<unsigned char>& membership, unsigned int rowGroup, bool isSet) { void setBit(std::vector<unsigned char> &membership, unsigned int rowGroup,
bool isSet) {
int byte = rowGroup / 8; int byte = rowGroup / 8;
int offset = rowGroup % 8; int offset = rowGroup % 8;
unsigned char c = membership[byte]; unsigned char c = membership[byte];
c &= ~(1UL << offset); c &= ~(1UL << offset);
if(isSet) { if (isSet) {
c |= 1UL << offset; c |= 1UL << offset;
} }
membership[byte] = c; membership[byte] = c;
} }
// Compares estimated rowGroupFilter results against observed results // Compares estimated rowGroupFilter results against observed results
// when we explored the row group. This lets us cache // when we explored the row group. This lets us cache
public: public:
RowGroupBitmap(unsigned int totalRowGroups) { RowGroupBitmap(unsigned int totalRowGroups) {
// Initialize everything to assume that all row groups match. // Initialize everything to assume that all row groups match.
// As we discover otherwise, we'll update that assumption. // As we discover otherwise, we'll update that assumption.
for(unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) { for (unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) {
estimatedMembership.push_back(0xFF); estimatedMembership.push_back(0xFF);
actualMembership.push_back(0xFF); actualMembership.push_back(0xFF);
} }
} }
RowGroupBitmap( RowGroupBitmap(std::vector<unsigned char> estimatedMembership,
std::vector<unsigned char> estimatedMembership, std::vector<unsigned char> actualMembership)
std::vector<unsigned char> actualMembership) : : estimatedMembership(estimatedMembership),
estimatedMembership(estimatedMembership), actualMembership(actualMembership) {}
actualMembership(actualMembership) {
}
std::vector<unsigned char> estimatedMembership; std::vector<unsigned char> estimatedMembership;
std::vector<unsigned char> actualMembership; std::vector<unsigned char> actualMembership;
@ -80,17 +73,11 @@ public:
class Constraint { class Constraint {
public: public:
// Kind of a messy constructor function, but it's just for internal use, so whatever. // Kind of a messy constructor function, but it's just for internal use, so
Constraint( // whatever.
RowGroupBitmap bitmap, Constraint(RowGroupBitmap bitmap, int column, std::string columnName,
int column, ConstraintOperator op, ValueType type, int64_t intValue,
std::string columnName, double doubleValue, std::vector<unsigned char> blobValue);
ConstraintOperator op,
ValueType type,
int64_t intValue,
double doubleValue,
std::vector<unsigned char> blobValue
);
RowGroupBitmap bitmap; RowGroupBitmap bitmap;
int column; // underlying column in the query int column; // underlying column in the query

View File

@ -2,61 +2,61 @@
#include "parquet/api/reader.h" #include "parquet/api/reader.h"
ParquetTable::ParquetTable(std::string file, std::string tableName): file(file), tableName(tableName) { ParquetTable::ParquetTable(std::string file, std::string tableName)
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); : file(file), tableName(tableName) {
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(file.data());
metadata = reader->metadata(); metadata = reader->metadata();
} }
std::string ParquetTable::columnName(int i) { std::string ParquetTable::columnName(int i) {
if(i == -1) if (i == -1)
return "rowid"; return "rowid";
return columnNames[i]; return columnNames[i];
} }
unsigned int ParquetTable::getNumColumns() { unsigned int ParquetTable::getNumColumns() { return columnNames.size(); }
return columnNames.size();
}
std::string ParquetTable::CreateStatement() { std::string ParquetTable::CreateStatement() {
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile( std::unique_ptr<parquet::ParquetFileReader> reader =
file.data(), parquet::ParquetFileReader::OpenFile(
true, file.data(), true, parquet::default_reader_properties(), metadata);
parquet::default_reader_properties(),
metadata);
std::string text("CREATE TABLE x("); std::string text("CREATE TABLE x(");
auto schema = reader->metadata()->schema(); auto schema = reader->metadata()->schema();
for(auto i = 0; i < schema->num_columns(); i++) { for (auto i = 0; i < schema->num_columns(); i++) {
auto _col = schema->GetColumnRoot(i); auto _col = schema->GetColumnRoot(i);
columnNames.push_back(_col->name()); columnNames.push_back(_col->name());
} }
for(auto i = 0; i < schema->num_columns(); i++) { for (auto i = 0; i < schema->num_columns(); i++) {
auto _col = schema->GetColumnRoot(i); auto _col = schema->GetColumnRoot(i);
if(!_col->is_primitive()) { if (!_col->is_primitive()) {
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-primitive type"; ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has non-primitive type";
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
if(_col->is_repeated()) { if (_col->is_repeated()) {
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-scalar type"; ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has non-scalar type";
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
parquet::schema::PrimitiveNode* col = (parquet::schema::PrimitiveNode*)_col; parquet::schema::PrimitiveNode *col =
(parquet::schema::PrimitiveNode *)_col;
if(i > 0) if (i > 0)
text += ", "; text += ", ";
text += "\""; text += "\"";
// Horrifically inefficient, but easy to understand. // Horrifically inefficient, but easy to understand.
std::string colName = col->name(); std::string colName = col->name();
for(char& c : colName) { for (char &c : colName) {
if(c == '"') if (c == '"')
text += "\"\""; text += "\"\"";
else else
text += c; text += c;
@ -71,7 +71,7 @@ std::string ParquetTable::CreateStatement() {
// whose unsigned ints start getting interpreted as signed. (We could // whose unsigned ints start getting interpreted as signed. (We could
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if // support this for UINT_8/16/32 -- and for UINT_64 we could throw if
// the high bit was set.) // the high bit was set.)
if(logical == parquet::LogicalType::NONE || if (logical == parquet::LogicalType::NONE ||
logical == parquet::LogicalType::UTF8 || logical == parquet::LogicalType::UTF8 ||
logical == parquet::LogicalType::DATE || logical == parquet::LogicalType::DATE ||
logical == parquet::LogicalType::TIME_MILLIS || logical == parquet::LogicalType::TIME_MILLIS ||
@ -82,17 +82,17 @@ std::string ParquetTable::CreateStatement() {
logical == parquet::LogicalType::INT_16 || logical == parquet::LogicalType::INT_16 ||
logical == parquet::LogicalType::INT_32 || logical == parquet::LogicalType::INT_32 ||
logical == parquet::LogicalType::INT_64) { logical == parquet::LogicalType::INT_64) {
switch(physical) { switch (physical) {
case parquet::Type::BOOLEAN: case parquet::Type::BOOLEAN:
type = "TINYINT"; type = "TINYINT";
break; break;
case parquet::Type::INT32: case parquet::Type::INT32:
if(logical == parquet::LogicalType::NONE || if (logical == parquet::LogicalType::NONE ||
logical == parquet::LogicalType::INT_32) { logical == parquet::LogicalType::INT_32) {
type = "INT"; type = "INT";
} else if(logical == parquet::LogicalType::INT_8) { } else if (logical == parquet::LogicalType::INT_8) {
type = "TINYINT"; type = "TINYINT";
} else if(logical == parquet::LogicalType::INT_16) { } else if (logical == parquet::LogicalType::INT_16) {
type = "SMALLINT"; type = "SMALLINT";
} }
break; break;
@ -109,7 +109,7 @@ std::string ParquetTable::CreateStatement() {
type = "DOUBLE"; type = "DOUBLE";
break; break;
case parquet::Type::BYTE_ARRAY: case parquet::Type::BYTE_ARRAY:
if(logical == parquet::LogicalType::UTF8) { if (logical == parquet::LogicalType::UTF8) {
type = "TEXT"; type = "TEXT";
} else { } else {
type = "BLOB"; type = "BLOB";
@ -123,33 +123,33 @@ std::string ParquetTable::CreateStatement() {
} }
} }
if(type.empty()) { if (type.empty()) {
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " << ss << __FILE__ << ":" << __LINE__ << ": column " << i
parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical); << " has unsupported type: " << parquet::TypeToString(physical) << "/"
<< parquet::LogicalTypeToString(logical);
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
#ifdef DEBUG #ifdef DEBUG
printf("col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", printf(
i, "col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(),
col->name().data(),
col->physical_type(), col->physical_type(),
parquet::TypeToString(col->physical_type()).data(), parquet::TypeToString(col->physical_type()).data(), col->logical_type(),
col->logical_type(), parquet::LogicalTypeToString(col->logical_type()).data(), type.data());
parquet::LogicalTypeToString(col->logical_type()).data(),
type.data());
#endif #endif
text += " "; text += " ";
text += type; text += type;
} }
text +=");"; text += ");";
return text; return text;
} }
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; } std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() {
return metadata;
}
const std::string& ParquetTable::getFile() { return file; } const std::string &ParquetTable::getFile() { return file; }
const std::string& ParquetTable::getTableName() { return tableName; } const std::string &ParquetTable::getTableName() { return tableName; }

View File

@ -1,9 +1,9 @@
#ifndef PARQUET_TABLE_H #ifndef PARQUET_TABLE_H
#define PARQUET_TABLE_H #define PARQUET_TABLE_H
#include <vector>
#include <string>
#include "parquet/api/reader.h" #include "parquet/api/reader.h"
#include <string>
#include <vector>
class ParquetTable { class ParquetTable {
std::string file; std::string file;
@ -11,15 +11,14 @@ class ParquetTable {
std::vector<std::string> columnNames; std::vector<std::string> columnNames;
std::shared_ptr<parquet::FileMetaData> metadata; std::shared_ptr<parquet::FileMetaData> metadata;
public: public:
ParquetTable(std::string file, std::string tableName); ParquetTable(std::string file, std::string tableName);
std::string CreateStatement(); std::string CreateStatement();
std::string columnName(int idx); std::string columnName(int idx);
unsigned int getNumColumns(); unsigned int getNumColumns();
std::shared_ptr<parquet::FileMetaData> getMetadata(); std::shared_ptr<parquet::FileMetaData> getMetadata();
const std::string& getFile(); const std::string &getFile();
const std::string& getTableName(); const std::string &getTableName();
}; };
#endif #endif