1
0
mirror of https://github.com/cldellow/sqlite-parquet-vtable.git synced 2025-04-03 09:39:47 +00:00

Apply mharju's patch to update to 1.5 Types

This commit is contained in:
Addie Morrison 2019-12-08 16:37:55 -06:00
parent 7bc6f91f6f
commit 342f01eda7
4 changed files with 59 additions and 75 deletions

View File

@ -280,7 +280,7 @@ parquetColumn(sqlite3_vtab_cursor *cur, /* The cursor */
} }
case parquet::Type::BYTE_ARRAY: { case parquet::Type::BYTE_ARRAY: {
parquet::ByteArray *rv = cursor->getByteArray(col); parquet::ByteArray *rv = cursor->getByteArray(col);
if (cursor->getLogicalType(col) == parquet::LogicalType::UTF8) { if (cursor->getLogicalType(col) == parquet::ConvertedType::UTF8) {
sqlite3_result_text(ctx, (const char *)rv->ptr, rv->len, sqlite3_result_text(ctx, (const char *)rv->ptr, rv->len,
SQLITE_TRANSIENT); SQLITE_TRANSIENT);
} else { } else {

View File

@ -33,8 +33,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(
} }
bool ParquetCursor::currentRowGroupSatisfiesBlobFilter( bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> _stats) {
std::shared_ptr<parquet::RowGroupStatistics> _stats) {
if (!_stats->HasMinMax()) { if (!_stats->HasMinMax()) {
return true; return true;
} }
@ -51,10 +50,8 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(
parquet::Type::type pqType = types[constraint.column]; parquet::Type::type pqType = types[constraint.column];
if (pqType == parquet::Type::BYTE_ARRAY) { if (pqType == parquet::Type::BYTE_ARRAY) {
parquet::TypedRowGroupStatistics< parquet::TypedStatistics<parquet::ByteArrayType> *stats =
parquet::DataType<parquet::Type::BYTE_ARRAY>> *stats = (parquet::TypedStatistics<parquet::ByteArrayType> *)_stats.get();
(parquet::TypedRowGroupStatistics<
parquet::DataType<parquet::Type::BYTE_ARRAY>> *)_stats.get();
minPtr = stats->min().ptr; minPtr = stats->min().ptr;
minLen = stats->min().len; minLen = stats->min().len;
@ -128,11 +125,9 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(
} }
bool ParquetCursor::currentRowGroupSatisfiesTextFilter( bool ParquetCursor::currentRowGroupSatisfiesTextFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> _stats) {
std::shared_ptr<parquet::RowGroupStatistics> _stats) { parquet::TypedStatistics<parquet::ByteArrayType> *stats =
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>> (parquet::TypedStatistics<parquet::ByteArrayType> *)_stats.get();
*stats = (parquet::TypedRowGroupStatistics<
parquet::DataType<parquet::Type::BYTE_ARRAY>> *)_stats.get();
if (!stats->HasMinMax()) { if (!stats->HasMinMax()) {
return true; return true;
@ -190,8 +185,7 @@ int64_t int96toMsSinceEpoch(const parquet::Int96 &rv) {
} }
bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter( bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> _stats) {
std::shared_ptr<parquet::RowGroupStatistics> _stats) {
if (!_stats->HasMinMax()) { if (!_stats->HasMinMax()) {
return true; return true;
} }
@ -207,31 +201,27 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(
parquet::Type::type pqType = types[column]; parquet::Type::type pqType = types[column];
if (pqType == parquet::Type::INT32) { if (pqType == parquet::Type::INT32) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT32>> parquet::TypedStatistics<parquet::Int32Type> *stats =
*stats = (parquet::TypedRowGroupStatistics< (parquet::TypedStatistics<parquet::Int32Type> *)_stats.get();
parquet::DataType<parquet::Type::INT32>> *)_stats.get();
min = stats->min(); min = stats->min();
max = stats->max(); max = stats->max();
} else if (pqType == parquet::Type::INT64) { } else if (pqType == parquet::Type::INT64) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT64>> parquet::TypedStatistics<parquet::Int64Type> *stats =
*stats = (parquet::TypedRowGroupStatistics< (parquet::TypedStatistics<parquet::Int64Type> *)_stats.get();
parquet::DataType<parquet::Type::INT64>> *)_stats.get();
min = stats->min(); min = stats->min();
max = stats->max(); max = stats->max();
} else if (pqType == parquet::Type::INT96) { } else if (pqType == parquet::Type::INT96) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT96>> parquet::TypedStatistics<parquet::Int96Type> *stats =
*stats = (parquet::TypedRowGroupStatistics< (parquet::TypedStatistics<parquet::Int96Type> *)_stats.get();
parquet::DataType<parquet::Type::INT96>> *)_stats.get();
min = int96toMsSinceEpoch(stats->min()); min = int96toMsSinceEpoch(stats->min());
max = int96toMsSinceEpoch(stats->max()); max = int96toMsSinceEpoch(stats->max());
} else if (pqType == parquet::Type::BOOLEAN) { } else if (pqType == parquet::Type::BOOLEAN) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BOOLEAN>> parquet::TypedStatistics<parquet::BooleanType> *stats =
*stats = (parquet::TypedRowGroupStatistics< (parquet::TypedStatistics<parquet::BooleanType> *)_stats.get();
parquet::DataType<parquet::Type::BOOLEAN>> *)_stats.get();
min = stats->min(); min = stats->min();
max = stats->max(); max = stats->max();
@ -275,8 +265,7 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(
} }
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter( bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> _stats) {
std::shared_ptr<parquet::RowGroupStatistics> _stats) {
if (!_stats->HasMinMax()) { if (!_stats->HasMinMax()) {
return true; return true;
} }
@ -292,16 +281,14 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(
parquet::Type::type pqType = types[column]; parquet::Type::type pqType = types[column];
if (pqType == parquet::Type::DOUBLE) { if (pqType == parquet::Type::DOUBLE) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>> parquet::TypedStatistics<parquet::DoubleType> *stats =
*stats = (parquet::TypedRowGroupStatistics< (parquet::TypedStatistics<parquet::DoubleType> *)_stats.get();
parquet::DataType<parquet::Type::DOUBLE>> *)_stats.get();
min = stats->min(); min = stats->min();
max = stats->max(); max = stats->max();
} else if (pqType == parquet::Type::FLOAT) { } else if (pqType == parquet::Type::FLOAT) {
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>> parquet::TypedStatistics<parquet::FloatType> *stats =
*stats = (parquet::TypedRowGroupStatistics< (parquet::TypedStatistics<parquet::FloatType> *)_stats.get();
parquet::DataType<parquet::Type::FLOAT>> *)_stats.get();
min = stats->min(); min = stats->min();
max = stats->max(); max = stats->max();
@ -521,7 +508,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
std::unique_ptr<parquet::ColumnChunkMetaData> md = std::unique_ptr<parquet::ColumnChunkMetaData> md =
rowGroupMetadata->ColumnChunk(column); rowGroupMetadata->ColumnChunk(column);
if (md->is_stats_set()) { if (md->is_stats_set()) {
std::shared_ptr<parquet::RowGroupStatistics> stats = md->statistics(); std::shared_ptr<parquet::Statistics> stats = md->statistics();
// SQLite is much looser with types than you might expect if you // SQLite is much looser with types than you might expect if you
// come from a Postgres background. The constraint '30.0' (that is, // come from a Postgres background. The constraint '30.0' (that is,
@ -540,7 +527,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
parquet::Type::type pqType = types[column]; parquet::Type::type pqType = types[column];
if (pqType == parquet::Type::BYTE_ARRAY && if (pqType == parquet::Type::BYTE_ARRAY &&
logicalTypes[column] == parquet::LogicalType::UTF8) { logicalTypes[column] == parquet::ConvertedType::UTF8) {
rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats);
} else if (pqType == parquet::Type::BYTE_ARRAY) { } else if (pqType == parquet::Type::BYTE_ARRAY) {
rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats); rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats);
@ -605,13 +592,13 @@ start:
while (logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) { while (logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) {
logicalTypes.push_back( logicalTypes.push_back(
rowGroupMetadata->schema()->Column(0)->logical_type()); rowGroupMetadata->schema()->Column(0)->converted_type());
} }
for (unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); for (unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns();
i++) { i++) {
types[i] = rowGroupMetadata->schema()->Column(i)->physical_type(); types[i] = rowGroupMetadata->schema()->Column(i)->physical_type();
logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type(); logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->converted_type();
} }
for (unsigned int i = 0; i < colRows.size(); i++) { for (unsigned int i = 0; i < colRows.size(); i++) {
@ -662,7 +649,7 @@ bool ParquetCursor::currentRowSatisfiesFilter() {
rv = !isNull(column); rv = !isNull(column);
} else { } else {
if (logicalTypes[column] == parquet::LogicalType::UTF8) { if (logicalTypes[column] == parquet::ConvertedType::UTF8) {
rv = currentRowSatisfiesTextFilter(constraints[i]); rv = currentRowSatisfiesTextFilter(constraints[i]);
} else { } else {
parquet::Type::type pqType = types[column]; parquet::Type::type pqType = types[column];
@ -765,7 +752,7 @@ void ParquetCursor::ensureColumn(int col) {
} }
case parquet::Type::INT64: { case parquet::Type::INT64: {
parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get();
long rv = 0; long long rv = 0;
s->NextValue(&rv, &wasNull); s->NextValue(&rv, &wasNull);
break; break;
} }
@ -843,7 +830,7 @@ void ParquetCursor::ensureColumn(int col) {
} }
case parquet::Type::INT64: { case parquet::Type::INT64: {
parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get();
long rv = 0; long long rv = 0;
hadValue = s->NextValue(&rv, &wasNull); hadValue = s->NextValue(&rv, &wasNull);
colIntValues[col] = rv; colIntValues[col] = rv;
break; break;
@ -906,7 +893,7 @@ parquet::Type::type ParquetCursor::getPhysicalType(int col) {
return types[col]; return types[col];
} }
parquet::LogicalType::type ParquetCursor::getLogicalType(int col) { parquet::ConvertedType::type ParquetCursor::getLogicalType(int col) {
return logicalTypes[col]; return logicalTypes[col];
} }

View File

@ -13,7 +13,7 @@ class ParquetCursor {
std::shared_ptr<parquet::RowGroupReader> rowGroup; std::shared_ptr<parquet::RowGroupReader> rowGroup;
std::vector<std::shared_ptr<parquet::Scanner>> scanners; std::vector<std::shared_ptr<parquet::Scanner>> scanners;
std::vector<parquet::Type::type> types; std::vector<parquet::Type::type> types;
std::vector<parquet::LogicalType::type> logicalTypes; std::vector<parquet::ConvertedType::type> logicalTypes;
std::vector<int> colRows; std::vector<int> colRows;
std::vector<bool> colNulls; std::vector<bool> colNulls;
@ -37,17 +37,13 @@ class ParquetCursor {
bool currentRowGroupSatisfiesFilter(); bool currentRowGroupSatisfiesFilter();
bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint); bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint);
bool currentRowGroupSatisfiesTextFilter( bool currentRowGroupSatisfiesTextFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> stats);
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesBlobFilter( bool currentRowGroupSatisfiesBlobFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> stats);
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesIntegerFilter( bool currentRowGroupSatisfiesIntegerFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> stats);
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter( bool currentRowGroupSatisfiesDoubleFilter(
Constraint &constraint, Constraint &constraint, std::shared_ptr<parquet::Statistics> stats);
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesTextFilter(Constraint &constraint); bool currentRowSatisfiesTextFilter(Constraint &constraint);
bool currentRowSatisfiesIntegerFilter(Constraint &constraint); bool currentRowSatisfiesIntegerFilter(Constraint &constraint);
@ -67,7 +63,7 @@ public:
unsigned int getNumConstraints() const; unsigned int getNumConstraints() const;
const Constraint &getConstraint(unsigned int i) const; const Constraint &getConstraint(unsigned int i) const;
parquet::Type::type getPhysicalType(int col); parquet::Type::type getPhysicalType(int col);
parquet::LogicalType::type getLogicalType(int col); parquet::ConvertedType::type getLogicalType(int col);
ParquetTable *getTable() const; ParquetTable *getTable() const;
int getInt32(int col); int getInt32(int col);

View File

@ -66,33 +66,33 @@ std::string ParquetTable::CreateStatement() {
std::string type; std::string type;
parquet::Type::type physical = col->physical_type(); parquet::Type::type physical = col->physical_type();
parquet::LogicalType::type logical = col->logical_type(); parquet::ConvertedType::type logical = col->converted_type();
// Be explicit about which types we understand so we don't mislead someone // Be explicit about which types we understand so we don't mislead someone
// whose unsigned ints start getting interpreted as signed. (We could // whose unsigned ints start getting interpreted as signed. (We could
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if // support this for UINT_8/16/32 -- and for UINT_64 we could throw if
// the high bit was set.) // the high bit was set.)
if (logical == parquet::LogicalType::NONE || if (logical == parquet::ConvertedType::NONE ||
logical == parquet::LogicalType::UTF8 || logical == parquet::ConvertedType::UTF8 ||
logical == parquet::LogicalType::DATE || logical == parquet::ConvertedType::DATE ||
logical == parquet::LogicalType::TIME_MILLIS || logical == parquet::ConvertedType::TIME_MILLIS ||
logical == parquet::LogicalType::TIMESTAMP_MILLIS || logical == parquet::ConvertedType::TIMESTAMP_MILLIS ||
logical == parquet::LogicalType::TIME_MICROS || logical == parquet::ConvertedType::TIME_MICROS ||
logical == parquet::LogicalType::TIMESTAMP_MICROS || logical == parquet::ConvertedType::TIMESTAMP_MICROS ||
logical == parquet::LogicalType::INT_8 || logical == parquet::ConvertedType::INT_8 ||
logical == parquet::LogicalType::INT_16 || logical == parquet::ConvertedType::INT_16 ||
logical == parquet::LogicalType::INT_32 || logical == parquet::ConvertedType::INT_32 ||
logical == parquet::LogicalType::INT_64) { logical == parquet::ConvertedType::INT_64) {
switch (physical) { switch (physical) {
case parquet::Type::BOOLEAN: case parquet::Type::BOOLEAN:
type = "TINYINT"; type = "TINYINT";
break; break;
case parquet::Type::INT32: case parquet::Type::INT32:
if (logical == parquet::LogicalType::NONE || if (logical == parquet::ConvertedType::NONE ||
logical == parquet::LogicalType::INT_32) { logical == parquet::ConvertedType::INT_32) {
type = "INT"; type = "INT";
} else if (logical == parquet::LogicalType::INT_8) { } else if (logical == parquet::ConvertedType::INT_8) {
type = "TINYINT"; type = "TINYINT";
} else if (logical == parquet::LogicalType::INT_16) { } else if (logical == parquet::ConvertedType::INT_16) {
type = "SMALLINT"; type = "SMALLINT";
} }
break; break;
@ -109,7 +109,7 @@ std::string ParquetTable::CreateStatement() {
type = "DOUBLE"; type = "DOUBLE";
break; break;
case parquet::Type::BYTE_ARRAY: case parquet::Type::BYTE_ARRAY:
if (logical == parquet::LogicalType::UTF8) { if (logical == parquet::ConvertedType::UTF8) {
type = "TEXT"; type = "TEXT";
} else { } else {
type = "BLOB"; type = "BLOB";
@ -127,17 +127,18 @@ std::string ParquetTable::CreateStatement() {
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has unsupported type: " << parquet::TypeToString(physical) << "/" << " has unsupported type: " << parquet::TypeToString(physical) << "/"
<< parquet::LogicalTypeToString(logical); << parquet::ConvertedTypeToString(logical);
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
#ifdef DEBUG #ifdef DEBUG
printf( printf("col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(),
"col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(), col->physical_type(),
col->physical_type(), parquet::TypeToString(col->physical_type()).data(),
parquet::TypeToString(col->physical_type()).data(), col->logical_type(), col->logical_type(),
parquet::LogicalTypeToString(col->logical_type()).data(), type.data()); parquet::ConvertedTypeToString(col->logical_type()).data(),
type.data());
#endif #endif
text += " "; text += " ";