From 342f01eda769e962cd155b3da234bc5b14f91a65 Mon Sep 17 00:00:00 2001 From: Addie Morrison Date: Sun, 8 Dec 2019 16:37:55 -0600 Subject: [PATCH] Apply mharju's patch to update to 1.5 Types --- src/parquet.cc | 2 +- src/parquet_cursor.cc | 69 ++++++++++++++++++------------------------- src/parquet_cursor.h | 16 ++++------ src/parquet_table.cc | 47 ++++++++++++++--------------- 4 files changed, 59 insertions(+), 75 deletions(-) diff --git a/src/parquet.cc b/src/parquet.cc index 0443b2e..642bb31 100644 --- a/src/parquet.cc +++ b/src/parquet.cc @@ -280,7 +280,7 @@ parquetColumn(sqlite3_vtab_cursor *cur, /* The cursor */ } case parquet::Type::BYTE_ARRAY: { parquet::ByteArray *rv = cursor->getByteArray(col); - if (cursor->getLogicalType(col) == parquet::LogicalType::UTF8) { + if (cursor->getLogicalType(col) == parquet::ConvertedType::UTF8) { sqlite3_result_text(ctx, (const char *)rv->ptr, rv->len, SQLITE_TRANSIENT); } else { diff --git a/src/parquet_cursor.cc b/src/parquet_cursor.cc index ee5452a..efeeafe 100644 --- a/src/parquet_cursor.cc +++ b/src/parquet_cursor.cc @@ -33,8 +33,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter( } bool ParquetCursor::currentRowGroupSatisfiesBlobFilter( - Constraint &constraint, - std::shared_ptr _stats) { + Constraint &constraint, std::shared_ptr _stats) { if (!_stats->HasMinMax()) { return true; } @@ -51,10 +50,8 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter( parquet::Type::type pqType = types[constraint.column]; if (pqType == parquet::Type::BYTE_ARRAY) { - parquet::TypedRowGroupStatistics< - parquet::DataType> *stats = - (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); minPtr = stats->min().ptr; minLen = stats->min().len; @@ -128,11 +125,9 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter( } bool ParquetCursor::currentRowGroupSatisfiesTextFilter( - Constraint &constraint, - std::shared_ptr _stats) { - parquet::TypedRowGroupStatistics> - *stats = (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + Constraint &constraint, std::shared_ptr _stats) { + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); if (!stats->HasMinMax()) { return true; @@ -190,8 +185,7 @@ int64_t int96toMsSinceEpoch(const parquet::Int96 &rv) { } bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter( - Constraint &constraint, - std::shared_ptr _stats) { + Constraint &constraint, std::shared_ptr _stats) { if (!_stats->HasMinMax()) { return true; } @@ -207,31 +201,27 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter( parquet::Type::type pqType = types[column]; if (pqType == parquet::Type::INT32) { - parquet::TypedRowGroupStatistics> - *stats = (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); min = stats->min(); max = stats->max(); } else if (pqType == parquet::Type::INT64) { - parquet::TypedRowGroupStatistics> - *stats = (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); min = stats->min(); max = stats->max(); } else if (pqType == parquet::Type::INT96) { - parquet::TypedRowGroupStatistics> - *stats = (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); min = int96toMsSinceEpoch(stats->min()); max = int96toMsSinceEpoch(stats->max()); } else if (pqType == parquet::Type::BOOLEAN) { - parquet::TypedRowGroupStatistics> - *stats = (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); min = stats->min(); max = stats->max(); @@ -275,8 +265,7 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter( } bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter( - Constraint &constraint, - std::shared_ptr _stats) { + Constraint &constraint, std::shared_ptr _stats) { if (!_stats->HasMinMax()) { return true; } @@ -292,16 +281,14 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter( parquet::Type::type pqType = types[column]; if (pqType == parquet::Type::DOUBLE) { - parquet::TypedRowGroupStatistics> - *stats = (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); min = stats->min(); max = stats->max(); } else if (pqType == parquet::Type::FLOAT) { - parquet::TypedRowGroupStatistics> - *stats = (parquet::TypedRowGroupStatistics< - parquet::DataType> *)_stats.get(); + parquet::TypedStatistics *stats = + (parquet::TypedStatistics *)_stats.get(); min = stats->min(); max = stats->max(); @@ -521,7 +508,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { std::unique_ptr md = rowGroupMetadata->ColumnChunk(column); if (md->is_stats_set()) { - std::shared_ptr stats = md->statistics(); + std::shared_ptr stats = md->statistics(); // SQLite is much looser with types than you might expect if you // come from a Postgres background. The constraint '30.0' (that is, @@ -540,7 +527,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { parquet::Type::type pqType = types[column]; if (pqType == parquet::Type::BYTE_ARRAY && - logicalTypes[column] == parquet::LogicalType::UTF8) { + logicalTypes[column] == parquet::ConvertedType::UTF8) { rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); } else if (pqType == parquet::Type::BYTE_ARRAY) { rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats); @@ -605,13 +592,13 @@ start: while (logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) { logicalTypes.push_back( - rowGroupMetadata->schema()->Column(0)->logical_type()); + rowGroupMetadata->schema()->Column(0)->converted_type()); } for (unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); i++) { types[i] = rowGroupMetadata->schema()->Column(i)->physical_type(); - logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type(); + logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->converted_type(); } for (unsigned int i = 0; i < colRows.size(); i++) { @@ -662,7 +649,7 @@ bool ParquetCursor::currentRowSatisfiesFilter() { rv = !isNull(column); } else { - if (logicalTypes[column] == parquet::LogicalType::UTF8) { + if (logicalTypes[column] == parquet::ConvertedType::UTF8) { rv = currentRowSatisfiesTextFilter(constraints[i]); } else { parquet::Type::type pqType = types[column]; @@ -765,7 +752,7 @@ void ParquetCursor::ensureColumn(int col) { } case parquet::Type::INT64: { parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); - long rv = 0; + long long rv = 0; s->NextValue(&rv, &wasNull); break; } @@ -843,7 +830,7 @@ void ParquetCursor::ensureColumn(int col) { } case parquet::Type::INT64: { parquet::Int64Scanner *s = (parquet::Int64Scanner *)scanners[col].get(); - long rv = 0; + long long rv = 0; hadValue = s->NextValue(&rv, &wasNull); colIntValues[col] = rv; break; @@ -906,7 +893,7 @@ parquet::Type::type ParquetCursor::getPhysicalType(int col) { return types[col]; } -parquet::LogicalType::type ParquetCursor::getLogicalType(int col) { +parquet::ConvertedType::type ParquetCursor::getLogicalType(int col) { return logicalTypes[col]; } diff --git a/src/parquet_cursor.h b/src/parquet_cursor.h index 8c6edb4..7b46089 100644 --- a/src/parquet_cursor.h +++ b/src/parquet_cursor.h @@ -13,7 +13,7 @@ class ParquetCursor { std::shared_ptr rowGroup; std::vector> scanners; std::vector types; - std::vector logicalTypes; + std::vector logicalTypes; std::vector colRows; std::vector colNulls; @@ -37,17 +37,13 @@ class ParquetCursor { bool currentRowGroupSatisfiesFilter(); bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint); bool currentRowGroupSatisfiesTextFilter( - Constraint &constraint, - std::shared_ptr stats); + Constraint &constraint, std::shared_ptr stats); bool currentRowGroupSatisfiesBlobFilter( - Constraint &constraint, - std::shared_ptr stats); + Constraint &constraint, std::shared_ptr stats); bool currentRowGroupSatisfiesIntegerFilter( - Constraint &constraint, - std::shared_ptr stats); + Constraint &constraint, std::shared_ptr stats); bool currentRowGroupSatisfiesDoubleFilter( - Constraint &constraint, - std::shared_ptr stats); + Constraint &constraint, std::shared_ptr stats); bool currentRowSatisfiesTextFilter(Constraint &constraint); bool currentRowSatisfiesIntegerFilter(Constraint &constraint); @@ -67,7 +63,7 @@ public: unsigned int getNumConstraints() const; const Constraint &getConstraint(unsigned int i) const; parquet::Type::type getPhysicalType(int col); - parquet::LogicalType::type getLogicalType(int col); + parquet::ConvertedType::type getLogicalType(int col); ParquetTable *getTable() const; int getInt32(int col); diff --git a/src/parquet_table.cc b/src/parquet_table.cc index 5146cfd..e7ed789 100644 --- a/src/parquet_table.cc +++ b/src/parquet_table.cc @@ -66,33 +66,33 @@ std::string ParquetTable::CreateStatement() { std::string type; parquet::Type::type physical = col->physical_type(); - parquet::LogicalType::type logical = col->logical_type(); + parquet::ConvertedType::type logical = col->converted_type(); // Be explicit about which types we understand so we don't mislead someone // whose unsigned ints start getting interpreted as signed. (We could // support this for UINT_8/16/32 -- and for UINT_64 we could throw if // the high bit was set.) - if (logical == parquet::LogicalType::NONE || - logical == parquet::LogicalType::UTF8 || - logical == parquet::LogicalType::DATE || - logical == parquet::LogicalType::TIME_MILLIS || - logical == parquet::LogicalType::TIMESTAMP_MILLIS || - logical == parquet::LogicalType::TIME_MICROS || - logical == parquet::LogicalType::TIMESTAMP_MICROS || - logical == parquet::LogicalType::INT_8 || - logical == parquet::LogicalType::INT_16 || - logical == parquet::LogicalType::INT_32 || - logical == parquet::LogicalType::INT_64) { + if (logical == parquet::ConvertedType::NONE || + logical == parquet::ConvertedType::UTF8 || + logical == parquet::ConvertedType::DATE || + logical == parquet::ConvertedType::TIME_MILLIS || + logical == parquet::ConvertedType::TIMESTAMP_MILLIS || + logical == parquet::ConvertedType::TIME_MICROS || + logical == parquet::ConvertedType::TIMESTAMP_MICROS || + logical == parquet::ConvertedType::INT_8 || + logical == parquet::ConvertedType::INT_16 || + logical == parquet::ConvertedType::INT_32 || + logical == parquet::ConvertedType::INT_64) { switch (physical) { case parquet::Type::BOOLEAN: type = "TINYINT"; break; case parquet::Type::INT32: - if (logical == parquet::LogicalType::NONE || - logical == parquet::LogicalType::INT_32) { + if (logical == parquet::ConvertedType::NONE || + logical == parquet::ConvertedType::INT_32) { type = "INT"; - } else if (logical == parquet::LogicalType::INT_8) { + } else if (logical == parquet::ConvertedType::INT_8) { type = "TINYINT"; - } else if (logical == parquet::LogicalType::INT_16) { + } else if (logical == parquet::ConvertedType::INT_16) { type = "SMALLINT"; } break; @@ -109,7 +109,7 @@ std::string ParquetTable::CreateStatement() { type = "DOUBLE"; break; case parquet::Type::BYTE_ARRAY: - if (logical == parquet::LogicalType::UTF8) { + if (logical == parquet::ConvertedType::UTF8) { type = "TEXT"; } else { type = "BLOB"; @@ -127,17 +127,18 @@ std::string ParquetTable::CreateStatement() { std::ostringstream ss; ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " << parquet::TypeToString(physical) << "/" - << parquet::LogicalTypeToString(logical); + << parquet::ConvertedTypeToString(logical); throw std::invalid_argument(ss.str()); } #ifdef DEBUG - printf( - "col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(), - col->physical_type(), - parquet::TypeToString(col->physical_type()).data(), col->logical_type(), - parquet::LogicalTypeToString(col->logical_type()).data(), type.data()); + printf("col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(), + col->physical_type(), + parquet::TypeToString(col->physical_type()).data(), + col->logical_type(), + parquet::ConvertedTypeToString(col->logical_type()).data(), + type.data()); #endif text += " ";