diff --git a/build-sqlite b/build-sqlite index 062a8c0..588663c 100755 --- a/build-sqlite +++ b/build-sqlite @@ -1,11 +1,11 @@ #!/bin/bash set -euo pipefail -VERSION=3240000 +VERSION=3330000 fetch_if_needed() { if [ ! -e sqlite ]; then - curl --fail "https://sqlite.org/2018/sqlite-autoconf-${VERSION}.tar.gz" > sqlite.tar.gz + curl --fail "https://sqlite.org/2020/sqlite-autoconf-${VERSION}.tar.gz" > sqlite.tar.gz tar xf sqlite.tar.gz rm sqlite.tar.gz mv sqlite-autoconf-${VERSION} sqlite diff --git a/build/Makefile.linux b/build/Makefile.linux index 656864a..829a9c7 100644 --- a/build/Makefile.linux +++ b/build/Makefile.linux @@ -3,119 +3,39 @@ ROOT:=$(HERE)/../.. VTABLE:=$(ROOT)/parquet SQLITE:=$(ROOT)/sqlite -# Directories -ARROW=$(HERE)/arrow -ARROW_RELEASE=$(ARROW)/cpp/release -BOOST_ROOT=$(ARROW_RELEASE)/boost_ep-prefix/src/boost_ep -BOOST=$(BOOST_ROOT)/stage/lib -BROTLI=$(ARROW_RELEASE)/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu -ICU=$(HERE)/icu -LZ4=$(ARROW_RELEASE)/lz4_ep-prefix/src/lz4_ep/lib -PARQUET_CPP=$(HERE)/parquet-cpp -SNAPPY=$(ARROW_RELEASE)/snappy_ep/src/snappy_ep-install/lib -ZLIB=$(ARROW_RELEASE)/zlib_ep/src/zlib_ep-install/lib -ZSTD=$(ARROW_RELEASE)/zstd_ep-prefix/src/zstd_ep/lib - -# Libraries -# profile_gen, profile_build for PGO -APACHE_BUILD=release - -ARROW_LIB = $(ARROW_RELEASE)/$(APACHE_BUILD)/libarrow.a -BOOST_FILESYSTEM_LIB = $(BOOST)/libboost_filesystem.a -BOOST_REGEX_LIB = $(BOOST)/libboost_regex.a -BOOST_SYSTEM_LIB = $(BOOST)/libboost_system.a -BROTLI_COMMON_LIB = $(BROTLI)/libbrotlicommon.a -BROTLI_DEC_LIB = $(BROTLI)/libbrotlidec.a -BROTLI_ENC_LIB = $(BROTLI)/libbrotlienc.a -ICU_I18N_LIB=$(ICU)/source/lib/libicui18n.a -ICU_UC_LIB=$(ICU)/source/lib/libicuuc.a -ICU_DATA_LIB=$(ICU)/source/lib/libicudata.a -LZ4_LIB = $(LZ4)/liblz4.a -PARQUET_CPP_LIB = $(PARQUET_CPP)/build/$(APACHE_BUILD)/libparquet.a -SNAPPY_LIB = $(SNAPPY)/libsnappy.a -THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a -ZLIB_LIB = $(ZLIB)/libz.a -ZSTD_LIB = $(ZSTD)/libzstd.a - # Flags -CC = gcc CXX = g++ OPTIMIZATIONS = -O3 -CPUS:=$(shell nproc) -CFLAGS = -I $(SQLITE) -I $(PARQUET_CPP)/src -I $(ARROW)/cpp/src $(OPTIMIZATIONS) -std=c++11 -Wall -fPIC -g +CFLAGS = -I $(SQLITE) $(OPTIMIZATIONS) -std=c++11 -Wall -fPIC -g +LIBS = -lparquet -lboost_regex -lboost_system -lboost_filesystem \ + -lbrotlienc -lbrotlicommon -lbrotlidec -licui18n -licuuc -licudata \ + -llz4 -lsnappy -lthrift -lz -lzstd -lcrypto -lssl -ALL_LIBS = $(PARQUET_CPP_LIB) $(LZ4_LIB) $(ZSTD_LIB) $(THRIFT_LIB) $(SNAPPY_LIB) $(ARROW_LIB) \ - $(ICU_I18N_LIB) $(ICU_UC_LIB) $(ICU_DATA_LIB) \ - $(BROTLI_ENC_LIB) $(BROTLI_COMMON_LIB) $(BROTLI_DEC_LIB) $(BOOST_REGEX_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_FILESYSTEM_LIB) +LDFLAGS = $(OPTIMIZATIONS) -Wl,--no-whole-archive $(LIBS) -lz -lcrypto -lssl -LDFLAGS = $(OPTIMIZATIONS) \ - -Wl,--whole-archive $(ALL_LIBS) \ - -Wl,--no-whole-archive -lz -lcrypto -lssl OBJ = parquet.o parquet_filter.o parquet_table.o parquet_cursor.o -LIBS = $(ARROW_LIB) $(PARQUET_CPP_LIB) $(ICU_I18N_LIB) PROF = -libparquet.so: $(LIBS) $(OBJ) +libparquet.so: $(OBJ) $(CXX) $(PROF) -shared -o $@ $(OBJ) $(LDFLAGS) -parquet_filter.o: $(VTABLE)/parquet_filter.cc $(VTABLE)/parquet_filter.h $(ARROW) $(PARQUET_CPP) +parquet_filter.o: $(VTABLE)/parquet_filter.cc $(VTABLE)/parquet_filter.h $(CXX) $(PROF) -c -o $@ $< $(CFLAGS) -parquet_cursor.o: $(VTABLE)/parquet_cursor.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h $(ARROW) $(PARQUET_CPP) +parquet_cursor.o: $(VTABLE)/parquet_cursor.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h $(CXX) $(PROF) -c -o $@ $< $(CFLAGS) -parquet_table.o: $(VTABLE)/parquet_table.cc $(VTABLE)/parquet_table.h $(ARROW) $(PARQUET_CPP) +parquet_table.o: $(VTABLE)/parquet_table.cc $(VTABLE)/parquet_table.h $(CXX) $(PROF) -c -o $@ $< $(CFLAGS) -parquet.o: $(VTABLE)/parquet.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h $(ARROW) $(PARQUET_CPP) +parquet.o: $(VTABLE)/parquet.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h $(CXX) $(PROF) -c -o $@ $< $(CFLAGS) -$(ARROW): - rm -rf $(ARROW) - git clone https://github.com/apache/arrow.git $(ARROW) - cd $(ARROW) && git checkout apache-arrow-0.9.0 - mkdir $(ARROW)/cpp/release - cd $(ARROW)/cpp/release && cmake -DCMAKE_BUILD_TYPE=$(APACHE_BUILD) -DARROW_BOOST_VENDORED=ON -DARROW_BOOST_USE_SHARED=OFF -DPARQUET_BUILD_SHARED=OFF .. - touch -d @0 $(ARROW) - -$(ARROW_LIB): $(ARROW) - cd $(ARROW)/cpp/release && make -j$(CPUS) - -# This is pretty gross. I'm sure someone who knows what they're doing could do this more cleanly. -$(ICU_I18N_LIB): - rm -rf $(ICU) - mkdir $(ICU) - cd $(ICU) && wget https://github.com/unicode-org/icu/releases/download/release-$(ICU_VERSION)/icu4c-$(ICU_VERSION_U)-src.tgz - cd $(ICU) && tar xf icu4c-$(ICU_VERSION_U)-src.tgz --strip-components=1 - cd $(ICU)/source && ./configure --enable-static - cd $(ICU)/source && make -j$(CPUS) LIBCFLAGS='-fPIC' LIBCXXFLAGS='-fPIC' - -$(PARQUET_CPP): - rm -rf $(PARQUET_CPP) - git clone https://github.com/apache/parquet-cpp.git $(PARQUET_CPP) - cd $(PARQUET_CPP) && git checkout apache-parquet-cpp-1.4.0 - cd $(PARQUET_CPP) && BOOST_ROOT=$(BOOST_ROOT) BOOST_STATIC_REGEX_LIBRARY=$(BOOST_REGEX_LIB) SNAPPY_STATIC_LIB=$(SNAPPY_LIB) BROTLI_STATIC_LIB_ENC=$(BROTLI_ENC_LIB) BROTLI_STATIC_LIB_DEC=$(BROTLI_DEC_LIB) BROTLI_STATIC_LIB_COMMON=$(BROTLI_COMMON_LIB) ZLIB_STATIC_LIB=$(ZLIB_LIB) LZ4_STATIC_LIB=$(LZ4_LIB) ZSTD_STATIC_LIB=$(ZSTD_LIB) cmake -DCMAKE_BUILD_TYPE=$(APACHE_BUILD) -DPARQUET_MINIMAL_DEPENDENCY=ON -DPARQUET_ARROW_LINKAGE=static -DPARQUET_BOOST_USE_SHARED=OFF -DPARQUET_BUILD_SHARED=OFF . - touch -d @0 $(PARQUET_CPP) - -$(PARQUET_CPP_LIB): $(PARQUET_CPP) $(ARROW_LIB) - cd $(PARQUET_CPP) && make -j$(CPUS) - -.PHONY: clean arrow icu parquet publish_libs +.PHONY: clean parquet clean: rm -f *.o *.so distclean: rm -rf $(SQLITE) $(HERE) - - -arrow: $(ARROW_LIB) - -icu: $(ICU_I18N_LIB) - -parquet: $(PARQUET_CPP_LIB) - -publish_libs: - tar -cJf libs.tar.xz $(ALL_LIBS) $(SQLITE)/sqlite3 - s3cmd put libs.tar.xz s3://cldellow/public/libparquet/$$(lsb_release -s -r)/libs.tar.xz diff --git a/make-linux b/make-linux index 4ff0af4..a626f09 100755 --- a/make-linux +++ b/make-linux @@ -1,31 +1,36 @@ #!/bin/bash set -euo pipefail +apt install -y sudo lsb-release wget + here=$(dirname "${BASH_SOURCE[0]}") here=$(readlink -f "$here") -prebuilt="$here"/build/linux/prebuilt ubuntu="$(lsb_release -s -r)" -libs=(libarrow.a libboost_filesystem.a libboost_regex.a libboost_system.a libbrotlicommon.a libbrotlidec.a \ - libbrotlienc.a libicudata.a libicui18n.a libicuuc.a liblz4.a libparquet.a libsnappy.a libthrift.a libzstd.a) -lib_locs=() setup_directories() { cd "$here" mkdir -p build/linux - mkdir -p "$prebuilt" cp -f build/Makefile.linux build/linux/Makefile cd build/linux } install_prerequisites() { + # install Apache Arrow libs + # NOTE: Pinned to Ubuntu Focal + wget https://apache.bintray.com/arrow/ubuntu/apache-arrow-archive-keyring-latest-focal.deb + sudo apt install -y -V ./apache-arrow-archive-keyring-latest-focal.deb + sudo apt update -y + sudo apt install -y -V libparquet-dev liblz4-dev libzstd-dev libthrift-dev \ + libsnappy-dev libthrift-dev libbrotli-dev libz-dev + # Install prereqs based on https://github.com/apache/parquet-cpp#linux - sudo apt-get install libboost-dev g++ libboost-filesystem-dev \ + sudo apt install -y libboost-dev g++ libboost-filesystem-dev \ libboost-program-options-dev libboost-regex-dev \ libboost-system-dev libboost-test-dev \ libssl-dev libtool bison flex pkg-config libreadline-dev libncurses-dev # Install prereqs based on https://github.com/apache/arrow/tree/master/cpp - sudo apt-get install cmake \ + sudo apt install -y cmake \ libboost-dev \ libboost-filesystem-dev \ libboost-system-dev @@ -48,6 +53,9 @@ set_icu_version() { 18.04) export ICU_VERSION=60-2 ;; + 20.10) + export ICU_VERSION=67-1 + ;; *) echo "unsure what libicu version to use" >&2 exit 1 @@ -56,47 +64,11 @@ set_icu_version() { export ICU_VERSION_U=${ICU_VERSION//-/_} } -add_prebuilt_lib() { - lib_locs+=("$1=$prebuilt/$2.a") -} - -fetch_prebuilt_libs() { - if [ ! -e "$prebuilt"/complete ]; then - ( - cd "$prebuilt" - curl "https://s3.amazonaws.com/cldellow/public/libparquet/$ubuntu/libs.tar.xz" > libs.tar.xz - tar xf libs.tar.xz --xform 's#.*/##' - touch "$prebuilt"/complete - ) - fi - - if [ ! -e "$here"/sqlite/sqlite3 ]; then - ln -s "$prebuilt"/sqlite3 "$here"/sqlite/sqlite3 - fi - - add_prebuilt_lib "PARQUET_CPP_LIB" libparquet - add_prebuilt_lib "LZ4_LIB" liblz4 - add_prebuilt_lib "ZSTD_LIB" libzstd - add_prebuilt_lib "THRIFT_LIB" libthrift - add_prebuilt_lib "SNAPPY_LIB" libsnappy - add_prebuilt_lib "ARROW_LIB" libarrow - add_prebuilt_lib "ICU_I18N_LIB" libicui18n - add_prebuilt_lib "ICU_UC_LIB" libicuuc - add_prebuilt_lib "ICU_DATA_LIB" libicudata - add_prebuilt_lib "BROTLI_ENC_LIB" libbrotlienc - add_prebuilt_lib "BROTLI_COMMON_LIB" libbrotlicommon - add_prebuilt_lib "BROTLI_DEC_LIB" libbrotlidec - add_prebuilt_lib "BOOST_REGEX_LIB" libboost_regex - add_prebuilt_lib "BOOST_SYSTEM_LIB" libboost_system - add_prebuilt_lib "BOOST_FILESYSTEM_LIB" libboost_filesystem - -} - main() { + set_icu_version setup_directories install_prerequisites build_sqlite - set_icu_version if [ -v PREBUILT ]; then fetch_prebuilt_libs diff --git a/parquet/parquet.cc b/parquet/parquet.cc index 9d5857e..e1df8ad 100644 --- a/parquet/parquet.cc +++ b/parquet/parquet.cc @@ -290,7 +290,7 @@ static int parquetColumn( case parquet::Type::BYTE_ARRAY: { parquet::ByteArray* rv = cursor->getByteArray(col); - if(cursor->getLogicalType(col) == parquet::LogicalType::UTF8) { + if(cursor->getConvertedType(col) == parquet::ConvertedType::UTF8) { sqlite3_result_text(ctx, (const char*)rv->ptr, rv->len, SQLITE_TRANSIENT); } else { sqlite3_result_blob(ctx, (void*)rv->ptr, rv->len, SQLITE_TRANSIENT); diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index e0ab8b6..e1fe8d3 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -31,7 +31,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint) } } -bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr _stats) { +bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr _stats) { if(!_stats->HasMinMax()) { return true; } @@ -48,8 +48,8 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s parquet::Type::type pqType = types[constraint.column]; if(pqType == parquet::Type::BYTE_ARRAY) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); minPtr = stats->min().ptr; minLen = stats->min().len; @@ -137,9 +137,9 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s } } -bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr _stats) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); +bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr _stats) { + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); if(!stats->HasMinMax()) { return true; @@ -195,7 +195,7 @@ int64_t int96toMsSinceEpoch(const parquet::Int96& rv) { return nsSinceEpoch; } -bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr _stats) { +bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr _stats) { if(!_stats->HasMinMax()) { return true; } @@ -211,27 +211,27 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint parquet::Type::type pqType = types[column]; if(pqType == parquet::Type::INT32) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); min = stats->min(); max = stats->max(); } else if(pqType == parquet::Type::INT64) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); min = stats->min(); max = stats->max(); } else if(pqType == parquet::Type::INT96) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); min = int96toMsSinceEpoch(stats->min()); max = int96toMsSinceEpoch(stats->max()); } else if(pqType == parquet::Type::BOOLEAN) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); min = stats->min(); max = stats->max(); @@ -272,7 +272,7 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint return true; } -bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr _stats) { +bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr _stats) { if(!_stats->HasMinMax()) { return true; } @@ -288,14 +288,14 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, parquet::Type::type pqType = types[column]; if(pqType == parquet::Type::DOUBLE) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); min = stats->min(); max = stats->max(); } else if(pqType == parquet::Type::FLOAT) { - parquet::TypedRowGroupStatistics>* stats = - (parquet::TypedRowGroupStatistics>*)_stats.get(); + parquet::TypedStatistics* stats = + (parquet::TypedStatistics*)_stats.get(); min = stats->min(); max = stats->max(); @@ -527,7 +527,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { } else { std::unique_ptr md = rowGroupMetadata->ColumnChunk(column); if(md->is_stats_set()) { - std::shared_ptr stats = md->statistics(); + std::shared_ptr stats = md->statistics(); // SQLite is much looser with types than you might expect if you // come from a Postgres background. The constraint '30.0' (that is, @@ -545,7 +545,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() { } else { parquet::Type::type pqType = types[column]; - if(pqType == parquet::Type::BYTE_ARRAY && logicalTypes[column] == parquet::LogicalType::UTF8) { + if(pqType == parquet::Type::BYTE_ARRAY && ConvertedTypes[column] == parquet::ConvertedType::UTF8) { rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats); } else if(pqType == parquet::Type::BYTE_ARRAY) { rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats); @@ -608,13 +608,13 @@ start: types.push_back(rowGroupMetadata->schema()->Column(0)->physical_type()); } - while(logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) { - logicalTypes.push_back(rowGroupMetadata->schema()->Column(0)->logical_type()); + while(ConvertedTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) { + ConvertedTypes.push_back(rowGroupMetadata->schema()->Column(0)->converted_type()); } for(unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); i++) { types[i] = rowGroupMetadata->schema()->Column(i)->physical_type(); - logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type(); + ConvertedTypes[i] = rowGroupMetadata->schema()->Column(i)->converted_type(); } for(unsigned int i = 0; i < colRows.size(); i++) { @@ -664,7 +664,7 @@ bool ParquetCursor::currentRowSatisfiesFilter() { rv = !isNull(column); } else { - if(logicalTypes[column] == parquet::LogicalType::UTF8) { + if(ConvertedTypes[column] == parquet::ConvertedType::UTF8) { rv = currentRowSatisfiesTextFilter(constraints[i]); } else { parquet::Type::type pqType = types[column]; @@ -928,8 +928,8 @@ parquet::Type::type ParquetCursor::getPhysicalType(int col) { return types[col]; } -parquet::LogicalType::type ParquetCursor::getLogicalType(int col) { - return logicalTypes[col]; +parquet::ConvertedType::type ParquetCursor::getConvertedType(int col) { + return ConvertedTypes[col]; } void ParquetCursor::close() { diff --git a/parquet/parquet_cursor.h b/parquet/parquet_cursor.h index f7d8c2a..f6afc40 100644 --- a/parquet/parquet_cursor.h +++ b/parquet/parquet_cursor.h @@ -13,7 +13,7 @@ class ParquetCursor { std::shared_ptr rowGroup; std::vector> scanners; std::vector types; - std::vector logicalTypes; + std::vector ConvertedTypes; std::vector colRows; std::vector colNulls; @@ -36,10 +36,10 @@ class ParquetCursor { bool currentRowSatisfiesFilter(); bool currentRowGroupSatisfiesFilter(); bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint); - bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr stats); - bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr stats); - bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr stats); - bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr stats); + bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr stats); + bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr stats); + bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr stats); + bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr stats); bool currentRowSatisfiesTextFilter(Constraint& constraint); bool currentRowSatisfiesIntegerFilter(Constraint& constraint); @@ -60,7 +60,7 @@ public: unsigned int getNumConstraints() const; const Constraint& getConstraint(unsigned int i) const; parquet::Type::type getPhysicalType(int col); - parquet::LogicalType::type getLogicalType(int col); + parquet::ConvertedType::type getConvertedType(int col); ParquetTable* getTable() const; int getInt32(int col); diff --git a/parquet/parquet_table.cc b/parquet/parquet_table.cc index d796b8a..a65f02b 100644 --- a/parquet/parquet_table.cc +++ b/parquet/parquet_table.cc @@ -66,33 +66,33 @@ std::string ParquetTable::CreateStatement() { std::string type; parquet::Type::type physical = col->physical_type(); - parquet::LogicalType::type logical = col->logical_type(); + parquet::ConvertedType::type converted = col->converted_type(); // Be explicit about which types we understand so we don't mislead someone // whose unsigned ints start getting interpreted as signed. (We could // support this for UINT_8/16/32 -- and for UINT_64 we could throw if // the high bit was set.) - if(logical == parquet::LogicalType::NONE || - logical == parquet::LogicalType::UTF8 || - logical == parquet::LogicalType::DATE || - logical == parquet::LogicalType::TIME_MILLIS || - logical == parquet::LogicalType::TIMESTAMP_MILLIS || - logical == parquet::LogicalType::TIME_MICROS || - logical == parquet::LogicalType::TIMESTAMP_MICROS || - logical == parquet::LogicalType::INT_8 || - logical == parquet::LogicalType::INT_16 || - logical == parquet::LogicalType::INT_32 || - logical == parquet::LogicalType::INT_64) { + if(converted == parquet::ConvertedType::NONE || + converted == parquet::ConvertedType::UTF8 || + converted == parquet::ConvertedType::DATE || + converted == parquet::ConvertedType::TIME_MILLIS || + converted == parquet::ConvertedType::TIMESTAMP_MILLIS || + converted == parquet::ConvertedType::TIME_MICROS || + converted == parquet::ConvertedType::TIMESTAMP_MICROS || + converted == parquet::ConvertedType::INT_8 || + converted == parquet::ConvertedType::INT_16 || + converted == parquet::ConvertedType::INT_32 || + converted == parquet::ConvertedType::INT_64) { switch(physical) { case parquet::Type::BOOLEAN: type = "TINYINT"; break; case parquet::Type::INT32: - if(logical == parquet::LogicalType::NONE || - logical == parquet::LogicalType::INT_32) { + if(converted == parquet::ConvertedType::NONE || + converted == parquet::ConvertedType::INT_32) { type = "INT"; - } else if(logical == parquet::LogicalType::INT_8) { + } else if(converted == parquet::ConvertedType::INT_8) { type = "TINYINT"; - } else if(logical == parquet::LogicalType::INT_16) { + } else if(converted == parquet::ConvertedType::INT_16) { type = "SMALLINT"; } break; @@ -109,7 +109,7 @@ std::string ParquetTable::CreateStatement() { type = "DOUBLE"; break; case parquet::Type::BYTE_ARRAY: - if(logical == parquet::LogicalType::UTF8) { + if(converted == parquet::ConvertedType::UTF8) { type = "TEXT"; } else { type = "BLOB"; @@ -126,7 +126,7 @@ std::string ParquetTable::CreateStatement() { if(type.empty()) { std::ostringstream ss; ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " << - parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical); + parquet::TypeToString(physical) << "/" << parquet::ConvertedTypeToString(converted); throw std::invalid_argument(ss.str()); } @@ -137,8 +137,8 @@ std::string ParquetTable::CreateStatement() { col->name().data(), col->physical_type(), parquet::TypeToString(col->physical_type()).data(), - col->logical_type(), - parquet::LogicalTypeToString(col->logical_type()).data(), + col->converted_type(), + parquet::ConvertedTypeToString(col->converted_type()).data(), type.data()); #endif