Switch to Arrow and Ubuntu packages
This commit is contained in:
parent
d44c88ad64
commit
723b3d6ac7
|
@ -3,119 +3,39 @@ ROOT:=$(HERE)/../..
|
|||
VTABLE:=$(ROOT)/parquet
|
||||
SQLITE:=$(ROOT)/sqlite
|
||||
|
||||
# Directories
|
||||
ARROW=$(HERE)/arrow
|
||||
ARROW_RELEASE=$(ARROW)/cpp/release
|
||||
BOOST_ROOT=$(ARROW_RELEASE)/boost_ep-prefix/src/boost_ep
|
||||
BOOST=$(BOOST_ROOT)/stage/lib
|
||||
BROTLI=$(ARROW_RELEASE)/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu
|
||||
ICU=$(HERE)/icu
|
||||
LZ4=$(ARROW_RELEASE)/lz4_ep-prefix/src/lz4_ep/lib
|
||||
PARQUET_CPP=$(HERE)/parquet-cpp
|
||||
SNAPPY=$(ARROW_RELEASE)/snappy_ep/src/snappy_ep-install/lib
|
||||
ZLIB=$(ARROW_RELEASE)/zlib_ep/src/zlib_ep-install/lib
|
||||
ZSTD=$(ARROW_RELEASE)/zstd_ep-prefix/src/zstd_ep/lib
|
||||
|
||||
# Libraries
|
||||
# profile_gen, profile_build for PGO
|
||||
APACHE_BUILD=release
|
||||
|
||||
ARROW_LIB = $(ARROW_RELEASE)/$(APACHE_BUILD)/libarrow.a
|
||||
BOOST_FILESYSTEM_LIB = $(BOOST)/libboost_filesystem.a
|
||||
BOOST_REGEX_LIB = $(BOOST)/libboost_regex.a
|
||||
BOOST_SYSTEM_LIB = $(BOOST)/libboost_system.a
|
||||
BROTLI_COMMON_LIB = $(BROTLI)/libbrotlicommon.a
|
||||
BROTLI_DEC_LIB = $(BROTLI)/libbrotlidec.a
|
||||
BROTLI_ENC_LIB = $(BROTLI)/libbrotlienc.a
|
||||
ICU_I18N_LIB=$(ICU)/source/lib/libicui18n.a
|
||||
ICU_UC_LIB=$(ICU)/source/lib/libicuuc.a
|
||||
ICU_DATA_LIB=$(ICU)/source/lib/libicudata.a
|
||||
LZ4_LIB = $(LZ4)/liblz4.a
|
||||
PARQUET_CPP_LIB = $(PARQUET_CPP)/build/$(APACHE_BUILD)/libparquet.a
|
||||
SNAPPY_LIB = $(SNAPPY)/libsnappy.a
|
||||
THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a
|
||||
ZLIB_LIB = $(ZLIB)/libz.a
|
||||
ZSTD_LIB = $(ZSTD)/libzstd.a
|
||||
|
||||
# Flags
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
OPTIMIZATIONS = -O3
|
||||
CPUS:=$(shell nproc)
|
||||
CFLAGS = -I $(SQLITE) -I $(PARQUET_CPP)/src -I $(ARROW)/cpp/src $(OPTIMIZATIONS) -std=c++11 -Wall -fPIC -g
|
||||
CFLAGS = -I $(SQLITE) $(OPTIMIZATIONS) -std=c++11 -Wall -fPIC -g
|
||||
LIBS = -lparquet -lboost_regex -lboost_system -lboost_filesystem \
|
||||
-lbrotlienc -lbrotlicommon -lbrotlidec -licui18n -licuuc -licudata \
|
||||
-llz4 -lsnappy -lthrift -lz -lzstd -lcrypto -lssl
|
||||
|
||||
ALL_LIBS = $(PARQUET_CPP_LIB) $(LZ4_LIB) $(ZSTD_LIB) $(THRIFT_LIB) $(SNAPPY_LIB) $(ARROW_LIB) \
|
||||
$(ICU_I18N_LIB) $(ICU_UC_LIB) $(ICU_DATA_LIB) \
|
||||
$(BROTLI_ENC_LIB) $(BROTLI_COMMON_LIB) $(BROTLI_DEC_LIB) $(BOOST_REGEX_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_FILESYSTEM_LIB)
|
||||
LDFLAGS = $(OPTIMIZATIONS) -Wl,--no-whole-archive $(LIBS) -lz -lcrypto -lssl
|
||||
|
||||
LDFLAGS = $(OPTIMIZATIONS) \
|
||||
-Wl,--whole-archive $(ALL_LIBS) \
|
||||
-Wl,--no-whole-archive -lz -lcrypto -lssl
|
||||
OBJ = parquet.o parquet_filter.o parquet_table.o parquet_cursor.o
|
||||
LIBS = $(ARROW_LIB) $(PARQUET_CPP_LIB) $(ICU_I18N_LIB)
|
||||
|
||||
PROF =
|
||||
|
||||
libparquet.so: $(LIBS) $(OBJ)
|
||||
libparquet.so: $(OBJ)
|
||||
$(CXX) $(PROF) -shared -o $@ $(OBJ) $(LDFLAGS)
|
||||
|
||||
parquet_filter.o: $(VTABLE)/parquet_filter.cc $(VTABLE)/parquet_filter.h $(ARROW) $(PARQUET_CPP)
|
||||
parquet_filter.o: $(VTABLE)/parquet_filter.cc $(VTABLE)/parquet_filter.h
|
||||
$(CXX) $(PROF) -c -o $@ $< $(CFLAGS)
|
||||
|
||||
parquet_cursor.o: $(VTABLE)/parquet_cursor.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h $(ARROW) $(PARQUET_CPP)
|
||||
parquet_cursor.o: $(VTABLE)/parquet_cursor.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h
|
||||
$(CXX) $(PROF) -c -o $@ $< $(CFLAGS)
|
||||
|
||||
parquet_table.o: $(VTABLE)/parquet_table.cc $(VTABLE)/parquet_table.h $(ARROW) $(PARQUET_CPP)
|
||||
parquet_table.o: $(VTABLE)/parquet_table.cc $(VTABLE)/parquet_table.h
|
||||
$(CXX) $(PROF) -c -o $@ $< $(CFLAGS)
|
||||
|
||||
parquet.o: $(VTABLE)/parquet.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h $(ARROW) $(PARQUET_CPP)
|
||||
parquet.o: $(VTABLE)/parquet.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h
|
||||
$(CXX) $(PROF) -c -o $@ $< $(CFLAGS)
|
||||
|
||||
$(ARROW):
|
||||
rm -rf $(ARROW)
|
||||
git clone https://github.com/apache/arrow.git $(ARROW)
|
||||
cd $(ARROW) && git checkout apache-arrow-0.9.0
|
||||
mkdir $(ARROW)/cpp/release
|
||||
cd $(ARROW)/cpp/release && cmake -DCMAKE_BUILD_TYPE=$(APACHE_BUILD) -DARROW_BOOST_VENDORED=ON -DARROW_BOOST_USE_SHARED=OFF -DPARQUET_BUILD_SHARED=OFF ..
|
||||
touch -d @0 $(ARROW)
|
||||
|
||||
$(ARROW_LIB): $(ARROW)
|
||||
cd $(ARROW)/cpp/release && make -j$(CPUS)
|
||||
|
||||
# This is pretty gross. I'm sure someone who knows what they're doing could do this more cleanly.
|
||||
$(ICU_I18N_LIB):
|
||||
rm -rf $(ICU)
|
||||
mkdir $(ICU)
|
||||
cd $(ICU) && wget https://github.com/unicode-org/icu/releases/download/release-$(ICU_VERSION)/icu4c-$(ICU_VERSION_U)-src.tgz
|
||||
cd $(ICU) && tar xf icu4c-$(ICU_VERSION_U)-src.tgz --strip-components=1
|
||||
cd $(ICU)/source && ./configure --enable-static
|
||||
cd $(ICU)/source && make -j$(CPUS) LIBCFLAGS='-fPIC' LIBCXXFLAGS='-fPIC'
|
||||
|
||||
$(PARQUET_CPP):
|
||||
rm -rf $(PARQUET_CPP)
|
||||
git clone https://github.com/apache/parquet-cpp.git $(PARQUET_CPP)
|
||||
cd $(PARQUET_CPP) && git checkout apache-parquet-cpp-1.4.0
|
||||
cd $(PARQUET_CPP) && BOOST_ROOT=$(BOOST_ROOT) BOOST_STATIC_REGEX_LIBRARY=$(BOOST_REGEX_LIB) SNAPPY_STATIC_LIB=$(SNAPPY_LIB) BROTLI_STATIC_LIB_ENC=$(BROTLI_ENC_LIB) BROTLI_STATIC_LIB_DEC=$(BROTLI_DEC_LIB) BROTLI_STATIC_LIB_COMMON=$(BROTLI_COMMON_LIB) ZLIB_STATIC_LIB=$(ZLIB_LIB) LZ4_STATIC_LIB=$(LZ4_LIB) ZSTD_STATIC_LIB=$(ZSTD_LIB) cmake -DCMAKE_BUILD_TYPE=$(APACHE_BUILD) -DPARQUET_MINIMAL_DEPENDENCY=ON -DPARQUET_ARROW_LINKAGE=static -DPARQUET_BOOST_USE_SHARED=OFF -DPARQUET_BUILD_SHARED=OFF .
|
||||
touch -d @0 $(PARQUET_CPP)
|
||||
|
||||
$(PARQUET_CPP_LIB): $(PARQUET_CPP) $(ARROW_LIB)
|
||||
cd $(PARQUET_CPP) && make -j$(CPUS)
|
||||
|
||||
.PHONY: clean arrow icu parquet publish_libs
|
||||
.PHONY: clean parquet
|
||||
|
||||
clean:
|
||||
rm -f *.o *.so
|
||||
|
||||
distclean:
|
||||
rm -rf $(SQLITE) $(HERE)
|
||||
|
||||
|
||||
arrow: $(ARROW_LIB)
|
||||
|
||||
icu: $(ICU_I18N_LIB)
|
||||
|
||||
parquet: $(PARQUET_CPP_LIB)
|
||||
|
||||
publish_libs:
|
||||
tar -cJf libs.tar.xz $(ALL_LIBS) $(SQLITE)/sqlite3
|
||||
s3cmd put libs.tar.xz s3://cldellow/public/libparquet/$$(lsb_release -s -r)/libs.tar.xz
|
||||
|
|
60
make-linux
60
make-linux
|
@ -1,31 +1,36 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
apt install -y sudo lsb-release wget
|
||||
|
||||
here=$(dirname "${BASH_SOURCE[0]}")
|
||||
here=$(readlink -f "$here")
|
||||
prebuilt="$here"/build/linux/prebuilt
|
||||
ubuntu="$(lsb_release -s -r)"
|
||||
libs=(libarrow.a libboost_filesystem.a libboost_regex.a libboost_system.a libbrotlicommon.a libbrotlidec.a \
|
||||
libbrotlienc.a libicudata.a libicui18n.a libicuuc.a liblz4.a libparquet.a libsnappy.a libthrift.a libzstd.a)
|
||||
lib_locs=()
|
||||
|
||||
setup_directories() {
|
||||
cd "$here"
|
||||
mkdir -p build/linux
|
||||
mkdir -p "$prebuilt"
|
||||
cp -f build/Makefile.linux build/linux/Makefile
|
||||
cd build/linux
|
||||
}
|
||||
|
||||
install_prerequisites() {
|
||||
# install Apache Arrow libs
|
||||
# NOTE: Pinned to Ubuntu Focal
|
||||
wget https://apache.bintray.com/arrow/ubuntu/apache-arrow-archive-keyring-latest-focal.deb
|
||||
sudo apt install -y -V ./apache-arrow-archive-keyring-latest-focal.deb
|
||||
sudo apt update -y
|
||||
sudo apt install -y -V libparquet-dev liblz4-dev libzstd-dev libthrift-dev \
|
||||
libsnappy-dev libthrift-dev libbrotli-dev libz-dev
|
||||
|
||||
# Install prereqs based on https://github.com/apache/parquet-cpp#linux
|
||||
sudo apt-get install libboost-dev g++ libboost-filesystem-dev \
|
||||
sudo apt install -y libboost-dev g++ libboost-filesystem-dev \
|
||||
libboost-program-options-dev libboost-regex-dev \
|
||||
libboost-system-dev libboost-test-dev \
|
||||
libssl-dev libtool bison flex pkg-config libreadline-dev libncurses-dev
|
||||
|
||||
# Install prereqs based on https://github.com/apache/arrow/tree/master/cpp
|
||||
sudo apt-get install cmake \
|
||||
sudo apt install -y cmake \
|
||||
libboost-dev \
|
||||
libboost-filesystem-dev \
|
||||
libboost-system-dev
|
||||
|
@ -48,6 +53,9 @@ set_icu_version() {
|
|||
18.04)
|
||||
export ICU_VERSION=60-2
|
||||
;;
|
||||
20.10)
|
||||
export ICU_VERSION=67-1
|
||||
;;
|
||||
*)
|
||||
echo "unsure what libicu version to use" >&2
|
||||
exit 1
|
||||
|
@ -56,47 +64,11 @@ set_icu_version() {
|
|||
export ICU_VERSION_U=${ICU_VERSION//-/_}
|
||||
}
|
||||
|
||||
add_prebuilt_lib() {
|
||||
lib_locs+=("$1=$prebuilt/$2.a")
|
||||
}
|
||||
|
||||
fetch_prebuilt_libs() {
|
||||
if [ ! -e "$prebuilt"/complete ]; then
|
||||
(
|
||||
cd "$prebuilt"
|
||||
curl "https://s3.amazonaws.com/cldellow/public/libparquet/$ubuntu/libs.tar.xz" > libs.tar.xz
|
||||
tar xf libs.tar.xz --xform 's#.*/##'
|
||||
touch "$prebuilt"/complete
|
||||
)
|
||||
fi
|
||||
|
||||
if [ ! -e "$here"/sqlite/sqlite3 ]; then
|
||||
ln -s "$prebuilt"/sqlite3 "$here"/sqlite/sqlite3
|
||||
fi
|
||||
|
||||
add_prebuilt_lib "PARQUET_CPP_LIB" libparquet
|
||||
add_prebuilt_lib "LZ4_LIB" liblz4
|
||||
add_prebuilt_lib "ZSTD_LIB" libzstd
|
||||
add_prebuilt_lib "THRIFT_LIB" libthrift
|
||||
add_prebuilt_lib "SNAPPY_LIB" libsnappy
|
||||
add_prebuilt_lib "ARROW_LIB" libarrow
|
||||
add_prebuilt_lib "ICU_I18N_LIB" libicui18n
|
||||
add_prebuilt_lib "ICU_UC_LIB" libicuuc
|
||||
add_prebuilt_lib "ICU_DATA_LIB" libicudata
|
||||
add_prebuilt_lib "BROTLI_ENC_LIB" libbrotlienc
|
||||
add_prebuilt_lib "BROTLI_COMMON_LIB" libbrotlicommon
|
||||
add_prebuilt_lib "BROTLI_DEC_LIB" libbrotlidec
|
||||
add_prebuilt_lib "BOOST_REGEX_LIB" libboost_regex
|
||||
add_prebuilt_lib "BOOST_SYSTEM_LIB" libboost_system
|
||||
add_prebuilt_lib "BOOST_FILESYSTEM_LIB" libboost_filesystem
|
||||
|
||||
}
|
||||
|
||||
main() {
|
||||
set_icu_version
|
||||
setup_directories
|
||||
install_prerequisites
|
||||
build_sqlite
|
||||
set_icu_version
|
||||
|
||||
if [ -v PREBUILT ]; then
|
||||
fetch_prebuilt_libs
|
||||
|
|
|
@ -290,7 +290,7 @@ static int parquetColumn(
|
|||
case parquet::Type::BYTE_ARRAY:
|
||||
{
|
||||
parquet::ByteArray* rv = cursor->getByteArray(col);
|
||||
if(cursor->getLogicalType(col) == parquet::LogicalType::UTF8) {
|
||||
if(cursor->getConvertedType(col) == parquet::ConvertedType::UTF8) {
|
||||
sqlite3_result_text(ctx, (const char*)rv->ptr, rv->len, SQLITE_TRANSIENT);
|
||||
} else {
|
||||
sqlite3_result_blob(ctx, (void*)rv->ptr, rv->len, SQLITE_TRANSIENT);
|
||||
|
|
|
@ -31,7 +31,7 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint)
|
|||
}
|
||||
}
|
||||
|
||||
bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||
bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> _stats) {
|
||||
if(!_stats->HasMinMax()) {
|
||||
return true;
|
||||
}
|
||||
|
@ -48,8 +48,8 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s
|
|||
parquet::Type::type pqType = types[constraint.column];
|
||||
|
||||
if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||
parquet::TypedStatistics<parquet::ByteArrayType>* stats =
|
||||
(parquet::TypedStatistics<parquet::ByteArrayType>*)_stats.get();
|
||||
|
||||
minPtr = stats->min().ptr;
|
||||
minLen = stats->min().len;
|
||||
|
@ -137,9 +137,9 @@ bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, s
|
|||
}
|
||||
}
|
||||
|
||||
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> _stats) {
|
||||
parquet::TypedStatistics<parquet::ByteArrayType>* stats =
|
||||
(parquet::TypedStatistics<parquet::ByteArrayType>*)_stats.get();
|
||||
|
||||
if(!stats->HasMinMax()) {
|
||||
return true;
|
||||
|
@ -195,7 +195,7 @@ int64_t int96toMsSinceEpoch(const parquet::Int96& rv) {
|
|||
return nsSinceEpoch;
|
||||
}
|
||||
|
||||
bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||
bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> _stats) {
|
||||
if(!_stats->HasMinMax()) {
|
||||
return true;
|
||||
}
|
||||
|
@ -211,27 +211,27 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint
|
|||
parquet::Type::type pqType = types[column];
|
||||
|
||||
if(pqType == parquet::Type::INT32) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT32>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT32>>*)_stats.get();
|
||||
parquet::TypedStatistics<parquet::Int32Type>* stats =
|
||||
(parquet::TypedStatistics<parquet::Int32Type>*)_stats.get();
|
||||
|
||||
min = stats->min();
|
||||
max = stats->max();
|
||||
} else if(pqType == parquet::Type::INT64) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT64>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT64>>*)_stats.get();
|
||||
parquet::TypedStatistics<parquet::Int64Type>* stats =
|
||||
(parquet::TypedStatistics<parquet::Int64Type>*)_stats.get();
|
||||
|
||||
min = stats->min();
|
||||
max = stats->max();
|
||||
} else if(pqType == parquet::Type::INT96) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT96>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::INT96>>*)_stats.get();
|
||||
parquet::TypedStatistics<parquet::Int96Type>* stats =
|
||||
(parquet::TypedStatistics<parquet::Int96Type>*)_stats.get();
|
||||
|
||||
min = int96toMsSinceEpoch(stats->min());
|
||||
max = int96toMsSinceEpoch(stats->max());
|
||||
|
||||
} else if(pqType == parquet::Type::BOOLEAN) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BOOLEAN>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BOOLEAN>>*)_stats.get();
|
||||
parquet::TypedStatistics<parquet::BooleanType>* stats =
|
||||
(parquet::TypedStatistics<parquet::BooleanType>*)_stats.get();
|
||||
|
||||
min = stats->min();
|
||||
max = stats->max();
|
||||
|
@ -272,7 +272,7 @@ bool ParquetCursor::currentRowGroupSatisfiesIntegerFilter(Constraint& constraint
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||
bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> _stats) {
|
||||
if(!_stats->HasMinMax()) {
|
||||
return true;
|
||||
}
|
||||
|
@ -288,14 +288,14 @@ bool ParquetCursor::currentRowGroupSatisfiesDoubleFilter(Constraint& constraint,
|
|||
parquet::Type::type pqType = types[column];
|
||||
|
||||
if(pqType == parquet::Type::DOUBLE) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::DOUBLE>>*)_stats.get();
|
||||
parquet::TypedStatistics<parquet::DoubleType>* stats =
|
||||
(parquet::TypedStatistics<parquet::DoubleType>*)_stats.get();
|
||||
|
||||
min = stats->min();
|
||||
max = stats->max();
|
||||
} else if(pqType == parquet::Type::FLOAT) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::FLOAT>>*)_stats.get();
|
||||
parquet::TypedStatistics<parquet::FloatType>* stats =
|
||||
(parquet::TypedStatistics<parquet::FloatType>*)_stats.get();
|
||||
|
||||
min = stats->min();
|
||||
max = stats->max();
|
||||
|
@ -527,7 +527,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
|||
} else {
|
||||
std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);
|
||||
if(md->is_stats_set()) {
|
||||
std::shared_ptr<parquet::RowGroupStatistics> stats = md->statistics();
|
||||
std::shared_ptr<parquet::Statistics> stats = md->statistics();
|
||||
|
||||
// SQLite is much looser with types than you might expect if you
|
||||
// come from a Postgres background. The constraint '30.0' (that is,
|
||||
|
@ -545,7 +545,7 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
|||
} else {
|
||||
parquet::Type::type pqType = types[column];
|
||||
|
||||
if(pqType == parquet::Type::BYTE_ARRAY && logicalTypes[column] == parquet::LogicalType::UTF8) {
|
||||
if(pqType == parquet::Type::BYTE_ARRAY && ConvertedTypes[column] == parquet::ConvertedType::UTF8) {
|
||||
rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats);
|
||||
} else if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||
rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats);
|
||||
|
@ -608,13 +608,13 @@ start:
|
|||
types.push_back(rowGroupMetadata->schema()->Column(0)->physical_type());
|
||||
}
|
||||
|
||||
while(logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) {
|
||||
logicalTypes.push_back(rowGroupMetadata->schema()->Column(0)->logical_type());
|
||||
while(ConvertedTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) {
|
||||
ConvertedTypes.push_back(rowGroupMetadata->schema()->Column(0)->converted_type());
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); i++) {
|
||||
types[i] = rowGroupMetadata->schema()->Column(i)->physical_type();
|
||||
logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type();
|
||||
ConvertedTypes[i] = rowGroupMetadata->schema()->Column(i)->converted_type();
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < colRows.size(); i++) {
|
||||
|
@ -664,7 +664,7 @@ bool ParquetCursor::currentRowSatisfiesFilter() {
|
|||
rv = !isNull(column);
|
||||
} else {
|
||||
|
||||
if(logicalTypes[column] == parquet::LogicalType::UTF8) {
|
||||
if(ConvertedTypes[column] == parquet::ConvertedType::UTF8) {
|
||||
rv = currentRowSatisfiesTextFilter(constraints[i]);
|
||||
} else {
|
||||
parquet::Type::type pqType = types[column];
|
||||
|
@ -928,8 +928,8 @@ parquet::Type::type ParquetCursor::getPhysicalType(int col) {
|
|||
return types[col];
|
||||
}
|
||||
|
||||
parquet::LogicalType::type ParquetCursor::getLogicalType(int col) {
|
||||
return logicalTypes[col];
|
||||
parquet::ConvertedType::type ParquetCursor::getConvertedType(int col) {
|
||||
return ConvertedTypes[col];
|
||||
}
|
||||
|
||||
void ParquetCursor::close() {
|
||||
|
|
|
@ -13,7 +13,7 @@ class ParquetCursor {
|
|||
std::shared_ptr<parquet::RowGroupReader> rowGroup;
|
||||
std::vector<std::shared_ptr<parquet::Scanner>> scanners;
|
||||
std::vector<parquet::Type::type> types;
|
||||
std::vector<parquet::LogicalType::type> logicalTypes;
|
||||
std::vector<parquet::ConvertedType::type> ConvertedTypes;
|
||||
|
||||
std::vector<int> colRows;
|
||||
std::vector<bool> colNulls;
|
||||
|
@ -36,10 +36,10 @@ class ParquetCursor {
|
|||
bool currentRowSatisfiesFilter();
|
||||
bool currentRowGroupSatisfiesFilter();
|
||||
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint);
|
||||
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> stats);
|
||||
bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> stats);
|
||||
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> stats);
|
||||
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::Statistics> stats);
|
||||
|
||||
bool currentRowSatisfiesTextFilter(Constraint& constraint);
|
||||
bool currentRowSatisfiesIntegerFilter(Constraint& constraint);
|
||||
|
@ -60,7 +60,7 @@ public:
|
|||
unsigned int getNumConstraints() const;
|
||||
const Constraint& getConstraint(unsigned int i) const;
|
||||
parquet::Type::type getPhysicalType(int col);
|
||||
parquet::LogicalType::type getLogicalType(int col);
|
||||
parquet::ConvertedType::type getConvertedType(int col);
|
||||
ParquetTable* getTable() const;
|
||||
|
||||
int getInt32(int col);
|
||||
|
|
|
@ -66,33 +66,33 @@ std::string ParquetTable::CreateStatement() {
|
|||
std::string type;
|
||||
|
||||
parquet::Type::type physical = col->physical_type();
|
||||
parquet::LogicalType::type logical = col->logical_type();
|
||||
parquet::ConvertedType::type converted = col->converted_type();
|
||||
// Be explicit about which types we understand so we don't mislead someone
|
||||
// whose unsigned ints start getting interpreted as signed. (We could
|
||||
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if
|
||||
// the high bit was set.)
|
||||
if(logical == parquet::LogicalType::NONE ||
|
||||
logical == parquet::LogicalType::UTF8 ||
|
||||
logical == parquet::LogicalType::DATE ||
|
||||
logical == parquet::LogicalType::TIME_MILLIS ||
|
||||
logical == parquet::LogicalType::TIMESTAMP_MILLIS ||
|
||||
logical == parquet::LogicalType::TIME_MICROS ||
|
||||
logical == parquet::LogicalType::TIMESTAMP_MICROS ||
|
||||
logical == parquet::LogicalType::INT_8 ||
|
||||
logical == parquet::LogicalType::INT_16 ||
|
||||
logical == parquet::LogicalType::INT_32 ||
|
||||
logical == parquet::LogicalType::INT_64) {
|
||||
if(converted == parquet::ConvertedType::NONE ||
|
||||
converted == parquet::ConvertedType::UTF8 ||
|
||||
converted == parquet::ConvertedType::DATE ||
|
||||
converted == parquet::ConvertedType::TIME_MILLIS ||
|
||||
converted == parquet::ConvertedType::TIMESTAMP_MILLIS ||
|
||||
converted == parquet::ConvertedType::TIME_MICROS ||
|
||||
converted == parquet::ConvertedType::TIMESTAMP_MICROS ||
|
||||
converted == parquet::ConvertedType::INT_8 ||
|
||||
converted == parquet::ConvertedType::INT_16 ||
|
||||
converted == parquet::ConvertedType::INT_32 ||
|
||||
converted == parquet::ConvertedType::INT_64) {
|
||||
switch(physical) {
|
||||
case parquet::Type::BOOLEAN:
|
||||
type = "TINYINT";
|
||||
break;
|
||||
case parquet::Type::INT32:
|
||||
if(logical == parquet::LogicalType::NONE ||
|
||||
logical == parquet::LogicalType::INT_32) {
|
||||
if(converted == parquet::ConvertedType::NONE ||
|
||||
converted == parquet::ConvertedType::INT_32) {
|
||||
type = "INT";
|
||||
} else if(logical == parquet::LogicalType::INT_8) {
|
||||
} else if(converted == parquet::ConvertedType::INT_8) {
|
||||
type = "TINYINT";
|
||||
} else if(logical == parquet::LogicalType::INT_16) {
|
||||
} else if(converted == parquet::ConvertedType::INT_16) {
|
||||
type = "SMALLINT";
|
||||
}
|
||||
break;
|
||||
|
@ -109,7 +109,7 @@ std::string ParquetTable::CreateStatement() {
|
|||
type = "DOUBLE";
|
||||
break;
|
||||
case parquet::Type::BYTE_ARRAY:
|
||||
if(logical == parquet::LogicalType::UTF8) {
|
||||
if(converted == parquet::ConvertedType::UTF8) {
|
||||
type = "TEXT";
|
||||
} else {
|
||||
type = "BLOB";
|
||||
|
@ -126,7 +126,7 @@ std::string ParquetTable::CreateStatement() {
|
|||
if(type.empty()) {
|
||||
std::ostringstream ss;
|
||||
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " <<
|
||||
parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical);
|
||||
parquet::TypeToString(physical) << "/" << parquet::ConvertedTypeToString(converted);
|
||||
|
||||
throw std::invalid_argument(ss.str());
|
||||
}
|
||||
|
@ -137,8 +137,8 @@ std::string ParquetTable::CreateStatement() {
|
|||
col->name().data(),
|
||||
col->physical_type(),
|
||||
parquet::TypeToString(col->physical_type()).data(),
|
||||
col->logical_type(),
|
||||
parquet::LogicalTypeToString(col->logical_type()).data(),
|
||||
col->converted_type(),
|
||||
parquet::ConvertedTypeToString(col->converted_type()).data(),
|
||||
type.data());
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Reference in New Issue