diff --git a/parquet/Makefile b/parquet/Makefile index 00deae9..07fa587 100644 --- a/parquet/Makefile +++ b/parquet/Makefile @@ -5,7 +5,7 @@ CC = g++ CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a -ARROW_LIB = $(PARQUET_CPP)/build/release/libarrow.so +ARROW_LIB = /usr/local/lib/libarrow.so BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB) diff --git a/parquet/parquet_cursor.cc b/parquet/parquet_cursor.cc index 5d25d48..8242850 100644 --- a/parquet/parquet_cursor.cc +++ b/parquet/parquet_cursor.cc @@ -34,21 +34,15 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st parquet::TypedRowGroupStatistics>* stats = (parquet::TypedRowGroupStatistics>*)_stats.get(); - // TODO: parquet-cpp doesn't seem to support stats for UTF8? - // 1) empirically, the following is false on a few parquets I've tested - // 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported - // 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported if(!stats->HasMinMax()) { return true; } - /* parquet::ByteArray min = stats->min(); parquet::ByteArray max = stats->max(); std::string minStr((const char*)min.ptr, min.len); std::string maxStr((const char*)max.ptr, max.len); printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len); - */ switch(constraint.getOperator()) { case Is: @@ -483,7 +477,11 @@ void ParquetCursor::reset(std::vector constraints) { rowId = -1; // TODO: consider having a long lived handle in ParquetTable that can be borrowed // without incurring the cost of opening the file from scratch twice - reader = parquet::ParquetFileReader::OpenFile(table->file.data()); + reader = parquet::ParquetFileReader::OpenFile( + table->file.data(), + true, + parquet::default_reader_properties(), + table->getMetadata()); rowGroupId = -1; rowGroupSize = 0; diff --git a/parquet/parquet_table.cc b/parquet/parquet_table.cc index 8fe072d..c213e2c 100644 --- a/parquet/parquet_table.cc +++ b/parquet/parquet_table.cc @@ -4,6 +4,9 @@ ParquetTable::ParquetTable(std::string file) { this->file = file; + + std::unique_ptr reader = parquet::ParquetFileReader::OpenFile(file.data()); + metadata = reader->metadata(); } std::string ParquetTable::columnName(int i) { @@ -13,7 +16,11 @@ std::string ParquetTable::columnName(int i) { } std::string ParquetTable::CreateStatement() { - std::unique_ptr reader = parquet::ParquetFileReader::OpenFile(file.data()); + std::unique_ptr reader = parquet::ParquetFileReader::OpenFile( + file.data(), + true, + parquet::default_reader_properties(), + metadata); std::string text("CREATE TABLE x("); auto schema = reader->metadata()->schema(); @@ -129,3 +136,5 @@ std::string ParquetTable::CreateStatement() { text +=");"; return text; } + +std::shared_ptr ParquetTable::getMetadata() { return metadata; } diff --git a/parquet/parquet_table.h b/parquet/parquet_table.h index 60e6918..da284c6 100644 --- a/parquet/parquet_table.h +++ b/parquet/parquet_table.h @@ -3,15 +3,19 @@ #include #include +#include "parquet/api/reader.h" class ParquetTable { std::vector columnNames; + std::shared_ptr metadata; + public: ParquetTable(std::string file); std::string CreateStatement(); std::string file; std::string columnName(int idx); + std::shared_ptr getMetadata(); }; #endif