reuse FileMetaData

For the statscan dataset, parsing the file metadata takes ~30-40ms,
so stash it away for future re-use.
This commit is contained in:
Colin Dellow 2018-03-15 19:57:38 -04:00
parent 769060dbcb
commit 92ba5f94e0
4 changed files with 20 additions and 9 deletions

View File

@ -5,7 +5,7 @@ CC = g++
CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g
PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a
THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a
ARROW_LIB = $(PARQUET_CPP)/build/release/libarrow.so ARROW_LIB = /usr/local/lib/libarrow.so
BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so
LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB) LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB)

View File

@ -34,21 +34,15 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats = parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get(); (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
// TODO: parquet-cpp doesn't seem to support stats for UTF8?
// 1) empirically, the following is false on a few parquets I've tested
// 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported
// 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported
if(!stats->HasMinMax()) { if(!stats->HasMinMax()) {
return true; return true;
} }
/*
parquet::ByteArray min = stats->min(); parquet::ByteArray min = stats->min();
parquet::ByteArray max = stats->max(); parquet::ByteArray max = stats->max();
std::string minStr((const char*)min.ptr, min.len); std::string minStr((const char*)min.ptr, min.len);
std::string maxStr((const char*)max.ptr, max.len); std::string maxStr((const char*)max.ptr, max.len);
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len); printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
*/
switch(constraint.getOperator()) { switch(constraint.getOperator()) {
case Is: case Is:
@ -483,7 +477,11 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
rowId = -1; rowId = -1;
// TODO: consider having a long lived handle in ParquetTable that can be borrowed // TODO: consider having a long lived handle in ParquetTable that can be borrowed
// without incurring the cost of opening the file from scratch twice // without incurring the cost of opening the file from scratch twice
reader = parquet::ParquetFileReader::OpenFile(table->file.data()); reader = parquet::ParquetFileReader::OpenFile(
table->file.data(),
true,
parquet::default_reader_properties(),
table->getMetadata());
rowGroupId = -1; rowGroupId = -1;
rowGroupSize = 0; rowGroupSize = 0;

View File

@ -4,6 +4,9 @@
ParquetTable::ParquetTable(std::string file) { ParquetTable::ParquetTable(std::string file) {
this->file = file; this->file = file;
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
metadata = reader->metadata();
} }
std::string ParquetTable::columnName(int i) { std::string ParquetTable::columnName(int i) {
@ -13,7 +16,11 @@ std::string ParquetTable::columnName(int i) {
} }
std::string ParquetTable::CreateStatement() { std::string ParquetTable::CreateStatement() {
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(
file.data(),
true,
parquet::default_reader_properties(),
metadata);
std::string text("CREATE TABLE x("); std::string text("CREATE TABLE x(");
auto schema = reader->metadata()->schema(); auto schema = reader->metadata()->schema();
@ -129,3 +136,5 @@ std::string ParquetTable::CreateStatement() {
text +=");"; text +=");";
return text; return text;
} }
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; }

View File

@ -3,15 +3,19 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include "parquet/api/reader.h"
class ParquetTable { class ParquetTable {
std::vector<std::string> columnNames; std::vector<std::string> columnNames;
std::shared_ptr<parquet::FileMetaData> metadata;
public: public:
ParquetTable(std::string file); ParquetTable(std::string file);
std::string CreateStatement(); std::string CreateStatement();
std::string file; std::string file;
std::string columnName(int idx); std::string columnName(int idx);
std::shared_ptr<parquet::FileMetaData> getMetadata();
}; };
#endif #endif