reuse FileMetaData
For the statscan dataset, parsing the file metadata takes ~30-40ms, so stash it away for future re-use.
This commit is contained in:
parent
769060dbcb
commit
92ba5f94e0
|
@ -5,7 +5,7 @@ CC = g++
|
|||
CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g
|
||||
PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a
|
||||
THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a
|
||||
ARROW_LIB = $(PARQUET_CPP)/build/release/libarrow.so
|
||||
ARROW_LIB = /usr/local/lib/libarrow.so
|
||||
BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so
|
||||
|
||||
LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB)
|
||||
|
|
|
@ -34,21 +34,15 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
|
|||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||
|
||||
// TODO: parquet-cpp doesn't seem to support stats for UTF8?
|
||||
// 1) empirically, the following is false on a few parquets I've tested
|
||||
// 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported
|
||||
// 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported
|
||||
if(!stats->HasMinMax()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
parquet::ByteArray min = stats->min();
|
||||
parquet::ByteArray max = stats->max();
|
||||
std::string minStr((const char*)min.ptr, min.len);
|
||||
std::string maxStr((const char*)max.ptr, max.len);
|
||||
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
|
||||
*/
|
||||
|
||||
switch(constraint.getOperator()) {
|
||||
case Is:
|
||||
|
@ -483,7 +477,11 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
|
|||
rowId = -1;
|
||||
// TODO: consider having a long lived handle in ParquetTable that can be borrowed
|
||||
// without incurring the cost of opening the file from scratch twice
|
||||
reader = parquet::ParquetFileReader::OpenFile(table->file.data());
|
||||
reader = parquet::ParquetFileReader::OpenFile(
|
||||
table->file.data(),
|
||||
true,
|
||||
parquet::default_reader_properties(),
|
||||
table->getMetadata());
|
||||
|
||||
rowGroupId = -1;
|
||||
rowGroupSize = 0;
|
||||
|
|
|
@ -4,6 +4,9 @@
|
|||
|
||||
ParquetTable::ParquetTable(std::string file) {
|
||||
this->file = file;
|
||||
|
||||
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
|
||||
metadata = reader->metadata();
|
||||
}
|
||||
|
||||
std::string ParquetTable::columnName(int i) {
|
||||
|
@ -13,7 +16,11 @@ std::string ParquetTable::columnName(int i) {
|
|||
}
|
||||
|
||||
std::string ParquetTable::CreateStatement() {
|
||||
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
|
||||
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(
|
||||
file.data(),
|
||||
true,
|
||||
parquet::default_reader_properties(),
|
||||
metadata);
|
||||
std::string text("CREATE TABLE x(");
|
||||
auto schema = reader->metadata()->schema();
|
||||
|
||||
|
@ -129,3 +136,5 @@ std::string ParquetTable::CreateStatement() {
|
|||
text +=");";
|
||||
return text;
|
||||
}
|
||||
|
||||
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; }
|
||||
|
|
|
@ -3,15 +3,19 @@
|
|||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "parquet/api/reader.h"
|
||||
|
||||
class ParquetTable {
|
||||
std::vector<std::string> columnNames;
|
||||
std::shared_ptr<parquet::FileMetaData> metadata;
|
||||
|
||||
|
||||
public:
|
||||
ParquetTable(std::string file);
|
||||
std::string CreateStatement();
|
||||
std::string file;
|
||||
std::string columnName(int idx);
|
||||
std::shared_ptr<parquet::FileMetaData> getMetadata();
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue