reuse FileMetaData
For the statscan dataset, parsing the file metadata takes ~30-40ms, so stash it away for future re-use.
This commit is contained in:
parent
769060dbcb
commit
92ba5f94e0
|
@ -5,7 +5,7 @@ CC = g++
|
||||||
CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g
|
CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g
|
||||||
PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a
|
PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a
|
||||||
THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a
|
THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a
|
||||||
ARROW_LIB = $(PARQUET_CPP)/build/release/libarrow.so
|
ARROW_LIB = /usr/local/lib/libarrow.so
|
||||||
BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so
|
BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so
|
||||||
|
|
||||||
LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB)
|
LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB)
|
||||||
|
|
|
@ -34,21 +34,15 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
|
||||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||||
|
|
||||||
// TODO: parquet-cpp doesn't seem to support stats for UTF8?
|
|
||||||
// 1) empirically, the following is false on a few parquets I've tested
|
|
||||||
// 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported
|
|
||||||
// 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported
|
|
||||||
if(!stats->HasMinMax()) {
|
if(!stats->HasMinMax()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
parquet::ByteArray min = stats->min();
|
parquet::ByteArray min = stats->min();
|
||||||
parquet::ByteArray max = stats->max();
|
parquet::ByteArray max = stats->max();
|
||||||
std::string minStr((const char*)min.ptr, min.len);
|
std::string minStr((const char*)min.ptr, min.len);
|
||||||
std::string maxStr((const char*)max.ptr, max.len);
|
std::string maxStr((const char*)max.ptr, max.len);
|
||||||
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
|
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
|
||||||
*/
|
|
||||||
|
|
||||||
switch(constraint.getOperator()) {
|
switch(constraint.getOperator()) {
|
||||||
case Is:
|
case Is:
|
||||||
|
@ -483,7 +477,11 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
|
||||||
rowId = -1;
|
rowId = -1;
|
||||||
// TODO: consider having a long lived handle in ParquetTable that can be borrowed
|
// TODO: consider having a long lived handle in ParquetTable that can be borrowed
|
||||||
// without incurring the cost of opening the file from scratch twice
|
// without incurring the cost of opening the file from scratch twice
|
||||||
reader = parquet::ParquetFileReader::OpenFile(table->file.data());
|
reader = parquet::ParquetFileReader::OpenFile(
|
||||||
|
table->file.data(),
|
||||||
|
true,
|
||||||
|
parquet::default_reader_properties(),
|
||||||
|
table->getMetadata());
|
||||||
|
|
||||||
rowGroupId = -1;
|
rowGroupId = -1;
|
||||||
rowGroupSize = 0;
|
rowGroupSize = 0;
|
||||||
|
|
|
@ -4,6 +4,9 @@
|
||||||
|
|
||||||
ParquetTable::ParquetTable(std::string file) {
|
ParquetTable::ParquetTable(std::string file) {
|
||||||
this->file = file;
|
this->file = file;
|
||||||
|
|
||||||
|
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
|
||||||
|
metadata = reader->metadata();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ParquetTable::columnName(int i) {
|
std::string ParquetTable::columnName(int i) {
|
||||||
|
@ -13,7 +16,11 @@ std::string ParquetTable::columnName(int i) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ParquetTable::CreateStatement() {
|
std::string ParquetTable::CreateStatement() {
|
||||||
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
|
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(
|
||||||
|
file.data(),
|
||||||
|
true,
|
||||||
|
parquet::default_reader_properties(),
|
||||||
|
metadata);
|
||||||
std::string text("CREATE TABLE x(");
|
std::string text("CREATE TABLE x(");
|
||||||
auto schema = reader->metadata()->schema();
|
auto schema = reader->metadata()->schema();
|
||||||
|
|
||||||
|
@ -129,3 +136,5 @@ std::string ParquetTable::CreateStatement() {
|
||||||
text +=");";
|
text +=");";
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; }
|
||||||
|
|
|
@ -3,15 +3,19 @@
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include "parquet/api/reader.h"
|
||||||
|
|
||||||
class ParquetTable {
|
class ParquetTable {
|
||||||
std::vector<std::string> columnNames;
|
std::vector<std::string> columnNames;
|
||||||
|
std::shared_ptr<parquet::FileMetaData> metadata;
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ParquetTable(std::string file);
|
ParquetTable(std::string file);
|
||||||
std::string CreateStatement();
|
std::string CreateStatement();
|
||||||
std::string file;
|
std::string file;
|
||||||
std::string columnName(int idx);
|
std::string columnName(int idx);
|
||||||
|
std::shared_ptr<parquet::FileMetaData> getMetadata();
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue