1
0
mirror of https://github.com/cldellow/sqlite-parquet-vtable.git synced 2025-09-14 22:39:59 +00:00

reuse FileMetaData

For the statscan dataset, parsing the file metadata takes ~30-40ms,
so stash it away for future re-use.
This commit is contained in:
Colin Dellow
2018-03-15 19:57:38 -04:00
parent 769060dbcb
commit 92ba5f94e0
4 changed files with 20 additions and 9 deletions

View File

@@ -34,21 +34,15 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
// TODO: parquet-cpp doesn't seem to support stats for UTF8?
// 1) empirically, the following is false on a few parquets I've tested
// 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported
// 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported
if(!stats->HasMinMax()) {
return true;
}
/*
parquet::ByteArray min = stats->min();
parquet::ByteArray max = stats->max();
std::string minStr((const char*)min.ptr, min.len);
std::string maxStr((const char*)max.ptr, max.len);
printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
*/
switch(constraint.getOperator()) {
case Is:
@@ -483,7 +477,11 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
rowId = -1;
// TODO: consider having a long lived handle in ParquetTable that can be borrowed
// without incurring the cost of opening the file from scratch twice
reader = parquet::ParquetFileReader::OpenFile(table->file.data());
reader = parquet::ParquetFileReader::OpenFile(
table->file.data(),
true,
parquet::default_reader_properties(),
table->getMetadata());
rowGroupId = -1;
rowGroupSize = 0;