mirror of
				https://github.com/cldellow/sqlite-parquet-vtable.git
				synced 2025-11-04 02:39:56 +00:00 
			
		
		
		
	reuse FileMetaData
For the statscan dataset, parsing the file metadata takes ~30-40ms, so stash it away for future re-use.
This commit is contained in:
		@@ -5,7 +5,7 @@ CC = g++
 | 
				
			|||||||
CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g
 | 
					CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g
 | 
				
			||||||
PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a
 | 
					PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a
 | 
				
			||||||
THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a
 | 
					THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a
 | 
				
			||||||
ARROW_LIB = $(PARQUET_CPP)/build/release/libarrow.so
 | 
					ARROW_LIB = /usr/local/lib/libarrow.so
 | 
				
			||||||
BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so
 | 
					BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB)
 | 
					LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -34,21 +34,15 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st
 | 
				
			|||||||
  parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
 | 
					  parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
 | 
				
			||||||
    (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
 | 
					    (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // TODO: parquet-cpp doesn't seem to support stats for UTF8?
 | 
					 | 
				
			||||||
  // 1) empirically, the following is false on a few parquets I've tested
 | 
					 | 
				
			||||||
  // 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported
 | 
					 | 
				
			||||||
  // 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported
 | 
					 | 
				
			||||||
  if(!stats->HasMinMax()) {
 | 
					  if(!stats->HasMinMax()) {
 | 
				
			||||||
    return true;
 | 
					    return true;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /*
 | 
					 | 
				
			||||||
  parquet::ByteArray min = stats->min();
 | 
					  parquet::ByteArray min = stats->min();
 | 
				
			||||||
  parquet::ByteArray max = stats->max();
 | 
					  parquet::ByteArray max = stats->max();
 | 
				
			||||||
  std::string minStr((const char*)min.ptr, min.len);
 | 
					  std::string minStr((const char*)min.ptr, min.len);
 | 
				
			||||||
  std::string maxStr((const char*)max.ptr, max.len);
 | 
					  std::string maxStr((const char*)max.ptr, max.len);
 | 
				
			||||||
  printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
 | 
					  printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len);
 | 
				
			||||||
  */
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  switch(constraint.getOperator()) {
 | 
					  switch(constraint.getOperator()) {
 | 
				
			||||||
    case Is:
 | 
					    case Is:
 | 
				
			||||||
@@ -483,7 +477,11 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) {
 | 
				
			|||||||
  rowId = -1;
 | 
					  rowId = -1;
 | 
				
			||||||
  // TODO: consider having a long lived handle in ParquetTable that can be borrowed
 | 
					  // TODO: consider having a long lived handle in ParquetTable that can be borrowed
 | 
				
			||||||
  // without incurring the cost of opening the file from scratch twice
 | 
					  // without incurring the cost of opening the file from scratch twice
 | 
				
			||||||
  reader = parquet::ParquetFileReader::OpenFile(table->file.data());
 | 
					  reader = parquet::ParquetFileReader::OpenFile(
 | 
				
			||||||
 | 
					      table->file.data(),
 | 
				
			||||||
 | 
					      true,
 | 
				
			||||||
 | 
					      parquet::default_reader_properties(),
 | 
				
			||||||
 | 
					      table->getMetadata());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  rowGroupId = -1;
 | 
					  rowGroupId = -1;
 | 
				
			||||||
  rowGroupSize = 0;
 | 
					  rowGroupSize = 0;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -4,6 +4,9 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
ParquetTable::ParquetTable(std::string file) {
 | 
					ParquetTable::ParquetTable(std::string file) {
 | 
				
			||||||
  this->file = file;
 | 
					  this->file = file;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
 | 
				
			||||||
 | 
					  metadata = reader->metadata();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
std::string ParquetTable::columnName(int i) {
 | 
					std::string ParquetTable::columnName(int i) {
 | 
				
			||||||
@@ -13,7 +16,11 @@ std::string ParquetTable::columnName(int i) {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
std::string ParquetTable::CreateStatement() {
 | 
					std::string ParquetTable::CreateStatement() {
 | 
				
			||||||
  std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
 | 
					  std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(
 | 
				
			||||||
 | 
					      file.data(),
 | 
				
			||||||
 | 
					      true,
 | 
				
			||||||
 | 
					      parquet::default_reader_properties(),
 | 
				
			||||||
 | 
					      metadata);
 | 
				
			||||||
  std::string text("CREATE TABLE x(");
 | 
					  std::string text("CREATE TABLE x(");
 | 
				
			||||||
  auto schema = reader->metadata()->schema();
 | 
					  auto schema = reader->metadata()->schema();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -129,3 +136,5 @@ std::string ParquetTable::CreateStatement() {
 | 
				
			|||||||
  text +=");";
 | 
					  text +=");";
 | 
				
			||||||
  return text;
 | 
					  return text;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,15 +3,19 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <vector>
 | 
					#include <vector>
 | 
				
			||||||
#include <string>
 | 
					#include <string>
 | 
				
			||||||
 | 
					#include "parquet/api/reader.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ParquetTable {
 | 
					class ParquetTable {
 | 
				
			||||||
  std::vector<std::string> columnNames;
 | 
					  std::vector<std::string> columnNames;
 | 
				
			||||||
 | 
					  std::shared_ptr<parquet::FileMetaData> metadata;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  ParquetTable(std::string file);
 | 
					  ParquetTable(std::string file);
 | 
				
			||||||
  std::string CreateStatement();
 | 
					  std::string CreateStatement();
 | 
				
			||||||
  std::string file;
 | 
					  std::string file;
 | 
				
			||||||
  std::string columnName(int idx);
 | 
					  std::string columnName(int idx);
 | 
				
			||||||
 | 
					  std::shared_ptr<parquet::FileMetaData> getMetadata();
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user