mirror of
				https://github.com/cldellow/sqlite-parquet-vtable.git
				synced 2025-10-31 02:19:56 +00:00 
			
		
		
		
	reuse FileMetaData
For the statscan dataset, parsing the file metadata takes ~30-40ms, so stash it away for future re-use.
This commit is contained in:
		| @@ -5,7 +5,7 @@ CC = g++ | |||||||
| CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g | CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g | ||||||
| PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a | PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a | ||||||
| THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a | THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a | ||||||
| ARROW_LIB = $(PARQUET_CPP)/build/release/libarrow.so | ARROW_LIB = /usr/local/lib/libarrow.so | ||||||
| BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so | BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so | ||||||
|  |  | ||||||
| LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB) | LDFLAGS = -O3 $(PARQUET_LIB) $(THRIFT_LIB) $(ARROW_LIB) $(BOOST_LIB) | ||||||
|   | |||||||
| @@ -34,21 +34,15 @@ bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint constraint, st | |||||||
|   parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats = |   parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats = | ||||||
|     (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get(); |     (parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get(); | ||||||
|  |  | ||||||
|   // TODO: parquet-cpp doesn't seem to support stats for UTF8? |  | ||||||
|   // 1) empirically, the following is false on a few parquets I've tested |  | ||||||
|   // 2) https://github.com/apache/parquet-cpp/blob/master/src/parquet/metadata.cc#L116 seems to say UTF8 not supported |  | ||||||
|   // 3) OTOH, https://issues.apache.org/jira/browse/ARROW-1982 seems to say it's supported |  | ||||||
|   if(!stats->HasMinMax()) { |   if(!stats->HasMinMax()) { | ||||||
|     return true; |     return true; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   /* |  | ||||||
|   parquet::ByteArray min = stats->min(); |   parquet::ByteArray min = stats->min(); | ||||||
|   parquet::ByteArray max = stats->max(); |   parquet::ByteArray max = stats->max(); | ||||||
|   std::string minStr((const char*)min.ptr, min.len); |   std::string minStr((const char*)min.ptr, min.len); | ||||||
|   std::string maxStr((const char*)max.ptr, max.len); |   std::string maxStr((const char*)max.ptr, max.len); | ||||||
|   printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len); |   printf("min=%s [%d], max=%s [%d]\n", minStr.data(), min.len, maxStr.data(), max.len); | ||||||
|   */ |  | ||||||
|  |  | ||||||
|   switch(constraint.getOperator()) { |   switch(constraint.getOperator()) { | ||||||
|     case Is: |     case Is: | ||||||
| @@ -483,7 +477,11 @@ void ParquetCursor::reset(std::vector<Constraint> constraints) { | |||||||
|   rowId = -1; |   rowId = -1; | ||||||
|   // TODO: consider having a long lived handle in ParquetTable that can be borrowed |   // TODO: consider having a long lived handle in ParquetTable that can be borrowed | ||||||
|   // without incurring the cost of opening the file from scratch twice |   // without incurring the cost of opening the file from scratch twice | ||||||
|   reader = parquet::ParquetFileReader::OpenFile(table->file.data()); |   reader = parquet::ParquetFileReader::OpenFile( | ||||||
|  |       table->file.data(), | ||||||
|  |       true, | ||||||
|  |       parquet::default_reader_properties(), | ||||||
|  |       table->getMetadata()); | ||||||
|  |  | ||||||
|   rowGroupId = -1; |   rowGroupId = -1; | ||||||
|   rowGroupSize = 0; |   rowGroupSize = 0; | ||||||
|   | |||||||
| @@ -4,6 +4,9 @@ | |||||||
|  |  | ||||||
| ParquetTable::ParquetTable(std::string file) { | ParquetTable::ParquetTable(std::string file) { | ||||||
|   this->file = file; |   this->file = file; | ||||||
|  |  | ||||||
|  |   std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); | ||||||
|  |   metadata = reader->metadata(); | ||||||
| } | } | ||||||
|  |  | ||||||
| std::string ParquetTable::columnName(int i) { | std::string ParquetTable::columnName(int i) { | ||||||
| @@ -13,7 +16,11 @@ std::string ParquetTable::columnName(int i) { | |||||||
| } | } | ||||||
|  |  | ||||||
| std::string ParquetTable::CreateStatement() { | std::string ParquetTable::CreateStatement() { | ||||||
|   std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); |   std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile( | ||||||
|  |       file.data(), | ||||||
|  |       true, | ||||||
|  |       parquet::default_reader_properties(), | ||||||
|  |       metadata); | ||||||
|   std::string text("CREATE TABLE x("); |   std::string text("CREATE TABLE x("); | ||||||
|   auto schema = reader->metadata()->schema(); |   auto schema = reader->metadata()->schema(); | ||||||
|  |  | ||||||
| @@ -129,3 +136,5 @@ std::string ParquetTable::CreateStatement() { | |||||||
|   text +=");"; |   text +=");"; | ||||||
|   return text; |   return text; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; } | ||||||
|   | |||||||
| @@ -3,15 +3,19 @@ | |||||||
|  |  | ||||||
| #include <vector> | #include <vector> | ||||||
| #include <string> | #include <string> | ||||||
|  | #include "parquet/api/reader.h" | ||||||
|  |  | ||||||
| class ParquetTable { | class ParquetTable { | ||||||
|   std::vector<std::string> columnNames; |   std::vector<std::string> columnNames; | ||||||
|  |   std::shared_ptr<parquet::FileMetaData> metadata; | ||||||
|  |  | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   ParquetTable(std::string file); |   ParquetTable(std::string file); | ||||||
|   std::string CreateStatement(); |   std::string CreateStatement(); | ||||||
|   std::string file; |   std::string file; | ||||||
|   std::string columnName(int idx); |   std::string columnName(int idx); | ||||||
|  |   std::shared_ptr<parquet::FileMetaData> getMetadata(); | ||||||
| }; | }; | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Colin Dellow
					Colin Dellow