mirror of
https://github.com/cldellow/sqlite-parquet-vtable.git
synced 2025-04-03 09:39:47 +00:00
156 lines
4.8 KiB
C++
156 lines
4.8 KiB
C++
#include "parquet_table.h"
|
|
|
|
#include "parquet/api/reader.h"
|
|
|
|
ParquetTable::ParquetTable(std::string file, std::string tableName)
|
|
: file(file), tableName(tableName) {
|
|
std::unique_ptr<parquet::ParquetFileReader> reader =
|
|
parquet::ParquetFileReader::OpenFile(file.data());
|
|
metadata = reader->metadata();
|
|
}
|
|
|
|
std::string ParquetTable::columnName(int i) {
|
|
if (i == -1)
|
|
return "rowid";
|
|
return columnNames[i];
|
|
}
|
|
|
|
unsigned int ParquetTable::getNumColumns() { return columnNames.size(); }
|
|
|
|
std::string ParquetTable::CreateStatement() {
|
|
std::unique_ptr<parquet::ParquetFileReader> reader =
|
|
parquet::ParquetFileReader::OpenFile(
|
|
file.data(), true, parquet::default_reader_properties(), metadata);
|
|
std::string text("CREATE TABLE x(");
|
|
auto schema = reader->metadata()->schema();
|
|
|
|
for (auto i = 0; i < schema->num_columns(); i++) {
|
|
auto _col = schema->GetColumnRoot(i);
|
|
columnNames.push_back(_col->name());
|
|
}
|
|
|
|
for (auto i = 0; i < schema->num_columns(); i++) {
|
|
auto _col = schema->GetColumnRoot(i);
|
|
|
|
if (!_col->is_primitive()) {
|
|
std::ostringstream ss;
|
|
ss << __FILE__ << ":" << __LINE__ << ": column " << i
|
|
<< " has non-primitive type";
|
|
throw std::invalid_argument(ss.str());
|
|
}
|
|
|
|
if (_col->is_repeated()) {
|
|
std::ostringstream ss;
|
|
ss << __FILE__ << ":" << __LINE__ << ": column " << i
|
|
<< " has non-scalar type";
|
|
throw std::invalid_argument(ss.str());
|
|
}
|
|
|
|
parquet::schema::PrimitiveNode *col =
|
|
(parquet::schema::PrimitiveNode *)_col;
|
|
|
|
if (i > 0)
|
|
text += ", ";
|
|
|
|
text += "\"";
|
|
// Horrifically inefficient, but easy to understand.
|
|
std::string colName = col->name();
|
|
for (char &c : colName) {
|
|
if (c == '"')
|
|
text += "\"\"";
|
|
else
|
|
text += c;
|
|
}
|
|
text += "\"";
|
|
|
|
std::string type;
|
|
|
|
parquet::Type::type physical = col->physical_type();
|
|
parquet::LogicalType::type logical = col->logical_type();
|
|
// Be explicit about which types we understand so we don't mislead someone
|
|
// whose unsigned ints start getting interpreted as signed. (We could
|
|
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if
|
|
// the high bit was set.)
|
|
if (logical == parquet::LogicalType::NONE ||
|
|
logical == parquet::LogicalType::UTF8 ||
|
|
logical == parquet::LogicalType::DATE ||
|
|
logical == parquet::LogicalType::TIME_MILLIS ||
|
|
logical == parquet::LogicalType::TIMESTAMP_MILLIS ||
|
|
logical == parquet::LogicalType::TIME_MICROS ||
|
|
logical == parquet::LogicalType::TIMESTAMP_MICROS ||
|
|
logical == parquet::LogicalType::INT_8 ||
|
|
logical == parquet::LogicalType::INT_16 ||
|
|
logical == parquet::LogicalType::INT_32 ||
|
|
logical == parquet::LogicalType::INT_64) {
|
|
switch (physical) {
|
|
case parquet::Type::BOOLEAN:
|
|
type = "TINYINT";
|
|
break;
|
|
case parquet::Type::INT32:
|
|
if (logical == parquet::LogicalType::NONE ||
|
|
logical == parquet::LogicalType::INT_32) {
|
|
type = "INT";
|
|
} else if (logical == parquet::LogicalType::INT_8) {
|
|
type = "TINYINT";
|
|
} else if (logical == parquet::LogicalType::INT_16) {
|
|
type = "SMALLINT";
|
|
}
|
|
break;
|
|
case parquet::Type::INT96:
|
|
// INT96 is used for nanosecond precision on timestamps; we truncate
|
|
// to millisecond precision.
|
|
case parquet::Type::INT64:
|
|
type = "BIGINT";
|
|
break;
|
|
case parquet::Type::FLOAT:
|
|
type = "REAL";
|
|
break;
|
|
case parquet::Type::DOUBLE:
|
|
type = "DOUBLE";
|
|
break;
|
|
case parquet::Type::BYTE_ARRAY:
|
|
if (logical == parquet::LogicalType::UTF8) {
|
|
type = "TEXT";
|
|
} else {
|
|
type = "BLOB";
|
|
}
|
|
break;
|
|
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
|
|
type = "BLOB";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (type.empty()) {
|
|
std::ostringstream ss;
|
|
ss << __FILE__ << ":" << __LINE__ << ": column " << i
|
|
<< " has unsupported type: " << parquet::TypeToString(physical) << "/"
|
|
<< parquet::LogicalTypeToString(logical);
|
|
|
|
throw std::invalid_argument(ss.str());
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
printf(
|
|
"col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(),
|
|
col->physical_type(),
|
|
parquet::TypeToString(col->physical_type()).data(), col->logical_type(),
|
|
parquet::LogicalTypeToString(col->logical_type()).data(), type.data());
|
|
#endif
|
|
|
|
text += " ";
|
|
text += type;
|
|
}
|
|
text += ");";
|
|
return text;
|
|
}
|
|
|
|
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() {
|
|
return metadata;
|
|
}
|
|
|
|
const std::string &ParquetTable::getFile() { return file; }
|
|
const std::string &ParquetTable::getTableName() { return tableName; }
|