diff --git a/README.md b/README.md index 99c7555..3718571 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,17 @@ -# parquet-vtable +# sqlite-parquet-vtable A SQLite [virtual table](https://sqlite.org/vtab.html) extension to expose Parquet files as SQL tables. +## Caveats + +I'm not a professional C/C++ programmer. These are the caveats I'm aware of, but there are probably others: + +* I don't use `sqlite3_malloc` and `sqlite3_free` for C++ objects + * Maybe this doesn't matter, since portability isn't a goal +* The C (SQLite API implementation) -> C++ interop (to talk to parquet-cpp) probably leaks some C++ exceptions + * Your process may crash due to my error. Sorry! + * I handle the obvious cases like file not found and unsupported Parquet types but I suspect low memory conditions aren't handled gracefully + ## Building 1. Install [`parquet-cpp`](https://github.com/apache/parquet-cpp) @@ -18,3 +28,26 @@ sqlite> create virtual table demo USING parquet('demo.parquet'); sqlite> select * from demo limit 1; ...if all goes well, you'll see data here!... ``` + +## Supported features + +### Index + +Only full table scans are supported. + +### Types + +These types are supported: + +* INT96 timestamps (exposed as milliseconds since the epoch) +* INT8/INT16/INT32/INT64 +* UTF8 strings +* BOOLEAN +* FLOAT +* DOUBLE + +These are not supported: + +* UINT8/UINT16/UINT32/UINT64 +* Fixed length byte arrays, including JSON and BSON subtypes +* DECIMAL diff --git a/parquet/parquet_table.cc b/parquet/parquet_table.cc index 1f23010..eb76dd9 100644 --- a/parquet/parquet_table.cc +++ b/parquet/parquet_table.cc @@ -35,45 +35,65 @@ std::string ParquetTable::CreateStatement() { text += col->name(); std::string type; - switch(col->physical_type()) { - case parquet::Type::BOOLEAN: - type = "TINYINT"; - break; - case parquet::Type::INT32: - if(col->logical_type() == parquet::LogicalType::NONE) { - type = "INT"; - } else if(col->logical_type() == parquet::LogicalType::INT_8) { + + parquet::Type::type physical = col->physical_type(); + parquet::LogicalType::type logical = col->logical_type(); + // Be explicit about which types we understand so we don't mislead someone + // whose unsigned ints start getting interpreted as signed. (We could + // support this for UINT_8/16/32 -- and for UINT_64 we could throw if + // the high bit was set.) + if(logical == parquet::LogicalType::NONE || + logical == parquet::LogicalType::UTF8 || + logical == parquet::LogicalType::DATE || + logical == parquet::LogicalType::TIME_MILLIS || + logical == parquet::LogicalType::TIMESTAMP_MILLIS || + logical == parquet::LogicalType::TIME_MICROS || + logical == parquet::LogicalType::TIMESTAMP_MICROS || + logical == parquet::LogicalType::INT_8 || + logical == parquet::LogicalType::INT_16 || + logical == parquet::LogicalType::INT_32 || + logical == parquet::LogicalType::INT_64) { + switch(physical) { + case parquet::Type::BOOLEAN: type = "TINYINT"; - } else if(col->logical_type() == parquet::LogicalType::INT_16) { - type = "SMALLINT"; - } - break; - case parquet::Type::INT96: - // INT96 is used for nanosecond precision on timestamps; we truncate - // to millisecond precision. - case parquet::Type::INT64: - type = "BIGINT"; - break; - case parquet::Type::FLOAT: - type = "REAL"; - break; - case parquet::Type::DOUBLE: - type = "DOUBLE"; - break; - case parquet::Type::BYTE_ARRAY: - if(col->logical_type() == parquet::LogicalType::UTF8) { - type = "TEXT"; - } - break; - case parquet::Type::FIXED_LEN_BYTE_ARRAY: - default: - break; + break; + case parquet::Type::INT32: + if(logical == parquet::LogicalType::NONE || + logical == parquet::LogicalType::INT_32) { + type = "INT"; + } else if(logical == parquet::LogicalType::INT_8) { + type = "TINYINT"; + } else if(logical == parquet::LogicalType::INT_16) { + type = "SMALLINT"; + } + break; + case parquet::Type::INT96: + // INT96 is used for nanosecond precision on timestamps; we truncate + // to millisecond precision. + case parquet::Type::INT64: + type = "BIGINT"; + break; + case parquet::Type::FLOAT: + type = "REAL"; + break; + case parquet::Type::DOUBLE: + type = "DOUBLE"; + break; + case parquet::Type::BYTE_ARRAY: + if(logical == parquet::LogicalType::UTF8) { + type = "TEXT"; + } + break; + case parquet::Type::FIXED_LEN_BYTE_ARRAY: + default: + break; + } } if(type.empty()) { std::ostringstream ss; ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " << - parquet::TypeToString(col->physical_type()) << "/" << parquet::LogicalTypeToString(col->logical_type()); + parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical); throw std::invalid_argument(ss.str()); }