More defensive, add caveats
This commit is contained in:
parent
eb0b48f867
commit
18f07f4c43
35
README.md
35
README.md
|
@ -1,7 +1,17 @@
|
|||
# parquet-vtable
|
||||
# sqlite-parquet-vtable
|
||||
|
||||
A SQLite [virtual table](https://sqlite.org/vtab.html) extension to expose Parquet files as SQL tables.
|
||||
|
||||
## Caveats
|
||||
|
||||
I'm not a professional C/C++ programmer. These are the caveats I'm aware of, but there are probably others:
|
||||
|
||||
* I don't use `sqlite3_malloc` and `sqlite3_free` for C++ objects
|
||||
* Maybe this doesn't matter, since portability isn't a goal
|
||||
* The C (SQLite API implementation) -> C++ interop (to talk to parquet-cpp) probably leaks some C++ exceptions
|
||||
* Your process may crash due to my error. Sorry!
|
||||
* I handle the obvious cases like file not found and unsupported Parquet types but I suspect low memory conditions aren't handled gracefully
|
||||
|
||||
## Building
|
||||
|
||||
1. Install [`parquet-cpp`](https://github.com/apache/parquet-cpp)
|
||||
|
@ -18,3 +28,26 @@ sqlite> create virtual table demo USING parquet('demo.parquet');
|
|||
sqlite> select * from demo limit 1;
|
||||
...if all goes well, you'll see data here!...
|
||||
```
|
||||
|
||||
## Supported features
|
||||
|
||||
### Index
|
||||
|
||||
Only full table scans are supported.
|
||||
|
||||
### Types
|
||||
|
||||
These types are supported:
|
||||
|
||||
* INT96 timestamps (exposed as milliseconds since the epoch)
|
||||
* INT8/INT16/INT32/INT64
|
||||
* UTF8 strings
|
||||
* BOOLEAN
|
||||
* FLOAT
|
||||
* DOUBLE
|
||||
|
||||
These are not supported:
|
||||
|
||||
* UINT8/UINT16/UINT32/UINT64
|
||||
* Fixed length byte arrays, including JSON and BSON subtypes
|
||||
* DECIMAL
|
||||
|
|
|
@ -35,45 +35,65 @@ std::string ParquetTable::CreateStatement() {
|
|||
text += col->name();
|
||||
|
||||
std::string type;
|
||||
switch(col->physical_type()) {
|
||||
case parquet::Type::BOOLEAN:
|
||||
type = "TINYINT";
|
||||
break;
|
||||
case parquet::Type::INT32:
|
||||
if(col->logical_type() == parquet::LogicalType::NONE) {
|
||||
type = "INT";
|
||||
} else if(col->logical_type() == parquet::LogicalType::INT_8) {
|
||||
|
||||
parquet::Type::type physical = col->physical_type();
|
||||
parquet::LogicalType::type logical = col->logical_type();
|
||||
// Be explicit about which types we understand so we don't mislead someone
|
||||
// whose unsigned ints start getting interpreted as signed. (We could
|
||||
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if
|
||||
// the high bit was set.)
|
||||
if(logical == parquet::LogicalType::NONE ||
|
||||
logical == parquet::LogicalType::UTF8 ||
|
||||
logical == parquet::LogicalType::DATE ||
|
||||
logical == parquet::LogicalType::TIME_MILLIS ||
|
||||
logical == parquet::LogicalType::TIMESTAMP_MILLIS ||
|
||||
logical == parquet::LogicalType::TIME_MICROS ||
|
||||
logical == parquet::LogicalType::TIMESTAMP_MICROS ||
|
||||
logical == parquet::LogicalType::INT_8 ||
|
||||
logical == parquet::LogicalType::INT_16 ||
|
||||
logical == parquet::LogicalType::INT_32 ||
|
||||
logical == parquet::LogicalType::INT_64) {
|
||||
switch(physical) {
|
||||
case parquet::Type::BOOLEAN:
|
||||
type = "TINYINT";
|
||||
} else if(col->logical_type() == parquet::LogicalType::INT_16) {
|
||||
type = "SMALLINT";
|
||||
}
|
||||
break;
|
||||
case parquet::Type::INT96:
|
||||
// INT96 is used for nanosecond precision on timestamps; we truncate
|
||||
// to millisecond precision.
|
||||
case parquet::Type::INT64:
|
||||
type = "BIGINT";
|
||||
break;
|
||||
case parquet::Type::FLOAT:
|
||||
type = "REAL";
|
||||
break;
|
||||
case parquet::Type::DOUBLE:
|
||||
type = "DOUBLE";
|
||||
break;
|
||||
case parquet::Type::BYTE_ARRAY:
|
||||
if(col->logical_type() == parquet::LogicalType::UTF8) {
|
||||
type = "TEXT";
|
||||
}
|
||||
break;
|
||||
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
|
||||
default:
|
||||
break;
|
||||
break;
|
||||
case parquet::Type::INT32:
|
||||
if(logical == parquet::LogicalType::NONE ||
|
||||
logical == parquet::LogicalType::INT_32) {
|
||||
type = "INT";
|
||||
} else if(logical == parquet::LogicalType::INT_8) {
|
||||
type = "TINYINT";
|
||||
} else if(logical == parquet::LogicalType::INT_16) {
|
||||
type = "SMALLINT";
|
||||
}
|
||||
break;
|
||||
case parquet::Type::INT96:
|
||||
// INT96 is used for nanosecond precision on timestamps; we truncate
|
||||
// to millisecond precision.
|
||||
case parquet::Type::INT64:
|
||||
type = "BIGINT";
|
||||
break;
|
||||
case parquet::Type::FLOAT:
|
||||
type = "REAL";
|
||||
break;
|
||||
case parquet::Type::DOUBLE:
|
||||
type = "DOUBLE";
|
||||
break;
|
||||
case parquet::Type::BYTE_ARRAY:
|
||||
if(logical == parquet::LogicalType::UTF8) {
|
||||
type = "TEXT";
|
||||
}
|
||||
break;
|
||||
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(type.empty()) {
|
||||
std::ostringstream ss;
|
||||
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " <<
|
||||
parquet::TypeToString(col->physical_type()) << "/" << parquet::LogicalTypeToString(col->logical_type());
|
||||
parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical);
|
||||
|
||||
throw std::invalid_argument(ss.str());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue