More defensive, add caveats
This commit is contained in:
parent
eb0b48f867
commit
18f07f4c43
35
README.md
35
README.md
|
@ -1,7 +1,17 @@
|
||||||
# parquet-vtable
|
# sqlite-parquet-vtable
|
||||||
|
|
||||||
A SQLite [virtual table](https://sqlite.org/vtab.html) extension to expose Parquet files as SQL tables.
|
A SQLite [virtual table](https://sqlite.org/vtab.html) extension to expose Parquet files as SQL tables.
|
||||||
|
|
||||||
|
## Caveats
|
||||||
|
|
||||||
|
I'm not a professional C/C++ programmer. These are the caveats I'm aware of, but there are probably others:
|
||||||
|
|
||||||
|
* I don't use `sqlite3_malloc` and `sqlite3_free` for C++ objects
|
||||||
|
* Maybe this doesn't matter, since portability isn't a goal
|
||||||
|
* The C (SQLite API implementation) -> C++ interop (to talk to parquet-cpp) probably leaks some C++ exceptions
|
||||||
|
* Your process may crash due to my error. Sorry!
|
||||||
|
* I handle the obvious cases like file not found and unsupported Parquet types but I suspect low memory conditions aren't handled gracefully
|
||||||
|
|
||||||
## Building
|
## Building
|
||||||
|
|
||||||
1. Install [`parquet-cpp`](https://github.com/apache/parquet-cpp)
|
1. Install [`parquet-cpp`](https://github.com/apache/parquet-cpp)
|
||||||
|
@ -18,3 +28,26 @@ sqlite> create virtual table demo USING parquet('demo.parquet');
|
||||||
sqlite> select * from demo limit 1;
|
sqlite> select * from demo limit 1;
|
||||||
...if all goes well, you'll see data here!...
|
...if all goes well, you'll see data here!...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Supported features
|
||||||
|
|
||||||
|
### Index
|
||||||
|
|
||||||
|
Only full table scans are supported.
|
||||||
|
|
||||||
|
### Types
|
||||||
|
|
||||||
|
These types are supported:
|
||||||
|
|
||||||
|
* INT96 timestamps (exposed as milliseconds since the epoch)
|
||||||
|
* INT8/INT16/INT32/INT64
|
||||||
|
* UTF8 strings
|
||||||
|
* BOOLEAN
|
||||||
|
* FLOAT
|
||||||
|
* DOUBLE
|
||||||
|
|
||||||
|
These are not supported:
|
||||||
|
|
||||||
|
* UINT8/UINT16/UINT32/UINT64
|
||||||
|
* Fixed length byte arrays, including JSON and BSON subtypes
|
||||||
|
* DECIMAL
|
||||||
|
|
|
@ -35,45 +35,65 @@ std::string ParquetTable::CreateStatement() {
|
||||||
text += col->name();
|
text += col->name();
|
||||||
|
|
||||||
std::string type;
|
std::string type;
|
||||||
switch(col->physical_type()) {
|
|
||||||
case parquet::Type::BOOLEAN:
|
parquet::Type::type physical = col->physical_type();
|
||||||
type = "TINYINT";
|
parquet::LogicalType::type logical = col->logical_type();
|
||||||
break;
|
// Be explicit about which types we understand so we don't mislead someone
|
||||||
case parquet::Type::INT32:
|
// whose unsigned ints start getting interpreted as signed. (We could
|
||||||
if(col->logical_type() == parquet::LogicalType::NONE) {
|
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if
|
||||||
type = "INT";
|
// the high bit was set.)
|
||||||
} else if(col->logical_type() == parquet::LogicalType::INT_8) {
|
if(logical == parquet::LogicalType::NONE ||
|
||||||
|
logical == parquet::LogicalType::UTF8 ||
|
||||||
|
logical == parquet::LogicalType::DATE ||
|
||||||
|
logical == parquet::LogicalType::TIME_MILLIS ||
|
||||||
|
logical == parquet::LogicalType::TIMESTAMP_MILLIS ||
|
||||||
|
logical == parquet::LogicalType::TIME_MICROS ||
|
||||||
|
logical == parquet::LogicalType::TIMESTAMP_MICROS ||
|
||||||
|
logical == parquet::LogicalType::INT_8 ||
|
||||||
|
logical == parquet::LogicalType::INT_16 ||
|
||||||
|
logical == parquet::LogicalType::INT_32 ||
|
||||||
|
logical == parquet::LogicalType::INT_64) {
|
||||||
|
switch(physical) {
|
||||||
|
case parquet::Type::BOOLEAN:
|
||||||
type = "TINYINT";
|
type = "TINYINT";
|
||||||
} else if(col->logical_type() == parquet::LogicalType::INT_16) {
|
break;
|
||||||
type = "SMALLINT";
|
case parquet::Type::INT32:
|
||||||
}
|
if(logical == parquet::LogicalType::NONE ||
|
||||||
break;
|
logical == parquet::LogicalType::INT_32) {
|
||||||
case parquet::Type::INT96:
|
type = "INT";
|
||||||
// INT96 is used for nanosecond precision on timestamps; we truncate
|
} else if(logical == parquet::LogicalType::INT_8) {
|
||||||
// to millisecond precision.
|
type = "TINYINT";
|
||||||
case parquet::Type::INT64:
|
} else if(logical == parquet::LogicalType::INT_16) {
|
||||||
type = "BIGINT";
|
type = "SMALLINT";
|
||||||
break;
|
}
|
||||||
case parquet::Type::FLOAT:
|
break;
|
||||||
type = "REAL";
|
case parquet::Type::INT96:
|
||||||
break;
|
// INT96 is used for nanosecond precision on timestamps; we truncate
|
||||||
case parquet::Type::DOUBLE:
|
// to millisecond precision.
|
||||||
type = "DOUBLE";
|
case parquet::Type::INT64:
|
||||||
break;
|
type = "BIGINT";
|
||||||
case parquet::Type::BYTE_ARRAY:
|
break;
|
||||||
if(col->logical_type() == parquet::LogicalType::UTF8) {
|
case parquet::Type::FLOAT:
|
||||||
type = "TEXT";
|
type = "REAL";
|
||||||
}
|
break;
|
||||||
break;
|
case parquet::Type::DOUBLE:
|
||||||
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
|
type = "DOUBLE";
|
||||||
default:
|
break;
|
||||||
break;
|
case parquet::Type::BYTE_ARRAY:
|
||||||
|
if(logical == parquet::LogicalType::UTF8) {
|
||||||
|
type = "TEXT";
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(type.empty()) {
|
if(type.empty()) {
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " <<
|
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " <<
|
||||||
parquet::TypeToString(col->physical_type()) << "/" << parquet::LogicalTypeToString(col->logical_type());
|
parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical);
|
||||||
|
|
||||||
throw std::invalid_argument(ss.str());
|
throw std::invalid_argument(ss.str());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue