More defensive, add caveats

2026-01-09 18:54:13 +00:00 · 2018-03-03 20:26:41 -05:00
parent eb0b48f867
commit 18f07f4c43
2 changed files with 87 additions and 34 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,17 @@
-# parquet-vtable
+# sqlite-parquet-vtable

 A SQLite [virtual table](https://sqlite.org/vtab.html) extension to expose Parquet files as SQL tables.

+## Caveats
+
+I'm not a professional C/C++ programmer. These are the caveats I'm aware of, but there are probably others:
+
+* I don't use `sqlite3_malloc` and `sqlite3_free` for C++ objects
+  * Maybe this doesn't matter, since portability isn't a goal
+* The C (SQLite API implementation) -> C++ interop (to talk to parquet-cpp) probably leaks some C++ exceptions
+  * Your process may crash due to my error. Sorry!
+  * I handle the obvious cases like file not found and unsupported Parquet types but I suspect low memory conditions aren't handled gracefully
+
 ## Building

 1. Install [`parquet-cpp`](https://github.com/apache/parquet-cpp)
@@ -18,3 +28,26 @@ sqlite> create virtual table demo USING parquet('demo.parquet');
 sqlite> select * from demo limit 1;
 ...if all goes well, you'll see data here!...
 ```
+
+## Supported features
+
+### Index
+
+Only full table scans are supported.
+
+### Types
+
+These types are supported:
+
+* INT96 timestamps (exposed as milliseconds since the epoch)
+* INT8/INT16/INT32/INT64
+* UTF8 strings
+* BOOLEAN
+* FLOAT
+* DOUBLE
+
+These are not supported:
+
+* UINT8/UINT16/UINT32/UINT64
+* Fixed length byte arrays, including JSON and BSON subtypes
+* DECIMAL
--- a/parquet/parquet_table.cc
+++ b/parquet/parquet_table.cc
@@ -35,45 +35,65 @@ std::string ParquetTable::CreateStatement() {
    text += col->name();

    std::string type;
-    switch(col->physical_type()) {
-      case parquet::Type::BOOLEAN:
-        type = "TINYINT";
-        break;
-      case parquet::Type::INT32:
-        if(col->logical_type() == parquet::LogicalType::NONE) {
-          type = "INT";
-        } else if(col->logical_type() == parquet::LogicalType::INT_8) {
+
+    parquet::Type::type physical = col->physical_type();
+    parquet::LogicalType::type logical = col->logical_type();
+    // Be explicit about which types we understand so we don't mislead someone
+    // whose unsigned ints start getting interpreted as signed. (We could
+    // support this for UINT_8/16/32 -- and for UINT_64 we could throw if
+    // the high bit was set.)
+    if(logical == parquet::LogicalType::NONE ||
+        logical == parquet::LogicalType::UTF8 ||
+        logical == parquet::LogicalType::DATE ||
+        logical == parquet::LogicalType::TIME_MILLIS ||
+        logical == parquet::LogicalType::TIMESTAMP_MILLIS ||
+        logical == parquet::LogicalType::TIME_MICROS ||
+        logical == parquet::LogicalType::TIMESTAMP_MICROS ||
+        logical == parquet::LogicalType::INT_8 ||
+        logical == parquet::LogicalType::INT_16 ||
+        logical == parquet::LogicalType::INT_32 ||
+        logical == parquet::LogicalType::INT_64) {
+      switch(physical) {
+        case parquet::Type::BOOLEAN:
          type = "TINYINT";
-        } else if(col->logical_type() == parquet::LogicalType::INT_16) {
-          type = "SMALLINT";
-        }
-        break;
-      case parquet::Type::INT96:
-        // INT96 is used for nanosecond precision on timestamps; we truncate
-        // to millisecond precision.
-      case parquet::Type::INT64:
-        type = "BIGINT";
-        break;
-      case parquet::Type::FLOAT:
-        type = "REAL";
-        break;
-      case parquet::Type::DOUBLE:
-        type = "DOUBLE";
-        break;
-      case parquet::Type::BYTE_ARRAY:
-        if(col->logical_type() == parquet::LogicalType::UTF8) {
-          type = "TEXT";
-        }
-        break;
-      case parquet::Type::FIXED_LEN_BYTE_ARRAY:
-      default:
-        break;
+          break;
+        case parquet::Type::INT32:
+          if(logical == parquet::LogicalType::NONE ||
+              logical == parquet::LogicalType::INT_32) {
+            type = "INT";
+          } else if(logical == parquet::LogicalType::INT_8) {
+            type = "TINYINT";
+          } else if(logical == parquet::LogicalType::INT_16) {
+            type = "SMALLINT";
+          }
+          break;
+        case parquet::Type::INT96:
+          // INT96 is used for nanosecond precision on timestamps; we truncate
+          // to millisecond precision.
+        case parquet::Type::INT64:
+          type = "BIGINT";
+          break;
+        case parquet::Type::FLOAT:
+          type = "REAL";
+          break;
+        case parquet::Type::DOUBLE:
+          type = "DOUBLE";
+          break;
+        case parquet::Type::BYTE_ARRAY:
+          if(logical == parquet::LogicalType::UTF8) {
+            type = "TEXT";
+          }
+          break;
+        case parquet::Type::FIXED_LEN_BYTE_ARRAY:
+        default:
+          break;
+      }
    }

    if(type.empty()) {
      std::ostringstream ss;
      ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " <<
-        parquet::TypeToString(col->physical_type()) << "/" << parquet::LogicalTypeToString(col->logical_type());
+        parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical);

      throw std::invalid_argument(ss.str());
    }