1
0
mirror of https://github.com/cldellow/sqlite-parquet-vtable.git synced 2025-04-03 09:39:47 +00:00

Run a formatting pass with clang-format to minimize future git churn

This commit is contained in:
Addie Morrison 2019-12-08 16:08:11 -06:00
parent ae194c69c5
commit 7bc6f91f6f
7 changed files with 1104 additions and 1165 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,13 @@
#ifndef PARQUET_CURSOR_H #ifndef PARQUET_CURSOR_H
#define PARQUET_CURSOR_H #define PARQUET_CURSOR_H
#include "parquet/api/reader.h"
#include "parquet_filter.h" #include "parquet_filter.h"
#include "parquet_table.h" #include "parquet_table.h"
#include "parquet/api/reader.h"
class ParquetCursor { class ParquetCursor {
ParquetTable* table; ParquetTable *table;
std::unique_ptr<parquet::ParquetFileReader> reader; std::unique_ptr<parquet::ParquetFileReader> reader;
std::unique_ptr<parquet::RowGroupMetaData> rowGroupMetadata; std::unique_ptr<parquet::RowGroupMetaData> rowGroupMetadata;
std::shared_ptr<parquet::RowGroupReader> rowGroup; std::shared_ptr<parquet::RowGroupReader> rowGroup;
@ -35,19 +35,26 @@ class ParquetCursor {
bool currentRowSatisfiesFilter(); bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter(); bool currentRowGroupSatisfiesFilter();
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint); bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint);
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); bool currentRowGroupSatisfiesTextFilter(
bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); Constraint &constraint,
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats); bool currentRowGroupSatisfiesBlobFilter(
Constraint &constraint,
bool currentRowSatisfiesTextFilter(Constraint& constraint); std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesIntegerFilter(Constraint& constraint); bool currentRowGroupSatisfiesIntegerFilter(
bool currentRowSatisfiesDoubleFilter(Constraint& constraint); Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter(
Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesTextFilter(Constraint &constraint);
bool currentRowSatisfiesIntegerFilter(Constraint &constraint);
bool currentRowSatisfiesDoubleFilter(Constraint &constraint);
public: public:
ParquetCursor(ParquetTable* table); ParquetCursor(ParquetTable *table);
int getRowId(); int getRowId();
void next(); void next();
void close(); void close();
@ -58,16 +65,15 @@ public:
bool isNull(int col); bool isNull(int col);
unsigned int getNumRowGroups() const; unsigned int getNumRowGroups() const;
unsigned int getNumConstraints() const; unsigned int getNumConstraints() const;
const Constraint& getConstraint(unsigned int i) const; const Constraint &getConstraint(unsigned int i) const;
parquet::Type::type getPhysicalType(int col); parquet::Type::type getPhysicalType(int col);
parquet::LogicalType::type getLogicalType(int col); parquet::LogicalType::type getLogicalType(int col);
ParquetTable* getTable() const; ParquetTable *getTable() const;
int getInt32(int col); int getInt32(int col);
long getInt64(int col); long getInt64(int col);
double getDouble(int col); double getDouble(int col);
parquet::ByteArray* getByteArray(int col); parquet::ByteArray *getByteArray(int col);
}; };
#endif #endif

View File

@ -1,40 +1,29 @@
#include "parquet_filter.h" #include "parquet_filter.h"
Constraint::Constraint( Constraint::Constraint(RowGroupBitmap bitmap, int column,
RowGroupBitmap bitmap, std::string columnName, ConstraintOperator op,
int column, ValueType type, int64_t intValue, double doubleValue,
std::string columnName, std::vector<unsigned char> blobValue)
ConstraintOperator op, : bitmap(bitmap), column(column), columnName(columnName), op(op),
ValueType type, type(type), intValue(intValue), doubleValue(doubleValue),
int64_t intValue, blobValue(blobValue), hadRows(false) {
double doubleValue, RowGroupBitmap bm = bitmap;
std::vector<unsigned char> blobValue this->bitmap = bm;
): bitmap(bitmap),
column(column),
columnName(columnName),
op(op),
type(type),
intValue(intValue),
doubleValue(doubleValue),
blobValue(blobValue),
hadRows(false) {
RowGroupBitmap bm = bitmap;
this->bitmap = bm;
if(type == Text) { if (type == Text) {
stringValue = std::string((char*)&blobValue[0], blobValue.size()); stringValue = std::string((char *)&blobValue[0], blobValue.size());
if(op == Like) { if (op == Like) {
// This permits more rowgroups than is strictly needed // This permits more rowgroups than is strictly needed
// since it assumes an implicit wildcard. But it's // since it assumes an implicit wildcard. But it's
// simple to implement, so we'll go with it. // simple to implement, so we'll go with it.
likeStringValue = stringValue; likeStringValue = stringValue;
size_t idx = likeStringValue.find_first_of("%"); size_t idx = likeStringValue.find_first_of("%");
if(idx != std::string::npos) { if (idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx); likeStringValue = likeStringValue.substr(0, idx);
} }
idx = likeStringValue.find_first_of("_"); idx = likeStringValue.find_first_of("_");
if(idx != std::string::npos) { if (idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx); likeStringValue = likeStringValue.substr(0, idx);
} }
} }
@ -45,61 +34,61 @@ std::string Constraint::describe() const {
std::string rv; std::string rv;
rv.append(columnName); rv.append(columnName);
rv.append(" "); rv.append(" ");
switch(op) { switch (op) {
case Equal: case Equal:
rv.append("="); rv.append("=");
break; break;
case GreaterThan: case GreaterThan:
rv.append(">"); rv.append(">");
break; break;
case LessThanOrEqual: case LessThanOrEqual:
rv.append("<="); rv.append("<=");
break; break;
case LessThan: case LessThan:
rv.append("<"); rv.append("<");
break; break;
case GreaterThanOrEqual: case GreaterThanOrEqual:
rv.append(">="); rv.append(">=");
break; break;
case Like: case Like:
rv.append("LIKE"); rv.append("LIKE");
break; break;
case Glob: case Glob:
rv.append("GLOB"); rv.append("GLOB");
break; break;
case NotEqual: case NotEqual:
rv.append("<>"); rv.append("<>");
break; break;
case IsNot: case IsNot:
rv.append("IS NOT"); rv.append("IS NOT");
break; break;
case IsNotNull: case IsNotNull:
rv.append("IS NOT NULL"); rv.append("IS NOT NULL");
break; break;
case IsNull: case IsNull:
rv.append("IS NULL"); rv.append("IS NULL");
break; break;
case Is: case Is:
rv.append("IS"); rv.append("IS");
break; break;
} }
rv.append(" "); rv.append(" ");
switch(type) { switch (type) {
case Null: case Null:
rv.append("NULL"); rv.append("NULL");
break; break;
case Integer: case Integer:
rv.append(std::to_string(intValue)); rv.append(std::to_string(intValue));
break; break;
case Double: case Double:
rv.append(std::to_string(doubleValue)); rv.append(std::to_string(doubleValue));
break; break;
case Blob: case Blob:
break; break;
case Text: case Text:
rv.append(stringValue); rv.append(stringValue);
break; break;
} }
return rv; return rv;
} }

View File

@ -1,9 +1,9 @@
#ifndef PARQUET_FILTER_H #ifndef PARQUET_FILTER_H
#define PARQUET_FILTER_H #define PARQUET_FILTER_H
#include <vector>
#include <string>
#include <cstdint> #include <cstdint>
#include <string>
#include <vector>
enum ConstraintOperator { enum ConstraintOperator {
Equal, Equal,
@ -20,43 +20,36 @@ enum ConstraintOperator {
Is Is
}; };
enum ValueType { enum ValueType { Null, Integer, Double, Blob, Text };
Null,
Integer,
Double,
Blob,
Text
};
class RowGroupBitmap { class RowGroupBitmap {
void setBit(std::vector<unsigned char>& membership, unsigned int rowGroup, bool isSet) { void setBit(std::vector<unsigned char> &membership, unsigned int rowGroup,
bool isSet) {
int byte = rowGroup / 8; int byte = rowGroup / 8;
int offset = rowGroup % 8; int offset = rowGroup % 8;
unsigned char c = membership[byte]; unsigned char c = membership[byte];
c &= ~(1UL << offset); c &= ~(1UL << offset);
if(isSet) { if (isSet) {
c |= 1UL << offset; c |= 1UL << offset;
} }
membership[byte] = c; membership[byte] = c;
} }
// Compares estimated rowGroupFilter results against observed results // Compares estimated rowGroupFilter results against observed results
// when we explored the row group. This lets us cache // when we explored the row group. This lets us cache
public: public:
RowGroupBitmap(unsigned int totalRowGroups) { RowGroupBitmap(unsigned int totalRowGroups) {
// Initialize everything to assume that all row groups match. // Initialize everything to assume that all row groups match.
// As we discover otherwise, we'll update that assumption. // As we discover otherwise, we'll update that assumption.
for(unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) { for (unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) {
estimatedMembership.push_back(0xFF); estimatedMembership.push_back(0xFF);
actualMembership.push_back(0xFF); actualMembership.push_back(0xFF);
} }
} }
RowGroupBitmap( RowGroupBitmap(std::vector<unsigned char> estimatedMembership,
std::vector<unsigned char> estimatedMembership, std::vector<unsigned char> actualMembership)
std::vector<unsigned char> actualMembership) : : estimatedMembership(estimatedMembership),
estimatedMembership(estimatedMembership), actualMembership(actualMembership) {}
actualMembership(actualMembership) {
}
std::vector<unsigned char> estimatedMembership; std::vector<unsigned char> estimatedMembership;
std::vector<unsigned char> actualMembership; std::vector<unsigned char> actualMembership;
@ -80,17 +73,11 @@ public:
class Constraint { class Constraint {
public: public:
// Kind of a messy constructor function, but it's just for internal use, so whatever. // Kind of a messy constructor function, but it's just for internal use, so
Constraint( // whatever.
RowGroupBitmap bitmap, Constraint(RowGroupBitmap bitmap, int column, std::string columnName,
int column, ConstraintOperator op, ValueType type, int64_t intValue,
std::string columnName, double doubleValue, std::vector<unsigned char> blobValue);
ConstraintOperator op,
ValueType type,
int64_t intValue,
double doubleValue,
std::vector<unsigned char> blobValue
);
RowGroupBitmap bitmap; RowGroupBitmap bitmap;
int column; // underlying column in the query int column; // underlying column in the query

View File

@ -2,61 +2,61 @@
#include "parquet/api/reader.h" #include "parquet/api/reader.h"
ParquetTable::ParquetTable(std::string file, std::string tableName): file(file), tableName(tableName) { ParquetTable::ParquetTable(std::string file, std::string tableName)
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data()); : file(file), tableName(tableName) {
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(file.data());
metadata = reader->metadata(); metadata = reader->metadata();
} }
std::string ParquetTable::columnName(int i) { std::string ParquetTable::columnName(int i) {
if(i == -1) if (i == -1)
return "rowid"; return "rowid";
return columnNames[i]; return columnNames[i];
} }
unsigned int ParquetTable::getNumColumns() { unsigned int ParquetTable::getNumColumns() { return columnNames.size(); }
return columnNames.size();
}
std::string ParquetTable::CreateStatement() { std::string ParquetTable::CreateStatement() {
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile( std::unique_ptr<parquet::ParquetFileReader> reader =
file.data(), parquet::ParquetFileReader::OpenFile(
true, file.data(), true, parquet::default_reader_properties(), metadata);
parquet::default_reader_properties(),
metadata);
std::string text("CREATE TABLE x("); std::string text("CREATE TABLE x(");
auto schema = reader->metadata()->schema(); auto schema = reader->metadata()->schema();
for(auto i = 0; i < schema->num_columns(); i++) { for (auto i = 0; i < schema->num_columns(); i++) {
auto _col = schema->GetColumnRoot(i); auto _col = schema->GetColumnRoot(i);
columnNames.push_back(_col->name()); columnNames.push_back(_col->name());
} }
for(auto i = 0; i < schema->num_columns(); i++) { for (auto i = 0; i < schema->num_columns(); i++) {
auto _col = schema->GetColumnRoot(i); auto _col = schema->GetColumnRoot(i);
if(!_col->is_primitive()) { if (!_col->is_primitive()) {
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-primitive type"; ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has non-primitive type";
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
if(_col->is_repeated()) { if (_col->is_repeated()) {
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-scalar type"; ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has non-scalar type";
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
parquet::schema::PrimitiveNode* col = (parquet::schema::PrimitiveNode*)_col; parquet::schema::PrimitiveNode *col =
(parquet::schema::PrimitiveNode *)_col;
if(i > 0) if (i > 0)
text += ", "; text += ", ";
text += "\""; text += "\"";
// Horrifically inefficient, but easy to understand. // Horrifically inefficient, but easy to understand.
std::string colName = col->name(); std::string colName = col->name();
for(char& c : colName) { for (char &c : colName) {
if(c == '"') if (c == '"')
text += "\"\""; text += "\"\"";
else else
text += c; text += c;
@ -71,7 +71,7 @@ std::string ParquetTable::CreateStatement() {
// whose unsigned ints start getting interpreted as signed. (We could // whose unsigned ints start getting interpreted as signed. (We could
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if // support this for UINT_8/16/32 -- and for UINT_64 we could throw if
// the high bit was set.) // the high bit was set.)
if(logical == parquet::LogicalType::NONE || if (logical == parquet::LogicalType::NONE ||
logical == parquet::LogicalType::UTF8 || logical == parquet::LogicalType::UTF8 ||
logical == parquet::LogicalType::DATE || logical == parquet::LogicalType::DATE ||
logical == parquet::LogicalType::TIME_MILLIS || logical == parquet::LogicalType::TIME_MILLIS ||
@ -82,74 +82,74 @@ std::string ParquetTable::CreateStatement() {
logical == parquet::LogicalType::INT_16 || logical == parquet::LogicalType::INT_16 ||
logical == parquet::LogicalType::INT_32 || logical == parquet::LogicalType::INT_32 ||
logical == parquet::LogicalType::INT_64) { logical == parquet::LogicalType::INT_64) {
switch(physical) { switch (physical) {
case parquet::Type::BOOLEAN: case parquet::Type::BOOLEAN:
type = "TINYINT";
break;
case parquet::Type::INT32:
if (logical == parquet::LogicalType::NONE ||
logical == parquet::LogicalType::INT_32) {
type = "INT";
} else if (logical == parquet::LogicalType::INT_8) {
type = "TINYINT"; type = "TINYINT";
break; } else if (logical == parquet::LogicalType::INT_16) {
case parquet::Type::INT32: type = "SMALLINT";
if(logical == parquet::LogicalType::NONE || }
logical == parquet::LogicalType::INT_32) { break;
type = "INT"; case parquet::Type::INT96:
} else if(logical == parquet::LogicalType::INT_8) { // INT96 is used for nanosecond precision on timestamps; we truncate
type = "TINYINT"; // to millisecond precision.
} else if(logical == parquet::LogicalType::INT_16) { case parquet::Type::INT64:
type = "SMALLINT"; type = "BIGINT";
} break;
break; case parquet::Type::FLOAT:
case parquet::Type::INT96: type = "REAL";
// INT96 is used for nanosecond precision on timestamps; we truncate break;
// to millisecond precision. case parquet::Type::DOUBLE:
case parquet::Type::INT64: type = "DOUBLE";
type = "BIGINT"; break;
break; case parquet::Type::BYTE_ARRAY:
case parquet::Type::FLOAT: if (logical == parquet::LogicalType::UTF8) {
type = "REAL"; type = "TEXT";
break; } else {
case parquet::Type::DOUBLE:
type = "DOUBLE";
break;
case parquet::Type::BYTE_ARRAY:
if(logical == parquet::LogicalType::UTF8) {
type = "TEXT";
} else {
type = "BLOB";
}
break;
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
type = "BLOB"; type = "BLOB";
break; }
default: break;
break; case parquet::Type::FIXED_LEN_BYTE_ARRAY:
type = "BLOB";
break;
default:
break;
} }
} }
if(type.empty()) { if (type.empty()) {
std::ostringstream ss; std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " << ss << __FILE__ << ":" << __LINE__ << ": column " << i
parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical); << " has unsupported type: " << parquet::TypeToString(physical) << "/"
<< parquet::LogicalTypeToString(logical);
throw std::invalid_argument(ss.str()); throw std::invalid_argument(ss.str());
} }
#ifdef DEBUG #ifdef DEBUG
printf("col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", printf(
i, "col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(),
col->name().data(),
col->physical_type(), col->physical_type(),
parquet::TypeToString(col->physical_type()).data(), parquet::TypeToString(col->physical_type()).data(), col->logical_type(),
col->logical_type(), parquet::LogicalTypeToString(col->logical_type()).data(), type.data());
parquet::LogicalTypeToString(col->logical_type()).data(),
type.data());
#endif #endif
text += " "; text += " ";
text += type; text += type;
} }
text +=");"; text += ");";
return text; return text;
} }
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; } std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() {
return metadata;
}
const std::string& ParquetTable::getFile() { return file; } const std::string &ParquetTable::getFile() { return file; }
const std::string& ParquetTable::getTableName() { return tableName; } const std::string &ParquetTable::getTableName() { return tableName; }

View File

@ -1,9 +1,9 @@
#ifndef PARQUET_TABLE_H #ifndef PARQUET_TABLE_H
#define PARQUET_TABLE_H #define PARQUET_TABLE_H
#include <vector>
#include <string>
#include "parquet/api/reader.h" #include "parquet/api/reader.h"
#include <string>
#include <vector>
class ParquetTable { class ParquetTable {
std::string file; std::string file;
@ -11,15 +11,14 @@ class ParquetTable {
std::vector<std::string> columnNames; std::vector<std::string> columnNames;
std::shared_ptr<parquet::FileMetaData> metadata; std::shared_ptr<parquet::FileMetaData> metadata;
public: public:
ParquetTable(std::string file, std::string tableName); ParquetTable(std::string file, std::string tableName);
std::string CreateStatement(); std::string CreateStatement();
std::string columnName(int idx); std::string columnName(int idx);
unsigned int getNumColumns(); unsigned int getNumColumns();
std::shared_ptr<parquet::FileMetaData> getMetadata(); std::shared_ptr<parquet::FileMetaData> getMetadata();
const std::string& getFile(); const std::string &getFile();
const std::string& getTableName(); const std::string &getTableName();
}; };
#endif #endif