1
0
mirror of https://github.com/cldellow/sqlite-parquet-vtable.git synced 2025-04-03 09:39:47 +00:00

Run a formatting pass with clang-format to minimize future git churn

This commit is contained in:
Addie Morrison 2019-12-08 16:08:11 -06:00
parent ae194c69c5
commit 7bc6f91f6f
7 changed files with 1104 additions and 1165 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,13 @@
#ifndef PARQUET_CURSOR_H
#define PARQUET_CURSOR_H
#include "parquet/api/reader.h"
#include "parquet_filter.h"
#include "parquet_table.h"
#include "parquet/api/reader.h"
class ParquetCursor {
ParquetTable* table;
ParquetTable *table;
std::unique_ptr<parquet::ParquetFileReader> reader;
std::unique_ptr<parquet::RowGroupMetaData> rowGroupMetadata;
std::shared_ptr<parquet::RowGroupReader> rowGroup;
@ -35,19 +35,26 @@ class ParquetCursor {
bool currentRowSatisfiesFilter();
bool currentRowGroupSatisfiesFilter();
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint);
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesTextFilter(Constraint& constraint);
bool currentRowSatisfiesIntegerFilter(Constraint& constraint);
bool currentRowSatisfiesDoubleFilter(Constraint& constraint);
bool currentRowGroupSatisfiesRowIdFilter(Constraint &constraint);
bool currentRowGroupSatisfiesTextFilter(
Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesBlobFilter(
Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesIntegerFilter(
Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowGroupSatisfiesDoubleFilter(
Constraint &constraint,
std::shared_ptr<parquet::RowGroupStatistics> stats);
bool currentRowSatisfiesTextFilter(Constraint &constraint);
bool currentRowSatisfiesIntegerFilter(Constraint &constraint);
bool currentRowSatisfiesDoubleFilter(Constraint &constraint);
public:
ParquetCursor(ParquetTable* table);
ParquetCursor(ParquetTable *table);
int getRowId();
void next();
void close();
@ -58,16 +65,15 @@ public:
bool isNull(int col);
unsigned int getNumRowGroups() const;
unsigned int getNumConstraints() const;
const Constraint& getConstraint(unsigned int i) const;
const Constraint &getConstraint(unsigned int i) const;
parquet::Type::type getPhysicalType(int col);
parquet::LogicalType::type getLogicalType(int col);
ParquetTable* getTable() const;
ParquetTable *getTable() const;
int getInt32(int col);
long getInt64(int col);
double getDouble(int col);
parquet::ByteArray* getByteArray(int col);
parquet::ByteArray *getByteArray(int col);
};
#endif

View File

@ -1,40 +1,29 @@
#include "parquet_filter.h"
Constraint::Constraint(
RowGroupBitmap bitmap,
int column,
std::string columnName,
ConstraintOperator op,
ValueType type,
int64_t intValue,
double doubleValue,
std::vector<unsigned char> blobValue
): bitmap(bitmap),
column(column),
columnName(columnName),
op(op),
type(type),
intValue(intValue),
doubleValue(doubleValue),
blobValue(blobValue),
hadRows(false) {
RowGroupBitmap bm = bitmap;
this->bitmap = bm;
Constraint::Constraint(RowGroupBitmap bitmap, int column,
std::string columnName, ConstraintOperator op,
ValueType type, int64_t intValue, double doubleValue,
std::vector<unsigned char> blobValue)
: bitmap(bitmap), column(column), columnName(columnName), op(op),
type(type), intValue(intValue), doubleValue(doubleValue),
blobValue(blobValue), hadRows(false) {
RowGroupBitmap bm = bitmap;
this->bitmap = bm;
if(type == Text) {
stringValue = std::string((char*)&blobValue[0], blobValue.size());
if (type == Text) {
stringValue = std::string((char *)&blobValue[0], blobValue.size());
if(op == Like) {
if (op == Like) {
// This permits more rowgroups than is strictly needed
// since it assumes an implicit wildcard. But it's
// simple to implement, so we'll go with it.
likeStringValue = stringValue;
size_t idx = likeStringValue.find_first_of("%");
if(idx != std::string::npos) {
if (idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx);
}
idx = likeStringValue.find_first_of("_");
if(idx != std::string::npos) {
if (idx != std::string::npos) {
likeStringValue = likeStringValue.substr(0, idx);
}
}
@ -45,61 +34,61 @@ std::string Constraint::describe() const {
std::string rv;
rv.append(columnName);
rv.append(" ");
switch(op) {
case Equal:
rv.append("=");
break;
case GreaterThan:
rv.append(">");
break;
case LessThanOrEqual:
rv.append("<=");
break;
case LessThan:
rv.append("<");
break;
case GreaterThanOrEqual:
rv.append(">=");
break;
case Like:
rv.append("LIKE");
break;
case Glob:
rv.append("GLOB");
break;
case NotEqual:
rv.append("<>");
break;
case IsNot:
rv.append("IS NOT");
break;
case IsNotNull:
rv.append("IS NOT NULL");
break;
case IsNull:
rv.append("IS NULL");
break;
case Is:
rv.append("IS");
break;
switch (op) {
case Equal:
rv.append("=");
break;
case GreaterThan:
rv.append(">");
break;
case LessThanOrEqual:
rv.append("<=");
break;
case LessThan:
rv.append("<");
break;
case GreaterThanOrEqual:
rv.append(">=");
break;
case Like:
rv.append("LIKE");
break;
case Glob:
rv.append("GLOB");
break;
case NotEqual:
rv.append("<>");
break;
case IsNot:
rv.append("IS NOT");
break;
case IsNotNull:
rv.append("IS NOT NULL");
break;
case IsNull:
rv.append("IS NULL");
break;
case Is:
rv.append("IS");
break;
}
rv.append(" ");
switch(type) {
case Null:
rv.append("NULL");
break;
case Integer:
rv.append(std::to_string(intValue));
break;
case Double:
rv.append(std::to_string(doubleValue));
break;
case Blob:
break;
case Text:
rv.append(stringValue);
break;
switch (type) {
case Null:
rv.append("NULL");
break;
case Integer:
rv.append(std::to_string(intValue));
break;
case Double:
rv.append(std::to_string(doubleValue));
break;
case Blob:
break;
case Text:
rv.append(stringValue);
break;
}
return rv;
}

View File

@ -1,9 +1,9 @@
#ifndef PARQUET_FILTER_H
#define PARQUET_FILTER_H
#include <vector>
#include <string>
#include <cstdint>
#include <string>
#include <vector>
enum ConstraintOperator {
Equal,
@ -20,43 +20,36 @@ enum ConstraintOperator {
Is
};
enum ValueType {
Null,
Integer,
Double,
Blob,
Text
};
enum ValueType { Null, Integer, Double, Blob, Text };
class RowGroupBitmap {
void setBit(std::vector<unsigned char>& membership, unsigned int rowGroup, bool isSet) {
void setBit(std::vector<unsigned char> &membership, unsigned int rowGroup,
bool isSet) {
int byte = rowGroup / 8;
int offset = rowGroup % 8;
unsigned char c = membership[byte];
c &= ~(1UL << offset);
if(isSet) {
if (isSet) {
c |= 1UL << offset;
}
membership[byte] = c;
}
// Compares estimated rowGroupFilter results against observed results
// when we explored the row group. This lets us cache
// Compares estimated rowGroupFilter results against observed results
// when we explored the row group. This lets us cache
public:
RowGroupBitmap(unsigned int totalRowGroups) {
// Initialize everything to assume that all row groups match.
// As we discover otherwise, we'll update that assumption.
for(unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) {
for (unsigned int i = 0; i < (totalRowGroups + 7) / 8; i++) {
estimatedMembership.push_back(0xFF);
actualMembership.push_back(0xFF);
}
}
RowGroupBitmap(
std::vector<unsigned char> estimatedMembership,
std::vector<unsigned char> actualMembership) :
estimatedMembership(estimatedMembership),
actualMembership(actualMembership) {
}
RowGroupBitmap(std::vector<unsigned char> estimatedMembership,
std::vector<unsigned char> actualMembership)
: estimatedMembership(estimatedMembership),
actualMembership(actualMembership) {}
std::vector<unsigned char> estimatedMembership;
std::vector<unsigned char> actualMembership;
@ -80,17 +73,11 @@ public:
class Constraint {
public:
// Kind of a messy constructor function, but it's just for internal use, so whatever.
Constraint(
RowGroupBitmap bitmap,
int column,
std::string columnName,
ConstraintOperator op,
ValueType type,
int64_t intValue,
double doubleValue,
std::vector<unsigned char> blobValue
);
// Kind of a messy constructor function, but it's just for internal use, so
// whatever.
Constraint(RowGroupBitmap bitmap, int column, std::string columnName,
ConstraintOperator op, ValueType type, int64_t intValue,
double doubleValue, std::vector<unsigned char> blobValue);
RowGroupBitmap bitmap;
int column; // underlying column in the query

View File

@ -2,61 +2,61 @@
#include "parquet/api/reader.h"
ParquetTable::ParquetTable(std::string file, std::string tableName): file(file), tableName(tableName) {
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(file.data());
ParquetTable::ParquetTable(std::string file, std::string tableName)
: file(file), tableName(tableName) {
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(file.data());
metadata = reader->metadata();
}
std::string ParquetTable::columnName(int i) {
if(i == -1)
if (i == -1)
return "rowid";
return columnNames[i];
}
unsigned int ParquetTable::getNumColumns() {
return columnNames.size();
}
unsigned int ParquetTable::getNumColumns() { return columnNames.size(); }
std::string ParquetTable::CreateStatement() {
std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(
file.data(),
true,
parquet::default_reader_properties(),
metadata);
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(
file.data(), true, parquet::default_reader_properties(), metadata);
std::string text("CREATE TABLE x(");
auto schema = reader->metadata()->schema();
for(auto i = 0; i < schema->num_columns(); i++) {
for (auto i = 0; i < schema->num_columns(); i++) {
auto _col = schema->GetColumnRoot(i);
columnNames.push_back(_col->name());
}
for(auto i = 0; i < schema->num_columns(); i++) {
for (auto i = 0; i < schema->num_columns(); i++) {
auto _col = schema->GetColumnRoot(i);
if(!_col->is_primitive()) {
if (!_col->is_primitive()) {
std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-primitive type";
ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has non-primitive type";
throw std::invalid_argument(ss.str());
}
if(_col->is_repeated()) {
if (_col->is_repeated()) {
std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has non-scalar type";
ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has non-scalar type";
throw std::invalid_argument(ss.str());
}
parquet::schema::PrimitiveNode* col = (parquet::schema::PrimitiveNode*)_col;
parquet::schema::PrimitiveNode *col =
(parquet::schema::PrimitiveNode *)_col;
if(i > 0)
if (i > 0)
text += ", ";
text += "\"";
// Horrifically inefficient, but easy to understand.
std::string colName = col->name();
for(char& c : colName) {
if(c == '"')
for (char &c : colName) {
if (c == '"')
text += "\"\"";
else
text += c;
@ -71,7 +71,7 @@ std::string ParquetTable::CreateStatement() {
// whose unsigned ints start getting interpreted as signed. (We could
// support this for UINT_8/16/32 -- and for UINT_64 we could throw if
// the high bit was set.)
if(logical == parquet::LogicalType::NONE ||
if (logical == parquet::LogicalType::NONE ||
logical == parquet::LogicalType::UTF8 ||
logical == parquet::LogicalType::DATE ||
logical == parquet::LogicalType::TIME_MILLIS ||
@ -82,74 +82,74 @@ std::string ParquetTable::CreateStatement() {
logical == parquet::LogicalType::INT_16 ||
logical == parquet::LogicalType::INT_32 ||
logical == parquet::LogicalType::INT_64) {
switch(physical) {
case parquet::Type::BOOLEAN:
switch (physical) {
case parquet::Type::BOOLEAN:
type = "TINYINT";
break;
case parquet::Type::INT32:
if (logical == parquet::LogicalType::NONE ||
logical == parquet::LogicalType::INT_32) {
type = "INT";
} else if (logical == parquet::LogicalType::INT_8) {
type = "TINYINT";
break;
case parquet::Type::INT32:
if(logical == parquet::LogicalType::NONE ||
logical == parquet::LogicalType::INT_32) {
type = "INT";
} else if(logical == parquet::LogicalType::INT_8) {
type = "TINYINT";
} else if(logical == parquet::LogicalType::INT_16) {
type = "SMALLINT";
}
break;
case parquet::Type::INT96:
// INT96 is used for nanosecond precision on timestamps; we truncate
// to millisecond precision.
case parquet::Type::INT64:
type = "BIGINT";
break;
case parquet::Type::FLOAT:
type = "REAL";
break;
case parquet::Type::DOUBLE:
type = "DOUBLE";
break;
case parquet::Type::BYTE_ARRAY:
if(logical == parquet::LogicalType::UTF8) {
type = "TEXT";
} else {
type = "BLOB";
}
break;
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
} else if (logical == parquet::LogicalType::INT_16) {
type = "SMALLINT";
}
break;
case parquet::Type::INT96:
// INT96 is used for nanosecond precision on timestamps; we truncate
// to millisecond precision.
case parquet::Type::INT64:
type = "BIGINT";
break;
case parquet::Type::FLOAT:
type = "REAL";
break;
case parquet::Type::DOUBLE:
type = "DOUBLE";
break;
case parquet::Type::BYTE_ARRAY:
if (logical == parquet::LogicalType::UTF8) {
type = "TEXT";
} else {
type = "BLOB";
break;
default:
break;
}
break;
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
type = "BLOB";
break;
default:
break;
}
}
if(type.empty()) {
if (type.empty()) {
std::ostringstream ss;
ss << __FILE__ << ":" << __LINE__ << ": column " << i << " has unsupported type: " <<
parquet::TypeToString(physical) << "/" << parquet::LogicalTypeToString(logical);
ss << __FILE__ << ":" << __LINE__ << ": column " << i
<< " has unsupported type: " << parquet::TypeToString(physical) << "/"
<< parquet::LogicalTypeToString(logical);
throw std::invalid_argument(ss.str());
}
#ifdef DEBUG
printf("col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n",
i,
col->name().data(),
printf(
"col %d[name=%s, p=%d:%s, l=%d:%s] is %s\n", i, col->name().data(),
col->physical_type(),
parquet::TypeToString(col->physical_type()).data(),
col->logical_type(),
parquet::LogicalTypeToString(col->logical_type()).data(),
type.data());
parquet::TypeToString(col->physical_type()).data(), col->logical_type(),
parquet::LogicalTypeToString(col->logical_type()).data(), type.data());
#endif
text += " ";
text += type;
}
text +=");";
text += ");";
return text;
}
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() { return metadata; }
std::shared_ptr<parquet::FileMetaData> ParquetTable::getMetadata() {
return metadata;
}
const std::string& ParquetTable::getFile() { return file; }
const std::string& ParquetTable::getTableName() { return tableName; }
const std::string &ParquetTable::getFile() { return file; }
const std::string &ParquetTable::getTableName() { return tableName; }

View File

@ -1,9 +1,9 @@
#ifndef PARQUET_TABLE_H
#define PARQUET_TABLE_H
#include <vector>
#include <string>
#include "parquet/api/reader.h"
#include <string>
#include <vector>
class ParquetTable {
std::string file;
@ -11,15 +11,14 @@ class ParquetTable {
std::vector<std::string> columnNames;
std::shared_ptr<parquet::FileMetaData> metadata;
public:
ParquetTable(std::string file, std::string tableName);
std::string CreateStatement();
std::string columnName(int idx);
unsigned int getNumColumns();
std::shared_ptr<parquet::FileMetaData> getMetadata();
const std::string& getFile();
const std::string& getTableName();
const std::string &getFile();
const std::string &getTableName();
};
#endif