Row group filtering for BYTE_ARRAY
This commit is contained in:
parent
7b302a0eb2
commit
1f3ffce560
|
@ -465,9 +465,6 @@ static int parquetBestIndex(
|
||||||
pIdxInfo->aConstraintUsage[i].argvIndex = j;
|
pIdxInfo->aConstraintUsage[i].argvIndex = j;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: consider setting this when querying by rowid? Unclear if that's implied.
|
|
||||||
// pIdxInfo->idxFlags = SQLITE_INDEX_SCAN_UNIQUE;
|
|
||||||
}
|
}
|
||||||
printf("idx %d has cost %f\n", pIdxInfo->idxNum, pIdxInfo->estimatedCost);
|
printf("idx %d has cost %f\n", pIdxInfo->idxNum, pIdxInfo->estimatedCost);
|
||||||
|
|
||||||
|
@ -480,7 +477,6 @@ static int parquetBestIndex(
|
||||||
pIdxInfo->idxStr = (char*)dupe;
|
pIdxInfo->idxStr = (char*)dupe;
|
||||||
pIdxInfo->needToFreeIdxStr = 1;
|
pIdxInfo->needToFreeIdxStr = 1;
|
||||||
|
|
||||||
// TODO: populate argvIndex.
|
|
||||||
memset(dupe, 0, dupeSize);
|
memset(dupe, 0, dupeSize);
|
||||||
memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info));
|
memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info));
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,114 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||||
|
if(!_stats->HasMinMax()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(constraint.type != Blob) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned char* minPtr = NULL;
|
||||||
|
const unsigned char* maxPtr = NULL;
|
||||||
|
size_t minLen = 0;
|
||||||
|
size_t maxLen = 0;
|
||||||
|
|
||||||
|
parquet::Type::type pqType = types[constraint.column];
|
||||||
|
|
||||||
|
if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||||
|
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||||
|
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||||
|
|
||||||
|
minPtr = stats->min().ptr;
|
||||||
|
minLen = stats->min().len;
|
||||||
|
maxPtr = stats->max().ptr;
|
||||||
|
maxLen = stats->max().len;
|
||||||
|
} else if(pqType == parquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||||
|
// It seems like parquet-cpp doesn't actually produce stats for FLBA yet, so
|
||||||
|
// rather than have untested code here, we'll just short circuit.
|
||||||
|
//
|
||||||
|
// Once I can get my hands on such a file, it should be easy to add support.
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
// Should be impossible to get here
|
||||||
|
std::ostringstream ss;
|
||||||
|
ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesBlobFilter called on unsupported type: " <<
|
||||||
|
parquet::TypeToString(pqType);
|
||||||
|
throw std::invalid_argument(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n\nBLOB\n\n");
|
||||||
|
|
||||||
|
const std::vector<unsigned char>& blob = constraint.blobValue;
|
||||||
|
|
||||||
|
switch(constraint.op) {
|
||||||
|
case Is:
|
||||||
|
case Equal:
|
||||||
|
{
|
||||||
|
bool minEqual = blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0;
|
||||||
|
bool maxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0;
|
||||||
|
|
||||||
|
bool blobGteMinBlob = std::lexicographical_compare(
|
||||||
|
minPtr,
|
||||||
|
minPtr + minLen,
|
||||||
|
&blob[0],
|
||||||
|
&blob[0] + blob.size());
|
||||||
|
|
||||||
|
bool blobLtMaxBlob = std::lexicographical_compare(
|
||||||
|
&blob[0],
|
||||||
|
&blob[0] + blob.size(),
|
||||||
|
maxPtr,
|
||||||
|
maxPtr + maxLen);
|
||||||
|
|
||||||
|
|
||||||
|
return (minEqual || blobGteMinBlob) && (maxEqual || blobLtMaxBlob);
|
||||||
|
}
|
||||||
|
case GreaterThanOrEqual:
|
||||||
|
{
|
||||||
|
bool maxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0;
|
||||||
|
|
||||||
|
return maxEqual || std::lexicographical_compare(
|
||||||
|
&blob[0],
|
||||||
|
&blob[0] + blob.size(),
|
||||||
|
maxPtr,
|
||||||
|
maxPtr + maxLen);
|
||||||
|
}
|
||||||
|
case GreaterThan:
|
||||||
|
return std::lexicographical_compare(
|
||||||
|
&blob[0],
|
||||||
|
&blob[0] + blob.size(),
|
||||||
|
maxPtr,
|
||||||
|
maxPtr + maxLen);
|
||||||
|
case LessThan:
|
||||||
|
return std::lexicographical_compare(
|
||||||
|
minPtr,
|
||||||
|
minPtr + minLen,
|
||||||
|
&blob[0],
|
||||||
|
&blob[0] + blob.size());
|
||||||
|
case LessThanOrEqual:
|
||||||
|
{
|
||||||
|
bool minEqual = blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0;
|
||||||
|
return minEqual || std::lexicographical_compare(
|
||||||
|
minPtr,
|
||||||
|
minPtr + minLen,
|
||||||
|
&blob[0],
|
||||||
|
&blob[0] + blob.size());
|
||||||
|
}
|
||||||
|
case IsNot:
|
||||||
|
case NotEqual:
|
||||||
|
{
|
||||||
|
// If min == max == blob, we can skip this.
|
||||||
|
bool blobMaxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0;
|
||||||
|
bool minMaxEqual = minLen == maxLen && memcmp(minPtr, maxPtr, minLen) == 0;
|
||||||
|
return !(blobMaxEqual && minMaxEqual);
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||||
|
@ -442,8 +550,10 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
||||||
} else {
|
} else {
|
||||||
parquet::Type::type pqType = types[column];
|
parquet::Type::type pqType = types[column];
|
||||||
|
|
||||||
if(pqType == parquet::Type::BYTE_ARRAY) {
|
if(pqType == parquet::Type::BYTE_ARRAY && logicalTypes[column] == parquet::LogicalType::UTF8) {
|
||||||
rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats);
|
rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats);
|
||||||
|
} else if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||||
|
rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats);
|
||||||
} else if(pqType == parquet::Type::INT32 ||
|
} else if(pqType == parquet::Type::INT32 ||
|
||||||
pqType == parquet::Type::INT64 ||
|
pqType == parquet::Type::INT64 ||
|
||||||
pqType == parquet::Type::INT96 ||
|
pqType == parquet::Type::INT96 ||
|
||||||
|
|
|
@ -37,6 +37,7 @@ class ParquetCursor {
|
||||||
bool currentRowGroupSatisfiesFilter();
|
bool currentRowGroupSatisfiesFilter();
|
||||||
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint);
|
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint);
|
||||||
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
|
bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue