Row group filtering for BYTE_ARRAY
This commit is contained in:
parent
7b302a0eb2
commit
1f3ffce560
|
@ -465,9 +465,6 @@ static int parquetBestIndex(
|
|||
pIdxInfo->aConstraintUsage[i].argvIndex = j;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: consider setting this when querying by rowid? Unclear if that's implied.
|
||||
// pIdxInfo->idxFlags = SQLITE_INDEX_SCAN_UNIQUE;
|
||||
}
|
||||
printf("idx %d has cost %f\n", pIdxInfo->idxNum, pIdxInfo->estimatedCost);
|
||||
|
||||
|
@ -480,7 +477,6 @@ static int parquetBestIndex(
|
|||
pIdxInfo->idxStr = (char*)dupe;
|
||||
pIdxInfo->needToFreeIdxStr = 1;
|
||||
|
||||
// TODO: populate argvIndex.
|
||||
memset(dupe, 0, dupeSize);
|
||||
memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info));
|
||||
|
||||
|
|
|
@ -32,6 +32,114 @@ bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint& constraint)
|
|||
}
|
||||
}
|
||||
|
||||
bool ParquetCursor::currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||
if(!_stats->HasMinMax()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if(constraint.type != Blob) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const unsigned char* minPtr = NULL;
|
||||
const unsigned char* maxPtr = NULL;
|
||||
size_t minLen = 0;
|
||||
size_t maxLen = 0;
|
||||
|
||||
parquet::Type::type pqType = types[constraint.column];
|
||||
|
||||
if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||
|
||||
minPtr = stats->min().ptr;
|
||||
minLen = stats->min().len;
|
||||
maxPtr = stats->max().ptr;
|
||||
maxLen = stats->max().len;
|
||||
} else if(pqType == parquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
// It seems like parquet-cpp doesn't actually produce stats for FLBA yet, so
|
||||
// rather than have untested code here, we'll just short circuit.
|
||||
//
|
||||
// Once I can get my hands on such a file, it should be easy to add support.
|
||||
return true;
|
||||
} else {
|
||||
// Should be impossible to get here
|
||||
std::ostringstream ss;
|
||||
ss << __FILE__ << ":" << __LINE__ << ": currentRowGroupSatisfiesBlobFilter called on unsupported type: " <<
|
||||
parquet::TypeToString(pqType);
|
||||
throw std::invalid_argument(ss.str());
|
||||
}
|
||||
|
||||
printf("\n\nBLOB\n\n");
|
||||
|
||||
const std::vector<unsigned char>& blob = constraint.blobValue;
|
||||
|
||||
switch(constraint.op) {
|
||||
case Is:
|
||||
case Equal:
|
||||
{
|
||||
bool minEqual = blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0;
|
||||
bool maxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0;
|
||||
|
||||
bool blobGteMinBlob = std::lexicographical_compare(
|
||||
minPtr,
|
||||
minPtr + minLen,
|
||||
&blob[0],
|
||||
&blob[0] + blob.size());
|
||||
|
||||
bool blobLtMaxBlob = std::lexicographical_compare(
|
||||
&blob[0],
|
||||
&blob[0] + blob.size(),
|
||||
maxPtr,
|
||||
maxPtr + maxLen);
|
||||
|
||||
|
||||
return (minEqual || blobGteMinBlob) && (maxEqual || blobLtMaxBlob);
|
||||
}
|
||||
case GreaterThanOrEqual:
|
||||
{
|
||||
bool maxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0;
|
||||
|
||||
return maxEqual || std::lexicographical_compare(
|
||||
&blob[0],
|
||||
&blob[0] + blob.size(),
|
||||
maxPtr,
|
||||
maxPtr + maxLen);
|
||||
}
|
||||
case GreaterThan:
|
||||
return std::lexicographical_compare(
|
||||
&blob[0],
|
||||
&blob[0] + blob.size(),
|
||||
maxPtr,
|
||||
maxPtr + maxLen);
|
||||
case LessThan:
|
||||
return std::lexicographical_compare(
|
||||
minPtr,
|
||||
minPtr + minLen,
|
||||
&blob[0],
|
||||
&blob[0] + blob.size());
|
||||
case LessThanOrEqual:
|
||||
{
|
||||
bool minEqual = blob.size() == minLen && memcmp(&blob[0], minPtr, minLen) == 0;
|
||||
return minEqual || std::lexicographical_compare(
|
||||
minPtr,
|
||||
minPtr + minLen,
|
||||
&blob[0],
|
||||
&blob[0] + blob.size());
|
||||
}
|
||||
case IsNot:
|
||||
case NotEqual:
|
||||
{
|
||||
// If min == max == blob, we can skip this.
|
||||
bool blobMaxEqual = blob.size() == maxLen && memcmp(&blob[0], maxPtr, maxLen) == 0;
|
||||
bool minMaxEqual = minLen == maxLen && memcmp(minPtr, maxPtr, minLen) == 0;
|
||||
return !(blobMaxEqual && minMaxEqual);
|
||||
}
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool ParquetCursor::currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> _stats) {
|
||||
parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>* stats =
|
||||
(parquet::TypedRowGroupStatistics<parquet::DataType<parquet::Type::BYTE_ARRAY>>*)_stats.get();
|
||||
|
@ -442,8 +550,10 @@ bool ParquetCursor::currentRowGroupSatisfiesFilter() {
|
|||
} else {
|
||||
parquet::Type::type pqType = types[column];
|
||||
|
||||
if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||
if(pqType == parquet::Type::BYTE_ARRAY && logicalTypes[column] == parquet::LogicalType::UTF8) {
|
||||
rv = currentRowGroupSatisfiesTextFilter(constraints[i], stats);
|
||||
} else if(pqType == parquet::Type::BYTE_ARRAY) {
|
||||
rv = currentRowGroupSatisfiesBlobFilter(constraints[i], stats);
|
||||
} else if(pqType == parquet::Type::INT32 ||
|
||||
pqType == parquet::Type::INT64 ||
|
||||
pqType == parquet::Type::INT96 ||
|
||||
|
|
|
@ -37,6 +37,7 @@ class ParquetCursor {
|
|||
bool currentRowGroupSatisfiesFilter();
|
||||
bool currentRowGroupSatisfiesRowIdFilter(Constraint& constraint);
|
||||
bool currentRowGroupSatisfiesTextFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
bool currentRowGroupSatisfiesBlobFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
bool currentRowGroupSatisfiesIntegerFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
bool currentRowGroupSatisfiesDoubleFilter(Constraint& constraint, std::shared_ptr<parquet::RowGroupStatistics> stats);
|
||||
|
||||
|
|
Loading…
Reference in New Issue