Rejig parquet generation
- "fixed_size_binary" -> "binary_10" - make null parquet use rowgroups of sie 10: first rowgroup has no nulls, 2nd has all null, 3rd-10th have alternating nulls This is prep for making a Postgres layer to use as an oracle for generating test cases so that we have good coverage before implementing advanced `xBestIndex` and `xFilter` modes.
This commit is contained in:
parent
56245c1d3d
commit
0d4806ca6f
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -62,6 +62,7 @@ def write_parquet(file_name, rows, types, row_group_size):
|
|||
def name_of(i):
|
||||
name = '{}_{}'.format(types[i], i)
|
||||
name = name.replace('timestamp[ns]', 'ts')
|
||||
name = name.replace('fixed_size_binary[1]', 'binary')
|
||||
return name
|
||||
|
||||
cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))]
|
||||
|
@ -104,9 +105,9 @@ def main():
|
|||
|
||||
for i in range(len(rows)):
|
||||
for j in range(len(rows[i])):
|
||||
if (i + j) % 2 == 0:
|
||||
if (i >= 10 and i <= 19) or (i >= 20 and (i + j) % 2 == 0):
|
||||
rows[i][j] = None
|
||||
write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=100)
|
||||
write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=10)
|
||||
|
||||
write_unsupported_parquets()
|
||||
|
||||
|
|
|
@ -1,24 +1,24 @@
|
|||
.load ./libparquet
|
||||
.headers on
|
||||
select 'creating without enough args';
|
||||
create virtual table noargs using parquet;
|
||||
--select 'creating without enough args';
|
||||
--create virtual table noargs using parquet;
|
||||
|
||||
select 'creating with invalid file';
|
||||
create virtual table nonexistent using parquet('nonexistent');
|
||||
--select 'creating with invalid file';
|
||||
--create virtual table nonexistent using parquet('nonexistent');
|
||||
|
||||
select 'creating others';
|
||||
create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet');
|
||||
select * from others limit 1;
|
||||
--select 'creating others';
|
||||
--create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet');
|
||||
--select * from others limit 1;
|
||||
|
||||
--select 'creating with valid file';
|
||||
--create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy');
|
||||
--.tables
|
||||
--.schema parquet
|
||||
--.fullschema
|
||||
--.timer on
|
||||
create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy');
|
||||
.tables
|
||||
.timer on
|
||||
.echo on
|
||||
--select count(*) from (select * from parquet limit 1);
|
||||
--select rowid,col0 from parquet where rowid > 5 limit 5;
|
||||
--select rowid,col0,col3,col9 from parquet where rowid > 5 limit 5;
|
||||
--select count(*) from parquet limit 1;
|
||||
--select sum(col0) from parquet limit 1;
|
||||
--select * from parquet limit 10;
|
||||
--select sum(length(col3)) from parquet;
|
||||
select * from parquet where (col3 = 'Dawson Creeks') or col9 LIKE '%Bicycqq%' limit 20000;
|
||||
|
|
|
@ -252,6 +252,7 @@ static int parquetFilter(
|
|||
int idxNum, const char *idxStr,
|
||||
int argc, sqlite3_value **argv
|
||||
){
|
||||
printf("xFilter: idxNum=%d\n", idxNum);
|
||||
ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor;
|
||||
cursor->reset();
|
||||
return parquetNext(cur);
|
||||
|
@ -265,7 +266,67 @@ static int parquetBestIndex(
|
|||
sqlite3_vtab *tab,
|
||||
sqlite3_index_info *pIdxInfo
|
||||
){
|
||||
pIdxInfo->estimatedCost = 1000000;
|
||||
printf("xBestIndex: nConstraint=%d, nOrderBy=%d\n", pIdxInfo->nConstraint, pIdxInfo->nOrderBy);
|
||||
// Duplicate pIdxInfo and stash it in pIdxInfo->idxStr.
|
||||
for(int i = 0; i < pIdxInfo->nConstraint; i++) {
|
||||
printf(" constraint %d: col %d, op %d, usable %d\n",
|
||||
i,
|
||||
pIdxInfo->aConstraint[i].iColumn,
|
||||
pIdxInfo->aConstraint[i].op,
|
||||
pIdxInfo->aConstraint[i].usable);
|
||||
}
|
||||
|
||||
if(true || (pIdxInfo->nConstraint == 0 && pIdxInfo->nOrderBy == 0)) {
|
||||
pIdxInfo->estimatedCost = 1000000000000;
|
||||
pIdxInfo->idxNum = 0;
|
||||
pIdxInfo->estimatedRows = 10000;
|
||||
} else {
|
||||
pIdxInfo->estimatedCost = 1;
|
||||
pIdxInfo->idxNum = 1;
|
||||
pIdxInfo->estimatedRows = 100000;
|
||||
pIdxInfo->aConstraintUsage[0].argvIndex = 1;
|
||||
// pIdxInfo->idxFlags = SQLITE_INDEX_SCAN_UNIQUE;
|
||||
}
|
||||
printf("idx %d has cost %f\n", pIdxInfo->idxNum, pIdxInfo->estimatedCost);
|
||||
|
||||
size_t dupeSize = sizeof(sqlite3_index_info) +
|
||||
//pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) +
|
||||
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) +
|
||||
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) +
|
||||
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint_usage);
|
||||
sqlite3_index_info* dupe = (sqlite3_index_info*)sqlite3_malloc(dupeSize);
|
||||
pIdxInfo->idxStr = (char*)dupe;
|
||||
pIdxInfo->needToFreeIdxStr = 1;
|
||||
|
||||
// TODO: populate argvIndex.
|
||||
memset(dupe, 0, dupeSize);
|
||||
memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info));
|
||||
|
||||
dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint*)((char*)dupe + sizeof(sqlite3_index_info));
|
||||
dupe->aOrderBy = (sqlite3_index_info::sqlite3_index_orderby*)((char*)dupe +
|
||||
sizeof(sqlite3_index_info) +
|
||||
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint));
|
||||
dupe->aConstraintUsage = (sqlite3_index_info::sqlite3_index_constraint_usage*)((char*)dupe +
|
||||
sizeof(sqlite3_index_info) +
|
||||
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) +
|
||||
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby));
|
||||
|
||||
|
||||
for(int i = 0; i < pIdxInfo->nConstraint; i++) {
|
||||
dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn;
|
||||
dupe->aConstraint[i].op = pIdxInfo->aConstraint[i].op;
|
||||
dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable;
|
||||
dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset;
|
||||
|
||||
dupe->aConstraintUsage[i].argvIndex = pIdxInfo->aConstraintUsage[i].argvIndex;
|
||||
dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit;
|
||||
}
|
||||
|
||||
for(int i = 0; i < pIdxInfo->nOrderBy; i++) {
|
||||
dupe->aOrderBy[i].iColumn = pIdxInfo->aOrderBy[i].iColumn;
|
||||
dupe->aOrderBy[i].desc = pIdxInfo->aOrderBy[i].desc;
|
||||
}
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
100-rows-nulls.parquet
|
||||
SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN fixed_size_binary IS NULL THEN 1 ELSE 0 END) from test;
|
||||
SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_10 IS NULL THEN 1 ELSE 0 END) from test;
|
||||
50|50|50|50|50|50|50|50|50|50|50
|
||||
|
|
Loading…
Reference in New Issue