Rejig parquet generation
- "fixed_size_binary" -> "binary_10" - make null parquet use rowgroups of sie 10: first rowgroup has no nulls, 2nd has all null, 3rd-10th have alternating nulls This is prep for making a Postgres layer to use as an oracle for generating test cases so that we have good coverage before implementing advanced `xBestIndex` and `xFilter` modes.
This commit is contained in:
parent
56245c1d3d
commit
0d4806ca6f
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -62,6 +62,7 @@ def write_parquet(file_name, rows, types, row_group_size):
|
||||||
def name_of(i):
|
def name_of(i):
|
||||||
name = '{}_{}'.format(types[i], i)
|
name = '{}_{}'.format(types[i], i)
|
||||||
name = name.replace('timestamp[ns]', 'ts')
|
name = name.replace('timestamp[ns]', 'ts')
|
||||||
|
name = name.replace('fixed_size_binary[1]', 'binary')
|
||||||
return name
|
return name
|
||||||
|
|
||||||
cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))]
|
cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))]
|
||||||
|
@ -104,9 +105,9 @@ def main():
|
||||||
|
|
||||||
for i in range(len(rows)):
|
for i in range(len(rows)):
|
||||||
for j in range(len(rows[i])):
|
for j in range(len(rows[i])):
|
||||||
if (i + j) % 2 == 0:
|
if (i >= 10 and i <= 19) or (i >= 20 and (i + j) % 2 == 0):
|
||||||
rows[i][j] = None
|
rows[i][j] = None
|
||||||
write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=100)
|
write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=10)
|
||||||
|
|
||||||
write_unsupported_parquets()
|
write_unsupported_parquets()
|
||||||
|
|
||||||
|
|
|
@ -1,24 +1,24 @@
|
||||||
.load ./libparquet
|
.load ./libparquet
|
||||||
.headers on
|
.headers on
|
||||||
select 'creating without enough args';
|
--select 'creating without enough args';
|
||||||
create virtual table noargs using parquet;
|
--create virtual table noargs using parquet;
|
||||||
|
|
||||||
select 'creating with invalid file';
|
--select 'creating with invalid file';
|
||||||
create virtual table nonexistent using parquet('nonexistent');
|
--create virtual table nonexistent using parquet('nonexistent');
|
||||||
|
|
||||||
select 'creating others';
|
--select 'creating others';
|
||||||
create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet');
|
--create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet');
|
||||||
select * from others limit 1;
|
--select * from others limit 1;
|
||||||
|
|
||||||
--select 'creating with valid file';
|
--select 'creating with valid file';
|
||||||
--create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy');
|
create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy');
|
||||||
--.tables
|
.tables
|
||||||
--.schema parquet
|
.timer on
|
||||||
--.fullschema
|
.echo on
|
||||||
--.timer on
|
|
||||||
--select count(*) from (select * from parquet limit 1);
|
--select count(*) from (select * from parquet limit 1);
|
||||||
--select rowid,col0 from parquet where rowid > 5 limit 5;
|
--select rowid,col0,col3,col9 from parquet where rowid > 5 limit 5;
|
||||||
--select count(*) from parquet limit 1;
|
--select count(*) from parquet limit 1;
|
||||||
--select sum(col0) from parquet limit 1;
|
--select sum(col0) from parquet limit 1;
|
||||||
--select * from parquet limit 10;
|
--select * from parquet limit 10;
|
||||||
--select sum(length(col3)) from parquet;
|
--select sum(length(col3)) from parquet;
|
||||||
|
select * from parquet where (col3 = 'Dawson Creeks') or col9 LIKE '%Bicycqq%' limit 20000;
|
||||||
|
|
|
@ -252,6 +252,7 @@ static int parquetFilter(
|
||||||
int idxNum, const char *idxStr,
|
int idxNum, const char *idxStr,
|
||||||
int argc, sqlite3_value **argv
|
int argc, sqlite3_value **argv
|
||||||
){
|
){
|
||||||
|
printf("xFilter: idxNum=%d\n", idxNum);
|
||||||
ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor;
|
ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor;
|
||||||
cursor->reset();
|
cursor->reset();
|
||||||
return parquetNext(cur);
|
return parquetNext(cur);
|
||||||
|
@ -265,7 +266,67 @@ static int parquetBestIndex(
|
||||||
sqlite3_vtab *tab,
|
sqlite3_vtab *tab,
|
||||||
sqlite3_index_info *pIdxInfo
|
sqlite3_index_info *pIdxInfo
|
||||||
){
|
){
|
||||||
pIdxInfo->estimatedCost = 1000000;
|
printf("xBestIndex: nConstraint=%d, nOrderBy=%d\n", pIdxInfo->nConstraint, pIdxInfo->nOrderBy);
|
||||||
|
// Duplicate pIdxInfo and stash it in pIdxInfo->idxStr.
|
||||||
|
for(int i = 0; i < pIdxInfo->nConstraint; i++) {
|
||||||
|
printf(" constraint %d: col %d, op %d, usable %d\n",
|
||||||
|
i,
|
||||||
|
pIdxInfo->aConstraint[i].iColumn,
|
||||||
|
pIdxInfo->aConstraint[i].op,
|
||||||
|
pIdxInfo->aConstraint[i].usable);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(true || (pIdxInfo->nConstraint == 0 && pIdxInfo->nOrderBy == 0)) {
|
||||||
|
pIdxInfo->estimatedCost = 1000000000000;
|
||||||
|
pIdxInfo->idxNum = 0;
|
||||||
|
pIdxInfo->estimatedRows = 10000;
|
||||||
|
} else {
|
||||||
|
pIdxInfo->estimatedCost = 1;
|
||||||
|
pIdxInfo->idxNum = 1;
|
||||||
|
pIdxInfo->estimatedRows = 100000;
|
||||||
|
pIdxInfo->aConstraintUsage[0].argvIndex = 1;
|
||||||
|
// pIdxInfo->idxFlags = SQLITE_INDEX_SCAN_UNIQUE;
|
||||||
|
}
|
||||||
|
printf("idx %d has cost %f\n", pIdxInfo->idxNum, pIdxInfo->estimatedCost);
|
||||||
|
|
||||||
|
size_t dupeSize = sizeof(sqlite3_index_info) +
|
||||||
|
//pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) +
|
||||||
|
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) +
|
||||||
|
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) +
|
||||||
|
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint_usage);
|
||||||
|
sqlite3_index_info* dupe = (sqlite3_index_info*)sqlite3_malloc(dupeSize);
|
||||||
|
pIdxInfo->idxStr = (char*)dupe;
|
||||||
|
pIdxInfo->needToFreeIdxStr = 1;
|
||||||
|
|
||||||
|
// TODO: populate argvIndex.
|
||||||
|
memset(dupe, 0, dupeSize);
|
||||||
|
memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info));
|
||||||
|
|
||||||
|
dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint*)((char*)dupe + sizeof(sqlite3_index_info));
|
||||||
|
dupe->aOrderBy = (sqlite3_index_info::sqlite3_index_orderby*)((char*)dupe +
|
||||||
|
sizeof(sqlite3_index_info) +
|
||||||
|
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint));
|
||||||
|
dupe->aConstraintUsage = (sqlite3_index_info::sqlite3_index_constraint_usage*)((char*)dupe +
|
||||||
|
sizeof(sqlite3_index_info) +
|
||||||
|
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) +
|
||||||
|
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby));
|
||||||
|
|
||||||
|
|
||||||
|
for(int i = 0; i < pIdxInfo->nConstraint; i++) {
|
||||||
|
dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn;
|
||||||
|
dupe->aConstraint[i].op = pIdxInfo->aConstraint[i].op;
|
||||||
|
dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable;
|
||||||
|
dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset;
|
||||||
|
|
||||||
|
dupe->aConstraintUsage[i].argvIndex = pIdxInfo->aConstraintUsage[i].argvIndex;
|
||||||
|
dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i = 0; i < pIdxInfo->nOrderBy; i++) {
|
||||||
|
dupe->aOrderBy[i].iColumn = pIdxInfo->aOrderBy[i].iColumn;
|
||||||
|
dupe->aOrderBy[i].desc = pIdxInfo->aOrderBy[i].desc;
|
||||||
|
}
|
||||||
|
|
||||||
return SQLITE_OK;
|
return SQLITE_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
100-rows-nulls.parquet
|
100-rows-nulls.parquet
|
||||||
SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN fixed_size_binary IS NULL THEN 1 ELSE 0 END) from test;
|
SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_10 IS NULL THEN 1 ELSE 0 END) from test;
|
||||||
50|50|50|50|50|50|50|50|50|50|50
|
50|50|50|50|50|50|50|50|50|50|50
|
||||||
|
|
Loading…
Reference in New Issue