Rejig parquet generation

- "fixed_size_binary" -> "binary_10"
- make null parquet use rowgroups of sie 10: first rowgroup
  has no nulls, 2nd has all null, 3rd-10th have alternating
  nulls

This is prep for making a Postgres layer to use as an oracle
for generating test cases so that we have good coverage before
implementing advanced `xBestIndex` and `xFilter` modes.
This commit is contained in:
Colin Dellow 2018-03-06 21:02:26 -05:00
parent 56245c1d3d
commit 0d4806ca6f
7 changed files with 79 additions and 17 deletions

Binary file not shown.

View File

@ -62,6 +62,7 @@ def write_parquet(file_name, rows, types, row_group_size):
def name_of(i): def name_of(i):
name = '{}_{}'.format(types[i], i) name = '{}_{}'.format(types[i], i)
name = name.replace('timestamp[ns]', 'ts') name = name.replace('timestamp[ns]', 'ts')
name = name.replace('fixed_size_binary[1]', 'binary')
return name return name
cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))] cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))]
@ -104,9 +105,9 @@ def main():
for i in range(len(rows)): for i in range(len(rows)):
for j in range(len(rows[i])): for j in range(len(rows[i])):
if (i + j) % 2 == 0: if (i >= 10 and i <= 19) or (i >= 20 and (i + j) % 2 == 0):
rows[i][j] = None rows[i][j] = None
write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=100) write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=10)
write_unsupported_parquets() write_unsupported_parquets()

View File

@ -1,24 +1,24 @@
.load ./libparquet .load ./libparquet
.headers on .headers on
select 'creating without enough args'; --select 'creating without enough args';
create virtual table noargs using parquet; --create virtual table noargs using parquet;
select 'creating with invalid file'; --select 'creating with invalid file';
create virtual table nonexistent using parquet('nonexistent'); --create virtual table nonexistent using parquet('nonexistent');
select 'creating others'; --select 'creating others';
create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet'); --create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet');
select * from others limit 1; --select * from others limit 1;
--select 'creating with valid file'; --select 'creating with valid file';
--create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy'); create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy');
--.tables .tables
--.schema parquet .timer on
--.fullschema .echo on
--.timer on
--select count(*) from (select * from parquet limit 1); --select count(*) from (select * from parquet limit 1);
--select rowid,col0 from parquet where rowid > 5 limit 5; --select rowid,col0,col3,col9 from parquet where rowid > 5 limit 5;
--select count(*) from parquet limit 1; --select count(*) from parquet limit 1;
--select sum(col0) from parquet limit 1; --select sum(col0) from parquet limit 1;
--select * from parquet limit 10; --select * from parquet limit 10;
--select sum(length(col3)) from parquet; --select sum(length(col3)) from parquet;
select * from parquet where (col3 = 'Dawson Creeks') or col9 LIKE '%Bicycqq%' limit 20000;

View File

@ -252,6 +252,7 @@ static int parquetFilter(
int idxNum, const char *idxStr, int idxNum, const char *idxStr,
int argc, sqlite3_value **argv int argc, sqlite3_value **argv
){ ){
printf("xFilter: idxNum=%d\n", idxNum);
ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor;
cursor->reset(); cursor->reset();
return parquetNext(cur); return parquetNext(cur);
@ -265,7 +266,67 @@ static int parquetBestIndex(
sqlite3_vtab *tab, sqlite3_vtab *tab,
sqlite3_index_info *pIdxInfo sqlite3_index_info *pIdxInfo
){ ){
pIdxInfo->estimatedCost = 1000000; printf("xBestIndex: nConstraint=%d, nOrderBy=%d\n", pIdxInfo->nConstraint, pIdxInfo->nOrderBy);
// Duplicate pIdxInfo and stash it in pIdxInfo->idxStr.
for(int i = 0; i < pIdxInfo->nConstraint; i++) {
printf(" constraint %d: col %d, op %d, usable %d\n",
i,
pIdxInfo->aConstraint[i].iColumn,
pIdxInfo->aConstraint[i].op,
pIdxInfo->aConstraint[i].usable);
}
if(true || (pIdxInfo->nConstraint == 0 && pIdxInfo->nOrderBy == 0)) {
pIdxInfo->estimatedCost = 1000000000000;
pIdxInfo->idxNum = 0;
pIdxInfo->estimatedRows = 10000;
} else {
pIdxInfo->estimatedCost = 1;
pIdxInfo->idxNum = 1;
pIdxInfo->estimatedRows = 100000;
pIdxInfo->aConstraintUsage[0].argvIndex = 1;
// pIdxInfo->idxFlags = SQLITE_INDEX_SCAN_UNIQUE;
}
printf("idx %d has cost %f\n", pIdxInfo->idxNum, pIdxInfo->estimatedCost);
size_t dupeSize = sizeof(sqlite3_index_info) +
//pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) +
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) +
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) +
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint_usage);
sqlite3_index_info* dupe = (sqlite3_index_info*)sqlite3_malloc(dupeSize);
pIdxInfo->idxStr = (char*)dupe;
pIdxInfo->needToFreeIdxStr = 1;
// TODO: populate argvIndex.
memset(dupe, 0, dupeSize);
memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info));
dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint*)((char*)dupe + sizeof(sqlite3_index_info));
dupe->aOrderBy = (sqlite3_index_info::sqlite3_index_orderby*)((char*)dupe +
sizeof(sqlite3_index_info) +
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint));
dupe->aConstraintUsage = (sqlite3_index_info::sqlite3_index_constraint_usage*)((char*)dupe +
sizeof(sqlite3_index_info) +
pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) +
pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby));
for(int i = 0; i < pIdxInfo->nConstraint; i++) {
dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn;
dupe->aConstraint[i].op = pIdxInfo->aConstraint[i].op;
dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable;
dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset;
dupe->aConstraintUsage[i].argvIndex = pIdxInfo->aConstraintUsage[i].argvIndex;
dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit;
}
for(int i = 0; i < pIdxInfo->nOrderBy; i++) {
dupe->aOrderBy[i].iColumn = pIdxInfo->aOrderBy[i].iColumn;
dupe->aOrderBy[i].desc = pIdxInfo->aOrderBy[i].desc;
}
return SQLITE_OK; return SQLITE_OK;
} }

View File

@ -1,3 +1,3 @@
100-rows-nulls.parquet 100-rows-nulls.parquet
SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN fixed_size_binary IS NULL THEN 1 ELSE 0 END) from test; SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_10 IS NULL THEN 1 ELSE 0 END) from test;
50|50|50|50|50|50|50|50|50|50|50 50|50|50|50|50|50|50|50|50|50|50