diff --git a/parquet-generator/100-rows-1.parquet b/parquet-generator/100-rows-1.parquet index 922b888..fe4fa70 100644 Binary files a/parquet-generator/100-rows-1.parquet and b/parquet-generator/100-rows-1.parquet differ diff --git a/parquet-generator/100-rows-10.parquet b/parquet-generator/100-rows-10.parquet index 0a2e019..aff0f9b 100644 Binary files a/parquet-generator/100-rows-10.parquet and b/parquet-generator/100-rows-10.parquet differ diff --git a/parquet-generator/100-rows-nulls.parquet b/parquet-generator/100-rows-nulls.parquet index 7a4fb2f..08c5b78 100644 Binary files a/parquet-generator/100-rows-nulls.parquet and b/parquet-generator/100-rows-nulls.parquet differ diff --git a/parquet-generator/parquets.py b/parquet-generator/parquets.py index af12105..df41da4 100644 --- a/parquet-generator/parquets.py +++ b/parquet-generator/parquets.py @@ -62,6 +62,7 @@ def write_parquet(file_name, rows, types, row_group_size): def name_of(i): name = '{}_{}'.format(types[i], i) name = name.replace('timestamp[ns]', 'ts') + name = name.replace('fixed_size_binary[1]', 'binary') return name cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))] @@ -104,9 +105,9 @@ def main(): for i in range(len(rows)): for j in range(len(rows[i])): - if (i + j) % 2 == 0: + if (i >= 10 and i <= 19) or (i >= 20 and (i + j) % 2 == 0): rows[i][j] = None - write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=100) + write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=10) write_unsupported_parquets() diff --git a/parquet/cmds.txt b/parquet/cmds.txt index 122e932..aa33e47 100644 --- a/parquet/cmds.txt +++ b/parquet/cmds.txt @@ -1,24 +1,24 @@ .load ./libparquet .headers on -select 'creating without enough args'; -create virtual table noargs using parquet; +--select 'creating without enough args'; +--create virtual table noargs using parquet; -select 'creating with invalid file'; -create virtual table nonexistent using parquet('nonexistent'); +--select 'creating with invalid file'; +--create virtual table nonexistent using parquet('nonexistent'); -select 'creating others'; -create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet'); -select * from others limit 1; +--select 'creating others'; +--create virtual table others using parquet('/home/cldellow/src/parquet-vtable/parquet/others.parquet'); +--select * from others limit 1; --select 'creating with valid file'; ---create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy'); ---.tables ---.schema parquet ---.fullschema ---.timer on +create virtual table parquet using parquet('/home/cldellow/src/csv2parquet/12m.parquet.snappy'); +.tables +.timer on +.echo on --select count(*) from (select * from parquet limit 1); ---select rowid,col0 from parquet where rowid > 5 limit 5; +--select rowid,col0,col3,col9 from parquet where rowid > 5 limit 5; --select count(*) from parquet limit 1; --select sum(col0) from parquet limit 1; --select * from parquet limit 10; --select sum(length(col3)) from parquet; +select * from parquet where (col3 = 'Dawson Creeks') or col9 LIKE '%Bicycqq%' limit 20000; diff --git a/parquet/parquet.cc b/parquet/parquet.cc index 7f2cb9e..159dd3b 100644 --- a/parquet/parquet.cc +++ b/parquet/parquet.cc @@ -252,6 +252,7 @@ static int parquetFilter( int idxNum, const char *idxStr, int argc, sqlite3_value **argv ){ + printf("xFilter: idxNum=%d\n", idxNum); ParquetCursor* cursor = ((sqlite3_vtab_cursor_parquet*)cur)->cursor; cursor->reset(); return parquetNext(cur); @@ -265,7 +266,67 @@ static int parquetBestIndex( sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo ){ - pIdxInfo->estimatedCost = 1000000; + printf("xBestIndex: nConstraint=%d, nOrderBy=%d\n", pIdxInfo->nConstraint, pIdxInfo->nOrderBy); + // Duplicate pIdxInfo and stash it in pIdxInfo->idxStr. + for(int i = 0; i < pIdxInfo->nConstraint; i++) { + printf(" constraint %d: col %d, op %d, usable %d\n", + i, + pIdxInfo->aConstraint[i].iColumn, + pIdxInfo->aConstraint[i].op, + pIdxInfo->aConstraint[i].usable); + } + + if(true || (pIdxInfo->nConstraint == 0 && pIdxInfo->nOrderBy == 0)) { + pIdxInfo->estimatedCost = 1000000000000; + pIdxInfo->idxNum = 0; + pIdxInfo->estimatedRows = 10000; + } else { + pIdxInfo->estimatedCost = 1; + pIdxInfo->idxNum = 1; + pIdxInfo->estimatedRows = 100000; + pIdxInfo->aConstraintUsage[0].argvIndex = 1; +// pIdxInfo->idxFlags = SQLITE_INDEX_SCAN_UNIQUE; + } + printf("idx %d has cost %f\n", pIdxInfo->idxNum, pIdxInfo->estimatedCost); + + size_t dupeSize = sizeof(sqlite3_index_info) + + //pIdxInfo->nConstraint * sizeof(sqlite3_index_constraint) + + pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) + + pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby) + + pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint_usage); + sqlite3_index_info* dupe = (sqlite3_index_info*)sqlite3_malloc(dupeSize); + pIdxInfo->idxStr = (char*)dupe; + pIdxInfo->needToFreeIdxStr = 1; + + // TODO: populate argvIndex. + memset(dupe, 0, dupeSize); + memcpy(dupe, pIdxInfo, sizeof(sqlite3_index_info)); + + dupe->aConstraint = (sqlite3_index_info::sqlite3_index_constraint*)((char*)dupe + sizeof(sqlite3_index_info)); + dupe->aOrderBy = (sqlite3_index_info::sqlite3_index_orderby*)((char*)dupe + + sizeof(sqlite3_index_info) + + pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint)); + dupe->aConstraintUsage = (sqlite3_index_info::sqlite3_index_constraint_usage*)((char*)dupe + + sizeof(sqlite3_index_info) + + pIdxInfo->nConstraint * sizeof(sqlite3_index_info::sqlite3_index_constraint) + + pIdxInfo->nOrderBy * sizeof(sqlite3_index_info::sqlite3_index_orderby)); + + + for(int i = 0; i < pIdxInfo->nConstraint; i++) { + dupe->aConstraint[i].iColumn = pIdxInfo->aConstraint[i].iColumn; + dupe->aConstraint[i].op = pIdxInfo->aConstraint[i].op; + dupe->aConstraint[i].usable = pIdxInfo->aConstraint[i].usable; + dupe->aConstraint[i].iTermOffset = pIdxInfo->aConstraint[i].iTermOffset; + + dupe->aConstraintUsage[i].argvIndex = pIdxInfo->aConstraintUsage[i].argvIndex; + dupe->aConstraintUsage[i].omit = pIdxInfo->aConstraintUsage[i].omit; + } + + for(int i = 0; i < pIdxInfo->nOrderBy; i++) { + dupe->aOrderBy[i].iColumn = pIdxInfo->aOrderBy[i].iColumn; + dupe->aOrderBy[i].desc = pIdxInfo->aOrderBy[i].desc; + } + return SQLITE_OK; } diff --git a/tests/queries/008-nulls.sql b/tests/queries/008-nulls.sql index 7f58a56..a113c7d 100644 --- a/tests/queries/008-nulls.sql +++ b/tests/queries/008-nulls.sql @@ -1,3 +1,3 @@ 100-rows-nulls.parquet -SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN fixed_size_binary IS NULL THEN 1 ELSE 0 END) from test; +SELECT SUM(CASE WHEN bool_0 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int8_1 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int16_2 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int32_3 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN int64_4 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN ts_5 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN double_6 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_7 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN string_8 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_9 IS NULL THEN 1 ELSE 0 END), SUM(CASE WHEN binary_10 IS NULL THEN 1 ELSE 0 END) from test; 50|50|50|50|50|50|50|50|50|50|50