sqlite-parquet-vtable/parquet/parquet_cursor.cc

#include "parquet_cursor.h"

ParquetCursor::ParquetCursor(ParquetTable* table) {
  this->table = table;
  reader = NULL;
  reset(std::vector<Constraint>());
}

bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {
  int64_t target = constraint.getInt();
  switch(constraint.getOperator()) {
    case IsNull:
      return false;
    case Is:
    case Equal:
      return target >= rowId && target < rowId + rowGroupSize;
    case GreaterThan:
      // rowId > target
      return rowId + rowGroupSize > target;
    case GreaterThanOrEqual:
      // rowId >= target
      return rowId + rowGroupSize >= rowId;
    case LessThan:
      return target > rowId;
    case LessThanOrEqual:
      return target >= rowId;
    default:
      return true;
  }
}

// Return true if it is _possible_ that the current
// rowgroup satisfies the constraints. Only return false
// if it definitely does not.
//
// This avoids opening rowgroups that can't return useful
// data, which provides substantial performance benefits.
bool ParquetCursor::currentRowGroupSatisfiesFilter() {
  for(unsigned int i = 0; i < constraints.size(); i++) {
    int column = constraints[i].getColumn();
    int op = constraints[i].getOperator();
    bool rv = true;

    if(column == -1) {
      rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]);
    } else {
  //    printf("column = %d\n", column);
  //    std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);

      if(op == IsNull) {
      } else if(op == IsNotNull) {
      }
    }

    if(!rv)
      return false;
  }

  return true;
}


bool ParquetCursor::nextRowGroup() {
start:
  // Ensure that rowId points at the start of this rowGroup (eg, in the case where
  // we skipped an entire row group).
  rowId = rowGroupStartRowId + rowGroupSize;

  if((rowGroupId + 1) >= numRowGroups) {
    return false;
  }

  rowGroupStartRowId = rowId;
  rowGroupId++;
  rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId);
  rowGroupSize = rowsLeftInRowGroup = rowGroupMetadata->num_rows();
  rowGroup = reader->RowGroup(rowGroupId);
  for(unsigned int i = 0; i < scanners.size(); i++)
    scanners[i] = NULL;

  while(types.size() < (unsigned int)rowGroupMetadata->num_columns()) {
    types.push_back(rowGroupMetadata->schema()->Column(0)->physical_type());
  }

  while(logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) {
    logicalTypes.push_back(rowGroupMetadata->schema()->Column(0)->logical_type());
  }

  for(unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); i++) {
    types[i] = rowGroupMetadata->schema()->Column(i)->physical_type();
    logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type();
  }

  for(unsigned int i = 0; i < colRows.size(); i++) {
    colRows[i] = rowId;
  }

  // Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it;
  // it'll get decremented by our caller
  rowId++;
  if(!currentRowGroupSatisfiesFilter())
    goto start;

  return true;
}

// Return true if it is _possible_ that the current
// row satisfies the constraints. Only return false
// if it definitely does not.
//
// This avoids pointless transitions between the SQLite VM
// and the extension, which can add up on a dataset of tens
// of millions of rows.
bool ParquetCursor::currentRowSatisfiesFilter() {
  for(unsigned int i = 0; i < constraints.size(); i++) {
    bool rv = true;
    int column = constraints[i].getColumn();
    ensureColumn(column);
    int op = constraints[i].getOperator();

    if(op == IsNull) {
      rv = isNull(column);
    } else if(op == IsNotNull) {
      rv = !isNull(column);
    }

    if(!rv)
      return false;
  }
  return true;
}

void ParquetCursor::next() {
start:
  if(rowsLeftInRowGroup == 0) {
    if(!nextRowGroup()) {
      // put rowId over the edge so eof returns true
      rowId = numRows + 1;
      return;
    } else {
      // After a successful nextRowGroup, rowId is pointing at the current row. Make it
      // point before so the rest of the logic works out.
      rowId--;
    }
  }

  rowsLeftInRowGroup--;
  rowId++;
  if(!currentRowSatisfiesFilter())
    goto start;
}

int ParquetCursor::getRowId() {
  return rowId;
}

bool ParquetCursor::eof() {
  return rowId >= numRows;
}

void ParquetCursor::ensureColumn(int col) {
  // -1 signals rowid, which is trivially available
  if(col == -1)
    return;

  // need to ensure a scanner exists (and skip the # of rows in the rowgroup)
  while((unsigned int)col >= scanners.size()) {
    scanners.push_back(std::shared_ptr<parquet::Scanner>());
    // If it doesn't exist, it's the rowId as of the last nextRowGroup call
    colRows.push_back(rowGroupStartRowId);
    colNulls.push_back(false);
    colIntValues.push_back(0);
    colDoubleValues.push_back(0);
    colByteArrayValues.push_back(parquet::ByteArray());
  }

  if(scanners[col].get() == NULL) {
    std::shared_ptr<parquet::ColumnReader> colReader = rowGroup->Column(col);
    scanners[col] = parquet::Scanner::Make(colReader);
    // TODO: potentially skip rows if rowsLeftInRowGroup != rowGroupMetadata->num_rows()
  }

  // Actually fetch a value, stash data in colRows, colNulls, colValues
  if(colRows[col] != rowId) {
    // We may need to skip some records, eg, a query like
    // SELECT a WHERE b = 10
    // may have read b, but skipped a until b matches the predicate.
    bool wasNull = false;
    while(colRows[col] + 1 < rowId) {
      switch(types[col]) {
        case parquet::Type::INT32:
        {
          parquet::Int32Scanner* s = (parquet::Int32Scanner*)scanners[col].get();
          int rv = 0;
          s->NextValue(&rv, &wasNull);
          break;
        }
        case parquet::Type::FLOAT:
        {
          parquet::FloatScanner* s = (parquet::FloatScanner*)scanners[col].get();
          float rv = 0;
          s->NextValue(&rv, &wasNull);
          break;
        }
        case parquet::Type::DOUBLE:
        {
          parquet::DoubleScanner* s = (parquet::DoubleScanner*)scanners[col].get();
          double rv = 0;
          s->NextValue(&rv, &wasNull);
          break;
        }
        case parquet::Type::BYTE_ARRAY:
        {
          parquet::ByteArrayScanner* s = (parquet::ByteArrayScanner*)scanners[col].get();
          parquet::ByteArray ba;
          s->NextValue(&ba, &wasNull);
          break;
        }
        case parquet::Type::INT96:
        {
          parquet::Int96Scanner* s = (parquet::Int96Scanner*)scanners[col].get();
          parquet::Int96 rv;
          s->NextValue(&rv, &wasNull);
          break;
        }
        case parquet::Type::INT64:
        {
          parquet::Int64Scanner* s = (parquet::Int64Scanner*)scanners[col].get();
          long rv = 0;
          s->NextValue(&rv, &wasNull);
          break;
        }
        case parquet::Type::BOOLEAN:
        {
          parquet::BoolScanner* s = (parquet::BoolScanner*)scanners[col].get();
          bool rv = false;
          s->NextValue(&rv, &wasNull);
          break;
        }
        case parquet::Type::FIXED_LEN_BYTE_ARRAY:
        {
          parquet::FixedLenByteArrayScanner* s = (parquet::FixedLenByteArrayScanner*)scanners[col].get();
          parquet::FixedLenByteArray flba;
          s->NextValue(&flba, &wasNull);
          break;
        }
        default:
          // Should be impossible to get here as we should have forbidden this at
          // CREATE time -- maybe file changed underneath us?
          std::ostringstream ss;
          ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " <<
            parquet::TypeToString(types[col]);
          throw std::invalid_argument(ss.str());
        break;

      }
      colRows[col]++;
    }

    colRows[col] = rowId;
    wasNull = false;

    switch(types[col]) {
      case parquet::Type::INT32:
      {
        parquet::Int32Scanner* s = (parquet::Int32Scanner*)scanners[col].get();
        int rv = 0;
        if(s->NextValue(&rv, &wasNull)) {
          colIntValues[col] = rv;
        } else {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }
      case parquet::Type::FLOAT:
      {
        parquet::FloatScanner* s = (parquet::FloatScanner*)scanners[col].get();
        float rv = 0;
        if(s->NextValue(&rv, &wasNull)) {
          colDoubleValues[col] = rv;
        } else {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }
      case parquet::Type::DOUBLE:
      {
        parquet::DoubleScanner* s = (parquet::DoubleScanner*)scanners[col].get();
        double rv = 0;
        if(s->NextValue(&rv, &wasNull)) {
          colDoubleValues[col] = rv;
        } else {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }
      case parquet::Type::BYTE_ARRAY:
      {
        parquet::ByteArrayScanner* s = (parquet::ByteArrayScanner*)scanners[col].get();
        if(!s->NextValue(&colByteArrayValues[col], &wasNull)) {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }
      case parquet::Type::INT96:
      {
        // INT96 tracks a date with nanosecond precision, convert to ms since epoch.
        // ...see https://github.com/apache/parquet-format/pull/49 for more
        //
        // First 8 bytes: nanoseconds into the day
        // Last 4 bytes: Julian day
        // To get nanoseconds since the epoch:
        // (julian_day - 2440588) * (86400 * 1000 * 1000 * 1000) + nanoseconds
        parquet::Int96Scanner* s = (parquet::Int96Scanner*)scanners[col].get();
        parquet::Int96 rv;
        rv.value[0] = 0;
        rv.value[1] = 0;
        rv.value[2] = 0;
        if(s->NextValue(&rv, &wasNull)) {
          __int128 ns = rv.value[0] + ((unsigned long)rv.value[1] << 32);
          __int128 julianDay = rv.value[2];
          __int128 nsSinceEpoch = (julianDay - 2440588);
          nsSinceEpoch *= 86400;
          nsSinceEpoch *= 1000 * 1000 * 1000;
          nsSinceEpoch += ns;
          nsSinceEpoch /= 1000000;

          colIntValues[col] = nsSinceEpoch;
        } else {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }
      case parquet::Type::INT64:
      {
        parquet::Int64Scanner* s = (parquet::Int64Scanner*)scanners[col].get();
        long rv = 0;
        if(s->NextValue(&rv, &wasNull)) {
          colIntValues[col] = rv;
        } else {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }

      case parquet::Type::BOOLEAN:
      {
        parquet::BoolScanner* s = (parquet::BoolScanner*)scanners[col].get();
        bool rv = false;
        if(s->NextValue(&rv, &wasNull)) {
          colIntValues[col] = rv ? 1 : 0;
        } else {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }
      case parquet::Type::FIXED_LEN_BYTE_ARRAY:
      {
        parquet::FixedLenByteArrayScanner* s = (parquet::FixedLenByteArrayScanner*)scanners[col].get();
        parquet::FixedLenByteArray flba;
        if(s->NextValue(&flba, &wasNull)) {
          colByteArrayValues[col].ptr = flba.ptr;
          // TODO: cache this
          colByteArrayValues[col].len = rowGroupMetadata->schema()->Column(col)->type_length();
        } else {
          throw std::invalid_argument("unexpectedly lacking a next value");
        }
        break;
      }
      default:
        // Should be impossible to get here as we should have forbidden this at
        // CREATE time -- maybe file changed underneath us?
        std::ostringstream ss;
        ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " <<
          parquet::TypeToString(types[col]);
        throw std::invalid_argument(ss.str());
      break;
    }

    colNulls[col] = wasNull;
  }
}

bool ParquetCursor::isNull(int col) {
  // -1 is rowid, which is trivially non null
  if(col == -1)
    return false;

  return colNulls[col];
}

int ParquetCursor::getInt32(int col) {
  return colIntValues[col];
}

long ParquetCursor::getInt64(int col) {
  return colIntValues[col];
}

double ParquetCursor::getDouble(int col) {
  return colDoubleValues[col];
}

parquet::ByteArray* ParquetCursor::getByteArray(int col) {
  return &colByteArrayValues[col];
}

parquet::Type::type ParquetCursor::getPhysicalType(int col) {
  return types[col];
}

parquet::LogicalType::type ParquetCursor::getLogicalType(int col) {
  return logicalTypes[col];
}

void ParquetCursor::close() {
  if(reader != NULL) {
    reader->Close();
  }
}

void ParquetCursor::reset(std::vector<Constraint> constraints) {
  close();
  this->constraints = constraints;
  rowId = -1;
  // TODO: consider having a long lived handle in ParquetTable that can be borrowed
  // without incurring the cost of opening the file from scratch twice
  reader = parquet::ParquetFileReader::OpenFile(table->file.data());

  rowGroupId = -1;
  rowGroupSize = 0;
  rowGroupStartRowId = -1;
  // TODO: handle the case where rowgroups have disjoint schemas?
  // TODO: or at least, fail fast if detected
  rowsLeftInRowGroup = 0;

  numRows = reader->metadata()->num_rows();
  numRowGroups = reader->metadata()->num_row_groups();
}

ParquetTable* ParquetCursor::getTable() { return table; }
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`#include "parquet_cursor.h"`

			`ParquetCursor::ParquetCursor(ParquetTable* table) {`
			`this->table = table;`
Add query test framework, fix xFilter 2018-03-05 02:05:26 +00:00			`reader = NULL;`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`reset(std::vector<Constraint>());`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`}`

Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`bool ParquetCursor::currentRowGroupSatisfiesRowIdFilter(Constraint constraint) {`
			`int64_t target = constraint.getInt();`
			`switch(constraint.getOperator()) {`
			`case IsNull:`
			`return false;`
			`case Is:`
			`case Equal:`
			`return target >= rowId && target < rowId + rowGroupSize;`
			`case GreaterThan:`
			`// rowId > target`
			`return rowId + rowGroupSize > target;`
			`case GreaterThanOrEqual:`
			`// rowId >= target`
			`return rowId + rowGroupSize >= rowId;`
			`case LessThan:`
			`return target > rowId;`
			`case LessThanOrEqual:`
			`return target >= rowId;`
			`default:`
			`return true;`
			`}`
			`}`

Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`// Return true if it is _possible_ that the current`
			`// rowgroup satisfies the constraints. Only return false`
			`// if it definitely does not.`
			`//`
			`// This avoids opening rowgroups that can't return useful`
			`// data, which provides substantial performance benefits.`
			`bool ParquetCursor::currentRowGroupSatisfiesFilter() {`
			`for(unsigned int i = 0; i < constraints.size(); i++) {`
			`int column = constraints[i].getColumn();`
			`int op = constraints[i].getOperator();`
			`bool rv = true;`

More tests cases to deal with affinity I'm not sure how these manifest - whether SQLite retypes them based on column affinity before we see them, or whether they're provided as is. 2018-03-11 23:18:44 +00:00			`if(column == -1) {`
Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`rv = currentRowGroupSatisfiesRowIdFilter(constraints[i]);`
More tests cases to deal with affinity I'm not sure how these manifest - whether SQLite retypes them based on column affinity before we see them, or whether they're provided as is. 2018-03-11 23:18:44 +00:00			`} else {`
			`// printf("column = %d\n", column);`
			`// std::unique_ptr<parquet::ColumnChunkMetaData> md = rowGroupMetadata->ColumnChunk(column);`

			`if(op == IsNull) {`
			`} else if(op == IsNotNull) {`
			`}`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`}`

			`if(!rv)`
			`return false;`
			`}`

			`return true;`
			`}`


Don't segfault on full table scan 2018-03-04 22:49:19 +00:00			`bool ParquetCursor::nextRowGroup() {`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`start:`
Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`// Ensure that rowId points at the start of this rowGroup (eg, in the case where`
			`// we skipped an entire row group).`
			`rowId = rowGroupStartRowId + rowGroupSize;`

			`if((rowGroupId + 1) >= numRowGroups) {`
Don't segfault on full table scan 2018-03-04 22:49:19 +00:00			`return false;`
Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`}`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00
`ensureColumn` catches up when rows are skipped 2018-03-05 03:29:35 +00:00			`rowGroupStartRowId = rowId;`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`rowGroupId++;`
Fix when last rowgroup is not same size as first ...change test data to use 99 rows, so that when we have rowgroup size 10 we exercise this code. 2018-03-11 19:04:38 +00:00			`rowGroupMetadata = reader->metadata()->RowGroup(rowGroupId);`
Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`rowGroupSize = rowsLeftInRowGroup = rowGroupMetadata->num_rows();`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`rowGroup = reader->RowGroup(rowGroupId);`
			`for(unsigned int i = 0; i < scanners.size(); i++)`
			`scanners[i] = NULL;`

			`while(types.size() < (unsigned int)rowGroupMetadata->num_columns()) {`
			`types.push_back(rowGroupMetadata->schema()->Column(0)->physical_type());`
			`}`

Support BLOBs 2018-03-04 22:20:28 +00:00			`while(logicalTypes.size() < (unsigned int)rowGroupMetadata->num_columns()) {`
			`logicalTypes.push_back(rowGroupMetadata->schema()->Column(0)->logical_type());`
			`}`

Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`for(unsigned int i = 0; i < (unsigned int)rowGroupMetadata->num_columns(); i++) {`
			`types[i] = rowGroupMetadata->schema()->Column(i)->physical_type();`
Support BLOBs 2018-03-04 22:20:28 +00:00			`logicalTypes[i] = rowGroupMetadata->schema()->Column(i)->logical_type();`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`}`
Don't segfault on full table scan 2018-03-04 22:49:19 +00:00
`ensureColumn` catches up when rows are skipped 2018-03-05 03:29:35 +00:00			`for(unsigned int i = 0; i < colRows.size(); i++) {`
			`colRows[i] = rowId;`
			`}`

Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`// Increment rowId so currentRowGroupSatisfiesRowIdFilter can access it;`
			`// it'll get decremented by our caller`
			`rowId++;`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`if(!currentRowGroupSatisfiesFilter())`
			`goto start;`

Don't segfault on full table scan 2018-03-04 22:49:19 +00:00			`return true;`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`}`

Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`// Return true if it is _possible_ that the current`
			`// row satisfies the constraints. Only return false`
			`// if it definitely does not.`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`//`
			`// This avoids pointless transitions between the SQLite VM`
			`// and the extension, which can add up on a dataset of tens`
			`// of millions of rows.`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`bool ParquetCursor::currentRowSatisfiesFilter() {`
			`for(unsigned int i = 0; i < constraints.size(); i++) {`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`bool rv = true;`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`int column = constraints[i].getColumn();`
			`ensureColumn(column);`
			`int op = constraints[i].getOperator();`

			`if(op == IsNull) {`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`rv = isNull(column);`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`} else if(op == IsNotNull) {`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00			`rv = !isNull(column);`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`}`
Scaffolding for row group filters, tests rowid is special since its column index is -1, so add explicit tests around it 2018-03-11 19:43:40 +00:00
			`if(!rv)`
			`return false;`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`}`
			`return true;`
			`}`

Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`void ParquetCursor::next() {`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`start:`
Don't segfault on full table scan 2018-03-04 22:49:19 +00:00			`if(rowsLeftInRowGroup == 0) {`
			`if(!nextRowGroup()) {`
			`// put rowId over the edge so eof returns true`
Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`rowId = numRows + 1;`
Don't segfault on full table scan 2018-03-04 22:49:19 +00:00			`return;`
Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`} else {`
			`// After a successful nextRowGroup, rowId is pointing at the current row. Make it`
			`// point before so the rest of the logic works out.`
			`rowId--;`
Don't segfault on full table scan 2018-03-04 22:49:19 +00:00			`}`
			`}`

Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`rowsLeftInRowGroup--;`
			`rowId++;`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`if(!currentRowSatisfiesFilter())`
			`goto start;`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`}`

			`int ParquetCursor::getRowId() {`
			`return rowId;`
			`}`

			`bool ParquetCursor::eof() {`
			`return rowId >= numRows;`
			`}`

			`void ParquetCursor::ensureColumn(int col) {`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`// -1 signals rowid, which is trivially available`
			`if(col == -1)`
			`return;`

Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`// need to ensure a scanner exists (and skip the # of rows in the rowgroup)`
			`while((unsigned int)col >= scanners.size()) {`
			`scanners.push_back(std::shared_ptr<parquet::Scanner>());`
`ensureColumn` catches up when rows are skipped 2018-03-05 03:29:35 +00:00			`// If it doesn't exist, it's the rowId as of the last nextRowGroup call`
			`colRows.push_back(rowGroupStartRowId);`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`colNulls.push_back(false);`
			`colIntValues.push_back(0);`
			`colDoubleValues.push_back(0);`
			`colByteArrayValues.push_back(parquet::ByteArray());`
			`}`

			`if(scanners[col].get() == NULL) {`
			`std::shared_ptr<parquet::ColumnReader> colReader = rowGroup->Column(col);`
			`scanners[col] = parquet::Scanner::Make(colReader);`
			`// TODO: potentially skip rows if rowsLeftInRowGroup != rowGroupMetadata->num_rows()`
			`}`

			`// Actually fetch a value, stash data in colRows, colNulls, colValues`
			`if(colRows[col] != rowId) {`
`ensureColumn` catches up when rows are skipped 2018-03-05 03:29:35 +00:00			`// We may need to skip some records, eg, a query like`
			`// SELECT a WHERE b = 10`
			`// may have read b, but skipped a until b matches the predicate.`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`bool wasNull = false;`
`ensureColumn` catches up when rows are skipped 2018-03-05 03:29:35 +00:00			`while(colRows[col] + 1 < rowId) {`
			`switch(types[col]) {`
			`case parquet::Type::INT32:`
			`{`
			`parquet::Int32Scanner* s = (parquet::Int32Scanner*)scanners[col].get();`
			`int rv = 0;`
			`s->NextValue(&rv, &wasNull);`
			`break;`
			`}`
			`case parquet::Type::FLOAT:`
			`{`
			`parquet::FloatScanner* s = (parquet::FloatScanner*)scanners[col].get();`
			`float rv = 0;`
			`s->NextValue(&rv, &wasNull);`
			`break;`
			`}`
			`case parquet::Type::DOUBLE:`
			`{`
			`parquet::DoubleScanner* s = (parquet::DoubleScanner*)scanners[col].get();`
			`double rv = 0;`
			`s->NextValue(&rv, &wasNull);`
			`break;`
			`}`
			`case parquet::Type::BYTE_ARRAY:`
			`{`
			`parquet::ByteArrayScanner* s = (parquet::ByteArrayScanner*)scanners[col].get();`
			`parquet::ByteArray ba;`
			`s->NextValue(&ba, &wasNull);`
			`break;`
			`}`
			`case parquet::Type::INT96:`
			`{`
			`parquet::Int96Scanner* s = (parquet::Int96Scanner*)scanners[col].get();`
			`parquet::Int96 rv;`
			`s->NextValue(&rv, &wasNull);`
			`break;`
			`}`
			`case parquet::Type::INT64:`
			`{`
			`parquet::Int64Scanner* s = (parquet::Int64Scanner*)scanners[col].get();`
			`long rv = 0;`
			`s->NextValue(&rv, &wasNull);`
			`break;`
			`}`
			`case parquet::Type::BOOLEAN:`
			`{`
			`parquet::BoolScanner* s = (parquet::BoolScanner*)scanners[col].get();`
			`bool rv = false;`
			`s->NextValue(&rv, &wasNull);`
			`break;`
			`}`
			`case parquet::Type::FIXED_LEN_BYTE_ARRAY:`
			`{`
			`parquet::FixedLenByteArrayScanner* s = (parquet::FixedLenByteArrayScanner*)scanners[col].get();`
			`parquet::FixedLenByteArray flba;`
			`s->NextValue(&flba, &wasNull);`
			`break;`
			`}`
			`default:`
			`// Should be impossible to get here as we should have forbidden this at`
			`// CREATE time -- maybe file changed underneath us?`
			`std::ostringstream ss;`
			`ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " <<`
			`parquet::TypeToString(types[col]);`
			`throw std::invalid_argument(ss.str());`
			`break;`

			`}`
			`colRows[col]++;`
			`}`

			`colRows[col] = rowId;`
			`wasNull = false;`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00
			`switch(types[col]) {`
			`case parquet::Type::INT32:`
			`{`
			`parquet::Int32Scanner* s = (parquet::Int32Scanner*)scanners[col].get();`
			`int rv = 0;`
			`if(s->NextValue(&rv, &wasNull)) {`
			`colIntValues[col] = rv;`
			`} else {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
Boolean, INT96, INT64 2018-03-04 01:00:50 +00:00			`break;`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`}`
float support 2018-03-04 01:57:09 +00:00			`case parquet::Type::FLOAT:`
			`{`
			`parquet::FloatScanner* s = (parquet::FloatScanner*)scanners[col].get();`
			`float rv = 0;`
			`if(s->NextValue(&rv, &wasNull)) {`
			`colDoubleValues[col] = rv;`
			`} else {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
			`break;`
			`}`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`case parquet::Type::DOUBLE:`
			`{`
			`parquet::DoubleScanner* s = (parquet::DoubleScanner*)scanners[col].get();`
			`double rv = 0;`
			`if(s->NextValue(&rv, &wasNull)) {`
			`colDoubleValues[col] = rv;`
			`} else {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
Boolean, INT96, INT64 2018-03-04 01:00:50 +00:00			`break;`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`}`
			`case parquet::Type::BYTE_ARRAY:`
			`{`
			`parquet::ByteArrayScanner* s = (parquet::ByteArrayScanner*)scanners[col].get();`
			`if(!s->NextValue(&colByteArrayValues[col], &wasNull)) {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
Boolean, INT96, INT64 2018-03-04 01:00:50 +00:00			`break;`
			`}`
			`case parquet::Type::INT96:`
			`{`
			`// INT96 tracks a date with nanosecond precision, convert to ms since epoch.`
			`// ...see https://github.com/apache/parquet-format/pull/49 for more`
			`//`
			`// First 8 bytes: nanoseconds into the day`
			`// Last 4 bytes: Julian day`
			`// To get nanoseconds since the epoch:`
			`// (julian_day - 2440588) * (86400 * 1000 * 1000 * 1000) + nanoseconds`
			`parquet::Int96Scanner* s = (parquet::Int96Scanner*)scanners[col].get();`
			`parquet::Int96 rv;`
			`rv.value[0] = 0;`
			`rv.value[1] = 0;`
			`rv.value[2] = 0;`
			`if(s->NextValue(&rv, &wasNull)) {`
			`__int128 ns = rv.value[0] + ((unsigned long)rv.value[1] << 32);`
			`__int128 julianDay = rv.value[2];`
			`__int128 nsSinceEpoch = (julianDay - 2440588);`
			`nsSinceEpoch *= 86400;`
			`nsSinceEpoch = 1000 1000 * 1000;`
			`nsSinceEpoch += ns;`
			`nsSinceEpoch /= 1000000;`

			`colIntValues[col] = nsSinceEpoch;`
			`} else {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
			`break;`
			`}`
			`case parquet::Type::INT64:`
			`{`
			`parquet::Int64Scanner* s = (parquet::Int64Scanner*)scanners[col].get();`
			`long rv = 0;`
			`if(s->NextValue(&rv, &wasNull)) {`
			`colIntValues[col] = rv;`
			`} else {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
			`break;`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`}`

			`case parquet::Type::BOOLEAN:`
Boolean, INT96, INT64 2018-03-04 01:00:50 +00:00			`{`
			`parquet::BoolScanner* s = (parquet::BoolScanner*)scanners[col].get();`
			`bool rv = false;`
			`if(s->NextValue(&rv, &wasNull)) {`
			`colIntValues[col] = rv ? 1 : 0;`
			`} else {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
			`break;`
			`}`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`case parquet::Type::FIXED_LEN_BYTE_ARRAY:`
Support BLOBs 2018-03-04 22:20:28 +00:00			`{`
			`parquet::FixedLenByteArrayScanner* s = (parquet::FixedLenByteArrayScanner*)scanners[col].get();`
			`parquet::FixedLenByteArray flba;`
			`if(s->NextValue(&flba, &wasNull)) {`
			`colByteArrayValues[col].ptr = flba.ptr;`
			`// TODO: cache this`
			`colByteArrayValues[col].len = rowGroupMetadata->schema()->Column(col)->type_length();`
			`} else {`
			`throw std::invalid_argument("unexpectedly lacking a next value");`
			`}`
			`break;`
			`}`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`default:`
Boolean, INT96, INT64 2018-03-04 01:00:50 +00:00			`// Should be impossible to get here as we should have forbidden this at`
			`// CREATE time -- maybe file changed underneath us?`
			`std::ostringstream ss;`
			`ss << __FILE__ << ":" << __LINE__ << ": column " << col << " has unsupported type: " <<`
			`parquet::TypeToString(types[col]);`
			`throw std::invalid_argument(ss.str());`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`break;`
			`}`

			`colNulls[col] = wasNull;`
			`}`
			`}`

			`bool ParquetCursor::isNull(int col) {`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`// -1 is rowid, which is trivially non null`
			`if(col == -1)`
			`return false;`

Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`return colNulls[col];`
			`}`

Boolean, INT96, INT64 2018-03-04 01:00:50 +00:00			`int ParquetCursor::getInt32(int col) {`
			`return colIntValues[col];`
			`}`

			`long ParquetCursor::getInt64(int col) {`
Very rough first cut supports int32, double, strings. 2018-03-03 20:44:01 +00:00			`return colIntValues[col];`
			`}`

			`double ParquetCursor::getDouble(int col) {`
			`return colDoubleValues[col];`
			`}`

			`parquet::ByteArray* ParquetCursor::getByteArray(int col) {`
			`return &colByteArrayValues[col];`
			`}`

			`parquet::Type::type ParquetCursor::getPhysicalType(int col) {`
			`return types[col];`
			`}`
Support BLOBs 2018-03-04 22:20:28 +00:00
			`parquet::LogicalType::type ParquetCursor::getLogicalType(int col) {`
			`return logicalTypes[col];`
			`}`
Add query test framework, fix xFilter 2018-03-05 02:05:26 +00:00
			`void ParquetCursor::close() {`
			`if(reader != NULL) {`
			`reader->Close();`
			`}`
			`}`

Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`void ParquetCursor::reset(std::vector<Constraint> constraints) {`
Add query test framework, fix xFilter 2018-03-05 02:05:26 +00:00			`close();`
Scaffolding for in-extension filtering Supports IS NULL and IS NOT NULL checks 2018-03-11 17:58:10 +00:00			`this->constraints = constraints;`
Add query test framework, fix xFilter 2018-03-05 02:05:26 +00:00			`rowId = -1;`
			`// TODO: consider having a long lived handle in ParquetTable that can be borrowed`
			`// without incurring the cost of opening the file from scratch twice`
			`reader = parquet::ParquetFileReader::OpenFile(table->file.data());`

			`rowGroupId = -1;`
Add rowgroup filtering for rowid 2018-03-13 00:42:50 +00:00			`rowGroupSize = 0;`
			`rowGroupStartRowId = -1;`
Add query test framework, fix xFilter 2018-03-05 02:05:26 +00:00			`// TODO: handle the case where rowgroups have disjoint schemas?`
			`// TODO: or at least, fail fast if detected`
			`rowsLeftInRowGroup = 0;`

			`numRows = reader->metadata()->num_rows();`
			`numRowGroups = reader->metadata()->num_row_groups();`
			`}`
Code to pretty print constraints 2018-03-10 15:59:53 +00:00
			`ParquetTable* ParquetCursor::getTable() { return table; }`