diff --git a/parquet-generator/100-rows-1.parquet b/parquet-generator/100-rows-1.parquet new file mode 100644 index 0000000..922b888 Binary files /dev/null and b/parquet-generator/100-rows-1.parquet differ diff --git a/parquet-generator/100-rows-10.parquet b/parquet-generator/100-rows-10.parquet new file mode 100644 index 0000000..0a2e019 Binary files /dev/null and b/parquet-generator/100-rows-10.parquet differ diff --git a/parquet-generator/100-rows-nulls.parquet b/parquet-generator/100-rows-nulls.parquet new file mode 100644 index 0000000..7a4fb2f Binary files /dev/null and b/parquet-generator/100-rows-nulls.parquet differ diff --git a/parquet-generator/Pipfile b/parquet-generator/Pipfile new file mode 100644 index 0000000..a005a41 --- /dev/null +++ b/parquet-generator/Pipfile @@ -0,0 +1,20 @@ +[[source]] + +verify_ssl = true +url = "https://pypi.python.org/simple" +name = "pypi" + + +[requires] + +python_version = "3.5" + + +[packages] + +pyarrow = "*" +pylint = "*" + + +[dev-packages] + diff --git a/parquet-generator/Pipfile.lock b/parquet-generator/Pipfile.lock new file mode 100644 index 0000000..f5074a7 --- /dev/null +++ b/parquet-generator/Pipfile.lock @@ -0,0 +1,152 @@ +{ + "_meta": { + "hash": { + "sha256": "52f669bf06acbda24a4b6aaf000c3eed6946174e833baeffc85802cc63bbe342" + }, + "host-environment-markers": { + "implementation_name": "cpython", + "implementation_version": "3.5.2", + "os_name": "posix", + "platform_machine": "x86_64", + "platform_python_implementation": "CPython", + "platform_release": "4.9.3-040903-generic", + "platform_system": "Linux", + "platform_version": "#201701120631 SMP Thu Jan 12 11:33:59 UTC 2017", + "python_full_version": "3.5.2", + "python_version": "3.5", + "sys_platform": "linux" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.5" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.python.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "astroid": { + "hashes": [ + "sha256:db5cfc9af6e0b60cd07c19478fb54021fc20d2d189882fbcbc94fc69a8aecc58", + "sha256:f0a0e386dbca9f93ea9f3ea6f32b37a24720502b7baa9cb17c3976a680d43a06" + ], + "version": "==1.6.1" + }, + "isort": { + "hashes": [ + "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497", + "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af", + "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8" + ], + "version": "==4.3.4" + }, + "lazy-object-proxy": { + "hashes": [ + "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019", + "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39", + "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c", + "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e", + "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b", + "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6", + "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b", + "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d", + "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff", + "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd", + "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f", + "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514", + "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92", + "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35", + "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff", + "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252", + "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7", + "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b", + "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f", + "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4", + "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577", + "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d", + "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109", + "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2", + "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33", + "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d", + "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5", + "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088", + "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a" + ], + "version": "==1.3.1" + }, + "mccabe": { + "hashes": [ + "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", + "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" + ], + "version": "==0.6.1" + }, + "numpy": { + "hashes": [ + "sha256:e2335d56d2fd9fc4e3a3f2d3148aafec4962682375f429f05c45a64dacf19436", + "sha256:9b762e78739b6e021124adbea07611682db99cd3fca7f3c3a8b98b8f74ea5699", + "sha256:7d4c549e41507db4f04ec7cfab5597de8acf7871b16c9cf64cebcb9d39031ca6", + "sha256:b803306c4c201e7dcda0ce1b9a9c87f61a7c7ce43de2c60c8e56147b76849a1a", + "sha256:2da8dff91d489fea3e20155d41f4cd680de7d01d9a89fdd0ebb1bee6e72d3800", + "sha256:6b8c2daacbbffc83b4a2ba83a61aa3ce60c66340b07b962bd27b6c6bb175bee1", + "sha256:89b9419019c47ec87cf4cfca77d85da4611cc0be636ec87b5290346490b98450", + "sha256:49880b47d7272f902946dd995f346842c95fe275e2deb3082ef0495f0c718a69", + "sha256:3d7ddd5bdfb12ec9668edf1aa49a4a3eddb0db4661b57ea431477eb9a2468894", + "sha256:788e1757f8e409cd805a7cd82993cd9252fa19e334758a4c6eb5a8b334abb084", + "sha256:377def0873bbb1fbdedb14b3275b10a29b1b55619a3f7f775c4e7f9ce2461b9c", + "sha256:9501c9ccd081977ca5579a3ec4009d6baff6bacb04bf07214aade3324734195a", + "sha256:a1f5173df8190ef9c6235d260d70ca70c6fb029683ceb66e244c5cc6e335947a", + "sha256:12cf4b27039b88e407ad66894d99a957ef60fea0eeb442026af325add2ab264d", + "sha256:4e2fc841c8c642f7fd44591ef856ca409cedba6aea27928df34004c533839eee", + "sha256:e5ade7a69dccbd99c4fdbb95b6d091d941e62ffa588b0ed8fb0a2854118fef3f", + "sha256:6b1011ffc87d7e2b1b7bcc6dc21bdf177163658746ef778dcd21bf0516b9126c", + "sha256:a8bc80f69570e11967763636db9b24c1e3e3689881d10ae793cec74cf7a627b6", + "sha256:81b9d8f6450e752bd82e7d9618fa053df8db1725747880e76fb09710b57f78d0", + "sha256:e8522cad377cc2ef20fe13aae742cc265172910c98e8a0d6014b1a8d564019e2", + "sha256:a3d5dd437112292c707e54f47141be2f1100221242f07eda7bd8477f3ddc2252", + "sha256:c8000a6cbc5140629be8c038c9c9cdb3a1c85ff90bd4180ec99f0f0c73050b5e", + "sha256:fa0944650d5d3fb95869eaacd8eedbd2d83610c85e271bd9d3495ffa9bc4dc9c" + ], + "version": "==1.14.1" + }, + "pyarrow": { + "hashes": [ + "sha256:e8cc9f6a545d08b888e7b6b4f21f65d9773ef74abfe3823e458a9bffc6889cdf", + "sha256:140c04ca9e2742df00435ac4856b109f05292fbb7e1f0b944976d0407be58997", + "sha256:5c72e2c6c3ac249bed6221d0ac6920571c00af24976627ca45ef91a59490eda8", + "sha256:3994e41cb98e6bfe3227bab76eee2c683c1a1877479154198f40a770fc71f776", + "sha256:6256e6c90478734b8f3e6976cd509614df617f3dd216144b3ea13d474c26220d", + "sha256:3a50f5d1f73bd11e3e14bf71ba24188f6604d39eb475b20a1fc34102df6a89e7", + "sha256:073d9d05f61361565112341cedc9e473b81b9f170a87b9ce3546d47378d2fa49", + "sha256:1faac9c21e57a7f92c9a5971f6414fdcf27f9288f0d3ccdf66751c0899e599a8", + "sha256:c423c577c92a9855d09be7604e79b16598d415e600a0960e8bfacfc816651ef6" + ], + "version": "==0.8.0" + }, + "pylint": { + "hashes": [ + "sha256:156839bedaa798febee72893beef00c650c2e7abafb5586fc7a6a56be7f80412", + "sha256:4fe3b99da7e789545327b75548cee6b511e4faa98afe268130fea1af4b5ec022" + ], + "version": "==1.8.2" + }, + "six": { + "hashes": [ + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9" + ], + "version": "==1.11.0" + }, + "wrapt": { + "hashes": [ + "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" + ], + "version": "==1.10.11" + } + }, + "develop": {} +} diff --git a/parquet-generator/README.md b/parquet-generator/README.md new file mode 100644 index 0000000..0ed4765 --- /dev/null +++ b/parquet-generator/README.md @@ -0,0 +1,12 @@ +# parquet-generator + +Supporting tools to create Parquet files to validate +the vtable module. + +Run: + +``` +pipenv shell +``` + +to get an environment with the necessary modules installed. diff --git a/parquet-generator/parquets.py b/parquet-generator/parquets.py new file mode 100644 index 0000000..b164e4e --- /dev/null +++ b/parquet-generator/parquets.py @@ -0,0 +1,112 @@ +from datetime import datetime, timedelta +import re + +import pyarrow as pa +import pyarrow.parquet as pq + +def make_100_rows(): + '''Create 100 rows with unique values in each field, exercising all the main + physical types.''' + rows = [] + for i in range(100): + ba_fixed = bytearray() + ba_fixed.append(i) + ba_variable = bytearray() + for j in range(i): + ba_variable.append(j) + row = [] + # BOOLEAN, INT32, INT64, INT96, DOUBLE, BYTE_ARRAY, FLOAT + row.append(i % 2 == 0) # BOOLEAN + row.append(50 - i) # INT32/INT8 + row.append(100 * (50 - i)) # INT32/INT16 + row.append(1000 * 1000 * (50 - i)) # INT32/INT32 + row.append(1000 * 1000 * 1000 * (50 - i)) # INT64/INT64 + row.append(datetime(1985, 7, 20) + timedelta(days=i)) # INT96 + row.append(100.0 / (i + 1)) # DOUBLE + row.append(str(i)) # BYTE_ARRAY/UTF8 + row.append('{:03}'.format(i)), # BYTE_ARRAY/UTF8 + row.append(bytes(ba_variable)), # BYTE_ARRAY + row.append(bytes(ba_fixed)) # FIXED_LENGTH_BYTE_ARRAY +# pyarrow does not support float yet :( +# row.append(1.0 / (i + 1)) # FLOAT + + rows.append(row) + return rows + +def get_100_rows_types(): + '''The types for the columns in `make_100_rows`.''' + return [ + pa.bool_(), + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.timestamp('ns'), + pa.float64(), + pa.string(), + pa.string(), + pa.binary(-1), + pa.binary(1) +# pa.float32() + ] + +def write_parquet(file_name, rows, types, row_group_size): + '''Create two parquets with columns we support.''' + # pivot to be column major, create arrow structures + fields = [] + for i in range(len(types)): + col = [] + col.append([row[i] for row in rows]) + fields.append(pa.chunked_array(col, type=types[i])) + + def name_of(i): + name = '{}_{}'.format(types[i], i) + name = name.replace('timestamp[ns]', 'ts') + return name + + cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))] + table = pa.Table.from_arrays(cols) + print('Writing {}'.format(file_name)) + pq.write_table(table, + file_name, + row_group_size=row_group_size, + use_deprecated_int96_timestamps=True) + +def write_unsupported_parquets(): + # Taken from https://arrow.apache.org/docs/python/api.html + unsupported = [ + pa.decimal128(10), + pa.null(), + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + # pa.float16() <-- not supported by us, but also not by pyarrow + # TODO: list_, struct, dict + ] + + for type in unsupported: + file_name = 'unsupported-{}.parquet'.format(type) + file_name = re.sub(r'[^0-9a-z.-]', '-', file_name) + file_name = re.sub(r'--*', '-', file_name) + + write_parquet(file_name, [], [type], row_group_size=1) + +def main(): + '''Entrypoint.''' + rows = make_100_rows() + types = get_100_rows_types() + + write_parquet('100-rows-1.parquet', rows, types, row_group_size=100) + write_parquet('100-rows-10.parquet', rows, types, row_group_size=10) + + for i in range(len(rows)): + for j in range(len(rows[i])): + if (i + j) % 2 == 0: + rows[i][j] = None + write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=100) + + write_unsupported_parquets() + +if __name__ == '__main__': + main() diff --git a/parquet-generator/unsupported-decimal-10-0-.parquet b/parquet-generator/unsupported-decimal-10-0-.parquet new file mode 100644 index 0000000..b392ef1 Binary files /dev/null and b/parquet-generator/unsupported-decimal-10-0-.parquet differ diff --git a/parquet-generator/unsupported-null.parquet b/parquet-generator/unsupported-null.parquet new file mode 100644 index 0000000..cc73d75 Binary files /dev/null and b/parquet-generator/unsupported-null.parquet differ diff --git a/parquet-generator/unsupported-uint16.parquet b/parquet-generator/unsupported-uint16.parquet new file mode 100644 index 0000000..fa7a4c9 Binary files /dev/null and b/parquet-generator/unsupported-uint16.parquet differ diff --git a/parquet-generator/unsupported-uint32.parquet b/parquet-generator/unsupported-uint32.parquet new file mode 100644 index 0000000..b9dc0b1 Binary files /dev/null and b/parquet-generator/unsupported-uint32.parquet differ diff --git a/parquet-generator/unsupported-uint64.parquet b/parquet-generator/unsupported-uint64.parquet new file mode 100644 index 0000000..6405262 Binary files /dev/null and b/parquet-generator/unsupported-uint64.parquet differ diff --git a/parquet-generator/unsupported-uint8.parquet b/parquet-generator/unsupported-uint8.parquet new file mode 100644 index 0000000..92ca10f Binary files /dev/null and b/parquet-generator/unsupported-uint8.parquet differ