Add tool to generate sample parquets
This commit is contained in:
parent
67b0d96967
commit
681c8a443f
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,20 @@
|
|||
[[source]]
|
||||
|
||||
verify_ssl = true
|
||||
url = "https://pypi.python.org/simple"
|
||||
name = "pypi"
|
||||
|
||||
|
||||
[requires]
|
||||
|
||||
python_version = "3.5"
|
||||
|
||||
|
||||
[packages]
|
||||
|
||||
pyarrow = "*"
|
||||
pylint = "*"
|
||||
|
||||
|
||||
[dev-packages]
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "52f669bf06acbda24a4b6aaf000c3eed6946174e833baeffc85802cc63bbe342"
|
||||
},
|
||||
"host-environment-markers": {
|
||||
"implementation_name": "cpython",
|
||||
"implementation_version": "3.5.2",
|
||||
"os_name": "posix",
|
||||
"platform_machine": "x86_64",
|
||||
"platform_python_implementation": "CPython",
|
||||
"platform_release": "4.9.3-040903-generic",
|
||||
"platform_system": "Linux",
|
||||
"platform_version": "#201701120631 SMP Thu Jan 12 11:33:59 UTC 2017",
|
||||
"python_full_version": "3.5.2",
|
||||
"python_version": "3.5",
|
||||
"sys_platform": "linux"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.5"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"astroid": {
|
||||
"hashes": [
|
||||
"sha256:db5cfc9af6e0b60cd07c19478fb54021fc20d2d189882fbcbc94fc69a8aecc58",
|
||||
"sha256:f0a0e386dbca9f93ea9f3ea6f32b37a24720502b7baa9cb17c3976a680d43a06"
|
||||
],
|
||||
"version": "==1.6.1"
|
||||
},
|
||||
"isort": {
|
||||
"hashes": [
|
||||
"sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497",
|
||||
"sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af",
|
||||
"sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8"
|
||||
],
|
||||
"version": "==4.3.4"
|
||||
},
|
||||
"lazy-object-proxy": {
|
||||
"hashes": [
|
||||
"sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019",
|
||||
"sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39",
|
||||
"sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c",
|
||||
"sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e",
|
||||
"sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b",
|
||||
"sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6",
|
||||
"sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b",
|
||||
"sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d",
|
||||
"sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff",
|
||||
"sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd",
|
||||
"sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f",
|
||||
"sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514",
|
||||
"sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92",
|
||||
"sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35",
|
||||
"sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff",
|
||||
"sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252",
|
||||
"sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7",
|
||||
"sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b",
|
||||
"sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f",
|
||||
"sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4",
|
||||
"sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577",
|
||||
"sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d",
|
||||
"sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109",
|
||||
"sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2",
|
||||
"sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33",
|
||||
"sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d",
|
||||
"sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5",
|
||||
"sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088",
|
||||
"sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a"
|
||||
],
|
||||
"version": "==1.3.1"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"numpy": {
|
||||
"hashes": [
|
||||
"sha256:e2335d56d2fd9fc4e3a3f2d3148aafec4962682375f429f05c45a64dacf19436",
|
||||
"sha256:9b762e78739b6e021124adbea07611682db99cd3fca7f3c3a8b98b8f74ea5699",
|
||||
"sha256:7d4c549e41507db4f04ec7cfab5597de8acf7871b16c9cf64cebcb9d39031ca6",
|
||||
"sha256:b803306c4c201e7dcda0ce1b9a9c87f61a7c7ce43de2c60c8e56147b76849a1a",
|
||||
"sha256:2da8dff91d489fea3e20155d41f4cd680de7d01d9a89fdd0ebb1bee6e72d3800",
|
||||
"sha256:6b8c2daacbbffc83b4a2ba83a61aa3ce60c66340b07b962bd27b6c6bb175bee1",
|
||||
"sha256:89b9419019c47ec87cf4cfca77d85da4611cc0be636ec87b5290346490b98450",
|
||||
"sha256:49880b47d7272f902946dd995f346842c95fe275e2deb3082ef0495f0c718a69",
|
||||
"sha256:3d7ddd5bdfb12ec9668edf1aa49a4a3eddb0db4661b57ea431477eb9a2468894",
|
||||
"sha256:788e1757f8e409cd805a7cd82993cd9252fa19e334758a4c6eb5a8b334abb084",
|
||||
"sha256:377def0873bbb1fbdedb14b3275b10a29b1b55619a3f7f775c4e7f9ce2461b9c",
|
||||
"sha256:9501c9ccd081977ca5579a3ec4009d6baff6bacb04bf07214aade3324734195a",
|
||||
"sha256:a1f5173df8190ef9c6235d260d70ca70c6fb029683ceb66e244c5cc6e335947a",
|
||||
"sha256:12cf4b27039b88e407ad66894d99a957ef60fea0eeb442026af325add2ab264d",
|
||||
"sha256:4e2fc841c8c642f7fd44591ef856ca409cedba6aea27928df34004c533839eee",
|
||||
"sha256:e5ade7a69dccbd99c4fdbb95b6d091d941e62ffa588b0ed8fb0a2854118fef3f",
|
||||
"sha256:6b1011ffc87d7e2b1b7bcc6dc21bdf177163658746ef778dcd21bf0516b9126c",
|
||||
"sha256:a8bc80f69570e11967763636db9b24c1e3e3689881d10ae793cec74cf7a627b6",
|
||||
"sha256:81b9d8f6450e752bd82e7d9618fa053df8db1725747880e76fb09710b57f78d0",
|
||||
"sha256:e8522cad377cc2ef20fe13aae742cc265172910c98e8a0d6014b1a8d564019e2",
|
||||
"sha256:a3d5dd437112292c707e54f47141be2f1100221242f07eda7bd8477f3ddc2252",
|
||||
"sha256:c8000a6cbc5140629be8c038c9c9cdb3a1c85ff90bd4180ec99f0f0c73050b5e",
|
||||
"sha256:fa0944650d5d3fb95869eaacd8eedbd2d83610c85e271bd9d3495ffa9bc4dc9c"
|
||||
],
|
||||
"version": "==1.14.1"
|
||||
},
|
||||
"pyarrow": {
|
||||
"hashes": [
|
||||
"sha256:e8cc9f6a545d08b888e7b6b4f21f65d9773ef74abfe3823e458a9bffc6889cdf",
|
||||
"sha256:140c04ca9e2742df00435ac4856b109f05292fbb7e1f0b944976d0407be58997",
|
||||
"sha256:5c72e2c6c3ac249bed6221d0ac6920571c00af24976627ca45ef91a59490eda8",
|
||||
"sha256:3994e41cb98e6bfe3227bab76eee2c683c1a1877479154198f40a770fc71f776",
|
||||
"sha256:6256e6c90478734b8f3e6976cd509614df617f3dd216144b3ea13d474c26220d",
|
||||
"sha256:3a50f5d1f73bd11e3e14bf71ba24188f6604d39eb475b20a1fc34102df6a89e7",
|
||||
"sha256:073d9d05f61361565112341cedc9e473b81b9f170a87b9ce3546d47378d2fa49",
|
||||
"sha256:1faac9c21e57a7f92c9a5971f6414fdcf27f9288f0d3ccdf66751c0899e599a8",
|
||||
"sha256:c423c577c92a9855d09be7604e79b16598d415e600a0960e8bfacfc816651ef6"
|
||||
],
|
||||
"version": "==0.8.0"
|
||||
},
|
||||
"pylint": {
|
||||
"hashes": [
|
||||
"sha256:156839bedaa798febee72893beef00c650c2e7abafb5586fc7a6a56be7f80412",
|
||||
"sha256:4fe3b99da7e789545327b75548cee6b511e4faa98afe268130fea1af4b5ec022"
|
||||
],
|
||||
"version": "==1.8.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb",
|
||||
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9"
|
||||
],
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"wrapt": {
|
||||
"hashes": [
|
||||
"sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
|
||||
],
|
||||
"version": "==1.10.11"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
# parquet-generator
|
||||
|
||||
Supporting tools to create Parquet files to validate
|
||||
the vtable module.
|
||||
|
||||
Run:
|
||||
|
||||
```
|
||||
pipenv shell
|
||||
```
|
||||
|
||||
to get an environment with the necessary modules installed.
|
|
@ -0,0 +1,112 @@
|
|||
from datetime import datetime, timedelta
|
||||
import re
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
def make_100_rows():
|
||||
'''Create 100 rows with unique values in each field, exercising all the main
|
||||
physical types.'''
|
||||
rows = []
|
||||
for i in range(100):
|
||||
ba_fixed = bytearray()
|
||||
ba_fixed.append(i)
|
||||
ba_variable = bytearray()
|
||||
for j in range(i):
|
||||
ba_variable.append(j)
|
||||
row = []
|
||||
# BOOLEAN, INT32, INT64, INT96, DOUBLE, BYTE_ARRAY, FLOAT
|
||||
row.append(i % 2 == 0) # BOOLEAN
|
||||
row.append(50 - i) # INT32/INT8
|
||||
row.append(100 * (50 - i)) # INT32/INT16
|
||||
row.append(1000 * 1000 * (50 - i)) # INT32/INT32
|
||||
row.append(1000 * 1000 * 1000 * (50 - i)) # INT64/INT64
|
||||
row.append(datetime(1985, 7, 20) + timedelta(days=i)) # INT96
|
||||
row.append(100.0 / (i + 1)) # DOUBLE
|
||||
row.append(str(i)) # BYTE_ARRAY/UTF8
|
||||
row.append('{:03}'.format(i)), # BYTE_ARRAY/UTF8
|
||||
row.append(bytes(ba_variable)), # BYTE_ARRAY
|
||||
row.append(bytes(ba_fixed)) # FIXED_LENGTH_BYTE_ARRAY
|
||||
# pyarrow does not support float yet :(
|
||||
# row.append(1.0 / (i + 1)) # FLOAT
|
||||
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
def get_100_rows_types():
|
||||
'''The types for the columns in `make_100_rows`.'''
|
||||
return [
|
||||
pa.bool_(),
|
||||
pa.int8(),
|
||||
pa.int16(),
|
||||
pa.int32(),
|
||||
pa.int64(),
|
||||
pa.timestamp('ns'),
|
||||
pa.float64(),
|
||||
pa.string(),
|
||||
pa.string(),
|
||||
pa.binary(-1),
|
||||
pa.binary(1)
|
||||
# pa.float32()
|
||||
]
|
||||
|
||||
def write_parquet(file_name, rows, types, row_group_size):
|
||||
'''Create two parquets with columns we support.'''
|
||||
# pivot to be column major, create arrow structures
|
||||
fields = []
|
||||
for i in range(len(types)):
|
||||
col = []
|
||||
col.append([row[i] for row in rows])
|
||||
fields.append(pa.chunked_array(col, type=types[i]))
|
||||
|
||||
def name_of(i):
|
||||
name = '{}_{}'.format(types[i], i)
|
||||
name = name.replace('timestamp[ns]', 'ts')
|
||||
return name
|
||||
|
||||
cols = [pa.Column.from_array(name_of(i), fields[i]) for i in range(len(fields))]
|
||||
table = pa.Table.from_arrays(cols)
|
||||
print('Writing {}'.format(file_name))
|
||||
pq.write_table(table,
|
||||
file_name,
|
||||
row_group_size=row_group_size,
|
||||
use_deprecated_int96_timestamps=True)
|
||||
|
||||
def write_unsupported_parquets():
|
||||
# Taken from https://arrow.apache.org/docs/python/api.html
|
||||
unsupported = [
|
||||
pa.decimal128(10),
|
||||
pa.null(),
|
||||
pa.uint8(),
|
||||
pa.uint16(),
|
||||
pa.uint32(),
|
||||
pa.uint64(),
|
||||
# pa.float16() <-- not supported by us, but also not by pyarrow
|
||||
# TODO: list_, struct, dict
|
||||
]
|
||||
|
||||
for type in unsupported:
|
||||
file_name = 'unsupported-{}.parquet'.format(type)
|
||||
file_name = re.sub(r'[^0-9a-z.-]', '-', file_name)
|
||||
file_name = re.sub(r'--*', '-', file_name)
|
||||
|
||||
write_parquet(file_name, [], [type], row_group_size=1)
|
||||
|
||||
def main():
|
||||
'''Entrypoint.'''
|
||||
rows = make_100_rows()
|
||||
types = get_100_rows_types()
|
||||
|
||||
write_parquet('100-rows-1.parquet', rows, types, row_group_size=100)
|
||||
write_parquet('100-rows-10.parquet', rows, types, row_group_size=10)
|
||||
|
||||
for i in range(len(rows)):
|
||||
for j in range(len(rows[i])):
|
||||
if (i + j) % 2 == 0:
|
||||
rows[i][j] = None
|
||||
write_parquet('100-rows-nulls.parquet', rows, types,row_group_size=100)
|
||||
|
||||
write_unsupported_parquets()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue