diff --git a/.gitignore b/.gitignore index 5868472..6588b75 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,4 @@ tests/queries /tests/test.db /tests/results.bad_alloc /tests/libfailmalloc +/build/linux diff --git a/README.md b/README.md index 5c41e64..4226d64 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,37 @@ A SQLite [virtual table](https://sqlite.org/vtab.html) extension to expose Parqu This [blog post](https://cldellow.com/2018/06/22/sqlite-parquet-vtable.html) provides some context on why you might use this. -## Download +## Installing + +### Download You can fetch a version built for Ubuntu 16.04 at https://s3.amazonaws.com/cldellow/public/libparquet/libparquet.so.xz +### Building + +``` +./make-linux +``` + +The first run will git clone a bunch of libraries, patch them to be statically linkable and build them. + +Subsequent builds will only build the parquet virtual table extension. + +#### Tests + +Run: + +``` +tests/create-queries-from-templates +tests/test-all +``` + + ## Use ``` $ sqlite/sqlite3 -sqlite> .load parquet/libparquet +sqlite> .load build/linux/libparquet sqlite> CREATE VIRTUAL TABLE demo USING parquet('parquet-generator/99-rows-1.parquet'); sqlite> SELECT * FROM demo; ...if all goes well, you'll see data here!... @@ -21,7 +43,7 @@ sqlite> SELECT * FROM demo; Note: if you get an error like: ``` -sqlite> .load parquet/libparquet +sqlite> .load build/linux/libparquet Error: parquet/libparquet.so: wrong ELF class: ELFCLASS64 ``` @@ -89,26 +111,3 @@ These are not currently supported: * UINT8/UINT16/UINT32/UINT64 * DECIMAL - -## Building - -If you're a masochist, you can try to build this yourself: - -1. Install [`parquet-cpp`](https://github.com/apache/parquet-cpp) - 1. Master appears to be broken for text row group stats; see https://github.com/cldellow/sqlite-parquet-vtable/issues/5 for which versions to use -2. Run `./build-sqlite` to fetch and build the SQLite dev bits -3. Run `./parquet/make` to build the module - 1. You will need to fixup the paths in this file to point at your local parquet-cpp folder. - -You're almost certainly going to regret your life. https://stackoverflow.com/questions/48157198/how-can-i-statically-link-arrow-when-building-parquet-cpp may be useful. - -## Tests - -Run: - -``` -tests/create-queries-from-templates -tests/test-all -``` - - diff --git a/build-sqlite b/build-sqlite index ca88e88..a5176bf 100755 --- a/build-sqlite +++ b/build-sqlite @@ -8,7 +8,7 @@ fetch_if_needed() { curl --fail "https://sqlite.org/2018/sqlite-autoconf-${VERSION}.tar.gz" > sqlite.tar.gz tar xf sqlite.tar.gz rm sqlite.tar.gz - ln -s sqlite-autoconf-${VERSION} sqlite + mv sqlite-autoconf-${VERSION} sqlite fi } diff --git a/build/Makefile.linux b/build/Makefile.linux new file mode 100644 index 0000000..1421451 --- /dev/null +++ b/build/Makefile.linux @@ -0,0 +1,141 @@ +HERE:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +ROOT:=$(HERE)/../.. +VTABLE:=$(ROOT)/parquet +SQLITE:=$(ROOT)/sqlite + +# Directories +ARROW=$(HERE)/arrow +BROTLI=$(HERE)/brotli +LZ4=$(HERE)/lz4 +PARQUET_CPP=$(HERE)/parquet-cpp +SNAPPY=$(HERE)/snappy +ZLIB=$(HERE)/zlib +ZSTD=$(HERE)/zstd + +# Libraries +ARROW_LIB = $(ARROW)/cpp/release/release/libarrow.a +BOOST_FILESYSTEM_LIB = /usr/lib/x86_64-linux-gnu/libboost_filesystem.so +BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so +BOOST_SYSTEM_LIB = /usr/lib/x86_64-linux-gnu/libboost_system.so +BROTLI_COMMON_LIB = $(BROTLI)/out/libbrotlicommon-static.a +BROTLI_DEC_LIB = $(BROTLI)/out/libbrotlidec-static.a +BROTLI_ENC_LIB = $(BROTLI)/out/libbrotlienc-static.a +LZ4_LIB = $(LZ4)/lib/liblz4.a +PARQUET_CPP_LIB = $(PARQUET_CPP)/build/release/libparquet.a +SNAPPY_LIB = $(SNAPPY)/build/libsnappy.a +SQLITE3_LIB = $(SQLITE)/libsqlite3.a +THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a +ZLIB_LIB = $(ZLIB)/libz.a +ZSTD_LIB = $(ZSTD)/lib/libzstd.a + +# Flags +CC = g++ +CPUS:=$(shell nproc) +CFLAGS = -I $(SQLITE) -O3 -std=c++11 -Wall -fPIC -g + +LDFLAGS = -O3 \ + -Wl,--whole-archive $(PARQUET_CPP_LIB) $(LZ4_LIB) $(ZSTD_LIB) $(THRIFT_LIB) $(SNAPPY_LIB) $(ARROW_LIB) \ + $(BROTLI_ENC_LIB) $(BROTLI_COMMON_LIB) $(BROTLI_DEC_LIB) \ + -Wl,--no-whole-archive -lz -lcrypto -lssl $(BOOST_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_FILESYSTEM_LIB) +OBJ = parquet.o parquet_filter.o parquet_table.o parquet_cursor.o +LIBS = $(ARROW_LIB) \ + $(BROTLI_COMMON_LIB) \ + $(BROTLI_DEC_LIB) \ + $(BROTLI_ENC_LIB) \ + $(LZ4_LIB) \ + $(PARQUET_CPP_LIB) \ + $(SNAPPY_LIB) \ + $(ZLIB_LIB) \ + $(ZSTD_LIB) + +libparquet.so: $(OBJ) $(LIBS) + $(CC) -shared -o $@ $(OBJ) $(LDFLAGS) + +parquet_filter.o: $(VTABLE)/parquet_filter.cc $(VTABLE)/parquet_filter.h + $(CC) -c -o $@ $< $(CFLAGS) + +parquet_cursor.o: $(VTABLE)/parquet_cursor.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h + $(CC) -c -o $@ $< $(CFLAGS) + +parquet_table.o: $(VTABLE)/parquet_table.cc $(VTABLE)/parquet_table.h + $(CC) -c -o $@ $< $(CFLAGS) + +parquet.o: $(VTABLE)/parquet.cc $(VTABLE)/parquet_cursor.h $(VTABLE)/parquet_table.h $(VTABLE)/parquet_filter.h + $(CC) -c -o $@ $< $(CFLAGS) + +$(ARROW_LIB): + rm -rf $(ARROW) + git clone https://github.com/apache/arrow.git $(ARROW) + cd $(ARROW) && git checkout apache-arrow-0.9.0 + mkdir $(ARROW)/cpp/release + cd $(ARROW)/cpp/release && cmake .. -DCMAKE_BUILD_TYPE=Release + cd $(ARROW)/cpp/release && make -j$(CPUS) unittest + +$(BROTLI_COMMON_LIB): + rm -rf $(BROTLI) + git clone https://github.com/google/brotli.git $(BROTLI) + mkdir $(BROTLI)/out + cd $(BROTLI)/out && ../configure-cmake && make -j$(CPUS) + +$(LZ4_LIB): + rm -rf $(LZ4) + git clone https://github.com/lz4/lz4.git $(LZ4) + sed -i 's/^CFLAGS *+=/CFLAGS += -fPIC /' $(LZ4)/lib/Makefile + cd $(LZ4) && make -j$(CPUS) + +$(PARQUET_CPP_LIB): $(SNAPPY_LIB) $(BROTLI_COMMON_LIB) $(ZLIB_LIB) $(LZ4_LIB) $(ZSTD_LIB) + rm -rf $(PARQUET_CPP) + git clone https://github.com/apache/parquet-cpp.git $(PARQUET_CPP) + cd $(PARQUET_CPP) && git checkout apache-parquet-cpp-1.4.0 + cd $(PARQUET_CPP) && SNAPPY_STATIC_LIB=$(SNAPPY_LIB) BROTLI_STATIC_LIB_ENC=$(BROTLI_ENC_LIB) BROTLI_STATIC_LIB_DEC=$(BROTLI_DEC_LIB) BROTLI_STATIC_LIB_COMMON=$(BROTLI_COMMON_LIB) ZLIB_STATIC_LIB=$(ZLIB_LIB) LZ4_STATIC_LIB=$(LZ4_LIB) ZSTD_STATIC_LIB=$(ZSTD_LIB) cmake -DCMAKE_BUILD_TYPE=Release -DPARQUET_MINIMAL_DEPENDENCY=ON -DPARQUET_ARROW_LINKAGE=static . + cd $(PARQUET_CPP) && make -j$(CPUS) + +$(SNAPPY_LIB): + rm -rf $(SNAPPY) + git clone https://github.com/google/snappy.git $(SNAPPY) + mkdir $(SNAPPY)/build + cd $(SNAPPY)/build && cmake .. + sed -i '3iset(CMAKE_POSITION_INDEPENDENT_CODE ON)' $(SNAPPY)/CMakeLists.txt + cd $(SNAPPY)/build && make -j$(CPUS) + +$(SQLITE3_LIB): + cd $(ROOT) && ./build-sqlite + +$(ZLIB_LIB): + rm -rf $(ZLIB) + git clone https://github.com/madler/zlib.git $(ZLIB) + cd $(ZLIB) && ./configure + sed -i 's/^CFLAGS=-O3/CFLAGS=-fPIC -O3/' $(ZLIB)/Makefile + cd $(ZLIB) && make -j$(CPUS) + +$(ZSTD_LIB): + rm -rf $(ZSTD) + git clone https://github.com/facebook/zstd.git $(ZSTD) + sed -i 's/^CFLAGS *+=/CFLAGS += -fPIC /' $(ZSTD)/lib/Makefile + cd $(ZSTD) && make -j$(CPUS) + +.PHONY: clean parquet snappy brotli zlib lz4 zstd arrow + +clean: + rm -f *.o *.so + +distclean: + rm -rf $(SQLITE) $(HERE) + + +arrow: $(ARROW_LIB) + +brotli: $(BROTLI_COMMON_LIB) + +lz4: $(LZ4_LIB) + +parquet: $(PARQUET_CPP_LIB) + +snappy: $(SNAPPY_LIB) + +sqlite: $(SQLITE3_LIB) + +zlib: $(ZLIB_LIB) + +zstd: $(ZSTD_LIB) + diff --git a/make-linux b/make-linux new file mode 100755 index 0000000..fb0d0cd --- /dev/null +++ b/make-linux @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail + +mkdir -p build/linux +cp -f build/Makefile.linux build/linux/Makefile + +cd build/linux + +# Install prereqs based on https://github.com/apache/parquet-cpp#linux +sudo apt-get install libboost-dev libboost-filesystem-dev \ + libboost-program-options-dev libboost-regex-dev \ + libboost-system-dev libboost-test-dev \ + libssl-dev libtool bison flex pkg-config + +# Install prereqs based on https://github.com/apache/arrow/tree/master/cpp +sudo apt-get install cmake \ + libboost-dev \ + libboost-filesystem-dev \ + libboost-system-dev + +if [ ! -e ../../sqlite/sqlite3 ]; then + make sqlite +fi +make "$@" diff --git a/parquet/Makefile b/parquet/Makefile deleted file mode 100644 index bf6eb95..0000000 --- a/parquet/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -PARQUET_CPP=~/src/parquet-cpp - -CC = g++ -CFLAGS = -I ../sqlite -O3 -std=c++11 -Wall -fPIC -g -PARQUET_LIB = $(PARQUET_CPP)/build/release/libparquet.a -THRIFT_LIB = $(PARQUET_CPP)/thrift_ep/src/thrift_ep-install/lib/libthrift.a -PARQUET_DEPS = /home/cldellow/src/parquet-deps -LZ4_LIB = $(PARQUET_DEPS)/lz4/lib/liblz4.a -ARROW_LIB = $(PARQUET_CPP)/arrow_ep-prefix/src/arrow_ep-build/release/libarrow.a -SNAPPY_LIB = $(PARQUET_DEPS)/snappy/build/libsnappy.a -ZSTD_LIB = $(PARQUET_DEPS)/zstd/lib/libzstd.a -BOOST_LIB = /usr/lib/x86_64-linux-gnu/libboost_regex.so -BOOST_SYSTEM_LIB = /usr/lib/x86_64-linux-gnu/libboost_system.so -BOOST_FILESYSTEM_LIB = /usr/lib/x86_64-linux-gnu/libboost_filesystem.so -BROTLI_COMMON_LIB = $(PARQUET_DEPS)/brotli/out/libbrotlicommon-static.a -BROTLI_ENC_LIB = $(PARQUET_DEPS)/brotli/out/libbrotlienc-static.a -BROTLI_DEC_LIB = $(PARQUET_DEPS)/brotli/out/libbrotlidec-static.a - -LDFLAGS = -O3 \ - -Wl,--whole-archive $(PARQUET_LIB) $(LZ4_LIB) $(ZSTD_LIB) $(THRIFT_LIB) $(SNAPPY_LIB) $(ARROW_LIB) \ - $(BROTLI_ENC_LIB) $(BROTLI_COMMON_LIB) $(BROTLI_DEC_LIB) \ - -Wl,--no-whole-archive -lz -lcrypto -lssl $(BOOST_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_FILESYSTEM_LIB) -OBJ = parquet.o parquet_filter.o parquet_table.o parquet_cursor.o - -libparquet.so: $(OBJ) - $(CC) -shared -o $@ $^ $(LDFLAGS) - -parquet_filter.o: parquet_filter.cc parquet_filter.h - $(CC) -c -o $@ $< $(CFLAGS) - -parquet_cursor.o: parquet_cursor.cc parquet_cursor.h parquet_table.h parquet_filter.h - $(CC) -c -o $@ $< $(CFLAGS) - -parquet_table.o: parquet_table.cc parquet_table.h - $(CC) -c -o $@ $< $(CFLAGS) - -parquet.o: parquet.cc parquet_cursor.h parquet_table.h parquet_filter.h - $(CC) -c -o $@ $< $(CFLAGS) - -.PHONY: clean - -clean: - rm -f *.o *.so diff --git a/tests/test-non-existent b/tests/test-non-existent index 564f267..43dfab1 100755 --- a/tests/test-non-existent +++ b/tests/test-non-existent @@ -6,7 +6,7 @@ set -euo pipefail load_nonexistent() { cat < /dev/null 2> testcase-stderr.txt; then diff --git a/tests/test-unsupported b/tests/test-unsupported index 57d7ec4..a761646 100755 --- a/tests/test-unsupported +++ b/tests/test-unsupported @@ -9,7 +9,7 @@ load_unsupported() { basename=$(basename "$file") cat <