From bb6909d2e2adff92dd0d53233334ea0e14dfa074 Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Sun, 6 Aug 2023 15:01:27 +0100 Subject: [PATCH] Use parquet CLI to combine --- .github/workflows/run.yml | 31 +++++++++++++++++++++++-------- sql/combine.sql | 8 -------- 2 files changed, 23 insertions(+), 16 deletions(-) delete mode 100644 sql/combine.sql diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 471cc21d..372dd276 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -127,15 +127,17 @@ jobs: with: name: groups + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - run: cargo install parquet -F cli + - name: Download links run: cat ${{ matrix.index }} | jq -rc '.[]' - - name: Setup DuckDB - run: | - wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip - unzip /tmp/duckdb.zip -d ${{ github.workspace }} - chmod +x ${{ github.workspace }}/duckdb - - name: Debug run: | echo "Links for ${{ matrix.index }}" @@ -149,9 +151,22 @@ jobs: - run: ls -la ${{ github.workspace }}/input/ - name: Combine - run: ${{ github.workspace }}/duckdb -echo foo.db < ${{ github.workspace }}/sql/combine.sql + run: parquet-concat ${{ github.workspace }}/input/*.parquet ${{ github.workspace }}/merged.parquet + + - name: Merged size + run: du -hs ${{ github.workspace }}/merged.parquet - - run: ls -la ${{ github.workspace }}/*.parquet + - name: Rewrite + run: | + parquet-rewrite --compression=zstd \ + --input=${{ github.workspace }}/merged.parquet \ + --output=${{ github.workspace }}/output.parquet \ + --writer-version=2.0 \ + --statistics-enabled=page \ + --bloom-filter-enabled=true + + - name: Output size + run: du -hs ${{ github.workspace }}/merged.parquet - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 diff --git a/sql/combine.sql b/sql/combine.sql deleted file mode 100644 index e8e65c76..00000000 --- a/sql/combine.sql +++ /dev/null @@ -1,8 +0,0 @@ -PRAGMA memory_limit='2GB'; -PRAGMA threads=4; -SET enable_progress_bar=true; -CREATE TABLE temp_table AS -select regexp_extract(filename, '(\d+)\.parquet', 1)::USMALLINT as repository, * exclude (filename) -FROM read_parquet('input/*.parquet', union_by_name = True, filename = true) -order by (archive_path); -COPY temp_table TO 'output.parquet' (FORMAT PARQUET, COMPRESSION zstd);