From 05f8835639f08e15c76b36fd67979a4a87818b0b Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Wed, 2 Aug 2023 13:17:06 +0100 Subject: [PATCH] Snappy? --- .github/workflows/unique_python_files.yml | 2 +- src/pypi_data/cli.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unique_python_files.yml b/.github/workflows/unique_python_files.yml index 178be256..ca8b9fde 100644 --- a/.github/workflows/unique_python_files.yml +++ b/.github/workflows/unique_python_files.yml @@ -62,7 +62,7 @@ jobs: - name: Ingest run: | mkdir combined/ - find dataset/ -name '*.parquet' | parallel -j 1 --xargs -n4 poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files.prql --output=parquet --threads=2 combined/{#}.parquet {} + find dataset/ -name '*.parquet' | parallel -j 1 --xargs -n3 poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files.prql --output=parquet --threads=2 combined/{#}.parquet {} - name: List run: ls combined/ diff --git a/src/pypi_data/cli.py b/src/pypi_data/cli.py index 30c71567..bf756d2a 100644 --- a/src/pypi_data/cli.py +++ b/src/pypi_data/cli.py @@ -226,7 +226,7 @@ def print_thread(): print(f'\n\nper_thread_output {output_sql}\n\n\n') conn.execute(output_sql) else: - sql_obj.to_parquet(str(output_file), compression="zstd") + sql_obj.to_parquet(str(output_file), compression="snappy") else: sql_obj.to_table("temp_table") conn.execute(f'COPY temp_table TO \'{output_file}\' (FORMAT JSON, array TRUE)')