From b751932c376214ab84b35f0835bb0ea32bb054ea Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Wed, 2 Aug 2023 13:54:17 +0100 Subject: [PATCH] Try this? --- .github/workflows/unique_python_files.yml | 2 +- src/pypi_data/cli.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unique_python_files.yml b/.github/workflows/unique_python_files.yml index ca8b9fde..0e5ed36e 100644 --- a/.github/workflows/unique_python_files.yml +++ b/.github/workflows/unique_python_files.yml @@ -69,7 +69,7 @@ jobs: - name: Combine run: | - poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files_combine.prql --output=parquet --memory=3 --threads=2 unique-python-files.parquet combined/*.parquet + poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files_combine.prql --output=parquet --memory=3 --threads=2 --per-thread-output unique-python-files.parquet combined/*.parquet - name: Gets latest created release info id: latest_release_info diff --git a/src/pypi_data/cli.py b/src/pypi_data/cli.py index ee337b5a..89898d9d 100644 --- a/src/pypi_data/cli.py +++ b/src/pypi_data/cli.py @@ -222,11 +222,11 @@ def print_thread(): sql_obj.insert_into("temp_table") elif output == OutputFormat.PARQUET: if per_thread_output: - output_sql = f'COPY ({sql}) TO \'{output_file}\' (FORMAT PARQUET, COMPRESSION zstd)' + output_sql = f'COPY ({sql}) TO \'{output_file}\' (FORMAT PARQUET, per_thread_output true, COMPRESSION snappy)' print(f'\n\nper_thread_output {output_sql}\n\n\n') conn.execute(output_sql) else: - sql_obj.to_parquet(str(output_file), compression="snappy") + sql_obj.to_parquet(str(output_file), compression="zstd") else: sql_obj.to_table("temp_table") conn.execute(f'COPY temp_table TO \'{output_file}\' (FORMAT JSON, array TRUE)')