Skip to content

Commit

Permalink
Try per-thread output
Browse files Browse the repository at this point in the history
  • Loading branch information
orf committed Jul 29, 2023
1 parent fef63cd commit ddf56e3
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 35 deletions.
60 changes: 30 additions & 30 deletions .github/workflows/unique_python_files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,36 +66,36 @@ jobs:
- name: Combine
run: |
poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files_combine.prql --output=parquet --memory=3 --threads=2 unique-python-files.parquet combined/*.parquet
poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files_combine.prql --per-thread-output --output=parquet --memory=3 --threads=2 unique-python-files.parquet combined/*.parquet
- name: List
run: ls combined/

- name: Gets latest created release info
id: latest_release_info
uses: jossef/[email protected]
env:
GITHUB_TOKEN: ${{ github.token }}

- name: Upload Assets
id: upload
uses: shogo82148/actions-upload-release-asset@v1
with:
upload_url: ${{ steps.latest_release_info.outputs.upload_url }}
asset_name: unique-python-files.parquet
asset_path: unique-python-files.parquet
overwrite: true

- name: Create download links
run: |
echo "${{ steps.upload.outputs.browser_download_url }}" > links/unique_python_files.txt
- uses: EndBug/add-and-commit@v9
with:
add: 'links/unique_python_files.txt'
author_email: "41898282+github-actions[bot]@users.noreply.github.com"
author_name: "commit-bot"
message: "Add only python links for asset ${{ needs.generate-matrix.outputs.release_id }}"
push: true
fetch: true
pull: '--rebase --autostash'
#
# - name: Gets latest created release info
# id: latest_release_info
# uses: jossef/[email protected]
# env:
# GITHUB_TOKEN: ${{ github.token }}
#
# - name: Upload Assets
# id: upload
# uses: shogo82148/actions-upload-release-asset@v1
# with:
# upload_url: ${{ steps.latest_release_info.outputs.upload_url }}
# asset_name: unique-python-files.parquet
# asset_path: unique-python-files.parquet
# overwrite: true
#
# - name: Create download links
# run: |
# echo "${{ steps.upload.outputs.browser_download_url }}" > links/unique_python_files.txt
#
# - uses: EndBug/add-and-commit@v9
# with:
# add: 'links/unique_python_files.txt'
# author_email: "41898282+github-actions[bot]@users.noreply.github.com"
# author_name: "commit-bot"
# message: "Add only python links for asset ${{ needs.generate-matrix.outputs.release_id }}"
# push: true
# fetch: true
# pull: '--rebase --autostash'
14 changes: 9 additions & 5 deletions src/pypi_data/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def run_sql(
no_limits: Annotated[bool, typer.Option()] = False,
profile: Annotated[bool, typer.Option()] = False,
db: Annotated[Optional[str], typer.Option()] = None,
per_thread_output: Annotated[bool, typer.Option()] = False,
):
"""
This whole method is a fucking mess.
Expand Down Expand Up @@ -179,19 +180,22 @@ def print_thread():
t = threading.Thread(target=print_thread, daemon=True)
t.start()

sql = conn.sql(sql)
sql_obj = conn.sql(sql)

if output == OutputFormat.TABLE:
try:
conn.table("temp_table")
except duckdb.CatalogException:
sql.to_table("temp_table")
sql_obj.to_table("temp_table")
else:
sql.insert_into("temp_table")
sql_obj.insert_into("temp_table")
elif output == OutputFormat.PARQUET:
sql.to_parquet(str(output_file), compression="zstd")
if per_thread_output:
conn.execute(f'COPY ({sql}) TO \'{output_file}\' (FORMAT PARQUET, PER_THREAD_OUTPUT TRUE, COMPRESSION zstd)')
else:
sql_obj.to_parquet(str(output_file), compression="zstd")
else:
sql.to_table("temp_table")
sql_obj.to_table("temp_table")
# df: pd.DataFrame = sql.to_df()
# df.set_index("name", inplace=True)
# df["stat"] = df["stat"].apply(lambda x: json.loads(x))
Expand Down

0 comments on commit ddf56e3

Please sign in to comment.