From c09eaa639ad8c70c0f3437ad796e0842ae4152c3 Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Wed, 26 Jul 2023 20:53:57 +0100 Subject: [PATCH] print plans no cache? pragma? Try this? Try this? bruh ? try this? false fail fast ok Enable metadata cache Try this fail fast print Install? try this? cat Install https ok Try this Use httpfs checkout Load httpfs Try using prql? --- .github/workflows/run.yml | 143 +++++++++++++++++++++++++++++------ poetry.lock | 136 +++++++++------------------------ pyproject.toml | 5 +- sql/combine.prql | 2 + sql/combine.sql | 4 - sql/stats.prql | 85 +++++++++++++++++++++ sql/stats.sql | 80 ++++++++++---------- sql/unique_python_files.prql | 11 +++ sql/unique_python_files.sql | 9 --- src/pypi_data/cli.py | 52 ++++++++++++- 10 files changed, 344 insertions(+), 183 deletions(-) create mode 100644 sql/combine.prql delete mode 100644 sql/combine.sql create mode 100644 sql/stats.prql create mode 100644 sql/unique_python_files.prql delete mode 100644 sql/unique_python_files.sql diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 97722c8a..a6813438 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -94,23 +94,44 @@ jobs: - name: checkout uses: actions/checkout@v3 + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Install root + run: poetry install --only-root + - uses: actions/download-artifact@v3 with: name: groups - - name: Setup DuckDB - run: | - wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip - unzip /tmp/duckdb.zip -d ${{ github.workspace }} - chmod +x ${{ github.workspace }}/duckdb - - - name: Download - run: | - mkdir input/ - cat ${{ matrix.index }} | jq -rc '.[]' | parallel wget -nv -O input/{#}.parquet {} + - name: Download links + run: cat ${{ matrix.index }} | jq -rc '.[]' - name: Combine - run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/combine.sql + run: | + poetry run pypi-data run-sql ${{ github.workspace }}/sql/combine.prql output.parquet $(cat ${{ matrix.index }} | jq -rc '.[]') - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 @@ -162,25 +183,40 @@ jobs: needs: [ makepublic, generate-matrix ] runs-on: ubuntu-latest steps: - - name: checkout - uses: actions/checkout@v3 + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' - - name: Setup DuckDB - run: | - wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip - unzip /tmp/duckdb.zip -d ${{ github.workspace }} - chmod +x ${{ github.workspace }}/duckdb + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} - - name: Setup wget2 - run: sudo apt-get update && sudo apt-get install wget2 + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root - - name: Download releases - run: | - mkdir data/ - wget2 $(cat download_links.txt) --compression -P data/ + - name: Install root + run: poetry install --only-root + + - name: Download links + run: cat download_links.txt - name: Combine - run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/unique_python_files.sql + run: | + poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files.prql unique-python-files.parquet $(cat download_links.txt) - name: Upload Assets id: upload @@ -202,3 +238,60 @@ jobs: push: true fetch: true pull: '--rebase --autostash' + + generate_stats: + needs: [ makepublic, generate-matrix ] + runs-on: ubuntu-latest + steps: + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Install root + run: poetry install --only-root + + - name: Combine + run: | + poetry run pypi-data run-sql ${{ github.workspace }}/sql/stats.prql stats.parquet $(cat download_links.txt) +# +# - name: Upload Assets +# id: upload +# uses: shogo82148/actions-upload-release-asset@v1 +# with: +# upload_url: ${{ needs.generate-matrix.outputs.upload_url }} +# asset_name: unique-python-files.parquet +# +# - name: Create download links +# run: | +# echo "${{ steps.upload.outputs.browser_download_url }}" > only_python_download_links.txt +# +# - uses: EndBug/add-and-commit@v9 +# with: +# add: 'only_python_download_links.txt' +# author_email: "41898282+github-actions[bot]@users.noreply.github.com" +# author_name: "commit-bot" +# message: "Add only python links for asset ${{ needs.generate-matrix.outputs.release_id }}" +# push: true +# fetch: true +# pull: '--rebase --autostash' diff --git a/poetry.lock b/poetry.lock index e57e731f..1aa09607 100644 --- a/poetry.lock +++ b/poetry.lock @@ -446,13 +446,13 @@ files = [ [[package]] name = "more-itertools" -version = "9.1.0" +version = "10.0.0" description = "More routines for operating on iterables, beyond itertools" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "more-itertools-9.1.0.tar.gz", hash = "sha256:cabaa341ad0389ea83c17a94566a53ae4c9d07349861ecb14dc6d0345cf9ac5d"}, - {file = "more_itertools-9.1.0-py3-none-any.whl", hash = "sha256:d2bc7f02446e86a68911e58ded76d6561eea00cddfb2a91e7019bbb586c799f3"}, + {file = "more-itertools-10.0.0.tar.gz", hash = "sha256:cd65437d7c4b615ab81c0640c0480bc29a550ea032891977681efd28344d51e1"}, + {file = "more_itertools-10.0.0-py3-none-any.whl", hash = "sha256:928d514ffd22b5b0a8fce326d57f423a55d2ff783b093bab217eda71e732330f"}, ] [[package]] @@ -466,40 +466,6 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] -[[package]] -name = "numpy" -version = "1.25.1" -description = "Fundamental package for array computing in Python" -optional = false -python-versions = ">=3.9" -files = [ - {file = "numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d339465dff3eb33c701430bcb9c325b60354698340229e1dff97745e6b3efa"}, - {file = "numpy-1.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d736b75c3f2cb96843a5c7f8d8ccc414768d34b0a75f466c05f3a739b406f10b"}, - {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a90725800caeaa160732d6b31f3f843ebd45d6b5f3eec9e8cc287e30f2805bf"}, - {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6c9261d21e617c6dc5eacba35cb68ec36bb72adcff0dee63f8fbc899362588"}, - {file = "numpy-1.25.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0def91f8af6ec4bb94c370e38c575855bf1d0be8a8fbfba42ef9c073faf2cf19"}, - {file = "numpy-1.25.1-cp310-cp310-win32.whl", hash = "sha256:fd67b306320dcadea700a8f79b9e671e607f8696e98ec255915c0c6d6b818503"}, - {file = "numpy-1.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:c1516db588987450b85595586605742879e50dcce923e8973f79529651545b57"}, - {file = "numpy-1.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b82655dd8efeea69dbf85d00fca40013d7f503212bc5259056244961268b66e"}, - {file = "numpy-1.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f6049c4878cb16960fbbfb22105e49d13d752d4d8371b55110941fb3b17800"}, - {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41a56b70e8139884eccb2f733c2f7378af06c82304959e174f8e7370af112e09"}, - {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5154b1a25ec796b1aee12ac1b22f414f94752c5f94832f14d8d6c9ac40bcca6"}, - {file = "numpy-1.25.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38eb6548bb91c421261b4805dc44def9ca1a6eef6444ce35ad1669c0f1a3fc5d"}, - {file = "numpy-1.25.1-cp311-cp311-win32.whl", hash = "sha256:791f409064d0a69dd20579345d852c59822c6aa087f23b07b1b4e28ff5880fcb"}, - {file = "numpy-1.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:c40571fe966393b212689aa17e32ed905924120737194b5d5c1b20b9ed0fb171"}, - {file = "numpy-1.25.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d7abcdd85aea3e6cdddb59af2350c7ab1ed764397f8eec97a038ad244d2d105"}, - {file = "numpy-1.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a180429394f81c7933634ae49b37b472d343cccb5bb0c4a575ac8bbc433722f"}, - {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d412c1697c3853c6fc3cb9751b4915859c7afe6a277c2bf00acf287d56c4e625"}, - {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e1266411120a4f16fad8efa8e0454d21d00b8c7cee5b5ccad7565d95eb42dd"}, - {file = "numpy-1.25.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f76aebc3358ade9eacf9bc2bb8ae589863a4f911611694103af05346637df1b7"}, - {file = "numpy-1.25.1-cp39-cp39-win32.whl", hash = "sha256:247d3ffdd7775bdf191f848be8d49100495114c82c2bd134e8d5d075fb386a1c"}, - {file = "numpy-1.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:1d5d3c68e443c90b38fdf8ef40e60e2538a27548b39b12b73132456847f4b631"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:35a9527c977b924042170a0887de727cd84ff179e478481404c5dc66b4170009"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d3fe3dd0506a28493d82dc3cf254be8cd0d26f4008a417385cbf1ae95b54004"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:012097b5b0d00a11070e8f2e261128c44157a8689f7dedcf35576e525893f4fe"}, - {file = "numpy-1.25.1.tar.gz", hash = "sha256:9a3a9f3a61480cc086117b426a8bd86869c213fc4072e606f01c4e4b66eb92bf"}, -] - [[package]] name = "packaging" version = "23.1" @@ -538,76 +504,44 @@ docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx- test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] [[package]] -name = "polars" -version = "0.18.8" -description = "Blazingly fast DataFrame library" +name = "prql-python" +version = "0.9.2" +description = "" optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "polars-0.18.8-cp38-abi3-macosx_10_7_x86_64.whl", hash = "sha256:b3c541b91dfc528ea5923b409c7dbe21902eeb6070bd3f4cc32e3bef9aced7df"}, - {file = "polars-0.18.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:519212359f45e84fcca9e61d81ec1a1f04da4ad1539de2f0575ea287a33f7fab"}, - {file = "polars-0.18.8-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c8b91abd684c50cfb9f08311264e19e03a9a56e5fc152c760ff69d6a8998fbf"}, - {file = "polars-0.18.8-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61d61440b10355a1af1206ee8d9cae002ec2398c710603f5ce2c4b6732af31fc"}, - {file = "polars-0.18.8-cp38-abi3-win_amd64.whl", hash = "sha256:0bc01d496b07093c92be17777c48d5ec6a741ffc723c55a992759c189f22e8b2"}, - {file = "polars-0.18.8.tar.gz", hash = "sha256:283ca1357ef643b366bdfd0da2c9f31f252d1ce9a1b656eda3041cc9f83e0fa9"}, + {file = "prql_python-0.9.2-cp37-abi3-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:0da87163da01646820bc4a460403c667d19e66c7b4c946ce8279d434fefb649e"}, + {file = "prql_python-0.9.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d90ade324bf2b5aa1b95de71551bd2ee29fc377f553695d8d9d78fd8575e8a3"}, + {file = "prql_python-0.9.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550d1832d850b8d40393a89ed62043a1ad924e80be21170fa6a2908134a6e74e"}, + {file = "prql_python-0.9.2-cp37-abi3-win_amd64.whl", hash = "sha256:4598ad722f048a25a29a85aeee8dcb1e046c85af06825cfe90d003d750354583"}, + {file = "prql_python-0.9.2.tar.gz", hash = "sha256:98de2534e29069c356a0a0f2967a7a2f41922771d9a18305ea29e97291852e67"}, ] -[package.dependencies] -pyarrow = {version = ">=7.0.0", optional = true, markers = "extra == \"pyarrow\""} - -[package.extras] -adbc = ["adbc_driver_sqlite"] -all = ["polars[adbc,cloudpickle,connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,pydantic,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] -cloudpickle = ["cloudpickle"] -connectorx = ["connectorx"] -deltalake = ["deltalake (>=0.10.0)"] -fsspec = ["fsspec"] -matplotlib = ["matplotlib"] -numpy = ["numpy (>=1.16.0)"] -pandas = ["pandas", "pyarrow (>=7.0.0)"] -pyarrow = ["pyarrow (>=7.0.0)"] -pydantic = ["pydantic"] -sqlalchemy = ["pandas", "sqlalchemy"] -timezone = ["backports.zoneinfo", "tzdata"] -xlsx2csv = ["xlsx2csv (>=0.8.0)"] -xlsxwriter = ["xlsxwriter"] - -[[package]] -name = "pyarrow" -version = "12.0.1" -description = "Python library for Apache Arrow" +[[package]] +name = "psutil" +version = "5.9.5" +description = "Cross-platform lib for process and system monitoring in Python." optional = false -python-versions = ">=3.7" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ - {file = "pyarrow-12.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:6d288029a94a9bb5407ceebdd7110ba398a00412c5b0155ee9813a40d246c5df"}, - {file = "pyarrow-12.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:345e1828efdbd9aa4d4de7d5676778aba384a2c3add896d995b23d368e60e5af"}, - {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d6009fdf8986332b2169314da482baed47ac053311c8934ac6651e614deacd6"}, - {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d3c4cbbf81e6dd23fe921bc91dc4619ea3b79bc58ef10bce0f49bdafb103daf"}, - {file = "pyarrow-12.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:cdacf515ec276709ac8042c7d9bd5be83b4f5f39c6c037a17a60d7ebfd92c890"}, - {file = "pyarrow-12.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:749be7fd2ff260683f9cc739cb862fb11be376de965a2a8ccbf2693b098db6c7"}, - {file = "pyarrow-12.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6895b5fb74289d055c43db3af0de6e16b07586c45763cb5e558d38b86a91e3a7"}, - {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1887bdae17ec3b4c046fcf19951e71b6a619f39fa674f9881216173566c8f718"}, - {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2c9cb8eeabbadf5fcfc3d1ddea616c7ce893db2ce4dcef0ac13b099ad7ca082"}, - {file = "pyarrow-12.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:ce4aebdf412bd0eeb800d8e47db854f9f9f7e2f5a0220440acf219ddfddd4f63"}, - {file = "pyarrow-12.0.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:e0d8730c7f6e893f6db5d5b86eda42c0a130842d101992b581e2138e4d5663d3"}, - {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43364daec02f69fec89d2315f7fbfbeec956e0d991cbbef471681bd77875c40f"}, - {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:051f9f5ccf585f12d7de836e50965b3c235542cc896959320d9776ab93f3b33d"}, - {file = "pyarrow-12.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:be2757e9275875d2a9c6e6052ac7957fbbfc7bc7370e4a036a9b893e96fedaba"}, - {file = "pyarrow-12.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:cf812306d66f40f69e684300f7af5111c11f6e0d89d6b733e05a3de44961529d"}, - {file = "pyarrow-12.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:459a1c0ed2d68671188b2118c63bac91eaef6fc150c77ddd8a583e3c795737bf"}, - {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85e705e33eaf666bbe508a16fd5ba27ca061e177916b7a317ba5a51bee43384c"}, - {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9120c3eb2b1f6f516a3b7a9714ed860882d9ef98c4b17edcdc91d95b7528db60"}, - {file = "pyarrow-12.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:c780f4dc40460015d80fcd6a6140de80b615349ed68ef9adb653fe351778c9b3"}, - {file = "pyarrow-12.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a3c63124fc26bf5f95f508f5d04e1ece8cc23a8b0af2a1e6ab2b1ec3fdc91b24"}, - {file = "pyarrow-12.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b13329f79fa4472324f8d32dc1b1216616d09bd1e77cfb13104dec5463632c36"}, - {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb656150d3d12ec1396f6dde542db1675a95c0cc8366d507347b0beed96e87ca"}, - {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6251e38470da97a5b2e00de5c6a049149f7b2bd62f12fa5dbb9ac674119ba71a"}, - {file = "pyarrow-12.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3de26da901216149ce086920547dfff5cd22818c9eab67ebc41e863a5883bac7"}, - {file = "pyarrow-12.0.1.tar.gz", hash = "sha256:cce317fc96e5b71107bf1f9f184d5e54e2bd14bbf3f9a3d62819961f0af86fec"}, + {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"}, + {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"}, + {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"}, + {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"}, + {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"}, + {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"}, + {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"}, + {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"}, ] -[package.dependencies] -numpy = ">=1.16.6" +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] [[package]] name = "pycparser" @@ -906,4 +840,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "ffacbfa8fc3d9102b586a8c8a3ab23b9488da2f8d7ef1fae388422074bc72e0f" +content-hash = "0ea03cf7c440b21aa4d831e6690dbc6184a516fa3985a131c17b0a8f74c86f55" diff --git a/pyproject.toml b/pyproject.toml index 55fd525a..4ec8aa14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,14 +11,15 @@ packages = [ [tool.poetry.dependencies] python = "^3.11" -polars = { extras = ["pyarrow"], version = "^0.18.7" } requests = "^2.31.0" typer = { extras = ["all"], version = "^0.9.0" } pygithub = "^1.59.0" fsspec = "^2023.6.0" duckdb = "^0.8.1" tqdm = "^4.65.0" -more-itertools = "^9.1.0" +more-itertools = "^10.0.0" +prql-python = "^0.9.2" +psutil = "^5.9.5" [tool.poetry.scripts] pypi-data = "pypi_data.cli:app" diff --git a/sql/combine.prql b/sql/combine.prql new file mode 100644 index 00000000..12a64784 --- /dev/null +++ b/sql/combine.prql @@ -0,0 +1,2 @@ +prql target:sql.duckdb +from (read_parquet $1) \ No newline at end of file diff --git a/sql/combine.sql b/sql/combine.sql deleted file mode 100644 index 35c4b4b1..00000000 --- a/sql/combine.sql +++ /dev/null @@ -1,4 +0,0 @@ -PRAGMA memory_limit='2GB'; -PRAGMA threads=4; -CREATE TABLE temp_table AS SELECT * FROM read_parquet('input/*.parquet', union_by_name=True); -COPY temp_table TO 'output.parquet' (FORMAT PARQUET, COMPRESSION zstd); \ No newline at end of file diff --git a/sql/stats.prql b/sql/stats.prql new file mode 100644 index 00000000..6b1e85d9 --- /dev/null +++ b/sql/stats.prql @@ -0,0 +1,85 @@ +prql target:sql.duckdb + +let approx_count_distinct = column -> s"approx_count_distinct({column})" +let regexp_extract = column r -> s"regexp_extract({column}, {r})" +let row_to_json = name -> s"row_to_json({name})" +let relation_to_json = func r -> ( + from s=r + aggregate { + _to_json=s"row_to_json(s)" + } + aggregate { + stat = s"json_group_array({_to_json})", + name = s"'{r}'" + } +) + +let base = ( + from (read_parquet($1)) +) + +let base_with_extension = ( + from base + select { + extension = (regexp_extract path "\\.[0-9a-z]+$"), + lines, + size, + skip_reason + } +) + +let total_stats = ( + from base + aggregate { + total_files = count(s"*"), + unique_files = approx_count_distinct(hash) | as bigint, + total_size = sum(size) | as bigint, + total_lines = sum(lines) | as bigint + } +) + +let extension_stats = ( + from base_with_extension + group {extension} ( + aggregate { + total_files = count(s"*"), + total_lines = sum(lines) | as bigint, + total_size = sum(size) | as bigint + } + ) + sort {-total_files} + take 10 +) + +let binary_extension_stats = ( + from base_with_extension + filter skip_reason == "binary" + group {extension} ( + aggregate { + total_files = count(s"*"), + total_lines = sum(lines) | as bigint, + total_size = sum(size) | as bigint + } + ) + sort {-total_files} + take 10 +) + +let skipped_files_stats = ( + from base_with_extension + filter skip_reason != "binary" + group {extension} ( + aggregate { + total_files = count(s"*"), + total_size = sum(size) | as bigint + } + ) + sort {-total_files} + take 10 +) + + +relation_to_json(total_stats) +append (relation_to_json extension_stats) +append (relation_to_json binary_extension_stats) +append (relation_to_json skipped_files_stats) \ No newline at end of file diff --git a/sql/stats.sql b/sql/stats.sql index 51461f3b..28564094 100644 --- a/sql/stats.sql +++ b/sql/stats.sql @@ -4,46 +4,46 @@ SET threads = 4; -COPY -( -select count(*) as "total_files", - approx_count_distinct(hash)::bigint as "unique_files", sum(size)::bigint as "total_size", sum(lines)::bigint as "total_lines", -from 'data/*.parquet' ) TO 'stats/general_stats.json'; - -COPY -( -select regexp_extract(path, '\.[0-9a-z]+$') as extension, - count() as total, - sum(lines)::bigint as lines, sum(size) ::bigint as size, -from 'data/*.parquet' -group by extension -order by total DESC - limit 10 - ) TO 'stats/top_extensions.json'; - -COPY -( -select regexp_extract(path, '\.[0-9a-z]+$') as extension, - count() as total, - sum(size) ::bigint as size, -from 'data/*.parquet' -where skip_reason = 'binary' -group by extension -order by total DESC - limit 10 - ) TO 'stats/top_binary_extensions.json' (ARRAY TRUE); - -COPY -( -select skip_reason, - count(*) as total, - sum(size) ::bigint as size -from 'data/*.parquet' -where skip_reason != '' -group by skip_reason -order by total DESC - limit 10 - ) TO 'stats/skipped_files.json' (ARRAY TRUE); +-- COPY +-- ( +-- select count(*) as "total_files", +-- approx_count_distinct(hash)::bigint as "unique_files", sum(size)::bigint as "total_size", sum(lines)::bigint as "total_lines", +-- from 'data/*.parquet' ) TO 'stats/general_stats.json'; +-- +-- COPY +-- ( +-- select regexp_extract(path, '\.[0-9a-z]+$') as extension, +-- count() as total, +-- sum(lines)::bigint as lines, sum(size) ::bigint as size, +-- from 'data/*.parquet' +-- group by extension +-- order by total DESC +-- limit 10 +-- ) TO 'stats/top_extensions.json'; +-- +-- COPY +-- ( +-- select regexp_extract(path, '\.[0-9a-z]+$') as extension, +-- count() as total, +-- sum(size) ::bigint as size, +-- from 'data/*.parquet' +-- where skip_reason = 'binary' +-- group by extension +-- order by total DESC +-- limit 10 +-- ) TO 'stats/top_binary_extensions.json' (ARRAY TRUE); +-- +-- COPY +-- ( +-- select skip_reason, +-- count(*) as total, +-- sum(size) ::bigint as size +-- from 'data/*.parquet' +-- where skip_reason != '' +-- group by skip_reason +-- order by total DESC +-- limit 10 +-- ) TO 'stats/skipped_files.json' (ARRAY TRUE); COPY diff --git a/sql/unique_python_files.prql b/sql/unique_python_files.prql new file mode 100644 index 00000000..7e39ce35 --- /dev/null +++ b/sql/unique_python_files.prql @@ -0,0 +1,11 @@ +prql target:sql.duckdb + +let any_value = column -> s"any_value({column})" + +from (read_parquet $1) +filter path ~= "\\.py$" +filter skip_reason == "" +group {hash} ( +aggregate { + any_value(path) +}) diff --git a/sql/unique_python_files.sql b/sql/unique_python_files.sql deleted file mode 100644 index 07d03ebd..00000000 --- a/sql/unique_python_files.sql +++ /dev/null @@ -1,9 +0,0 @@ -SET memory_limit='6GB'; -SET threads=2; -COPY -( -select hash, any_value(path) -from read_parquet('data/*.parquet') -where path LIKE '%.py' and skip_reason = '' -group by 1 -) TO 'unique-python-files.parquet' (FORMAT PARQUET, compression zstd); diff --git a/src/pypi_data/cli.py b/src/pypi_data/cli.py index cb32b881..673c7f84 100644 --- a/src/pypi_data/cli.py +++ b/src/pypi_data/cli.py @@ -1,9 +1,15 @@ +import threading +import time + +import duckdb import json from pathlib import Path -from typing import Annotated, Iterable +from typing import Annotated, Iterable, List, Optional + +import psutil from fsspec.implementations.http_sync import HTTPFileSystem import typer -import polars as pl +import prql_python as prql from github import Github from github import Auth import requests @@ -73,5 +79,47 @@ def group_index_urls(github_token: GithubToken, (output_path / "groups.json").write_text(json.dumps(outputs)) +@app.command() +def run_sql( + prql_file: Annotated[Path, typer.Argument(dir_okay=False, file_okay=True, readable=True)], + output_file: Annotated[Path, typer.Argument(dir_okay=False, file_okay=True, writable=True)], + parameter: Annotated[Optional[List[str]], typer.Argument()] = None +): + options = prql.CompileOptions( + format=True, signature_comment=True, target="sql.duckdb" + ) + + sql = prql.compile(prql_file.read_text(), options=options) + print(sql) + print(f'{parameter=}') + print("\n\n\n") + # x = duckdb.execute(sql, parameters=[parameter] if parameter else []) + # import pprint + # pprint.pprint(x.fetchall()) + duckdb.install_extension("httpfs") + duckdb.load_extension("httpfs") + + def print_thread(): + psutil.cpu_percent() + while True: + time.sleep(1) + memory = psutil.virtual_memory() + cpu = psutil.cpu_percent() + print(f'\n{memory.available=} / {memory=} / {cpu=}\n') + + t = threading.Thread(target=print_thread, daemon=True) + t.start() + duckdb.execute("PRAGMA EXPLAIN_OUTPUT='ALL';") + duckdb.execute(f"EXPLAIN COPY ({sql}) TO '{output_file}' (FORMAT PARQUET, COMPRESSION zstd)", parameters=[parameter] if parameter else []) + for name, plan in duckdb.fetchall(): + print(name) + print(plan) + print("\n\n\n") + duckdb.executemany(f"PRAGMA threads=2; " + f"PRAGMA memory_limit='2GB'; " + f"COPY ({sql}) TO '{output_file}' (FORMAT PARQUET, COMPRESSION zstd)", + parameters=[[parameter]] if parameter else []) + + if __name__ == "__main__": app()