diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 97722c8a..a6813438 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -94,23 +94,44 @@ jobs: - name: checkout uses: actions/checkout@v3 + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Install root + run: poetry install --only-root + - uses: actions/download-artifact@v3 with: name: groups - - name: Setup DuckDB - run: | - wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip - unzip /tmp/duckdb.zip -d ${{ github.workspace }} - chmod +x ${{ github.workspace }}/duckdb - - - name: Download - run: | - mkdir input/ - cat ${{ matrix.index }} | jq -rc '.[]' | parallel wget -nv -O input/{#}.parquet {} + - name: Download links + run: cat ${{ matrix.index }} | jq -rc '.[]' - name: Combine - run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/combine.sql + run: | + poetry run pypi-data run-sql ${{ github.workspace }}/sql/combine.prql output.parquet $(cat ${{ matrix.index }} | jq -rc '.[]') - name: Upload Assets uses: shogo82148/actions-upload-release-asset@v1 @@ -162,25 +183,40 @@ jobs: needs: [ makepublic, generate-matrix ] runs-on: ubuntu-latest steps: - - name: checkout - uses: actions/checkout@v3 + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' - - name: Setup DuckDB - run: | - wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip - unzip /tmp/duckdb.zip -d ${{ github.workspace }} - chmod +x ${{ github.workspace }}/duckdb + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} - - name: Setup wget2 - run: sudo apt-get update && sudo apt-get install wget2 + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root - - name: Download releases - run: | - mkdir data/ - wget2 $(cat download_links.txt) --compression -P data/ + - name: Install root + run: poetry install --only-root + + - name: Download links + run: cat download_links.txt - name: Combine - run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/unique_python_files.sql + run: | + poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files.prql unique-python-files.parquet $(cat download_links.txt) - name: Upload Assets id: upload @@ -202,3 +238,60 @@ jobs: push: true fetch: true pull: '--rebase --autostash' + + generate_stats: + needs: [ makepublic, generate-matrix ] + runs-on: ubuntu-latest + steps: + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Install root + run: poetry install --only-root + + - name: Combine + run: | + poetry run pypi-data run-sql ${{ github.workspace }}/sql/stats.prql stats.parquet $(cat download_links.txt) +# +# - name: Upload Assets +# id: upload +# uses: shogo82148/actions-upload-release-asset@v1 +# with: +# upload_url: ${{ needs.generate-matrix.outputs.upload_url }} +# asset_name: unique-python-files.parquet +# +# - name: Create download links +# run: | +# echo "${{ steps.upload.outputs.browser_download_url }}" > only_python_download_links.txt +# +# - uses: EndBug/add-and-commit@v9 +# with: +# add: 'only_python_download_links.txt' +# author_email: "41898282+github-actions[bot]@users.noreply.github.com" +# author_name: "commit-bot" +# message: "Add only python links for asset ${{ needs.generate-matrix.outputs.release_id }}" +# push: true +# fetch: true +# pull: '--rebase --autostash' diff --git a/poetry.lock b/poetry.lock index e57e731f..1aa09607 100644 --- a/poetry.lock +++ b/poetry.lock @@ -446,13 +446,13 @@ files = [ [[package]] name = "more-itertools" -version = "9.1.0" +version = "10.0.0" description = "More routines for operating on iterables, beyond itertools" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "more-itertools-9.1.0.tar.gz", hash = "sha256:cabaa341ad0389ea83c17a94566a53ae4c9d07349861ecb14dc6d0345cf9ac5d"}, - {file = "more_itertools-9.1.0-py3-none-any.whl", hash = "sha256:d2bc7f02446e86a68911e58ded76d6561eea00cddfb2a91e7019bbb586c799f3"}, + {file = "more-itertools-10.0.0.tar.gz", hash = "sha256:cd65437d7c4b615ab81c0640c0480bc29a550ea032891977681efd28344d51e1"}, + {file = "more_itertools-10.0.0-py3-none-any.whl", hash = "sha256:928d514ffd22b5b0a8fce326d57f423a55d2ff783b093bab217eda71e732330f"}, ] [[package]] @@ -466,40 +466,6 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] -[[package]] -name = "numpy" -version = "1.25.1" -description = "Fundamental package for array computing in Python" -optional = false -python-versions = ">=3.9" -files = [ - {file = "numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d339465dff3eb33c701430bcb9c325b60354698340229e1dff97745e6b3efa"}, - {file = "numpy-1.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d736b75c3f2cb96843a5c7f8d8ccc414768d34b0a75f466c05f3a739b406f10b"}, - {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a90725800caeaa160732d6b31f3f843ebd45d6b5f3eec9e8cc287e30f2805bf"}, - {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6c9261d21e617c6dc5eacba35cb68ec36bb72adcff0dee63f8fbc899362588"}, - {file = "numpy-1.25.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0def91f8af6ec4bb94c370e38c575855bf1d0be8a8fbfba42ef9c073faf2cf19"}, - {file = "numpy-1.25.1-cp310-cp310-win32.whl", hash = "sha256:fd67b306320dcadea700a8f79b9e671e607f8696e98ec255915c0c6d6b818503"}, - {file = "numpy-1.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:c1516db588987450b85595586605742879e50dcce923e8973f79529651545b57"}, - {file = "numpy-1.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b82655dd8efeea69dbf85d00fca40013d7f503212bc5259056244961268b66e"}, - {file = "numpy-1.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f6049c4878cb16960fbbfb22105e49d13d752d4d8371b55110941fb3b17800"}, - {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41a56b70e8139884eccb2f733c2f7378af06c82304959e174f8e7370af112e09"}, - {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5154b1a25ec796b1aee12ac1b22f414f94752c5f94832f14d8d6c9ac40bcca6"}, - {file = "numpy-1.25.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38eb6548bb91c421261b4805dc44def9ca1a6eef6444ce35ad1669c0f1a3fc5d"}, - {file = "numpy-1.25.1-cp311-cp311-win32.whl", hash = "sha256:791f409064d0a69dd20579345d852c59822c6aa087f23b07b1b4e28ff5880fcb"}, - {file = "numpy-1.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:c40571fe966393b212689aa17e32ed905924120737194b5d5c1b20b9ed0fb171"}, - {file = "numpy-1.25.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d7abcdd85aea3e6cdddb59af2350c7ab1ed764397f8eec97a038ad244d2d105"}, - {file = "numpy-1.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a180429394f81c7933634ae49b37b472d343cccb5bb0c4a575ac8bbc433722f"}, - {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d412c1697c3853c6fc3cb9751b4915859c7afe6a277c2bf00acf287d56c4e625"}, - {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e1266411120a4f16fad8efa8e0454d21d00b8c7cee5b5ccad7565d95eb42dd"}, - {file = "numpy-1.25.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f76aebc3358ade9eacf9bc2bb8ae589863a4f911611694103af05346637df1b7"}, - {file = "numpy-1.25.1-cp39-cp39-win32.whl", hash = "sha256:247d3ffdd7775bdf191f848be8d49100495114c82c2bd134e8d5d075fb386a1c"}, - {file = "numpy-1.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:1d5d3c68e443c90b38fdf8ef40e60e2538a27548b39b12b73132456847f4b631"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:35a9527c977b924042170a0887de727cd84ff179e478481404c5dc66b4170009"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d3fe3dd0506a28493d82dc3cf254be8cd0d26f4008a417385cbf1ae95b54004"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:012097b5b0d00a11070e8f2e261128c44157a8689f7dedcf35576e525893f4fe"}, - {file = "numpy-1.25.1.tar.gz", hash = "sha256:9a3a9f3a61480cc086117b426a8bd86869c213fc4072e606f01c4e4b66eb92bf"}, -] - [[package]] name = "packaging" version = "23.1" @@ -538,76 +504,44 @@ docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx- test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] [[package]] -name = "polars" -version = "0.18.8" -description = "Blazingly fast DataFrame library" +name = "prql-python" +version = "0.9.2" +description = "" optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "polars-0.18.8-cp38-abi3-macosx_10_7_x86_64.whl", hash = "sha256:b3c541b91dfc528ea5923b409c7dbe21902eeb6070bd3f4cc32e3bef9aced7df"}, - {file = "polars-0.18.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:519212359f45e84fcca9e61d81ec1a1f04da4ad1539de2f0575ea287a33f7fab"}, - {file = "polars-0.18.8-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c8b91abd684c50cfb9f08311264e19e03a9a56e5fc152c760ff69d6a8998fbf"}, - {file = "polars-0.18.8-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61d61440b10355a1af1206ee8d9cae002ec2398c710603f5ce2c4b6732af31fc"}, - {file = "polars-0.18.8-cp38-abi3-win_amd64.whl", hash = "sha256:0bc01d496b07093c92be17777c48d5ec6a741ffc723c55a992759c189f22e8b2"}, - {file = "polars-0.18.8.tar.gz", hash = "sha256:283ca1357ef643b366bdfd0da2c9f31f252d1ce9a1b656eda3041cc9f83e0fa9"}, + {file = "prql_python-0.9.2-cp37-abi3-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:0da87163da01646820bc4a460403c667d19e66c7b4c946ce8279d434fefb649e"}, + {file = "prql_python-0.9.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d90ade324bf2b5aa1b95de71551bd2ee29fc377f553695d8d9d78fd8575e8a3"}, + {file = "prql_python-0.9.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550d1832d850b8d40393a89ed62043a1ad924e80be21170fa6a2908134a6e74e"}, + {file = "prql_python-0.9.2-cp37-abi3-win_amd64.whl", hash = "sha256:4598ad722f048a25a29a85aeee8dcb1e046c85af06825cfe90d003d750354583"}, + {file = "prql_python-0.9.2.tar.gz", hash = "sha256:98de2534e29069c356a0a0f2967a7a2f41922771d9a18305ea29e97291852e67"}, ] -[package.dependencies] -pyarrow = {version = ">=7.0.0", optional = true, markers = "extra == \"pyarrow\""} - -[package.extras] -adbc = ["adbc_driver_sqlite"] -all = ["polars[adbc,cloudpickle,connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,pydantic,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] -cloudpickle = ["cloudpickle"] -connectorx = ["connectorx"] -deltalake = ["deltalake (>=0.10.0)"] -fsspec = ["fsspec"] -matplotlib = ["matplotlib"] -numpy = ["numpy (>=1.16.0)"] -pandas = ["pandas", "pyarrow (>=7.0.0)"] -pyarrow = ["pyarrow (>=7.0.0)"] -pydantic = ["pydantic"] -sqlalchemy = ["pandas", "sqlalchemy"] -timezone = ["backports.zoneinfo", "tzdata"] -xlsx2csv = ["xlsx2csv (>=0.8.0)"] -xlsxwriter = ["xlsxwriter"] - -[[package]] -name = "pyarrow" -version = "12.0.1" -description = "Python library for Apache Arrow" +[[package]] +name = "psutil" +version = "5.9.5" +description = "Cross-platform lib for process and system monitoring in Python." optional = false -python-versions = ">=3.7" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ - {file = "pyarrow-12.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:6d288029a94a9bb5407ceebdd7110ba398a00412c5b0155ee9813a40d246c5df"}, - {file = "pyarrow-12.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:345e1828efdbd9aa4d4de7d5676778aba384a2c3add896d995b23d368e60e5af"}, - {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d6009fdf8986332b2169314da482baed47ac053311c8934ac6651e614deacd6"}, - {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d3c4cbbf81e6dd23fe921bc91dc4619ea3b79bc58ef10bce0f49bdafb103daf"}, - {file = "pyarrow-12.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:cdacf515ec276709ac8042c7d9bd5be83b4f5f39c6c037a17a60d7ebfd92c890"}, - {file = "pyarrow-12.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:749be7fd2ff260683f9cc739cb862fb11be376de965a2a8ccbf2693b098db6c7"}, - {file = "pyarrow-12.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6895b5fb74289d055c43db3af0de6e16b07586c45763cb5e558d38b86a91e3a7"}, - {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1887bdae17ec3b4c046fcf19951e71b6a619f39fa674f9881216173566c8f718"}, - {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2c9cb8eeabbadf5fcfc3d1ddea616c7ce893db2ce4dcef0ac13b099ad7ca082"}, - {file = "pyarrow-12.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:ce4aebdf412bd0eeb800d8e47db854f9f9f7e2f5a0220440acf219ddfddd4f63"}, - {file = "pyarrow-12.0.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:e0d8730c7f6e893f6db5d5b86eda42c0a130842d101992b581e2138e4d5663d3"}, - {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43364daec02f69fec89d2315f7fbfbeec956e0d991cbbef471681bd77875c40f"}, - {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:051f9f5ccf585f12d7de836e50965b3c235542cc896959320d9776ab93f3b33d"}, - {file = "pyarrow-12.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:be2757e9275875d2a9c6e6052ac7957fbbfc7bc7370e4a036a9b893e96fedaba"}, - {file = "pyarrow-12.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:cf812306d66f40f69e684300f7af5111c11f6e0d89d6b733e05a3de44961529d"}, - {file = "pyarrow-12.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:459a1c0ed2d68671188b2118c63bac91eaef6fc150c77ddd8a583e3c795737bf"}, - {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85e705e33eaf666bbe508a16fd5ba27ca061e177916b7a317ba5a51bee43384c"}, - {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9120c3eb2b1f6f516a3b7a9714ed860882d9ef98c4b17edcdc91d95b7528db60"}, - {file = "pyarrow-12.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:c780f4dc40460015d80fcd6a6140de80b615349ed68ef9adb653fe351778c9b3"}, - {file = "pyarrow-12.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a3c63124fc26bf5f95f508f5d04e1ece8cc23a8b0af2a1e6ab2b1ec3fdc91b24"}, - {file = "pyarrow-12.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b13329f79fa4472324f8d32dc1b1216616d09bd1e77cfb13104dec5463632c36"}, - {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb656150d3d12ec1396f6dde542db1675a95c0cc8366d507347b0beed96e87ca"}, - {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6251e38470da97a5b2e00de5c6a049149f7b2bd62f12fa5dbb9ac674119ba71a"}, - {file = "pyarrow-12.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3de26da901216149ce086920547dfff5cd22818c9eab67ebc41e863a5883bac7"}, - {file = "pyarrow-12.0.1.tar.gz", hash = "sha256:cce317fc96e5b71107bf1f9f184d5e54e2bd14bbf3f9a3d62819961f0af86fec"}, + {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"}, + {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"}, + {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"}, + {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"}, + {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"}, + {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"}, + {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"}, + {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"}, ] -[package.dependencies] -numpy = ">=1.16.6" +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] [[package]] name = "pycparser" @@ -906,4 +840,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "ffacbfa8fc3d9102b586a8c8a3ab23b9488da2f8d7ef1fae388422074bc72e0f" +content-hash = "0ea03cf7c440b21aa4d831e6690dbc6184a516fa3985a131c17b0a8f74c86f55" diff --git a/pyproject.toml b/pyproject.toml index 55fd525a..4ec8aa14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,14 +11,15 @@ packages = [ [tool.poetry.dependencies] python = "^3.11" -polars = { extras = ["pyarrow"], version = "^0.18.7" } requests = "^2.31.0" typer = { extras = ["all"], version = "^0.9.0" } pygithub = "^1.59.0" fsspec = "^2023.6.0" duckdb = "^0.8.1" tqdm = "^4.65.0" -more-itertools = "^9.1.0" +more-itertools = "^10.0.0" +prql-python = "^0.9.2" +psutil = "^5.9.5" [tool.poetry.scripts] pypi-data = "pypi_data.cli:app" diff --git a/sql/combine.prql b/sql/combine.prql new file mode 100644 index 00000000..12a64784 --- /dev/null +++ b/sql/combine.prql @@ -0,0 +1,2 @@ +prql target:sql.duckdb +from (read_parquet $1) \ No newline at end of file diff --git a/sql/combine.sql b/sql/combine.sql deleted file mode 100644 index 35c4b4b1..00000000 --- a/sql/combine.sql +++ /dev/null @@ -1,4 +0,0 @@ -PRAGMA memory_limit='2GB'; -PRAGMA threads=4; -CREATE TABLE temp_table AS SELECT * FROM read_parquet('input/*.parquet', union_by_name=True); -COPY temp_table TO 'output.parquet' (FORMAT PARQUET, COMPRESSION zstd); \ No newline at end of file diff --git a/sql/stats.prql b/sql/stats.prql new file mode 100644 index 00000000..6b1e85d9 --- /dev/null +++ b/sql/stats.prql @@ -0,0 +1,85 @@ +prql target:sql.duckdb + +let approx_count_distinct = column -> s"approx_count_distinct({column})" +let regexp_extract = column r -> s"regexp_extract({column}, {r})" +let row_to_json = name -> s"row_to_json({name})" +let relation_to_json = func r -> ( + from s=r + aggregate { + _to_json=s"row_to_json(s)" + } + aggregate { + stat = s"json_group_array({_to_json})", + name = s"'{r}'" + } +) + +let base = ( + from (read_parquet($1)) +) + +let base_with_extension = ( + from base + select { + extension = (regexp_extract path "\\.[0-9a-z]+$"), + lines, + size, + skip_reason + } +) + +let total_stats = ( + from base + aggregate { + total_files = count(s"*"), + unique_files = approx_count_distinct(hash) | as bigint, + total_size = sum(size) | as bigint, + total_lines = sum(lines) | as bigint + } +) + +let extension_stats = ( + from base_with_extension + group {extension} ( + aggregate { + total_files = count(s"*"), + total_lines = sum(lines) | as bigint, + total_size = sum(size) | as bigint + } + ) + sort {-total_files} + take 10 +) + +let binary_extension_stats = ( + from base_with_extension + filter skip_reason == "binary" + group {extension} ( + aggregate { + total_files = count(s"*"), + total_lines = sum(lines) | as bigint, + total_size = sum(size) | as bigint + } + ) + sort {-total_files} + take 10 +) + +let skipped_files_stats = ( + from base_with_extension + filter skip_reason != "binary" + group {extension} ( + aggregate { + total_files = count(s"*"), + total_size = sum(size) | as bigint + } + ) + sort {-total_files} + take 10 +) + + +relation_to_json(total_stats) +append (relation_to_json extension_stats) +append (relation_to_json binary_extension_stats) +append (relation_to_json skipped_files_stats) \ No newline at end of file diff --git a/sql/stats.sql b/sql/stats.sql index 51461f3b..28564094 100644 --- a/sql/stats.sql +++ b/sql/stats.sql @@ -4,46 +4,46 @@ SET threads = 4; -COPY -( -select count(*) as "total_files", - approx_count_distinct(hash)::bigint as "unique_files", sum(size)::bigint as "total_size", sum(lines)::bigint as "total_lines", -from 'data/*.parquet' ) TO 'stats/general_stats.json'; - -COPY -( -select regexp_extract(path, '\.[0-9a-z]+$') as extension, - count() as total, - sum(lines)::bigint as lines, sum(size) ::bigint as size, -from 'data/*.parquet' -group by extension -order by total DESC - limit 10 - ) TO 'stats/top_extensions.json'; - -COPY -( -select regexp_extract(path, '\.[0-9a-z]+$') as extension, - count() as total, - sum(size) ::bigint as size, -from 'data/*.parquet' -where skip_reason = 'binary' -group by extension -order by total DESC - limit 10 - ) TO 'stats/top_binary_extensions.json' (ARRAY TRUE); - -COPY -( -select skip_reason, - count(*) as total, - sum(size) ::bigint as size -from 'data/*.parquet' -where skip_reason != '' -group by skip_reason -order by total DESC - limit 10 - ) TO 'stats/skipped_files.json' (ARRAY TRUE); +-- COPY +-- ( +-- select count(*) as "total_files", +-- approx_count_distinct(hash)::bigint as "unique_files", sum(size)::bigint as "total_size", sum(lines)::bigint as "total_lines", +-- from 'data/*.parquet' ) TO 'stats/general_stats.json'; +-- +-- COPY +-- ( +-- select regexp_extract(path, '\.[0-9a-z]+$') as extension, +-- count() as total, +-- sum(lines)::bigint as lines, sum(size) ::bigint as size, +-- from 'data/*.parquet' +-- group by extension +-- order by total DESC +-- limit 10 +-- ) TO 'stats/top_extensions.json'; +-- +-- COPY +-- ( +-- select regexp_extract(path, '\.[0-9a-z]+$') as extension, +-- count() as total, +-- sum(size) ::bigint as size, +-- from 'data/*.parquet' +-- where skip_reason = 'binary' +-- group by extension +-- order by total DESC +-- limit 10 +-- ) TO 'stats/top_binary_extensions.json' (ARRAY TRUE); +-- +-- COPY +-- ( +-- select skip_reason, +-- count(*) as total, +-- sum(size) ::bigint as size +-- from 'data/*.parquet' +-- where skip_reason != '' +-- group by skip_reason +-- order by total DESC +-- limit 10 +-- ) TO 'stats/skipped_files.json' (ARRAY TRUE); COPY diff --git a/sql/unique_python_files.prql b/sql/unique_python_files.prql new file mode 100644 index 00000000..7e39ce35 --- /dev/null +++ b/sql/unique_python_files.prql @@ -0,0 +1,11 @@ +prql target:sql.duckdb + +let any_value = column -> s"any_value({column})" + +from (read_parquet $1) +filter path ~= "\\.py$" +filter skip_reason == "" +group {hash} ( +aggregate { + any_value(path) +}) diff --git a/sql/unique_python_files.sql b/sql/unique_python_files.sql deleted file mode 100644 index 07d03ebd..00000000 --- a/sql/unique_python_files.sql +++ /dev/null @@ -1,9 +0,0 @@ -SET memory_limit='6GB'; -SET threads=2; -COPY -( -select hash, any_value(path) -from read_parquet('data/*.parquet') -where path LIKE '%.py' and skip_reason = '' -group by 1 -) TO 'unique-python-files.parquet' (FORMAT PARQUET, compression zstd); diff --git a/src/pypi_data/cli.py b/src/pypi_data/cli.py index cb32b881..673c7f84 100644 --- a/src/pypi_data/cli.py +++ b/src/pypi_data/cli.py @@ -1,9 +1,15 @@ +import threading +import time + +import duckdb import json from pathlib import Path -from typing import Annotated, Iterable +from typing import Annotated, Iterable, List, Optional + +import psutil from fsspec.implementations.http_sync import HTTPFileSystem import typer -import polars as pl +import prql_python as prql from github import Github from github import Auth import requests @@ -73,5 +79,47 @@ def group_index_urls(github_token: GithubToken, (output_path / "groups.json").write_text(json.dumps(outputs)) +@app.command() +def run_sql( + prql_file: Annotated[Path, typer.Argument(dir_okay=False, file_okay=True, readable=True)], + output_file: Annotated[Path, typer.Argument(dir_okay=False, file_okay=True, writable=True)], + parameter: Annotated[Optional[List[str]], typer.Argument()] = None +): + options = prql.CompileOptions( + format=True, signature_comment=True, target="sql.duckdb" + ) + + sql = prql.compile(prql_file.read_text(), options=options) + print(sql) + print(f'{parameter=}') + print("\n\n\n") + # x = duckdb.execute(sql, parameters=[parameter] if parameter else []) + # import pprint + # pprint.pprint(x.fetchall()) + duckdb.install_extension("httpfs") + duckdb.load_extension("httpfs") + + def print_thread(): + psutil.cpu_percent() + while True: + time.sleep(1) + memory = psutil.virtual_memory() + cpu = psutil.cpu_percent() + print(f'\n{memory.available=} / {memory=} / {cpu=}\n') + + t = threading.Thread(target=print_thread, daemon=True) + t.start() + duckdb.execute("PRAGMA EXPLAIN_OUTPUT='ALL';") + duckdb.execute(f"EXPLAIN COPY ({sql}) TO '{output_file}' (FORMAT PARQUET, COMPRESSION zstd)", parameters=[parameter] if parameter else []) + for name, plan in duckdb.fetchall(): + print(name) + print(plan) + print("\n\n\n") + duckdb.executemany(f"PRAGMA threads=2; " + f"PRAGMA memory_limit='2GB'; " + f"COPY ({sql}) TO '{output_file}' (FORMAT PARQUET, COMPRESSION zstd)", + parameters=[[parameter]] if parameter else []) + + if __name__ == "__main__": app()