diff --git a/.github/workflows/unique_python_files.yml b/.github/workflows/unique_python_files.yml index 6524406c..7134e95a 100644 --- a/.github/workflows/unique_python_files.yml +++ b/.github/workflows/unique_python_files.yml @@ -59,14 +59,14 @@ jobs: run: | sudo apt-get install parallel - - name: Combine + - name: Ingest run: | mkdir combined/ find dataset/ -name '*.parquet' | parallel -j 1 --xargs -n2 poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files.prql --output=parquet --threads=2 combined/{#}.parquet {} - - name: Complete + - name: Combine run: | - poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files.prql --output=parquet --threads=2 unique-python-files.parquet combined/*.parquet + poetry run pypi-data run-sql ${{ github.workspace }}/sql/unique_python_files_combine.prql --output=parquet --threads=2 unique-python-files.parquet combined/*.parquet - name: List run: ls combined/ diff --git a/sql/unique_python_files_combine.prql b/sql/unique_python_files_combine.prql new file mode 100644 index 00000000..ac9baa47 --- /dev/null +++ b/sql/unique_python_files_combine.prql @@ -0,0 +1,10 @@ +prql target:sql.duckdb + +let any_value = column -> s"any_value({column})" + +from (read_parquet $1) +select {hash, path} +group {hash} ( +aggregate { + path = any_value(path) +})