diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 68873391..8d335592 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -125,6 +125,7 @@ jobs: - name: Download links run: cat ${{ matrix.index }} | jq -rc '.[]' + - name: Setup DuckDB run: | wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip @@ -134,7 +135,7 @@ jobs: - name: Download run: | mkdir input/ - cat ${{ matrix.index }} | jq -rc '.[]' | parallel wget -nv -O input/{#}.parquet {} + cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' -N 2 wget -nv -O {2}.parquet {1} - name: Combine run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/combine.sql diff --git a/sql/combine.sql b/sql/combine.sql index 35c4b4b1..c6557fc1 100644 --- a/sql/combine.sql +++ b/sql/combine.sql @@ -1,4 +1,6 @@ PRAGMA memory_limit='2GB'; PRAGMA threads=4; -CREATE TABLE temp_table AS SELECT * FROM read_parquet('input/*.parquet', union_by_name=True); +CREATE TABLE temp_table AS +select regexp_extract(filename, '(\d+)\.parquet', 1)::USMALLINT as repository, * exclude (filename) +FROM read_parquet('input/*.parquet', union_by_name = True, filename=true); COPY temp_table TO 'output.parquet' (FORMAT PARQUET, COMPRESSION zstd); \ No newline at end of file diff --git a/src/pypi_data/cli.py b/src/pypi_data/cli.py index d5e70137..779ef273 100644 --- a/src/pypi_data/cli.py +++ b/src/pypi_data/cli.py @@ -51,7 +51,7 @@ def print_git_urls(github_token: GithubToken): print(url) -def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]: +def group_by_size(github: Github, target_size: int) -> Iterable[list[tuple[int, str]]]: fs = HTTPFileSystem() urls = (u[1] for u in _get_urls(github)) with ThreadPoolExecutor() as pool: @@ -60,7 +60,12 @@ def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]: names = [] total_size = 0 for stat_result in stat_results: - names.append(stat_result["name"]) + name = stat_result["name"] + index = int(name.removeprefix('https://github.com/pypi-data/pypi-mirror-').split('/')[0]) + names.append({ + "name": stat_result["name"], + "id": index + }) total_size += stat_result["size"] if total_size >= target_size: yield names