Try including the repository in the output

pypi-data · Aug 1, 2023 · c7454a1 · c7454a1
1 parent 9edc96e
commit c7454a1
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 4 deletions.
diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml
@@ -125,6 +125,7 @@ jobs:
 
       - name: Download links
         run: cat ${{ matrix.index }} | jq -rc '.[]'
+
       - name: Setup DuckDB
         run: |
           wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip
@@ -134,7 +135,7 @@ jobs:
       - name: Download
         run: |
           mkdir input/
-          cat ${{ matrix.index }} | jq -rc '.[]' | parallel wget -nv -O input/{#}.parquet {}
+          cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' -N 2 wget -nv -O {2}.parquet {1}
 
       - name: Combine
         run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/combine.sql

diff --git a/sql/combine.sql b/sql/combine.sql
@@ -1,4 +1,6 @@
 PRAGMA memory_limit='2GB';
 PRAGMA threads=4;
-CREATE TABLE temp_table AS SELECT * FROM read_parquet('input/*.parquet', union_by_name=True);
+CREATE TABLE temp_table AS
+select regexp_extract(filename, '(\d+)\.parquet', 1)::USMALLINT as repository, * exclude (filename)
+FROM read_parquet('input/*.parquet', union_by_name = True, filename=true);
 COPY temp_table TO 'output.parquet' (FORMAT PARQUET, COMPRESSION zstd);
diff --git a/src/pypi_data/cli.py b/src/pypi_data/cli.py
@@ -51,7 +51,7 @@ def print_git_urls(github_token: GithubToken):
         print(url)
 
 
-def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]:
+def group_by_size(github: Github, target_size: int) -> Iterable[list[tuple[int, str]]]:
     fs = HTTPFileSystem()
     urls = (u[1] for u in _get_urls(github))
     with ThreadPoolExecutor() as pool:
@@ -60,7 +60,12 @@ def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]:
         names = []
         total_size = 0
         for stat_result in stat_results:
-            names.append(stat_result["name"])
+            name = stat_result["name"]
+            index = int(name.removeprefix('https://github.com/pypi-data/pypi-mirror-').split('/')[0])
+            names.append({
+                "name": stat_result["name"],
+                "id": index
+            })
             total_size += stat_result["size"]
             if total_size >= target_size:
                 yield names