From c7454a1baa64a7e6ef06d81452937829f2a495c0 Mon Sep 17 00:00:00 2001
From: Tom Forbes <tom@tomforb.es>
Date: Tue, 1 Aug 2023 17:20:28 +0100
Subject: [PATCH] Try including the repository in the output

---
 .github/workflows/run.yml | 3 ++-
 sql/combine.sql           | 4 +++-
 src/pypi_data/cli.py      | 9 +++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml
index 68873391..8d335592 100644
--- a/.github/workflows/run.yml
+++ b/.github/workflows/run.yml
@@ -125,6 +125,7 @@ jobs:
 
       - name: Download links
         run: cat ${{ matrix.index }} | jq -rc '.[]'
+
       - name: Setup DuckDB
         run: |
           wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip
@@ -134,7 +135,7 @@ jobs:
       - name: Download
         run: |
           mkdir input/
-          cat ${{ matrix.index }} | jq -rc '.[]' | parallel wget -nv -O input/{#}.parquet {}
+          cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' -N 2 wget -nv -O {2}.parquet {1}
 
       - name: Combine
         run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/combine.sql
diff --git a/sql/combine.sql b/sql/combine.sql
index 35c4b4b1..c6557fc1 100644
--- a/sql/combine.sql
+++ b/sql/combine.sql
@@ -1,4 +1,6 @@
 PRAGMA memory_limit='2GB';
 PRAGMA threads=4;
-CREATE TABLE temp_table AS SELECT * FROM read_parquet('input/*.parquet', union_by_name=True);
+CREATE TABLE temp_table AS
+select regexp_extract(filename, '(\d+)\.parquet', 1)::USMALLINT as repository, * exclude (filename)
+FROM read_parquet('input/*.parquet', union_by_name = True, filename=true);
 COPY temp_table TO 'output.parquet' (FORMAT PARQUET, COMPRESSION zstd);
\ No newline at end of file
diff --git a/src/pypi_data/cli.py b/src/pypi_data/cli.py
index d5e70137..779ef273 100644
--- a/src/pypi_data/cli.py
+++ b/src/pypi_data/cli.py
@@ -51,7 +51,7 @@ def print_git_urls(github_token: GithubToken):
         print(url)
 
 
-def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]:
+def group_by_size(github: Github, target_size: int) -> Iterable[list[tuple[int, str]]]:
     fs = HTTPFileSystem()
     urls = (u[1] for u in _get_urls(github))
     with ThreadPoolExecutor() as pool:
@@ -60,7 +60,12 @@ def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]:
         names = []
         total_size = 0
         for stat_result in stat_results:
-            names.append(stat_result["name"])
+            name = stat_result["name"]
+            index = int(name.removeprefix('https://github.com/pypi-data/pypi-mirror-').split('/')[0])
+            names.append({
+                "name": stat_result["name"],
+                "id": index
+            })
             total_size += stat_result["size"]
             if total_size >= target_size:
                 yield names