Skip to content

Commit

Permalink
Try including the repository in the output
Browse files Browse the repository at this point in the history
  • Loading branch information
orf committed Aug 1, 2023
1 parent 9edc96e commit c7454a1
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ jobs:

- name: Download links
run: cat ${{ matrix.index }} | jq -rc '.[]'

- name: Setup DuckDB
run: |
wget https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip -O /tmp/duckdb.zip
Expand All @@ -134,7 +135,7 @@ jobs:
- name: Download
run: |
mkdir input/
cat ${{ matrix.index }} | jq -rc '.[]' | parallel wget -nv -O input/{#}.parquet {}
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' -N 2 wget -nv -O {2}.parquet {1}
- name: Combine
run: ${{ github.workspace }}/duckdb -echo -stats foo.db < ${{ github.workspace }}/sql/combine.sql
Expand Down
4 changes: 3 additions & 1 deletion sql/combine.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
PRAGMA memory_limit='2GB';
PRAGMA threads=4;
CREATE TABLE temp_table AS SELECT * FROM read_parquet('input/*.parquet', union_by_name=True);
CREATE TABLE temp_table AS
select regexp_extract(filename, '(\d+)\.parquet', 1)::USMALLINT as repository, * exclude (filename)
FROM read_parquet('input/*.parquet', union_by_name = True, filename=true);
COPY temp_table TO 'output.parquet' (FORMAT PARQUET, COMPRESSION zstd);
9 changes: 7 additions & 2 deletions src/pypi_data/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def print_git_urls(github_token: GithubToken):
print(url)


def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]:
def group_by_size(github: Github, target_size: int) -> Iterable[list[tuple[int, str]]]:
fs = HTTPFileSystem()
urls = (u[1] for u in _get_urls(github))
with ThreadPoolExecutor() as pool:
Expand All @@ -60,7 +60,12 @@ def group_by_size(github: Github, target_size: int) -> Iterable[list[str]]:
names = []
total_size = 0
for stat_result in stat_results:
names.append(stat_result["name"])
name = stat_result["name"]
index = int(name.removeprefix('https://github.com/pypi-data/pypi-mirror-').split('/')[0])
names.append({
"name": stat_result["name"],
"id": index
})
total_size += stat_result["size"]
if total_size >= target_size:
yield names
Expand Down

0 comments on commit c7454a1

Please sign in to comment.