Build data #583
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# On every push this script is executed | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 3 * * *" | |
concurrency: data | |
name: Build data | |
jobs: | |
generate-matrix: | |
runs-on: ubuntu-latest | |
env: | |
UV_CACHE_DIR: /tmp/.uv-cache | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: Set up uv | |
# Install latest uv version using the installer | |
run: curl -LsSf https://astral.sh/uv/install.sh | sh | |
- name: Restore uv cache | |
uses: actions/cache@v4 | |
with: | |
path: /tmp/.uv-cache | |
key: uv-${{ runner.os }}-${{ hashFiles('uv.lock') }} | |
restore-keys: | | |
uv-${{ runner.os }}-${{ hashFiles('uv.lock') }} | |
uv-${{ runner.os }} | |
- name: Generate token | |
id: generate_token | |
uses: pypi-data/github-app-token@v1 | |
with: | |
app_id: ${{ secrets.APP_ID }} | |
private_key: ${{ secrets.APP_PRIVATE_KEY }} | |
- env: | |
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} | |
run: | | |
uv run pypi-data print-git-urls > links/repositories.txt | |
uv run pypi-data print-git-urls --ssh-urls > links/repositories_ssh.txt | |
- uses: EndBug/add-and-commit@v9 | |
with: | |
add: | | |
links/repositories.txt | |
links/repositories_ssh.txt | |
author_email: "41898282+github-actions[bot]@users.noreply.github.com" | |
author_name: "commit-bot" | |
message: "Add repository URLs" | |
push: true | |
fetch: true | |
pull: '--rebase --autostash' | |
- env: | |
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} | |
id: groups | |
run: | | |
mkdir output/ | |
uv run pypi-data group-index-urls output/ | |
echo "matrix=$(cat output/groups.json)" >> "$GITHUB_OUTPUT" | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: groups | |
path: output/groups/ | |
retention-days: 1 | |
- name: "Set current date as env variable" | |
run: | | |
echo "tag_name=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_OUTPUT | |
id: version | |
- name: Create Release | |
id: create-release | |
uses: shogo82148/actions-create-release@v1 | |
with: | |
draft: true | |
release_name: ${{ steps.version.outputs.tag_name }} | |
tag_name: ${{ steps.version.outputs.tag_name }} | |
commitish: ${{ github.sha }} | |
- name: Release URL | |
run: echo ${{ steps.create-release.html_url }} | |
- name: Minimize uv cache | |
run: uv cache prune --ci | |
outputs: | |
matrix: ${{ steps.groups.outputs.matrix }} | |
upload_url: ${{ steps.create-release.outputs.upload_url }} | |
release_id: ${{ steps.create-release.outputs.id }} | |
combine: | |
needs: [generate-matrix] | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
index: ${{fromJson(needs.generate-matrix.outputs.matrix)}} | |
steps: | |
- name: Maximize build space | |
uses: easimon/maximize-build-space@master | |
with: | |
remove-dotnet: 'true' | |
remove-android: 'true' | |
remove-haskell: 'true' | |
remove-codeql: 'true' | |
remove-docker-images: 'true' | |
- name: checkout | |
uses: actions/checkout@v4 | |
- uses: actions/download-artifact@v4 | |
with: | |
name: groups | |
- uses: actions-rust-lang/setup-rust-toolchain@v1 | |
with: | |
cache-on-failure: 'false' | |
- name: Install parquet cli from crates.io | |
uses: baptiste0928/cargo-install@v3 | |
with: | |
crate: parquet | |
features: cli | |
- name: Download links | |
run: cat ${{ matrix.index }} | jq -rc '.[]' | |
- name: Debug | |
run: | | |
echo "Links for ${{ matrix.index }}" | |
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | |
- name: Download | |
run: | | |
mkdir input/ | |
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' wget --tries=2 -nv -O input/{2}.parquet {1} | |
- run: ls -la ${{ github.workspace }}/input/ | |
- name: Combine | |
run: parquet-concat ${{ github.workspace }}/merged.parquet ${{ github.workspace }}/input/*.parquet | |
- name: Merged size | |
run: du -hs ${{ github.workspace }}/merged.parquet | |
- name: Rewrite | |
run: | | |
parquet-rewrite \ | |
--compression=zstd \ | |
--writer-version=2.0 \ | |
--dictionary-enabled=true \ | |
--statistics-enabled=page \ | |
--bloom-filter-enabled=true \ | |
--bloom-filter-fpp 0.2 \ | |
--input=${{ github.workspace }}/merged.parquet \ | |
--output=${{ github.workspace }}/output.parquet | |
- name: Output size | |
run: du -hs ${{ github.workspace }}/output.parquet | |
- name: Upload Assets | |
uses: shogo82148/actions-upload-release-asset@v1 | |
with: | |
upload_url: ${{ needs.generate-matrix.outputs.upload_url }} | |
asset_path: ${{ github.workspace }}/output.parquet | |
asset_name: index-${{ matrix.index }}.parquet | |
makepublic: | |
needs: [generate-matrix, combine] | |
runs-on: ubuntu-latest | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: Publish release | |
uses: StuYarrow/[email protected] | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
id: ${{ needs.generate-matrix.outputs.release_id }} | |
- name: Get download links | |
id: get_download_links | |
uses: actions/github-script@v7 | |
with: | |
github-token: ${{ secrets.GITHUB_TOKEN }} | |
script: | | |
const response = await github.rest.repos.listReleaseAssets({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
release_id: ${{ needs.generate-matrix.outputs.release_id }}, | |
}); | |
const assets = response.data; | |
// Save the download links to a file | |
require('fs').writeFileSync('links/dataset.txt', assets.map(asset => asset.browser_download_url).join('\n')); | |
- run: cat links/dataset.txt | |
- uses: EndBug/add-and-commit@v9 | |
with: | |
add: 'links/dataset.txt' | |
author_email: "41898282+github-actions[bot]@users.noreply.github.com" | |
author_name: "commit-bot" | |
message: "Add download links for asset ${{ needs.generate-matrix.outputs.release_id }}" | |
push: true | |
fetch: true | |
pull: '--rebase --autostash' |