Build data #434
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# On every push this script is executed | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 3 * * *" | |
concurrency: data | |
name: Build data | |
jobs: | |
generate-matrix: | |
runs-on: ubuntu-latest | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: Set up python | |
id: setup-python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
cache: 'pip' | |
- name: Install Poetry | |
uses: snok/install-poetry@v1 | |
with: | |
virtualenvs-create: true | |
virtualenvs-in-project: true | |
installer-parallel: true | |
- name: Load cached venv | |
id: cached-poetry-dependencies | |
uses: actions/cache@v4 | |
with: | |
path: .venv | |
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} | |
- name: Install dependencies | |
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' | |
run: poetry install --no-interaction --no-root | |
- name: Install root | |
run: poetry install --only-root | |
- name: Generate token | |
id: generate_token | |
uses: pypi-data/github-app-token@v1 | |
with: | |
app_id: ${{ secrets.APP_ID }} | |
private_key: ${{ secrets.APP_PRIVATE_KEY }} | |
- env: | |
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} | |
run: | | |
poetry run pypi-data print-git-urls > links/repositories.txt | |
poetry run pypi-data print-git-urls --ssh-urls > links/repositories_ssh.txt | |
- uses: EndBug/add-and-commit@v9 | |
with: | |
add: | | |
links/repositories.txt | |
links/repositories_ssh.txt | |
author_email: "41898282+github-actions[bot]@users.noreply.github.com" | |
author_name: "commit-bot" | |
message: "Add repository URLs" | |
push: true | |
fetch: true | |
pull: '--rebase --autostash' | |
- env: | |
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} | |
id: groups | |
run: | | |
mkdir output/ | |
poetry run pypi-data group-index-urls output/ | |
echo "matrix=$(cat output/groups.json)" >> "$GITHUB_OUTPUT" | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: groups | |
path: output/groups/ | |
retention-days: 1 | |
- name: "Set current date as env variable" | |
run: | | |
echo "tag_name=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_OUTPUT | |
id: version | |
- name: Create Release | |
id: create-release | |
uses: shogo82148/actions-create-release@v1 | |
with: | |
draft: true | |
release_name: ${{ steps.version.outputs.tag_name }} | |
tag_name: ${{ steps.version.outputs.tag_name }} | |
commitish: ${{ github.sha }} | |
- name: Release URL | |
run: echo ${{ steps.create-release.html_url }} | |
outputs: | |
matrix: ${{ steps.groups.outputs.matrix }} | |
upload_url: ${{ steps.create-release.outputs.upload_url }} | |
release_id: ${{ steps.create-release.outputs.id }} | |
combine: | |
needs: [generate-matrix] | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
index: ${{fromJson(needs.generate-matrix.outputs.matrix)}} | |
steps: | |
- name: Maximize build space | |
uses: easimon/maximize-build-space@master | |
with: | |
remove-dotnet: 'true' | |
remove-android: 'true' | |
remove-haskell: 'true' | |
remove-codeql: 'true' | |
remove-docker-images: 'true' | |
- name: checkout | |
uses: actions/checkout@v4 | |
- uses: actions/download-artifact@v4 | |
with: | |
name: groups | |
- name: Install Rust | |
uses: actions-rs/toolchain@v1 | |
with: | |
toolchain: stable | |
override: true | |
- name: Install parquet cli from crates.io | |
uses: baptiste0928/cargo-install@v3 | |
with: | |
crate: parquet | |
features: cli | |
- name: Download links | |
run: cat ${{ matrix.index }} | jq -rc '.[]' | |
- name: Debug | |
run: | | |
echo "Links for ${{ matrix.index }}" | |
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | |
- name: Download | |
run: | | |
mkdir input/ | |
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' wget --tries=2 -nv -O input/{2}.parquet {1} | |
- run: ls -la ${{ github.workspace }}/input/ | |
- name: Combine | |
run: parquet-concat ${{ github.workspace }}/merged.parquet ${{ github.workspace }}/input/*.parquet | |
- name: Merged size | |
run: du -hs ${{ github.workspace }}/merged.parquet | |
- name: Rewrite | |
run: | | |
parquet-rewrite \ | |
--compression=zstd \ | |
--writer-version=2.0 \ | |
--dictionary-enabled=true \ | |
--statistics-enabled=page \ | |
--bloom-filter-enabled=true \ | |
--bloom-filter-fpp 0.2 \ | |
--input=${{ github.workspace }}/merged.parquet \ | |
--output=${{ github.workspace }}/output.parquet | |
- name: Output size | |
run: du -hs ${{ github.workspace }}/output.parquet | |
- name: Upload Assets | |
uses: shogo82148/actions-upload-release-asset@v1 | |
with: | |
upload_url: ${{ needs.generate-matrix.outputs.upload_url }} | |
asset_path: ${{ github.workspace }}/output.parquet | |
asset_name: index-${{ matrix.index }}.parquet | |
makepublic: | |
needs: [generate-matrix, combine] | |
runs-on: ubuntu-latest | |
steps: | |
- name: checkout | |
uses: actions/checkout@v4 | |
- name: Publish release | |
uses: StuYarrow/[email protected] | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
id: ${{ needs.generate-matrix.outputs.release_id }} | |
- name: Get download links | |
id: get_download_links | |
uses: actions/github-script@v7 | |
with: | |
github-token: ${{ secrets.GITHUB_TOKEN }} | |
script: | | |
const response = await github.rest.repos.listReleaseAssets({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
release_id: ${{ needs.generate-matrix.outputs.release_id }}, | |
}); | |
const assets = response.data; | |
// Save the download links to a file | |
require('fs').writeFileSync('links/dataset.txt', assets.map(asset => asset.browser_download_url).join('\n')); | |
- run: cat links/dataset.txt | |
- uses: EndBug/add-and-commit@v9 | |
with: | |
add: 'links/dataset.txt' | |
author_email: "41898282+github-actions[bot]@users.noreply.github.com" | |
author_name: "commit-bot" | |
message: "Add download links for asset ${{ needs.generate-matrix.outputs.release_id }}" | |
push: true | |
fetch: true | |
pull: '--rebase --autostash' |