Skip to content

Build data

Build data #435

Workflow file for this run

# On every push this script is executed
on:
workflow_dispatch:
schedule:
- cron: "0 3 * * *"
concurrency: data
name: Build data
jobs:
generate-matrix:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4
- name: Set up python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Install Poetry
uses: snok/install-poetry@v1
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --no-root
- name: Install root
run: poetry install --only-root
- name: Generate token
id: generate_token
uses: pypi-data/github-app-token@v1
with:
app_id: ${{ secrets.APP_ID }}
private_key: ${{ secrets.APP_PRIVATE_KEY }}
- env:
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }}
run: |
poetry run pypi-data print-git-urls > links/repositories.txt
poetry run pypi-data print-git-urls --ssh-urls > links/repositories_ssh.txt
- uses: EndBug/add-and-commit@v9
with:
add: |
links/repositories.txt
links/repositories_ssh.txt
author_email: "41898282+github-actions[bot]@users.noreply.github.com"
author_name: "commit-bot"
message: "Add repository URLs"
push: true
fetch: true
pull: '--rebase --autostash'
- env:
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }}
id: groups
run: |
mkdir output/
poetry run pypi-data group-index-urls output/
echo "matrix=$(cat output/groups.json)" >> "$GITHUB_OUTPUT"
- uses: actions/upload-artifact@v4
with:
name: groups
path: output/groups/
retention-days: 1
- name: "Set current date as env variable"
run: |
echo "tag_name=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_OUTPUT
id: version
- name: Create Release
id: create-release
uses: shogo82148/actions-create-release@v1
with:
draft: true
release_name: ${{ steps.version.outputs.tag_name }}
tag_name: ${{ steps.version.outputs.tag_name }}
commitish: ${{ github.sha }}
- name: Release URL
run: echo ${{ steps.create-release.html_url }}
outputs:
matrix: ${{ steps.groups.outputs.matrix }}
upload_url: ${{ steps.create-release.outputs.upload_url }}
release_id: ${{ steps.create-release.outputs.id }}
combine:
needs: [generate-matrix]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
index: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
steps:
- name: Maximize build space
uses: easimon/maximize-build-space@master
with:
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
remove-docker-images: 'true'
- name: checkout
uses: actions/checkout@v4
- uses: actions/download-artifact@v4
with:
name: groups
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Install parquet cli from crates.io
uses: baptiste0928/cargo-install@v3
with:
crate: parquet
features: cli
- name: Download links
run: cat ${{ matrix.index }} | jq -rc '.[]'
- name: Debug
run: |
echo "Links for ${{ matrix.index }}"
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv'
- name: Download
run: |
mkdir input/
cat ${{ matrix.index }} | jq -rc '.[] | [.name, .id] | @tsv' | parallel --colsep '\t' wget --tries=2 -nv -O input/{2}.parquet {1}
- run: ls -la ${{ github.workspace }}/input/
- name: Combine
run: parquet-concat ${{ github.workspace }}/merged.parquet ${{ github.workspace }}/input/*.parquet
- name: Merged size
run: du -hs ${{ github.workspace }}/merged.parquet
- name: Rewrite
run: |
parquet-rewrite \
--compression=zstd \
--writer-version=2.0 \
--dictionary-enabled=true \
--statistics-enabled=page \
--bloom-filter-enabled=true \
--bloom-filter-fpp 0.2 \
--input=${{ github.workspace }}/merged.parquet \
--output=${{ github.workspace }}/output.parquet
- name: Output size
run: du -hs ${{ github.workspace }}/output.parquet
- name: Upload Assets
uses: shogo82148/actions-upload-release-asset@v1
with:
upload_url: ${{ needs.generate-matrix.outputs.upload_url }}
asset_path: ${{ github.workspace }}/output.parquet
asset_name: index-${{ matrix.index }}.parquet
makepublic:
needs: [generate-matrix, combine]
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4
- name: Publish release
uses: StuYarrow/[email protected]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
id: ${{ needs.generate-matrix.outputs.release_id }}
- name: Get download links
id: get_download_links
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const response = await github.rest.repos.listReleaseAssets({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: ${{ needs.generate-matrix.outputs.release_id }},
});
const assets = response.data;
// Save the download links to a file
require('fs').writeFileSync('links/dataset.txt', assets.map(asset => asset.browser_download_url).join('\n'));
- run: cat links/dataset.txt
- uses: EndBug/add-and-commit@v9
with:
add: 'links/dataset.txt'
author_email: "41898282+github-actions[bot]@users.noreply.github.com"
author_name: "commit-bot"
message: "Add download links for asset ${{ needs.generate-matrix.outputs.release_id }}"
push: true
fetch: true
pull: '--rebase --autostash'