Skip to content

Process Samples Aspera #1122

Process Samples Aspera

Process Samples Aspera #1122

name: Process Samples Aspera
on:
schedule:
- cron: '0 0-16 * * 0,6'
env:
BATCH_SIZE: 100
jobs:
setup:
permissions:
contents: write
id-token: write
runs-on: self-hosted
outputs:
run_rest_jobs: ${{ steps.get_accession_list.outputs.run_jobs }}
steps:
- name: Checkout main
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup Python
run: |
echo ${{secrets.DPILZ_USR_PWD}} | sudo -S dnf install python3 -y
echo ${{secrets.DPILZ_USR_PWD}} | sudo -S dnf install python3-pip -y
pip3 install pandas numpy pyyaml ffq epiweeks git+https://github.com/outbreak-info/python-outbreak-info.git@new_docs
- name: 'Get accession list'
id: get_accession_list
run: |
python scripts/get_accession_list.py $BATCH_SIZE
if [[ -f data/accession_list.csv ]]; then
echo "::set-output name=run_jobs::true"
else
echo "::set-output name=run_jobs::false"
fi
run_samples:
needs: [setup]
if: needs.setup.outputs.run_rest_jobs == 'true'
permissions:
contents: write
id-token: write
runs-on: self-hosted
steps:
- name: Checkout main
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup nextflow
uses: nf-core/setup-nextflow@v1
- name: Setup Python
run: |
echo ${{secrets.DPILZ_USR_PWD}} | sudo -S dnf install python3 -y
echo ${{secrets.DPILZ_USR_PWD}} | sudo -S dnf install python3-pip -y
pip3 install pandas numpy pyyaml ffq epiweeks git+https://github.com/outbreak-info/python-outbreak-info.git@new_docs
- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v2'
with:
version: '>= 363.0.0'
- id: 'auth'
name: 'Authenticate with gcloud'
uses: 'google-github-actions/auth@v2'
with:
workload_identity_provider: 'projects/12767718289/locations/global/workloadIdentityPools/github/providers/freyja-sra'
service_account: '[email protected]'
- name: 'Get accession list'
id: get_accession_list
run: |
python scripts/get_accession_list.py $BATCH_SIZE
- name: Run pipeline on new samples
run: |
export NXF_ENABLE_VIRTUAL_THREADS=false
nextflow run main.nf \
--accession_list data/accession_list.csv \
--num_samples $BATCH_SIZE \
-profile docker \
-entry aspera &
BG_PID=$!
wait $BG_PID
- name: Aggregate outputs
run: |
python scripts/aggregate_demix.py
python scripts/aggregate_variants.py
python scripts/aggregate_metadata.py
- id: 'download-aggregated-outputs'
name: 'Download aggregated outputs'
run: |
gcloud storage cp gs://outbreak-ww-data/aggregate/aggregate_demix.json outputs/aggregate/aggregate_demix.json --billing-project=andersen-lab-primary
gcloud storage cp gs://outbreak-ww-data/aggregate/aggregate_variants.json outputs/aggregate/aggregate_variants.json --billing-project=andersen-lab-primary
gcloud storage cp gs://outbreak-ww-data/aggregate/aggregate_metadata.json outputs/aggregate/aggregate_metadata.json --billing-project=andersen-lab-primary
- id: 'concatenate-outputs'
name: 'Concatenate outputs'
run: |
python scripts/concat_agg_files.py
- id: 'create-demix-by-week'
name: 'Create demix by week'
run: |
python scripts/aggregate_demix_by_week.py
- id: 'upload-outputs'
name: 'Upload Outputs to Cloud Storage'
uses: 'google-github-actions/upload-cloud-storage@v2'
with:
path: 'outputs/'
destination: 'outbreak-ww-data/'
parent: false
project_id: 'andersen-lab-primary'
- name: 'Update processed samples'
run: |
python scripts/update_sample_status.py $BATCH_SIZE
- name: 'Commit and push changes'
run: |
git config --local user.name "$GITHUB_ACTOR"
git config --local user.email "[email protected]"
git remote set-url origin https://github.com/andersen-lab/Freyja-SRA
git add data/all_metadata.csv
git commit -m "Update processed samples"
git push --force
- name: 'Clean workspace'
run: |
rm -rf ${{ github.workspace }}/*