feat: ONT sample preprocessing #2765
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Tests | |
on: | |
push: | |
branches: | |
- master | |
pull_request: | |
branches: | |
- "*" | |
jobs: | |
Cancel-previous-jobs: | |
runs-on: ubuntu-latest | |
if: github.ref != 'refs/heads/master' | |
steps: | |
- uses: khan/[email protected] | |
with: | |
workflows: "main.yml" | |
env: | |
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" | |
Formatting: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Formatting | |
uses: super-linter/[email protected] | |
env: | |
VALIDATE_ALL_CODEBASE: false | |
DEFAULT_BRANCH: master | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
FILTER_REGEX_EXCLUDE: CHANGELOG.md | |
VALIDATE_SNAKEMAKE_SNAKEFMT: true | |
VALIDATE_PYTHON_BLACK: true | |
VALIDATE_MARKDOWN: true | |
Linting: | |
runs-on: ubuntu-latest | |
env: | |
GISAID_API_TOKEN: ${{ secrets.GISAID_API_TOKEN }} | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Lint workflow | |
uses: snakemake/[email protected] | |
with: | |
directory: . | |
snakefile: workflow/Snakefile | |
stagein: mamba install -n snakemake -c conda-forge peppy | |
args: "--lint" | |
# pre-commit action currently fails: | |
# https://github.com/IKIM-Essen/uncovar/actions/runs/4304753941/jobs/7506225198#step:4:115 | |
# revisit when new pre-commit release >3.0.0 is out | |
# Pre-Commit: | |
# runs-on: ubuntu-latest | |
# if: github.ref != 'refs/heads/master' | |
# steps: | |
# - uses: actions/checkout@v4 | |
# - uses: actions/setup-python@v5 | |
# - uses: pre-commit/[email protected] | |
Technology-Tests: | |
runs-on: ubuntu-latest | |
env: | |
GISAID_API_TOKEN: ${{ secrets.GISAID_API_TOKEN }} | |
needs: | |
- Formatting | |
- Linting | |
#- Pre-Commit | |
strategy: | |
matrix: | |
rule: [all, all -np] | |
# disable ont actions | |
technology: [all, illumina, ont, ion] | |
# technology: [all, illumina, ion] | |
seq_method: [shotgun, amplicon] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Free Disk Space (Ubuntu) | |
uses: jlumbroso/[email protected] | |
with: | |
# this might remove tools that are actually needed, | |
# if set to "true" but frees about 6 GB | |
tool-cache: false | |
# all of these default to true, but feel free to set to | |
# "false" if necessary for your workflow | |
android: true | |
dotnet: true | |
haskell: true | |
large-packages: true | |
docker-images: false | |
swap-storage: true | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
# android - will release about 10 GB if you don't need Android | |
# dotnet - will release about 20 GB if you don't need .NET | |
- name: Free up some disk sapce | |
run: | | |
sudo rm -rf /usr/local/lib/android | |
sudo rm -rf /usr/share/dotnet | |
- name: Prepare test data for all technologies | |
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'all' || matrix.rule == 'compare_assemblers') | |
run: | | |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi | |
mkdir -p .tests/data | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz | |
echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv | |
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,$AMPLICON,illumina >> .tests/config/pep/samples.csv | |
echo ion-test,data/ion_reads.fastq.gz,,2022-01-01,$AMPLICON,ion >> .tests/config/pep/samples.csv | |
- name: Prepare test data for Illumina | |
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'illumina' || matrix.rule == 'compare_assemblers') | |
run: | | |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi | |
mkdir -p .tests/data | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz | |
echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv | |
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,$AMPLICON,illumina >> .tests/config/pep/samples.csv | |
- name: Prepare test data for Oxford Nanopore | |
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'ont' || matrix.rule == 'compare_assemblers') | |
run: | | |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi | |
mkdir -p .tests/data | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ont_reads.fastq.gz > .tests/data/ont_reads.fastq.gz | |
echo sample_name,fq1,date,is_amplicon_data,technology > .tests/config/pep/samples.csv | |
echo ont-test,data/ont_reads.fastq.gz,2022-01-01,$AMPLICON,ont >> .tests/config/pep/samples.csv | |
- name: Prepare test data for Ion Torrent | |
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'ion' || matrix.rule == 'compare_assemblers') | |
run: | | |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi | |
mkdir -p .tests/data | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz | |
echo sample_name,fq1,date,is_amplicon_data,technology > .tests/config/pep/samples.csv | |
echo ion-test,data/ion_reads.fastq.gz,2022-01-01,$AMPLICON,ion >> .tests/config/pep/samples.csv | |
- name: Use smaller reference files for testing | |
if: steps.test-resources.outputs.cache-hit != true | |
run: | | |
# mkdir -p .tests/resources/minikraken-8GB | |
# curl -SL https://github.com/thomasbtf/small-kraken-db/raw/master/human_k2db.tar.gz | tar zxvf - -C .tests/resources/minikraken-8GB --strip 1 | |
mkdir -p .tests/resources/genomes | |
curl -SL "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=BA000005.3&db=nuccore&report=fasta" | gzip -c > .tests/resources/genomes/human-genome.fna.gz | |
- name: Simulate GISAID download | |
run: | | |
mkdir -p .tests/results/benchmarking/tables | |
echo -e "resources/genomes/B.1.1.7.fasta\nresources/genomes/B.1.351.fasta" > .tests/results/benchmarking/tables/strain-genomes.txt | |
mkdir -p .tests/resources/genomes | |
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314997.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.1.7.fasta | |
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314998.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.351.fasta | |
- name: Test rule ${{ matrix.rule }} on ${{ matrix.technology }} ${{ matrix.seq_method }} data | |
uses: snakemake/[email protected] | |
with: | |
directory: .tests | |
snakefile: workflow/Snakefile | |
args: "-p --use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba ${{ matrix.rule }}" | |
- name: Test report | |
uses: snakemake/[email protected] | |
if: startsWith(matrix.rule, 'all -np') != true | |
with: | |
directory: .tests | |
snakefile: workflow/Snakefile | |
args: "${{ matrix.rule }} --report report.zip" | |
- name: Upload report | |
uses: actions/upload-artifact@v4 | |
if: matrix.technology == 'all' && matrix.rule != 'all -npr' | |
with: | |
name: report-rule-${{ matrix.rule }}-${{ matrix.technology }}-${{ matrix.seq_method }} | |
path: .tests/results/patient-reports/2022-01-01.zip | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: matrix.technology == 'all' && matrix.rule != 'all -npr' | |
with: | |
name: log-rule-${{ matrix.rule }}-technology-${{ matrix.technology }} | |
path: .tests/logs/ | |
- name: Change permissions for caching | |
run: sudo chmod -R 755 .tests/.snakemake/conda | |
- name: Print disk space | |
run: sudo df -h | |
Benchmarks-Tests: | |
runs-on: ubuntu-latest | |
env: | |
GISAID_API_TOKEN: ${{ secrets.GISAID_API_TOKEN }} | |
needs: | |
- Formatting | |
- Linting | |
#- Pre-Commit | |
strategy: | |
matrix: | |
rule: | |
[ | |
benchmark_strain_calling, | |
benchmark_assembly, | |
benchmark_mixtures, | |
benchmark_non_sars_cov_2, | |
benchmark_reads, | |
compare_assemblers, | |
] | |
# generate_test_cases, | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Free Disk Space (Ubuntu) | |
uses: jlumbroso/[email protected] | |
with: | |
# this might remove tools that are actually needed, | |
# if set to "true" but frees about 6 GB | |
tool-cache: false | |
# all of these default to true, but feel free to set to | |
# "false" if necessary for your workflow | |
android: true | |
dotnet: true | |
haskell: true | |
large-packages: true | |
docker-images: false | |
swap-storage: true | |
# - name: Cache conda dependencies | |
# uses: actions/cache@v2 | |
# with: | |
# path: | | |
# .tests/.snakemake/conda | |
# key: benchmarks-${{ runner.os }}-${{ matrix.rule }}-${{ matrix.technology }}-${{ matrix.seq_method }}-${{ hashFiles('*.tests/.snakemake/conda/*.yaml') }} | |
# TODO caches are currently completely misleading, as they lead to certain files becoming present on disk which might | |
# then hide failures that would otherwise be seen. | |
# - name: Get date | |
# id: get-date | |
# run: | | |
# echo "::set-output name=date::$(/bin/date -u "+%Y%m%d")" | |
# shell: bash | |
# - name: Cache resources | |
# id: test-resources | |
# uses: actions/cache@v2 | |
# with: | |
# path: | | |
# .tests/resources/minikraken-8GB | |
# .tests/resources/genomes/human-genome.fna.gz | |
# key: ${{ runner.os }}-test-resources-${{ steps.get-date.outputs.date }}-${{ hashFiles('**.tests/resources**taxo.k2d') }} | |
# restore-keys: | | |
# ${{ runner.os }}-test-resources-${{ steps.get-date.outputs.date }}- | |
# ${{ runner.os }}-test-resources- | |
# - name: Cache results | |
# if: startsWith(matrix.rule, 'all') | |
# id: test-results | |
# uses: actions/cache@v2 | |
# with: | |
# path: | | |
# .tests/results | |
# key: ${{ runner.os }}-results-${{ steps.get-date.outputs.date }}-${{ hashFiles('**results/2021-02-01/qc/multiqc.html') }} | |
# restore-keys: | | |
# ${{ runner.os }}-results-${{ steps.get-date.outputs.date }}- | |
# ${{ runner.os }}-results- | |
# - name: Cache data | |
# if: startsWith(matrix.rule, 'all') | |
# id: test-data | |
# uses: actions/cache@v2 | |
# with: | |
# path: | | |
# .tests/data | |
# key: ${{ runner.os }}-test-data-${{ steps.get-date.outputs.date }}-${{ hashFiles('**.tests/data/*.fastq.gz') }} | |
# restore-keys: | | |
# ${{ runner.os }}-test-data-${{ steps.get-date.outputs.date }}- | |
# ${{ runner.os }}-test-data- | |
# - name: Cache benchmark data | |
# if: startsWith(matrix.rule, 'all') != true | |
# id: benchmark-data | |
# uses: actions/cache@v2 | |
# with: | |
# path: | | |
# .tests/resources/benchmarking | |
# key: ${{ runner.os }}-benchmark-data-${{ steps.get-date.outputs.date }}-${{ hashFiles('**.tests/resources/benchmarking/**/reads.1.fastq.gz') }} | |
# restore-keys: | | |
# ${{ runner.os }}-benchmark-data-${{ steps.get-date.outputs.date }}- | |
# ${{ runner.os }}-benchmark-data- | |
# - name: Cache test dependencies | |
# if: startsWith(matrix.rule, 'all') | |
# id: test-dependencies | |
# uses: actions/cache@v2 | |
# with: | |
# path: | | |
# .tests/.snakemake/conda | |
# key: ${{ runner.os }}-sars-cov-test-dependencies-${{ steps.get-date.outputs.date }}-${{ hashFiles('*.tests/.snakemake/conda/*.yaml') }} | |
# restore-keys: | | |
# ${{ runner.os }}-sars-cov-test-dependencies-${{ steps.get-date.outputs.date }}- | |
# ${{ runner.os }}-sars-cov-test-dependencies- | |
# - name: Cache benchmark dependencies | |
# if: startsWith(matrix.rule, 'all') != true | |
# id: benchmark-dependencies | |
# uses: actions/cache@v2 | |
# with: | |
# path: | | |
# .tests/.snakemake/conda | |
# key: ${{ runner.os }}-sars-cov-benchmark-dependencies-${{ steps.get-date.outputs.date }}-${{ hashFiles('*.tests/.snakemake/conda/*.yaml') }} | |
# restore-keys: | | |
# ${{ runner.os }}-sars-cov-benchmark-dependencies-${{ steps.get-date.outputs.date }}- | |
# ${{ runner.os }}-sars-cov-benchmark-dependencies- | |
- name: Prepare test data | |
if: matrix.rule == 'generate_test_cases' | |
run: | | |
mkdir -p .tests/data | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ont_reads.fastq.gz > .tests/data/ont_reads.fastq.gz | |
echo sample_name,fq1,fq2,date,is_amplicon_data,technology,test_case > .tests/config/pep/samples.csv | |
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,1,illumina,case >> .tests/config/pep/samples.csv | |
echo ont-test,data/ont_reads.fastq.gz,,2022-01-01,1,ont,case >> .tests/config/pep/samples.csv | |
- name: Prepare test data | |
if: matrix.rule != 'generate_test_cases' | |
run: | | |
mkdir -p .tests/data | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz | |
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz | |
echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv | |
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,0,illumina >> .tests/config/pep/samples.csv | |
- name: Use smaller reference files for testing | |
if: steps.test-resources.outputs.cache-hit != true | |
run: | | |
# mkdir -p .tests/resources/minikraken-8GB | |
# curl -SL https://github.com/thomasbtf/small-kraken-db/raw/master/human_k2db.tar.gz | tar zxvf - -C .tests/resources/minikraken-8GB --strip 1 | |
mkdir -p .tests/resources/genomes | |
curl -SL "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=BA000005.3&db=nuccore&report=fasta" | gzip -c > .tests/resources/genomes/human-genome.fna.gz | |
- name: Simulate GISAID download | |
run: | | |
mkdir -p .tests/results/benchmarking/tables | |
echo -e "resources/genomes/B.1.1.7.fasta\nresources/genomes/B.1.351.fasta" > .tests/results/benchmarking/tables/strain-genomes.txt | |
mkdir -p .tests/resources/genomes | |
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314997.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.1.7.fasta | |
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314998.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.351.fasta | |
- name: Test rule ${{ matrix.rule }} | |
uses: snakemake/[email protected] | |
with: | |
directory: .tests | |
snakefile: workflow/Snakefile | |
args: "-p --use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba ${{ matrix.rule }}" | |
- name: Test report | |
uses: snakemake/[email protected] | |
if: startsWith(matrix.rule, 'all -np') != true | |
with: | |
directory: .tests | |
snakefile: workflow/Snakefile | |
args: "${{ matrix.rule }} --report report.zip" | |
# - name: Upload report | |
# uses: actions/upload-artifact@v4 | |
# with: | |
# name: report-rule-${{ matrix.rule }} | |
# path: .tests/results/patient-reports/2022-01-01.zip | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
with: | |
name: log-rule-${{ matrix.rule }} | |
path: .tests/logs/ | |
# - name: Unit test | |
# args: "--generate-unit-tests" | |
# - name: Test workflow (singularity) | |
# args: "--use-conda --use-singularity --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba" | |
# - name: Test input changes | |
# args: "--use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba -R `snakemake --list-input-changes`" | |
# - name: Test code changes | |
# args: "--use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba -R `snakemake --list-code-changes`" | |
# - name: Test params changes | |
# args: "--use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba -R `snakemake --list-params-changes`" | |
- name: Check strain calling benchmark | |
if: matrix.rule == 'benchmark_strain_calling' | |
run: | | |
cat .tests/results/benchmarking/strain-calling.csv | |
if (tail -n+2 .tests/results/benchmarking/strain-calling.csv | grep mismatch > /dev/null) | |
then | |
echo "Strain calling failed in some cases (see above)." | |
exit 1 | |
else | |
echo "Strain calling was successful in all cases." | |
fi | |
- name: Check pseudoassembly benchmark | |
if: matrix.rule == 'benchmark_assembly' | |
run: | | |
cat .tests/results/benchmarking/assembly/pseudoassembly.csv | |
if [[ $(tail -1 .tests/results/benchmarking/assembly/pseudoassembly.csv) < 0.95 ]] | |
then | |
echo "Pseudoassembly benchmarking failed. There is at least one assembly where the contigs do not cover 95% of the original sequence (see above)." | |
exit 1 | |
else | |
echo "Pseudoassembly was successful." | |
fi | |
- name: Check assembly benchmark | |
if: matrix.rule == 'benchmark_assembly' | |
run: | | |
cat .tests/results/benchmarking/assembly/assembly.csv | |
if [[ $(tail -1 .tests/results/benchmarking/assembly/assembly.csv) < 0.8 ]] | |
then | |
echo "Assembly benchmarking failed. There is at least one assembly where the contigs do not cover 80% of the original sequence (see above)." | |
exit 1 | |
else | |
echo "Assembly was successful." | |
fi | |
- name: Print non-sars-cov-2 kallisto calls | |
if: matrix.rule == 'benchmark_non_sars_cov_2' | |
run: | | |
cat .tests/results/benchmarking/tables/strain-calls/non-cov2-*.strains.kallisto.tsv | |
- name: Test non-sars-cov-2 coronaviruses | |
if: matrix.rule == 'benchmark_non_sars_cov_2' | |
run: | | |
cat .tests/results/benchmarking/non-sars-cov-2.csv | |
if (cat .tests/results/benchmarking/non-sars-cov-2.csv | grep 'is sars-cov-2' > /dev/null) | |
then | |
echo "Workflow failed! A non-sars-cov-2 genome was identified as sars-cov-2 (see above)." | |
exit 1 | |
else | |
echo "Workflow sucessfully identified samples as non-sars-cov-2 in all cases." | |
fi | |
- name: Change permissions for caching | |
run: sudo chmod -R 755 .tests/.snakemake/conda | |
- name: Print disk space | |
run: sudo df -h |