From 4051dd2309200ebc5849e2995d8d1e9c11d3fbfb Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:36:56 +0200 Subject: [PATCH 1/8] ci: add release-please --- .github/workflows/release.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/workflows/release.yaml diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..fff0f70 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,17 @@ +name: Build and release +on: + push: + branches: main + workflow_dispatch: + +jobs: + bump-version: + name: Release version + runs-on: ubuntu-latest + + steps: + - uses: GoogleCloudPlatform/release-please-action@v3 + id: release + with: + release-type: python + package-name: assembly_snptyper \ No newline at end of file From a58377ffbe33e4c37f0b9ef8ed4ca01c89df06ce Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:37:07 +0200 Subject: [PATCH 2/8] test: add e2e test --- tests/test_e2e.sh | 34 ++++++++++++++++++++++++++++++++++ tests/test_expected_output.txt | 3 +++ 2 files changed, 37 insertions(+) create mode 100644 tests/test_e2e.sh create mode 100644 tests/test_expected_output.txt diff --git a/tests/test_e2e.sh b/tests/test_e2e.sh new file mode 100644 index 0000000..dd0564c --- /dev/null +++ b/tests/test_e2e.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -euxo pipefail + +TEST_DATA_DIR=tests/tmp + +# Download +mkdir -p "$TEST_DATA_DIR" +## M1UK strain from Lynskey et al. +curl --output "$TEST_DATA_DIR"/GCA_034818825.1.zip "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCA_034818825.1/download?include_annotation_type=GENOME_FASTA&hydrated=FULLY_HYDRATED" +## M1global strain from Lynskey et al. +curl --output "$TEST_DATA_DIR"/GCA_034822885.1.zip "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCA_034822885.1/download?include_annotation_type=GENOME_FASTA&hydrated=FULLY_HYDRATED" + +# Unzip +unzip -n "$TEST_DATA_DIR"/GCA_034818825.1.zip -d "$TEST_DATA_DIR" +unzip -n "$TEST_DATA_DIR"/GCA_034822885.1.zip -d "$TEST_DATA_DIR" + +# List genomes +find "$TEST_DATA_DIR" -iname "*.fna" -print > "$TEST_DATA_DIR"/test_genomes.txt + +# Install the script +python -m pip install . + +# Run the script +assembly_snptyper \ + --list_input "$TEST_DATA_DIR"/test_genomes.txt \ + --vcf data/M1UK.vcf \ + --reference "$TEST_DATA_DIR"/MGAS5005.fa \ + -p 2 > "$TEST_DATA_DIR"/test_output.txt + +# Check the output +cmp "$TEST_DATA_DIR"/test_output.txt tests/test_expected_output.txt + +rm -rf "$TEST_DATA_DIR" \ No newline at end of file diff --git a/tests/test_expected_output.txt b/tests/test_expected_output.txt new file mode 100644 index 0000000..efcc73f --- /dev/null +++ b/tests/test_expected_output.txt @@ -0,0 +1,3 @@ +sample matching_variants wt_variants variants_in_scheme variants_missing variants_multiple_cov +GCA_034818825.1_PDT001921168.1_genomic 27 0 27 0 0 +GCA_034822885.1_PDT001920952.1_genomic 0 27 27 0 0 From 830f9c4a2093de60f5b25d7f23a806e5e7455c1b Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:37:16 +0200 Subject: [PATCH 3/8] ci: add action for e2e test --- .github/workflows/test.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/test.yaml diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 0000000..36ec891 --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,27 @@ +name: assembly_snptyper test + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.config.os }} + strategy: + fail-fast: false + matrix: + config: + - {os: ubuntu-latest} + name: Testing assembly_snptyper ${{ matrix.config.os }} + + steps: + - uses: actions/checkout@v4 + - name: Install Conda environment with Micromamba + uses: mamba-org/provision-with-micromamba@main + with: + cache-downloads: true + environment-file: env.yaml + - name: Conda list + shell: bash -l {0} + run: conda list + - name: Test M1UK typing end-to-end + shell: bash -l {0} + run: bash tests/test_e2e.sh \ No newline at end of file From 17b6ccac1a1cd6b1ceee57493f181f8f14c435b2 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:40:42 +0200 Subject: [PATCH 4/8] docs: add example to README --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 90e9754..e1db9ca 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,18 @@ assembly_snptyper --version Note that: - the reference VCF must be based on the reference genome (i.e. identical chromosome names). - the list of input files must be a file list of paths to assemblies. This is to accomodate running the script on large numbers of files +- the reference fasta must be unzipped + +### Example + +Running `assembly_snptyper` for *S. pyogenes* M1UK, with paths to genome assemblies listed in `fastas.txt`: + +``` +assembly_snptyper --vcf data/M1UK.vcf --reference data/MGAS5005.fa --list_input fastas.txt -p 4 --verbose > output.txt +``` + + +## Help message ``` assembly_snptyper --help From 3d3a2c756d0ed29bc94c749b94a3319d68221226 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:41:25 +0200 Subject: [PATCH 5/8] style: black formatting --- assembly_snptyper/main.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/assembly_snptyper/main.py b/assembly_snptyper/main.py index afbaf65..5439f5d 100644 --- a/assembly_snptyper/main.py +++ b/assembly_snptyper/main.py @@ -63,6 +63,7 @@ def check_if_ref_is_ascii(reference): "Reference genome cannot be read as flat text: only unzipped FASTA reference genomes are supported." ) + def convert_vcf_to_bed(vcf, bed_path): """ Convert reference VCF to BED file @@ -264,7 +265,10 @@ def wrapper(args_dict): """ sample = Path(args_dict["input_asm"]).stem result = run_oneliner( - args_dict["bed_path"], args_dict["reference"], args_dict["input_asm"], args_dict["minimap_preset"] + args_dict["bed_path"], + args_dict["reference"], + args_dict["input_asm"], + args_dict["minimap_preset"], ) output = parse_mpileup_output(result, args_dict["vcf"], sample) logging.info(f"Processed {sample}") @@ -404,7 +408,9 @@ def main(): logging.info(f"Created temporary bed file: {bed_path}") logging.info("Starting typing workflow") - run_parallel(bed_path, args.reference, list_input, args.vcf, minimap_preset, args.processes) + run_parallel( + bed_path, args.reference, list_input, args.vcf, minimap_preset, args.processes + ) if __name__ == "__main__": From 3ac5625894838e5a12980a92f79f7a9fcd895dfb Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:42:11 +0200 Subject: [PATCH 6/8] ci: fix ref path --- tests/test_e2e.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_e2e.sh b/tests/test_e2e.sh index dd0564c..602d3cf 100644 --- a/tests/test_e2e.sh +++ b/tests/test_e2e.sh @@ -25,7 +25,7 @@ python -m pip install . assembly_snptyper \ --list_input "$TEST_DATA_DIR"/test_genomes.txt \ --vcf data/M1UK.vcf \ - --reference "$TEST_DATA_DIR"/MGAS5005.fa \ + --reference data/MGAS5005.fa \ -p 2 > "$TEST_DATA_DIR"/test_output.txt # Check the output From 5559a8391b42bce5aa7c506ca7cf36781789998a Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:43:52 +0200 Subject: [PATCH 7/8] ci: switch micromamba setup --- .github/workflows/test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 36ec891..0992e96 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -15,9 +15,9 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install Conda environment with Micromamba - uses: mamba-org/provision-with-micromamba@main + uses: mamba-org/setup-micromamba@v1 with: - cache-downloads: true + cache-environment: true environment-file: env.yaml - name: Conda list shell: bash -l {0} From 0be0408f4df041a81aa76cd23d494bff2b66638c Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Apr 2024 10:45:50 +0200 Subject: [PATCH 8/8] test: sort output before comparing --- tests/test_e2e.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_e2e.sh b/tests/test_e2e.sh index 602d3cf..4aa80c0 100644 --- a/tests/test_e2e.sh +++ b/tests/test_e2e.sh @@ -29,6 +29,6 @@ assembly_snptyper \ -p 2 > "$TEST_DATA_DIR"/test_output.txt # Check the output -cmp "$TEST_DATA_DIR"/test_output.txt tests/test_expected_output.txt +cmp <(sort "$TEST_DATA_DIR"/test_output.txt) <(sort tests/test_expected_output.txt) rm -rf "$TEST_DATA_DIR" \ No newline at end of file