boasvdp · boasvdp · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -0,0 +1,17 @@
+name: Build and release
+on:
+  push:
+    branches: main
+  workflow_dispatch:
+
+jobs:
+  bump-version:
+    name: Release version
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: GoogleCloudPlatform/release-please-action@v3
+        id: release
+        with:
+          release-type: python
+          package-name: assembly_snptyper
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,27 @@
+name: assembly_snptyper test
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config: 
+          - {os: ubuntu-latest}
+    name: Testing assembly_snptyper ${{ matrix.config.os }}
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Conda environment with Micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          cache-environment: true
+          environment-file: env.yaml
+      - name: Conda list
+        shell: bash -l {0}
+        run: conda list
+      - name: Test M1UK typing end-to-end
+        shell: bash -l {0}
+        run: bash tests/test_e2e.sh
diff --git a/README.md b/README.md
@@ -32,6 +32,18 @@ assembly_snptyper --version
 Note that: 
 - the reference VCF must be based on the reference genome (i.e. identical chromosome names).
 - the list of input files must be a file list of paths to assemblies. This is to accomodate running the script on large numbers of files
+- the reference fasta must be unzipped
+
+### Example
+
+Running `assembly_snptyper` for *S. pyogenes* M1UK, with paths to genome assemblies listed in `fastas.txt`:
+
+```
+assembly_snptyper --vcf data/M1UK.vcf --reference data/MGAS5005.fa --list_input fastas.txt -p 4 --verbose > output.txt
+```
+
+
+## Help message
 
 ```
 assembly_snptyper --help

diff --git a/assembly_snptyper/main.py b/assembly_snptyper/main.py
@@ -63,6 +63,7 @@ def check_if_ref_is_ascii(reference):
             "Reference genome cannot be read as flat text: only unzipped FASTA reference genomes are supported."
         )
 
+
 def convert_vcf_to_bed(vcf, bed_path):
     """
     Convert reference VCF to BED file
@@ -264,7 +265,10 @@ def wrapper(args_dict):
     """
     sample = Path(args_dict["input_asm"]).stem
     result = run_oneliner(
-        args_dict["bed_path"], args_dict["reference"], args_dict["input_asm"], args_dict["minimap_preset"]
+        args_dict["bed_path"],
+        args_dict["reference"],
+        args_dict["input_asm"],
+        args_dict["minimap_preset"],
     )
     output = parse_mpileup_output(result, args_dict["vcf"], sample)
     logging.info(f"Processed {sample}")
@@ -404,7 +408,9 @@ def main():
     logging.info(f"Created temporary bed file: {bed_path}")
 
     logging.info("Starting typing workflow")
-    run_parallel(bed_path, args.reference, list_input, args.vcf, minimap_preset, args.processes)
+    run_parallel(
+        bed_path, args.reference, list_input, args.vcf, minimap_preset, args.processes
+    )
 
 
 if __name__ == "__main__":

diff --git a/tests/test_e2e.sh b/tests/test_e2e.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+TEST_DATA_DIR=tests/tmp
+
+# Download 
+mkdir -p "$TEST_DATA_DIR"
+## M1UK strain from Lynskey et al.
+curl --output "$TEST_DATA_DIR"/GCA_034818825.1.zip "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCA_034818825.1/download?include_annotation_type=GENOME_FASTA&hydrated=FULLY_HYDRATED"
+## M1global strain from Lynskey et al.
+curl --output "$TEST_DATA_DIR"/GCA_034822885.1.zip "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCA_034822885.1/download?include_annotation_type=GENOME_FASTA&hydrated=FULLY_HYDRATED"
+
+# Unzip
+unzip -n "$TEST_DATA_DIR"/GCA_034818825.1.zip -d "$TEST_DATA_DIR"
+unzip -n "$TEST_DATA_DIR"/GCA_034822885.1.zip -d "$TEST_DATA_DIR"
+
+# List genomes
+find "$TEST_DATA_DIR" -iname "*.fna" -print > "$TEST_DATA_DIR"/test_genomes.txt
+
+# Install the script
+python -m pip install .
+
+# Run the script
+assembly_snptyper \
+    --list_input "$TEST_DATA_DIR"/test_genomes.txt \
+    --vcf data/M1UK.vcf \
+    --reference data/MGAS5005.fa \
+    -p 2 > "$TEST_DATA_DIR"/test_output.txt
+
+# Check the output
+cmp <(sort "$TEST_DATA_DIR"/test_output.txt) <(sort tests/test_expected_output.txt)
+
+rm -rf "$TEST_DATA_DIR"
diff --git a/tests/test_expected_output.txt b/tests/test_expected_output.txt
@@ -0,0 +1,3 @@
+sample	matching_variants	wt_variants	variants_in_scheme	variants_missing	variants_multiple_cov
+GCA_034818825.1_PDT001921168.1_genomic	27	0	27	0	0
+GCA_034822885.1_PDT001920952.1_genomic	0	27	27	0	0