diff --git a/tests/utils_test.py b/tests/utils_test.py index d4a3060..eb056bd 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -24,13 +24,15 @@ def test_load_single_seq_fasta(): utils.load_single_seq_fasta(infile) -def test_amplicons_json_to_bed(): +def test_amplicons_json_to_bed_and_range(): json_in = os.path.join(data_dir, "amplicons_json_to_bed.json") expect_bed = os.path.join(data_dir, "amplicons_json_to_bed.bed") tmp_out = "tmp.amplicons_json_to_bed.bed" subprocess.check_output(f"rm -f {tmp_out}", shell=True) - utils.amplicons_json_to_bed(json_in, tmp_out) + got_start, got_end = utils.amplicons_json_to_bed_and_range(json_in, tmp_out) assert filecmp.cmp(tmp_out, expect_bed, shallow=False) + assert got_start == 100 + assert got_end == 250 os.unlink(tmp_out) diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py index 4fde38e..3a8537e 100644 --- a/viridian_workflow/one_sample_pipeline.py +++ b/viridian_workflow/one_sample_pipeline.py @@ -2,7 +2,12 @@ import os from viridian_workflow import minimap, qcovid, sample_reads, varifier -from viridian_workflow.utils import amplicons_json_to_bed, check_file, run_process, rm +from viridian_workflow.utils import ( + amplicons_json_to_bed_and_range, + check_file, + run_process, + rm, +) def check_tech(tech): @@ -57,7 +62,9 @@ def run_one_sample( os.mkdir(outdir) amplicon_bed = os.path.join(outdir, "amplicons.bed") - amplicons_json_to_bed(amplicon_json, amplicon_bed) + amplicons_start, amplicons_end = amplicons_json_to_bed_and_range( + amplicon_json, amplicon_bed + ) if paired: all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2) else: @@ -91,7 +98,13 @@ def run_one_sample( masked_fasta = qcovid.self_qc(outdir, assembly, self_map) - vcf = varifier.run(varifier_out, ref_genome, masked_fasta) + vcf = varifier.run( + varifier_out, + ref_genome, + masked_fasta, + min_coord=amplicons_start, + max_coord=amplicons_end, + ) check_file(vcf) # clean up intermediate files diff --git a/viridian_workflow/utils.py b/viridian_workflow/utils.py index afbd1f8..c7dbf2e 100644 --- a/viridian_workflow/utils.py +++ b/viridian_workflow/utils.py @@ -49,17 +49,27 @@ def run_process(cmd, ignore_error=False, stdout=None): logging.error(result.stderr) -def amplicons_json_to_bed(infile, outfile): +def amplicons_json_to_bed_and_range(infile, outfile): + """Converts the amplicons JSON file to a BED file, which is used in + various stages of the pipeline. Returns the 0-based inclusive coordinates + of the start of the first amplicon and end of the last amplicon, as + a tuple (start, end)""" with open(infile) as f: data = json.load(f) + start = float("inf") + end = -1 + data_out = [] for amplicon, d in data["amplicons"].items(): data_out.append((amplicon, d["start"], d["end"] + 1)) + start = min(start, d["start"]) + end = max(end, d["end"]) data_out.sort(key=itemgetter(1)) with open(outfile, "w") as f: for t in data_out: print(*t, sep="\t", file=f) + return start, end def load_amplicons_bed_file(infile): diff --git a/viridian_workflow/varifier.py b/viridian_workflow/varifier.py index a3a25dc..2017dd0 100644 --- a/viridian_workflow/varifier.py +++ b/viridian_workflow/varifier.py @@ -6,8 +6,14 @@ from viridian_workflow.utils import run_process, check_file -def run(outdir, ref_genome, assembly): +def run(outdir, ref_genome, assembly, min_coord=None, max_coord=None): vcf = os.path.join(outdir, "04.truth.vcf") - run_process(f"varifier make_truth_vcf {assembly} {ref_genome} {outdir}") + options = ["--global_align"] + if min_coord is not None: + options.append(f"--global_align_min_coord {min_coord + 1}") + if max_coord is not None: + options.append(f"--global_align_max_coord {max_coord + 1}") + options = " ".join(options) + run_process(f"varifier make_truth_vcf {options} {assembly} {ref_genome} {outdir}") check_file(vcf) return vcf