From e069532eb65691e7b9ccdfeef668c832a926c784 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 14 Oct 2021 16:43:51 +0100 Subject: [PATCH 1/4] Add sample_name option and put it in BAM files RG header line --- viridian_workflow/__main__.py | 6 ++++++ viridian_workflow/minimap.py | 16 ++++++++++------ viridian_workflow/one_sample_pipeline.py | 9 +++++---- viridian_workflow/tasks/run_one_sample.py | 1 + 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/viridian_workflow/__main__.py b/viridian_workflow/__main__.py index 6cef9e7..b7afb54 100644 --- a/viridian_workflow/__main__.py +++ b/viridian_workflow/__main__.py @@ -102,6 +102,12 @@ def main(args=None): help="Target coverage for amplicon depth normalisation [%(default)s]", metavar="INT", ) + subparser_run_one_sample.add_argument( + "--sample_name", + default="sample", + help="Name of sample to put in header of final FASTA, VCF, and BAM files [%(default)s]", + metavar="STRING", + ) subparser_run_one_sample.set_defaults( func=viridian_workflow.tasks.run_one_sample.run diff --git a/viridian_workflow/minimap.py b/viridian_workflow/minimap.py index 3244466..f5419c2 100644 --- a/viridian_workflow/minimap.py +++ b/viridian_workflow/minimap.py @@ -5,17 +5,21 @@ from viridian_workflow.utils import run_process, check_file -def run_se(outdir, ref_genome, fq, prefix=None, threads=1): +def run_se(outdir, ref_genome, fq, prefix=None, threads=1, sample_name="sample"): """map single fastq ONT reads with minimap2 """ bam = os.path.join(outdir, "reference_mapped.bam") if prefix: bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam") - minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "map-ont", ref_genome, fq] + # Note: was getting issues with escaping the tab characters in the -R + # option. In the end got it to work by giving Popen a string instead of + # a list and setting shell=True (using a list and shell=False failed using + # exactly the same minimap_cmd contents). + minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "map-ont", ref_genome, fq] sort_cmd = ["samtools", "sort", "-o", bam] - map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE) + map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE) sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout) sort_proc.wait() check_file(bam) @@ -25,15 +29,15 @@ def run_se(outdir, ref_genome, fq, prefix=None, threads=1): return bam -def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1): +def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1, sample_name="sample"): bam = os.path.join(outdir, "reference_mapped.bam") if prefix: bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam") - minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2] + minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2] sort_cmd = ["samtools", "sort", "-o", bam] - map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE) + map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE) sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout) sort_proc.wait() check_file(bam) diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py index 3a8537e..0d5d973 100644 --- a/viridian_workflow/one_sample_pipeline.py +++ b/viridian_workflow/one_sample_pipeline.py @@ -49,6 +49,7 @@ def run_one_sample( keep_intermediate=False, keep_bam=False, target_sample_depth=1000, + sample_name="sample", ): check_tech(tech) if tech == "ont": @@ -66,9 +67,9 @@ def run_one_sample( amplicon_json, amplicon_bed ) if paired: - all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2) + all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2, sample_name=sample_name) else: - all_reads_bam = minimap.run_se(outdir, ref_genome, fq1) + all_reads_bam = minimap.run_se(outdir, ref_genome, fq1, sample_name=sample_name) sample_outprefix = os.path.join(outdir, "sample") sampler = sample_reads.sample_reads( ref_genome, @@ -91,10 +92,10 @@ def run_one_sample( varifier_out = os.path.join(outdir, "varifier") if paired: self_map = minimap.run( - outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc" + outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc", sample_name=sample_name ) else: - self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc") + self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name) masked_fasta = qcovid.self_qc(outdir, assembly, self_map) diff --git a/viridian_workflow/tasks/run_one_sample.py b/viridian_workflow/tasks/run_one_sample.py index 87eb388..cdf8004 100644 --- a/viridian_workflow/tasks/run_one_sample.py +++ b/viridian_workflow/tasks/run_one_sample.py @@ -23,4 +23,5 @@ def run(options): keep_intermediate=options.debug, keep_bam=options.keep_bam, target_sample_depth=options.target_sample_depth, + sample_name=options.sample_name, ) From 157f055e0096d5a79e5c93de0f5af93a4e64d322 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 14 Oct 2021 16:55:07 +0100 Subject: [PATCH 2/4] Put sample name in final VCF file --- .../set_sample_name_in_vcf_file.expect.vcf | 6 +++++ .../utils/set_sample_name_in_vcf_file.in.vcf | 6 +++++ tests/utils_test.py | 10 ++++++++ viridian_workflow/one_sample_pipeline.py | 23 +++++++++++++++---- viridian_workflow/utils.py | 11 +++++++++ 5 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 tests/data/utils/set_sample_name_in_vcf_file.expect.vcf create mode 100644 tests/data/utils/set_sample_name_in_vcf_file.in.vcf diff --git a/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf b/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf new file mode 100644 index 0000000..18bac5c --- /dev/null +++ b/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NEW_NAME +ref.1 1 . A G 61 . DP=42 GT 1/1 +ref.1 5 . C T 62 . DP=43 GT 0/0 diff --git a/tests/data/utils/set_sample_name_in_vcf_file.in.vcf b/tests/data/utils/set_sample_name_in_vcf_file.in.vcf new file mode 100644 index 0000000..0f90736 --- /dev/null +++ b/tests/data/utils/set_sample_name_in_vcf_file.in.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT OLD_NAME +ref.1 1 . A G 61 . DP=42 GT 1/1 +ref.1 5 . C T 62 . DP=43 GT 0/0 diff --git a/tests/utils_test.py b/tests/utils_test.py index eb056bd..37db52b 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -44,3 +44,13 @@ def test_load_amplicons_bed_file(): ] infile = os.path.join(data_dir, "load_amplicons_bed_file.bed") assert expect == utils.load_amplicons_bed_file(infile) + + +def test_set_sample_name_in_vcf_file(): + infile = os.path.join(data_dir, "set_sample_name_in_vcf_file.in.vcf") + tmp_out = "tmp.set_sample_name_in_vcf_file.vcf" + subprocess.check_output(f"rm -f {tmp_out}", shell=True) + utils.set_sample_name_in_vcf_file(infile, tmp_out, "NEW_NAME") + expect = os.path.join(data_dir, "set_sample_name_in_vcf_file.expect.vcf") + assert filecmp.cmp(tmp_out, expect, shallow=False) + os.unlink(tmp_out) diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py index 0d5d973..f727458 100644 --- a/viridian_workflow/one_sample_pipeline.py +++ b/viridian_workflow/one_sample_pipeline.py @@ -7,6 +7,7 @@ check_file, run_process, rm, + set_sample_name_in_vcf_file, ) @@ -67,7 +68,9 @@ def run_one_sample( amplicon_json, amplicon_bed ) if paired: - all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2, sample_name=sample_name) + all_reads_bam = minimap.run( + outdir, ref_genome, fq1, fq2, sample_name=sample_name + ) else: all_reads_bam = minimap.run_se(outdir, ref_genome, fq1, sample_name=sample_name) sample_outprefix = os.path.join(outdir, "sample") @@ -92,21 +95,31 @@ def run_one_sample( varifier_out = os.path.join(outdir, "varifier") if paired: self_map = minimap.run( - outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc", sample_name=sample_name + outdir, + assembly, + sampler.fq_out1, + sampler.fq_out2, + prefix="self_qc", + sample_name=sample_name, ) else: - self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name) + self_map = minimap.run_se( + outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name + ) masked_fasta = qcovid.self_qc(outdir, assembly, self_map) - vcf = varifier.run( + varifier_vcf = varifier.run( varifier_out, ref_genome, masked_fasta, min_coord=amplicons_start, max_coord=amplicons_end, ) - check_file(vcf) + check_file(varifier_vcf) + final_vcf = os.path.join(outdir, "variants.vcf") + set_sample_name_in_vcf_file(varifier_vcf, final_vcf, sample_name) + check_file(final_vcf) # clean up intermediate files if not keep_intermediate: diff --git a/viridian_workflow/utils.py b/viridian_workflow/utils.py index c7dbf2e..ec6e5b4 100644 --- a/viridian_workflow/utils.py +++ b/viridian_workflow/utils.py @@ -96,3 +96,14 @@ def load_single_seq_fasta(infile): ref = list(d.values())[0] ref.id = ref.id.split()[0] return ref + + +def set_sample_name_in_vcf_file(infile, outfile, sample_name): + with open(infile) as f_in, open(outfile, "w") as f_out: + for line in f_in: + if line.startswith("#CHROM\tPOS"): + fields = line.rstrip().split("\t") + fields[-1] = sample_name + print(*fields, sep="\t", file=f_out) + else: + print(line, end="", file=f_out) From 2bb8a7a77c5fdbaf078cc68da48abbaae3848be1 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 14 Oct 2021 17:08:07 +0100 Subject: [PATCH 3/4] Put sample name in final FASTA file --- .../data/utils/set_seq_name_in_fasta_file.expect.fasta | 2 ++ tests/data/utils/set_seq_name_in_fasta_file.in.fasta | 2 ++ tests/utils_test.py | 10 ++++++++++ viridian_workflow/one_sample_pipeline.py | 5 ++++- viridian_workflow/utils.py | 9 +++++++++ 5 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 tests/data/utils/set_seq_name_in_fasta_file.expect.fasta create mode 100644 tests/data/utils/set_seq_name_in_fasta_file.in.fasta diff --git a/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta b/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta new file mode 100644 index 0000000..8e42d3a --- /dev/null +++ b/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta @@ -0,0 +1,2 @@ +>NEW_NAME +ACGT diff --git a/tests/data/utils/set_seq_name_in_fasta_file.in.fasta b/tests/data/utils/set_seq_name_in_fasta_file.in.fasta new file mode 100644 index 0000000..7376c7c --- /dev/null +++ b/tests/data/utils/set_seq_name_in_fasta_file.in.fasta @@ -0,0 +1,2 @@ +>OLD_NAME +ACGT diff --git a/tests/utils_test.py b/tests/utils_test.py index 37db52b..6b48d4c 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -54,3 +54,13 @@ def test_set_sample_name_in_vcf_file(): expect = os.path.join(data_dir, "set_sample_name_in_vcf_file.expect.vcf") assert filecmp.cmp(tmp_out, expect, shallow=False) os.unlink(tmp_out) + + +def test_set_seq_name_in_fasta_file(): + infile = os.path.join(data_dir, "set_seq_name_in_fasta_file.in.fasta") + tmp_out = "tmp.set_seq_name_in_fasta_file.fasta" + subprocess.check_output(f"rm -f {tmp_out}", shell=True) + utils.set_seq_name_in_fasta_file(infile, tmp_out, "NEW_NAME") + expect = os.path.join(data_dir, "set_seq_name_in_fasta_file.expect.fasta") + assert filecmp.cmp(tmp_out, expect, shallow=False) + os.unlink(tmp_out) diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py index f727458..fcdec8f 100644 --- a/viridian_workflow/one_sample_pipeline.py +++ b/viridian_workflow/one_sample_pipeline.py @@ -8,6 +8,7 @@ run_process, rm, set_sample_name_in_vcf_file, + set_seq_name_in_fasta_file, ) @@ -108,11 +109,13 @@ def run_one_sample( ) masked_fasta = qcovid.self_qc(outdir, assembly, self_map) + final_masked_fasta = os.path.join(outdir, "consensus.masked.fa") + set_seq_name_in_fasta_file(masked_fasta, final_masked_fasta, sample_name) varifier_vcf = varifier.run( varifier_out, ref_genome, - masked_fasta, + final_masked_fasta, min_coord=amplicons_start, max_coord=amplicons_end, ) diff --git a/viridian_workflow/utils.py b/viridian_workflow/utils.py index ec6e5b4..f411034 100644 --- a/viridian_workflow/utils.py +++ b/viridian_workflow/utils.py @@ -107,3 +107,12 @@ def set_sample_name_in_vcf_file(infile, outfile, sample_name): print(*fields, sep="\t", file=f_out) else: print(line, end="", file=f_out) + + +def set_seq_name_in_fasta_file(infile, outfile, new_name): + """Changes name in FASTA file. Assumes that there is only one sequence + in the file, raises Exception if it finds >1 sequence""" + seq = load_single_seq_fasta(infile) + seq.id = new_name + with open(outfile, "w") as f: + print(seq, file=f) From 92810472bdeb6d27f4345abf5d253cc1978cd6bf Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 14 Oct 2021 17:11:44 +0100 Subject: [PATCH 4/4] Add description of option --sample_name --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 1fefc51..c8b06e5 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,9 @@ The FASTA and JSON files in those commands can be found in the `data/` directory of this repository. Other options: +* `--sample_name MY_NAME`: use this to change the sample name + (default is "sample") that is put in the final FASTA file, BAM file, and + VCF file. * `--keep_bam`: use this option to keep the BAM file of original input reads mapped to the reference genome. * `--force`: use with caution - it will overwrite the output directory if