Merge pull request #16 from martinghunt/add_sample_name_option

Add sample name option
iqbal-lab-org · Oct 15, 2021 · c2fa4a8 · c2fa4a8
2 parents 7607dbb + 9281047
commit c2fa4a8
Show file tree

Hide file tree

Showing 11 changed files with 100 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -34,6 +34,9 @@ The FASTA and JSON files in those commands can be found in the `data/`
 directory of this repository.
 
 Other options:
+* `--sample_name MY_NAME`: use this to change the sample name
+  (default is "sample") that is put in the final FASTA file, BAM file, and
+  VCF file.
 * `--keep_bam`: use this option to keep the BAM file of original input reads
   mapped to the reference genome.
 * `--force`: use with caution - it will overwrite the output directory if

diff --git a/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf b/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Read depth">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NEW_NAME
+ref.1	1	.	A	G	61	.	DP=42	GT	1/1
+ref.1	5	.	C	T	62	.	DP=43	GT	0/0
diff --git a/tests/data/utils/set_sample_name_in_vcf_file.in.vcf b/tests/data/utils/set_sample_name_in_vcf_file.in.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Read depth">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	OLD_NAME
+ref.1	1	.	A	G	61	.	DP=42	GT	1/1
+ref.1	5	.	C	T	62	.	DP=43	GT	0/0
diff --git a/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta b/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta
@@ -0,0 +1,2 @@
+>NEW_NAME
+ACGT
diff --git a/tests/data/utils/set_seq_name_in_fasta_file.in.fasta b/tests/data/utils/set_seq_name_in_fasta_file.in.fasta
@@ -0,0 +1,2 @@
+>OLD_NAME
+ACGT
diff --git a/tests/utils_test.py b/tests/utils_test.py
@@ -44,3 +44,23 @@ def test_load_amplicons_bed_file():
     ]
     infile = os.path.join(data_dir, "load_amplicons_bed_file.bed")
     assert expect == utils.load_amplicons_bed_file(infile)
+
+
+def test_set_sample_name_in_vcf_file():
+    infile = os.path.join(data_dir, "set_sample_name_in_vcf_file.in.vcf")
+    tmp_out = "tmp.set_sample_name_in_vcf_file.vcf"
+    subprocess.check_output(f"rm -f {tmp_out}", shell=True)
+    utils.set_sample_name_in_vcf_file(infile, tmp_out, "NEW_NAME")
+    expect = os.path.join(data_dir, "set_sample_name_in_vcf_file.expect.vcf")
+    assert filecmp.cmp(tmp_out, expect, shallow=False)
+    os.unlink(tmp_out)
+
+
+def test_set_seq_name_in_fasta_file():
+    infile = os.path.join(data_dir, "set_seq_name_in_fasta_file.in.fasta")
+    tmp_out = "tmp.set_seq_name_in_fasta_file.fasta"
+    subprocess.check_output(f"rm -f {tmp_out}", shell=True)
+    utils.set_seq_name_in_fasta_file(infile, tmp_out, "NEW_NAME")
+    expect = os.path.join(data_dir, "set_seq_name_in_fasta_file.expect.fasta")
+    assert filecmp.cmp(tmp_out, expect, shallow=False)
+    os.unlink(tmp_out)
diff --git a/viridian_workflow/__main__.py b/viridian_workflow/__main__.py
@@ -102,6 +102,12 @@ def main(args=None):
         help="Target coverage for amplicon depth normalisation [%(default)s]",
         metavar="INT",
     )
+    subparser_run_one_sample.add_argument(
+        "--sample_name",
+        default="sample",
+        help="Name of sample to put in header of final FASTA, VCF, and BAM files [%(default)s]",
+        metavar="STRING",
+    )
 
     subparser_run_one_sample.set_defaults(
         func=viridian_workflow.tasks.run_one_sample.run

diff --git a/viridian_workflow/minimap.py b/viridian_workflow/minimap.py
@@ -5,17 +5,21 @@
 from viridian_workflow.utils import run_process, check_file
 
 
-def run_se(outdir, ref_genome, fq, prefix=None, threads=1):
+def run_se(outdir, ref_genome, fq, prefix=None, threads=1, sample_name="sample"):
     """map single fastq ONT reads with minimap2
     """
     bam = os.path.join(outdir, "reference_mapped.bam")
     if prefix:
         bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam")
 
-    minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "map-ont", ref_genome, fq]
+    # Note: was getting issues with escaping the tab characters in the -R
+    # option. In the end got it to work by giving Popen a string instead of
+    # a list and setting shell=True (using a list and shell=False failed using
+    # exactly the same minimap_cmd contents).
+    minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "map-ont", ref_genome, fq]
     sort_cmd = ["samtools", "sort", "-o", bam]
 
-    map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE)
+    map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE)
     sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout)
     sort_proc.wait()
     check_file(bam)
@@ -25,15 +29,15 @@ def run_se(outdir, ref_genome, fq, prefix=None, threads=1):
     return bam
 
 
-def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1):
+def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1, sample_name="sample"):
     bam = os.path.join(outdir, "reference_mapped.bam")
     if prefix:
         bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam")
 
-    minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2]
+    minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2]
     sort_cmd = ["samtools", "sort", "-o", bam]
 
-    map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE)
+    map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE)
     sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout)
     sort_proc.wait()
     check_file(bam)

diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py
@@ -7,6 +7,8 @@
     check_file,
     run_process,
     rm,
+    set_sample_name_in_vcf_file,
+    set_seq_name_in_fasta_file,
 )
 
 
@@ -49,6 +51,7 @@ def run_one_sample(
     keep_intermediate=False,
     keep_bam=False,
     target_sample_depth=1000,
+    sample_name="sample",
 ):
     check_tech(tech)
     if tech == "ont":
@@ -66,9 +69,11 @@ def run_one_sample(
         amplicon_json, amplicon_bed
     )
     if paired:
-        all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2)
+        all_reads_bam = minimap.run(
+            outdir, ref_genome, fq1, fq2, sample_name=sample_name
+        )
     else:
-        all_reads_bam = minimap.run_se(outdir, ref_genome, fq1)
+        all_reads_bam = minimap.run_se(outdir, ref_genome, fq1, sample_name=sample_name)
     sample_outprefix = os.path.join(outdir, "sample")
     sampler = sample_reads.sample_reads(
         ref_genome,
@@ -91,21 +96,33 @@ def run_one_sample(
     varifier_out = os.path.join(outdir, "varifier")
     if paired:
         self_map = minimap.run(
-            outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc"
+            outdir,
+            assembly,
+            sampler.fq_out1,
+            sampler.fq_out2,
+            prefix="self_qc",
+            sample_name=sample_name,
         )
     else:
-        self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc")
+        self_map = minimap.run_se(
+            outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name
+        )
 
     masked_fasta = qcovid.self_qc(outdir, assembly, self_map)
+    final_masked_fasta = os.path.join(outdir, "consensus.masked.fa")
+    set_seq_name_in_fasta_file(masked_fasta, final_masked_fasta, sample_name)
 
-    vcf = varifier.run(
+    varifier_vcf = varifier.run(
         varifier_out,
         ref_genome,
-        masked_fasta,
+        final_masked_fasta,
         min_coord=amplicons_start,
         max_coord=amplicons_end,
     )
-    check_file(vcf)
+    check_file(varifier_vcf)
+    final_vcf = os.path.join(outdir, "variants.vcf")
+    set_sample_name_in_vcf_file(varifier_vcf, final_vcf, sample_name)
+    check_file(final_vcf)
 
     # clean up intermediate files
     if not keep_intermediate:

diff --git a/viridian_workflow/tasks/run_one_sample.py b/viridian_workflow/tasks/run_one_sample.py
@@ -23,4 +23,5 @@ def run(options):
         keep_intermediate=options.debug,
         keep_bam=options.keep_bam,
         target_sample_depth=options.target_sample_depth,
+        sample_name=options.sample_name,
     )
diff --git a/viridian_workflow/utils.py b/viridian_workflow/utils.py
@@ -96,3 +96,23 @@ def load_single_seq_fasta(infile):
     ref = list(d.values())[0]
     ref.id = ref.id.split()[0]
     return ref
+
+
+def set_sample_name_in_vcf_file(infile, outfile, sample_name):
+    with open(infile) as f_in, open(outfile, "w") as f_out:
+        for line in f_in:
+            if line.startswith("#CHROM\tPOS"):
+                fields = line.rstrip().split("\t")
+                fields[-1] = sample_name
+                print(*fields, sep="\t", file=f_out)
+            else:
+                print(line, end="", file=f_out)
+
+
+def set_seq_name_in_fasta_file(infile, outfile, new_name):
+    """Changes name in FASTA file. Assumes that there is only one sequence
+    in the file, raises Exception if it finds >1 sequence"""
+    seq = load_single_seq_fasta(infile)
+    seq.id = new_name
+    with open(outfile, "w") as f:
+        print(seq, file=f)