From e069532eb65691e7b9ccdfeef668c832a926c784 Mon Sep 17 00:00:00 2001
From: Martin Hunt <martin.g.hunt@gmail.com>
Date: Thu, 14 Oct 2021 16:43:51 +0100
Subject: [PATCH 1/4] Add sample_name option and put it in BAM files RG header
 line

---
 viridian_workflow/__main__.py             |  6 ++++++
 viridian_workflow/minimap.py              | 16 ++++++++++------
 viridian_workflow/one_sample_pipeline.py  |  9 +++++----
 viridian_workflow/tasks/run_one_sample.py |  1 +
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/viridian_workflow/__main__.py b/viridian_workflow/__main__.py
index 6cef9e7..b7afb54 100644
--- a/viridian_workflow/__main__.py
+++ b/viridian_workflow/__main__.py
@@ -102,6 +102,12 @@ def main(args=None):
         help="Target coverage for amplicon depth normalisation [%(default)s]",
         metavar="INT",
     )
+    subparser_run_one_sample.add_argument(
+        "--sample_name",
+        default="sample",
+        help="Name of sample to put in header of final FASTA, VCF, and BAM files [%(default)s]",
+        metavar="STRING",
+    )
 
     subparser_run_one_sample.set_defaults(
         func=viridian_workflow.tasks.run_one_sample.run
diff --git a/viridian_workflow/minimap.py b/viridian_workflow/minimap.py
index 3244466..f5419c2 100644
--- a/viridian_workflow/minimap.py
+++ b/viridian_workflow/minimap.py
@@ -5,17 +5,21 @@
 from viridian_workflow.utils import run_process, check_file
 
 
-def run_se(outdir, ref_genome, fq, prefix=None, threads=1):
+def run_se(outdir, ref_genome, fq, prefix=None, threads=1, sample_name="sample"):
     """map single fastq ONT reads with minimap2
     """
     bam = os.path.join(outdir, "reference_mapped.bam")
     if prefix:
         bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam")
 
-    minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "map-ont", ref_genome, fq]
+    # Note: was getting issues with escaping the tab characters in the -R
+    # option. In the end got it to work by giving Popen a string instead of
+    # a list and setting shell=True (using a list and shell=False failed using
+    # exactly the same minimap_cmd contents).
+    minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "map-ont", ref_genome, fq]
     sort_cmd = ["samtools", "sort", "-o", bam]
 
-    map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE)
+    map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE)
     sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout)
     sort_proc.wait()
     check_file(bam)
@@ -25,15 +29,15 @@ def run_se(outdir, ref_genome, fq, prefix=None, threads=1):
     return bam
 
 
-def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1):
+def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1, sample_name="sample"):
     bam = os.path.join(outdir, "reference_mapped.bam")
     if prefix:
         bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam")
 
-    minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2]
+    minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2]
     sort_cmd = ["samtools", "sort", "-o", bam]
 
-    map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE)
+    map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE)
     sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout)
     sort_proc.wait()
     check_file(bam)
diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py
index 3a8537e..0d5d973 100644
--- a/viridian_workflow/one_sample_pipeline.py
+++ b/viridian_workflow/one_sample_pipeline.py
@@ -49,6 +49,7 @@ def run_one_sample(
     keep_intermediate=False,
     keep_bam=False,
     target_sample_depth=1000,
+    sample_name="sample",
 ):
     check_tech(tech)
     if tech == "ont":
@@ -66,9 +67,9 @@ def run_one_sample(
         amplicon_json, amplicon_bed
     )
     if paired:
-        all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2)
+        all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2, sample_name=sample_name)
     else:
-        all_reads_bam = minimap.run_se(outdir, ref_genome, fq1)
+        all_reads_bam = minimap.run_se(outdir, ref_genome, fq1, sample_name=sample_name)
     sample_outprefix = os.path.join(outdir, "sample")
     sampler = sample_reads.sample_reads(
         ref_genome,
@@ -91,10 +92,10 @@ def run_one_sample(
     varifier_out = os.path.join(outdir, "varifier")
     if paired:
         self_map = minimap.run(
-            outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc"
+            outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc", sample_name=sample_name
         )
     else:
-        self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc")
+        self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name)
 
     masked_fasta = qcovid.self_qc(outdir, assembly, self_map)
 
diff --git a/viridian_workflow/tasks/run_one_sample.py b/viridian_workflow/tasks/run_one_sample.py
index 87eb388..cdf8004 100644
--- a/viridian_workflow/tasks/run_one_sample.py
+++ b/viridian_workflow/tasks/run_one_sample.py
@@ -23,4 +23,5 @@ def run(options):
         keep_intermediate=options.debug,
         keep_bam=options.keep_bam,
         target_sample_depth=options.target_sample_depth,
+        sample_name=options.sample_name,
     )

From 157f055e0096d5a79e5c93de0f5af93a4e64d322 Mon Sep 17 00:00:00 2001
From: Martin Hunt <martin.g.hunt@gmail.com>
Date: Thu, 14 Oct 2021 16:55:07 +0100
Subject: [PATCH 2/4] Put sample name in final VCF file

---
 .../set_sample_name_in_vcf_file.expect.vcf    |  6 +++++
 .../utils/set_sample_name_in_vcf_file.in.vcf  |  6 +++++
 tests/utils_test.py                           | 10 ++++++++
 viridian_workflow/one_sample_pipeline.py      | 23 +++++++++++++++----
 viridian_workflow/utils.py                    | 11 +++++++++
 5 files changed, 51 insertions(+), 5 deletions(-)
 create mode 100644 tests/data/utils/set_sample_name_in_vcf_file.expect.vcf
 create mode 100644 tests/data/utils/set_sample_name_in_vcf_file.in.vcf

diff --git a/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf b/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf
new file mode 100644
index 0000000..18bac5c
--- /dev/null
+++ b/tests/data/utils/set_sample_name_in_vcf_file.expect.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Read depth">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NEW_NAME
+ref.1	1	.	A	G	61	.	DP=42	GT	1/1
+ref.1	5	.	C	T	62	.	DP=43	GT	0/0
diff --git a/tests/data/utils/set_sample_name_in_vcf_file.in.vcf b/tests/data/utils/set_sample_name_in_vcf_file.in.vcf
new file mode 100644
index 0000000..0f90736
--- /dev/null
+++ b/tests/data/utils/set_sample_name_in_vcf_file.in.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Read depth">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	OLD_NAME
+ref.1	1	.	A	G	61	.	DP=42	GT	1/1
+ref.1	5	.	C	T	62	.	DP=43	GT	0/0
diff --git a/tests/utils_test.py b/tests/utils_test.py
index eb056bd..37db52b 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -44,3 +44,13 @@ def test_load_amplicons_bed_file():
     ]
     infile = os.path.join(data_dir, "load_amplicons_bed_file.bed")
     assert expect == utils.load_amplicons_bed_file(infile)
+
+
+def test_set_sample_name_in_vcf_file():
+    infile = os.path.join(data_dir, "set_sample_name_in_vcf_file.in.vcf")
+    tmp_out = "tmp.set_sample_name_in_vcf_file.vcf"
+    subprocess.check_output(f"rm -f {tmp_out}", shell=True)
+    utils.set_sample_name_in_vcf_file(infile, tmp_out, "NEW_NAME")
+    expect = os.path.join(data_dir, "set_sample_name_in_vcf_file.expect.vcf")
+    assert filecmp.cmp(tmp_out, expect, shallow=False)
+    os.unlink(tmp_out)
diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py
index 0d5d973..f727458 100644
--- a/viridian_workflow/one_sample_pipeline.py
+++ b/viridian_workflow/one_sample_pipeline.py
@@ -7,6 +7,7 @@
     check_file,
     run_process,
     rm,
+    set_sample_name_in_vcf_file,
 )
 
 
@@ -67,7 +68,9 @@ def run_one_sample(
         amplicon_json, amplicon_bed
     )
     if paired:
-        all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2, sample_name=sample_name)
+        all_reads_bam = minimap.run(
+            outdir, ref_genome, fq1, fq2, sample_name=sample_name
+        )
     else:
         all_reads_bam = minimap.run_se(outdir, ref_genome, fq1, sample_name=sample_name)
     sample_outprefix = os.path.join(outdir, "sample")
@@ -92,21 +95,31 @@ def run_one_sample(
     varifier_out = os.path.join(outdir, "varifier")
     if paired:
         self_map = minimap.run(
-            outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc", sample_name=sample_name
+            outdir,
+            assembly,
+            sampler.fq_out1,
+            sampler.fq_out2,
+            prefix="self_qc",
+            sample_name=sample_name,
         )
     else:
-        self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name)
+        self_map = minimap.run_se(
+            outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name
+        )
 
     masked_fasta = qcovid.self_qc(outdir, assembly, self_map)
 
-    vcf = varifier.run(
+    varifier_vcf = varifier.run(
         varifier_out,
         ref_genome,
         masked_fasta,
         min_coord=amplicons_start,
         max_coord=amplicons_end,
     )
-    check_file(vcf)
+    check_file(varifier_vcf)
+    final_vcf = os.path.join(outdir, "variants.vcf")
+    set_sample_name_in_vcf_file(varifier_vcf, final_vcf, sample_name)
+    check_file(final_vcf)
 
     # clean up intermediate files
     if not keep_intermediate:
diff --git a/viridian_workflow/utils.py b/viridian_workflow/utils.py
index c7dbf2e..ec6e5b4 100644
--- a/viridian_workflow/utils.py
+++ b/viridian_workflow/utils.py
@@ -96,3 +96,14 @@ def load_single_seq_fasta(infile):
     ref = list(d.values())[0]
     ref.id = ref.id.split()[0]
     return ref
+
+
+def set_sample_name_in_vcf_file(infile, outfile, sample_name):
+    with open(infile) as f_in, open(outfile, "w") as f_out:
+        for line in f_in:
+            if line.startswith("#CHROM\tPOS"):
+                fields = line.rstrip().split("\t")
+                fields[-1] = sample_name
+                print(*fields, sep="\t", file=f_out)
+            else:
+                print(line, end="", file=f_out)

From 2bb8a7a77c5fdbaf078cc68da48abbaae3848be1 Mon Sep 17 00:00:00 2001
From: Martin Hunt <martin.g.hunt@gmail.com>
Date: Thu, 14 Oct 2021 17:08:07 +0100
Subject: [PATCH 3/4] Put sample name in final FASTA file

---
 .../data/utils/set_seq_name_in_fasta_file.expect.fasta |  2 ++
 tests/data/utils/set_seq_name_in_fasta_file.in.fasta   |  2 ++
 tests/utils_test.py                                    | 10 ++++++++++
 viridian_workflow/one_sample_pipeline.py               |  5 ++++-
 viridian_workflow/utils.py                             |  9 +++++++++
 5 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/utils/set_seq_name_in_fasta_file.expect.fasta
 create mode 100644 tests/data/utils/set_seq_name_in_fasta_file.in.fasta

diff --git a/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta b/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta
new file mode 100644
index 0000000..8e42d3a
--- /dev/null
+++ b/tests/data/utils/set_seq_name_in_fasta_file.expect.fasta
@@ -0,0 +1,2 @@
+>NEW_NAME
+ACGT
diff --git a/tests/data/utils/set_seq_name_in_fasta_file.in.fasta b/tests/data/utils/set_seq_name_in_fasta_file.in.fasta
new file mode 100644
index 0000000..7376c7c
--- /dev/null
+++ b/tests/data/utils/set_seq_name_in_fasta_file.in.fasta
@@ -0,0 +1,2 @@
+>OLD_NAME
+ACGT
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 37db52b..6b48d4c 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -54,3 +54,13 @@ def test_set_sample_name_in_vcf_file():
     expect = os.path.join(data_dir, "set_sample_name_in_vcf_file.expect.vcf")
     assert filecmp.cmp(tmp_out, expect, shallow=False)
     os.unlink(tmp_out)
+
+
+def test_set_seq_name_in_fasta_file():
+    infile = os.path.join(data_dir, "set_seq_name_in_fasta_file.in.fasta")
+    tmp_out = "tmp.set_seq_name_in_fasta_file.fasta"
+    subprocess.check_output(f"rm -f {tmp_out}", shell=True)
+    utils.set_seq_name_in_fasta_file(infile, tmp_out, "NEW_NAME")
+    expect = os.path.join(data_dir, "set_seq_name_in_fasta_file.expect.fasta")
+    assert filecmp.cmp(tmp_out, expect, shallow=False)
+    os.unlink(tmp_out)
diff --git a/viridian_workflow/one_sample_pipeline.py b/viridian_workflow/one_sample_pipeline.py
index f727458..fcdec8f 100644
--- a/viridian_workflow/one_sample_pipeline.py
+++ b/viridian_workflow/one_sample_pipeline.py
@@ -8,6 +8,7 @@
     run_process,
     rm,
     set_sample_name_in_vcf_file,
+    set_seq_name_in_fasta_file,
 )
 
 
@@ -108,11 +109,13 @@ def run_one_sample(
         )
 
     masked_fasta = qcovid.self_qc(outdir, assembly, self_map)
+    final_masked_fasta = os.path.join(outdir, "consensus.masked.fa")
+    set_seq_name_in_fasta_file(masked_fasta, final_masked_fasta, sample_name)
 
     varifier_vcf = varifier.run(
         varifier_out,
         ref_genome,
-        masked_fasta,
+        final_masked_fasta,
         min_coord=amplicons_start,
         max_coord=amplicons_end,
     )
diff --git a/viridian_workflow/utils.py b/viridian_workflow/utils.py
index ec6e5b4..f411034 100644
--- a/viridian_workflow/utils.py
+++ b/viridian_workflow/utils.py
@@ -107,3 +107,12 @@ def set_sample_name_in_vcf_file(infile, outfile, sample_name):
                 print(*fields, sep="\t", file=f_out)
             else:
                 print(line, end="", file=f_out)
+
+
+def set_seq_name_in_fasta_file(infile, outfile, new_name):
+    """Changes name in FASTA file. Assumes that there is only one sequence
+    in the file, raises Exception if it finds >1 sequence"""
+    seq = load_single_seq_fasta(infile)
+    seq.id = new_name
+    with open(outfile, "w") as f:
+        print(seq, file=f)

From 92810472bdeb6d27f4345abf5d253cc1978cd6bf Mon Sep 17 00:00:00 2001
From: Martin Hunt <martin.g.hunt@gmail.com>
Date: Thu, 14 Oct 2021 17:11:44 +0100
Subject: [PATCH 4/4] Add description of option --sample_name

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 1fefc51..c8b06e5 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,9 @@ The FASTA and JSON files in those commands can be found in the `data/`
 directory of this repository.
 
 Other options:
+* `--sample_name MY_NAME`: use this to change the sample name
+  (default is "sample") that is put in the final FASTA file, BAM file, and
+  VCF file.
 * `--keep_bam`: use this option to keep the BAM file of original input reads
   mapped to the reference genome.
 * `--force`: use with caution - it will overwrite the output directory if