From e89c2741d607679c70fefdcd1efe7b51278bc67f Mon Sep 17 00:00:00 2001
From: Jared Simpson <jared.simpson@gmail.com>
Date: Wed, 28 Apr 2021 10:47:30 -0400
Subject: [PATCH] implement methods to identify variants that disrupt amplicons

---
 workflow/rules/annotation.smk          | 46 +++++++++++++++++++++++++-
 workflow/scripts/primer_snp_depth.py   | 42 +++++++++++++++++++++++
 workflow/scripts/primer_snp_summary.py | 38 +++++++++++++++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 workflow/scripts/primer_snp_depth.py
 create mode 100644 workflow/scripts/primer_snp_summary.py

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index 6bf1a85..d6a0726 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -41,12 +41,18 @@ def get_recurrent_heatmap_plot(wildcards):
     out = "plots/%s_aa_mutation_heatmap.pdf" % (prefix)
     return out
 
+def get_primer_snp_depth_table(wildcards):
+    prefix = get_run_name()
+    out = "qc_annotation/%s.primer_snp_depth_summary.tsv" % (prefix)
+    return out
+
 #
 # Rules for annotating variants with functional consequence
 #
 rule all_qc_annotation:
     input:
-        get_recurrent_heatmap_plot
+        get_recurrent_heatmap_plot,
+        get_primer_snp_depth_table
 
 rule build_snpeff_db:
     input:
@@ -113,3 +119,41 @@ rule create_recurrent_mutation_heatmap:
     shell:
         "Rscript {params.script} --path qc_annotation --output {output} --threshold {params.threshold}"
 
+rule create_primer_snp_bed:
+    input:
+        vcf=get_vcf_file,
+        primer_bed=get_primer_bed
+    output:
+        "qc_annotation/{sample}.primer_snps.bed"
+    shell:
+        "bedtools intersect -a {input.vcf} -b {input.primer_bed} -wb > {output}"
+
+rule create_primer_snp_depth:
+    input:
+        primer_snps="qc_annotation/{sample}.primer_snps.bed",
+        amplicon_depth="qc_sequencing/{sample}.amplicon_depth.bed"
+    output:
+        "qc_annotation/{sample}.primer_snp_depth.tsv"
+    params:
+        script=srcdir("../scripts/primer_snp_depth.py")
+    shell:
+        "python {params.script} --sample-name {wildcards.sample} --primer-snp-bed {input.primer_snps} --amplicon-depth-tsv {input.amplicon_depth} > {output}"
+
+rule merge_primer_snp_depth:
+    input:
+        expand("qc_annotation/{s}.primer_snp_depth.tsv", s=get_sample_names())
+    output:
+        "qc_annotation/{prefix}.primer_snp_depth_merged.tsv"
+    shell:
+        "cat {input} | awk 'NR == 1 || $0 !~ /primer_name/' > {output}"
+
+rule summarize_primer_snp_depth:
+    input:
+        "qc_annotation/{prefix}.primer_snp_depth_merged.tsv"
+    output:
+        "qc_annotation/{prefix}.primer_snp_depth_summary.tsv"
+    params:
+        script=srcdir("../scripts/primer_snp_summary.py")
+    shell:
+        "python {params.script} --input-tsv {input} > {output}"
+
diff --git a/workflow/scripts/primer_snp_depth.py b/workflow/scripts/primer_snp_depth.py
new file mode 100644
index 0000000..e0b6875
--- /dev/null
+++ b/workflow/scripts/primer_snp_depth.py
@@ -0,0 +1,42 @@
+import argparse
+import sys
+import csv
+import os
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--primer-snp-bed', type=str, required=True)
+parser.add_argument('--amplicon-depth-tsv', type=str, required=True)
+parser.add_argument('--sample-name', type=str, default="none")
+args, files = parser.parse_known_args()
+
+def main():
+    print("\t".join(["sample", "contig", "position", "ref", "alt", "primer_name", "amplicon", "depth"]))
+    
+    # read in the depth of each amplicon
+    depth_map = dict()
+    with open(args.amplicon_depth_tsv, 'r') as ifh:
+        reader = csv.DictReader(ifh, delimiter='\t')
+        for record in reader:
+            depth_map[record['amplicon_id']] = record['mean_depth']
+    
+    # read in the bed file containing mutations that overlap primers
+    seen = dict()
+    with(open(args.primer_snp_bed, 'r')) as ifh:
+        for line in ifh:
+            fields = line.rstrip().split()
+            assert(len(fields) == 16)
+            primer_name = fields[13]
+            primer_direction_idx = max(primer_name.find("_LEFT"), primer_name.find("_RIGHT"))
+            if primer_direction_idx == -1:
+                sys.stderr.write("Could not parse primer name %s" % (primer_name))
+                sys.exit(1)
+            amplicon_id = primer_name[0:primer_direction_idx]
+
+            # skip duplicate entries when a mutation lands in ALT versions of primers
+            key = ":".join([fields[0], fields[1], fields[3], fields[4], amplicon_id])
+            if key not in seen:
+                print("\t".join([args.sample_name, fields[0], fields[1], fields[3], fields[4], primer_name, amplicon_id, depth_map[amplicon_id]]))
+                seen[key] = 1
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/scripts/primer_snp_summary.py b/workflow/scripts/primer_snp_summary.py
new file mode 100644
index 0000000..098f3ba
--- /dev/null
+++ b/workflow/scripts/primer_snp_summary.py
@@ -0,0 +1,38 @@
+import argparse
+import numpy
+import sys
+import csv
+import os
+
+from collections import defaultdict
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--input-tsv', type=str, required=True)
+args, files = parser.parse_known_args()
+
+def main():
+    print("\t".join(["contig", "position", "ref", "alt", "primer_name", "amplicon", "num_samples", "mean_depth", "min_depth", "max_depth"]))
+    
+    # read in the depth per sample
+    depth_map = defaultdict(list)
+
+    with open(args.input_tsv, 'r') as ifh:
+        reader = csv.DictReader(ifh, delimiter='\t')
+        for record in reader:
+            key = ":".join([record['contig'], record['position'], record['ref'], record['alt'], record['primer_name'], record['amplicon']])
+            depth_map[key].append(float(record['depth']))
+    
+
+    for key in depth_map.keys():
+        mean_depth = numpy.mean(depth_map[key])
+        min_depth = numpy.min(depth_map[key])
+        max_depth = numpy.max(depth_map[key])
+        fields = key.split(":")
+        fields.append(str(len(depth_map[key])))
+        fields.append("%.2f" % mean_depth)
+        fields.append("%.2f" % min_depth)
+        fields.append("%.2f" % max_depth)
+        print("\t".join(fields))
+
+if __name__ == '__main__':
+    main()