From e89c2741d607679c70fefdcd1efe7b51278bc67f Mon Sep 17 00:00:00 2001 From: Jared Simpson Date: Wed, 28 Apr 2021 10:47:30 -0400 Subject: [PATCH] implement methods to identify variants that disrupt amplicons --- workflow/rules/annotation.smk | 46 +++++++++++++++++++++++++- workflow/scripts/primer_snp_depth.py | 42 +++++++++++++++++++++++ workflow/scripts/primer_snp_summary.py | 38 +++++++++++++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 workflow/scripts/primer_snp_depth.py create mode 100644 workflow/scripts/primer_snp_summary.py diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 6bf1a85..d6a0726 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -41,12 +41,18 @@ def get_recurrent_heatmap_plot(wildcards): out = "plots/%s_aa_mutation_heatmap.pdf" % (prefix) return out +def get_primer_snp_depth_table(wildcards): + prefix = get_run_name() + out = "qc_annotation/%s.primer_snp_depth_summary.tsv" % (prefix) + return out + # # Rules for annotating variants with functional consequence # rule all_qc_annotation: input: - get_recurrent_heatmap_plot + get_recurrent_heatmap_plot, + get_primer_snp_depth_table rule build_snpeff_db: input: @@ -113,3 +119,41 @@ rule create_recurrent_mutation_heatmap: shell: "Rscript {params.script} --path qc_annotation --output {output} --threshold {params.threshold}" +rule create_primer_snp_bed: + input: + vcf=get_vcf_file, + primer_bed=get_primer_bed + output: + "qc_annotation/{sample}.primer_snps.bed" + shell: + "bedtools intersect -a {input.vcf} -b {input.primer_bed} -wb > {output}" + +rule create_primer_snp_depth: + input: + primer_snps="qc_annotation/{sample}.primer_snps.bed", + amplicon_depth="qc_sequencing/{sample}.amplicon_depth.bed" + output: + "qc_annotation/{sample}.primer_snp_depth.tsv" + params: + script=srcdir("../scripts/primer_snp_depth.py") + shell: + "python {params.script} --sample-name {wildcards.sample} --primer-snp-bed {input.primer_snps} --amplicon-depth-tsv {input.amplicon_depth} > {output}" + +rule merge_primer_snp_depth: + input: + expand("qc_annotation/{s}.primer_snp_depth.tsv", s=get_sample_names()) + output: + "qc_annotation/{prefix}.primer_snp_depth_merged.tsv" + shell: + "cat {input} | awk 'NR == 1 || $0 !~ /primer_name/' > {output}" + +rule summarize_primer_snp_depth: + input: + "qc_annotation/{prefix}.primer_snp_depth_merged.tsv" + output: + "qc_annotation/{prefix}.primer_snp_depth_summary.tsv" + params: + script=srcdir("../scripts/primer_snp_summary.py") + shell: + "python {params.script} --input-tsv {input} > {output}" + diff --git a/workflow/scripts/primer_snp_depth.py b/workflow/scripts/primer_snp_depth.py new file mode 100644 index 0000000..e0b6875 --- /dev/null +++ b/workflow/scripts/primer_snp_depth.py @@ -0,0 +1,42 @@ +import argparse +import sys +import csv +import os + +parser = argparse.ArgumentParser() +parser.add_argument('--primer-snp-bed', type=str, required=True) +parser.add_argument('--amplicon-depth-tsv', type=str, required=True) +parser.add_argument('--sample-name', type=str, default="none") +args, files = parser.parse_known_args() + +def main(): + print("\t".join(["sample", "contig", "position", "ref", "alt", "primer_name", "amplicon", "depth"])) + + # read in the depth of each amplicon + depth_map = dict() + with open(args.amplicon_depth_tsv, 'r') as ifh: + reader = csv.DictReader(ifh, delimiter='\t') + for record in reader: + depth_map[record['amplicon_id']] = record['mean_depth'] + + # read in the bed file containing mutations that overlap primers + seen = dict() + with(open(args.primer_snp_bed, 'r')) as ifh: + for line in ifh: + fields = line.rstrip().split() + assert(len(fields) == 16) + primer_name = fields[13] + primer_direction_idx = max(primer_name.find("_LEFT"), primer_name.find("_RIGHT")) + if primer_direction_idx == -1: + sys.stderr.write("Could not parse primer name %s" % (primer_name)) + sys.exit(1) + amplicon_id = primer_name[0:primer_direction_idx] + + # skip duplicate entries when a mutation lands in ALT versions of primers + key = ":".join([fields[0], fields[1], fields[3], fields[4], amplicon_id]) + if key not in seen: + print("\t".join([args.sample_name, fields[0], fields[1], fields[3], fields[4], primer_name, amplicon_id, depth_map[amplicon_id]])) + seen[key] = 1 + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/primer_snp_summary.py b/workflow/scripts/primer_snp_summary.py new file mode 100644 index 0000000..098f3ba --- /dev/null +++ b/workflow/scripts/primer_snp_summary.py @@ -0,0 +1,38 @@ +import argparse +import numpy +import sys +import csv +import os + +from collections import defaultdict + +parser = argparse.ArgumentParser() +parser.add_argument('--input-tsv', type=str, required=True) +args, files = parser.parse_known_args() + +def main(): + print("\t".join(["contig", "position", "ref", "alt", "primer_name", "amplicon", "num_samples", "mean_depth", "min_depth", "max_depth"])) + + # read in the depth per sample + depth_map = defaultdict(list) + + with open(args.input_tsv, 'r') as ifh: + reader = csv.DictReader(ifh, delimiter='\t') + for record in reader: + key = ":".join([record['contig'], record['position'], record['ref'], record['alt'], record['primer_name'], record['amplicon']]) + depth_map[key].append(float(record['depth'])) + + + for key in depth_map.keys(): + mean_depth = numpy.mean(depth_map[key]) + min_depth = numpy.min(depth_map[key]) + max_depth = numpy.max(depth_map[key]) + fields = key.split(":") + fields.append(str(len(depth_map[key]))) + fields.append("%.2f" % mean_depth) + fields.append("%.2f" % min_depth) + fields.append("%.2f" % max_depth) + print("\t".join(fields)) + +if __name__ == '__main__': + main()