From 95dcf6d7030c6168f5ef93b34cf174e288b78066 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 13 Jul 2016 21:55:55 -0400 Subject: [PATCH 01/13] add plot_coverage() and align_and_plot_coverage() to read_utils.py add plot_coverage() and align_and_plot_coverage() to read_utils.py, WIP --- read_utils.py | 256 ++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + tools/__init__.py | 2 +- tools/bwa.py | 20 +++- tools/samtools.py | 6 ++ 5 files changed, 280 insertions(+), 5 deletions(-) diff --git a/read_utils.py b/read_utils.py index 7639b4eb1..c05283ac6 100755 --- a/read_utils.py +++ b/read_utils.py @@ -14,8 +14,11 @@ import os import tempfile import shutil +import csv +from collections import OrderedDict from Bio import SeqIO +import matplotlib.pyplot as plt import util.cmd import util.file @@ -1052,6 +1055,259 @@ def parser_bwamem_idxstats(parser=argparse.ArgumentParser()): # ========================= +def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser needs add_help=False? + parser.add_argument('in_bam', + help='Input reads, BAM format.') + parser.add_argument('out_plot_file', + help='The generated chart file') + parser.add_argument('--plotFormat', + dest="plot_format", + default="pdf", + type=str, + choices=list(plt.gcf().canvas.get_supported_filetypes().keys()), + help="File format of the coverage plot") + parser.add_argument('--plotStyle', + dest="plot_style", + default="ggplot", + type=str, + choices=plt.style.available, + help="The plot visual style") + parser.add_argument('--plotWidth', + dest="plot_width", + default=800, + type=int, + help="Width of the plot in pixels") + parser.add_argument('--plotHeight', + dest="plot_height", + default=600, + type=int, + help="Width of the plot in pixels") + parser.add_argument('--plotTitle', + dest="plot_title", + default="Coverage Plot", + type=str, + help="The title displayed on the coverage plot") + parser.add_argument('-q', + dest="base_q_threshold", + default=None, + type=int, + help="The minimum base quality threshold") + parser.add_argument('-Q', + dest="mapping_q_threshold", + default=None, + type=int, + help="The minimum mapping quality threshold") + parser.add_argument('-m', + dest="max_coverage_depth", + default=1000000, + type=int, + help="The max coverage depth (default: %(default)s)") + parser.add_argument('-l', + dest="read_length_threshold", + default=None, + type=int, + help="Read length threshold") + parser.add_argument('--outSummary', + dest="out_summary", + default=None, + type=str, + help="Coverage summary TSV file. Default is to write to temp.") + return parser + +def plot_coverage(in_bam, out_plot_file, plot_format, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary=None): + ''' + Generate a coverage plot from an aligned bam file + ''' + + # TODO: remove this: + #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" + + samtools = tools.samtools.SamtoolsTool() + + # check if in_bam is aligned, if not raise an error + num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) + if num_mapped_reads == 0: + raise Exception("""The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam) + + + if out_summary is None: + coverage_tsv_file = mkstempfname('.summary.tsv') + else: + coverage_tsv_file = out_summary + + bam_aligned = mkstempfname('.aligned.bam') + + if in_bam[-4:] == ".sam": + # convert sam -> bam + samtools.view(["-b"], in_bam, bam_aligned) + elif in_bam[-4:] == ".bam": + shutil.copyfile(in_bam, bam_aligned) + + # call samtools sort + bam_sorted = mkstempfname('.aligned.bam') + samtools.sort(bam_aligned, bam_sorted) + + # call samtools index + samtools.index(bam_sorted) + + # call samtools depth + opts = [] + opts += ['-aa'] # report coverate at "absolutely all" positions + if base_q_threshold: + opts += ["-q", str(base_q_threshold)] + if mapping_q_threshold: + opts += ["-Q", str(mapping_q_threshold)] + if max_coverage_depth: + opts += ["-m", str(max_coverage_depth)] + if read_length_threshold: + opts += ["-l", str(read_length_threshold)] + + samtools.depth(bam_sorted, coverage_tsv_file, opts) + + # ---- create plot based on coverage_tsv_file ---- + + segment_depths = OrderedDict() + domain_max = 0 + with open(coverage_tsv_file, "r") as tabfile: + for row in csv.reader(tabfile, delimiter='\t'): + segment_depths.setdefault(row[0],[]).append(int(row[2])) + domain_max += 1 + + domain_max = 0 + with plt.style.context(plot_style): + + fig = plt.gcf() + DPI = fig.get_dpi() + fig.set_size_inches(float(plot_width)/float(DPI),float(plot_height)/float(DPI)) + + font_size = math.sqrt((plot_width**2)+(plot_height**2))/float(DPI)*1.25 + + ax = plt.subplot() # Defines ax variable by creating an empty plot + + # Set the tick labels font + for label in (ax.get_xticklabels() + ax.get_yticklabels()): + label.set_fontname('Arial') + label.set_fontsize(font_size) + + for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): + prior_domain_max = domain_max + domain_max += len(position_depths) + + segment_color = plt.rcParams['axes.color_cycle'][segment_num%len(plt.rcParams['axes.color_cycle'])] + plot = plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0]*len(position_depths), linewidth=0, antialiased=True, color=segment_color) + + plt.title(plot_title, fontsize=font_size*1.2) + plt.xlabel("bp", fontsize=font_size*1.1) + plt.ylabel("read depth", fontsize=font_size*1.1) + plt.tight_layout() + plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') + log.info("Coverage plot saved to: " + out_plot_file) + + os.unlink(bam_aligned) + os.unlink(bam_sorted) + # TODO: uncomment when done testing + if not out_summary: + os.unlink(coverage_tsv_file) + + +def parser_plot_coverage(parser=argparse.ArgumentParser()): + parser = parser_plot_coverage_common(parser) + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) + util.cmd.attach_main(parser, plot_coverage, split_args=True) + return parser + +__commands__.append(('plot_coverage', parser_plot_coverage)) + +def align_and_plot_coverage(out_plot_file, plot_format, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, + in_bam, ref_fasta, out_bam=None, sensitive=False, min_score_to_output=None + ): + ''' + Take reads, align to reference with BWA-MEM, and generate a coverage plot + ''' + if out_bam is None: + bam_aligned = mkstempfname('.aligned.bam') + else: + bam_aligned = out_bam + + ref_indexed = mkstempfname('.reference.fasta') + shutil.copyfile(ref_fasta, ref_indexed) + + bwa = tools.bwa.Bwa() + samtools = tools.samtools.SamtoolsTool() + + bwa.index(ref_indexed) + + bwa_opts = [] + if sensitive: + bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() + + map_threshold = min_score_to_output or 30 + + bwa_opts + ["-T", str(map_threshold)] + + aln_sam = mkstempfname('.sam') + aln_sam_filtered = mkstempfname('.filtered.sam') + + bwa.mem(in_bam, ref_indexed, aln_sam, opts=bwa_opts) + + # @haydenm says: + # For some reason (particularly when the --sensitive option is on), bwa + # doesn't listen to its '-T' flag and outputs alignments with score less + # than the '-T 30' threshold. So filter these: + os.system("grep \"^@\" " + aln_sam + " > " + aln_sam_filtered) + os.system("grep \"AS:i:\" " + aln_sam + " | awk -v threshold=" + str(map_threshold) + " '{split($14, subfield, \":\"); if(subfield[3]>=threshold) print $0}' >> " + aln_sam_filtered) + os.unlink(aln_sam) + + # convert sam -> bam + aln_bam_filtered = mkstempfname('.reference.fasta') + samtools.view(["-b"], aln_sam_filtered, aln_bam_filtered) + os.unlink(aln_sam_filtered) + + samtools.sort(aln_bam_filtered, bam_aligned) + os.unlink(aln_bam_filtered) + + samtools.index(bam_aligned) + + + # call plot function + plot_coverage(bam_aligned, out_plot_file, plot_format, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary) + + # remove the output bam, unless it is needed + if out_bam is None: + os.unlink(bam_aligned) + + # remove the files created by bwa index. + # The empty extension causes the original fasta file to be removed + for ext in [".amb",".ann",".bwt",".bwa",".pac",".sa",""]: + file_to_remove = ref_indexed+ext + if os.path.isfile(file_to_remove): + os.unlink( file_to_remove ) + +def parser_align_and_plot_coverage(parser=argparse.ArgumentParser()): + parser = parser_plot_coverage_common(parser) + parser.add_argument('ref_fasta', + default=None, + help='Reference genome, FASTA format.') + parser.add_argument('--outBam', + dest="out_bam", + default=None, + help='Output aligned, indexed BAM file. Default is to write to temp.') + parser.add_argument('--sensitive', + action="store_true", + help="Equivalent to giving bwa: '-k 12 -A 1 -B 1 -O 1 -E 1' ") + parser.add_argument('-T', + dest="min_score_to_output", + default=30, + type=int, + help="The min score to output during alignment (default: %(default)s)") + + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) + util.cmd.attach_main(parser, align_and_plot_coverage, split_args=True) + return parser + +__commands__.append(('align_and_plot_coverage', parser_align_and_plot_coverage)) + +# ========================= def full_parser(): return util.cmd.make_parser(__commands__, __doc__) diff --git a/requirements.txt b/requirements.txt index ea56db617..8110b6f35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ future>=0.15.2 futures==3.0.3; python_version == '2.6' or python_version=='2.7' pysam==0.8.3 PyYAML==3.11 +matplotlib==1.5.1 \ No newline at end of file diff --git a/tools/__init__.py b/tools/__init__.py index bc27ea07a..6b0a8ca8c 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -285,7 +285,7 @@ def __init__( self.env_path = os.path.dirname(last_path_component) if last_path_component == "bin" else conda_env_path else: # if conda env is an environment name, infer the path _log.debug('Conda env found is specified by name: %s' % conda_env_path) - result = util.misc.run_and_print(["conda", "env", "list", "--json"], loglevel=logging.DEBUG, env=os.environ) + result = util.misc.run_and_print(["conda", "env", "list", "--json"], silent=True, env=os.environ) if result.returncode == 0: command_output = result.stdout.decode("UTF-8") data = json.loads(self._string_from_start_of_json(command_output)) diff --git a/tools/bwa.py b/tools/bwa.py index 56760c23e..6581e5946 100644 --- a/tools/bwa.py +++ b/tools/bwa.py @@ -7,6 +7,8 @@ import os import os.path import subprocess +import shutil + import tools import tools.samtools import util.file @@ -46,18 +48,28 @@ def index(self, inFasta, algorithm=None): cmd.append(inFasta) self.execute('index', cmd) - def mem(self, inReads, refDb, outAlign, threads=None): + def mem(self, inReads, refDb, outAlign, opts=None, threads=None): + opts = [] if not opts else opts + threads = threads or util.misc.available_cpu_count() samtools = tools.samtools.SamtoolsTool() fq1 = util.file.mkstempfname('.1.fastq') fq2 = util.file.mkstempfname('.2.fastq') aln_sam = util.file.mkstempfname('.sam') + aln_sam_sorted = util.file.mkstempfname('sorted.sam') samtools.bam2fq(inReads, fq1, fq2) - self.execute('mem', ['-t', str(threads), refDb, fq1, fq2], stdout=aln_sam) + self.execute('mem', opts + ['-t', str(threads), refDb, fq1, fq2], stdout=aln_sam) os.unlink(fq1) os.unlink(fq2) - samtools.sort(aln_sam, outAlign) + samtools.sort(aln_sam, aln_sam_sorted) os.unlink(aln_sam) - samtools.index(outAlign) + # cannot index sam files; only do so if a bam is desired + if outAlign.endswith(".bam"): + # convert sam -> bam + samtools.view(["-b"], aln_sam_sorted, outAlign) + samtools.index(outAlign) + elif outAlign.endswith(".sam"): + shutil.copyfile(aln_sam_sorted, outAlign) + os.unlink(aln_sam_sorted) diff --git a/tools/samtools.py b/tools/samtools.py index a34fe59dc..51899e70b 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -105,6 +105,12 @@ def faidx(self, inFasta, overwrite=False): #pysam.faidx(inFasta) self.execute('faidx', [inFasta]) + def depth(self, inBam, outFile, options=None): + """ Write a TSV file with coverage depth by position """ + options = options or ["-aa", "-m", "1000000"] + + self.execute('depth', options + [inBam], stdout=outFile) + def idxstats(self, inBam, statsFile): self.execute('idxstats', [inBam], stdout=statsFile) From a1d073f5d8a5a1b051a88823fc3e12d006e86a7c Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 13 Jul 2016 22:28:04 -0400 Subject: [PATCH 02/13] minor changes to fix matplotlib deprecation warnings --- read_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/read_utils.py b/read_utils.py index c05283ac6..05a2c6097 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1074,12 +1074,12 @@ def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser need help="The plot visual style") parser.add_argument('--plotWidth', dest="plot_width", - default=800, + default=1024, type=int, help="Width of the plot in pixels") parser.add_argument('--plotHeight', dest="plot_height", - default=600, + default=768, type=int, help="Width of the plot in pixels") parser.add_argument('--plotTitle', @@ -1186,26 +1186,32 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_style, plot_width, pl # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): - label.set_fontname('Arial') label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) - segment_color = plt.rcParams['axes.color_cycle'][segment_num%len(plt.rcParams['axes.color_cycle'])] - plot = plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0]*len(position_depths), linewidth=0, antialiased=True, color=segment_color) + colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style + segment_color = colors[segment_num%len(colors)] # pick a color, offset by the segment index + plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0]*len(position_depths), linewidth=0, antialiased=True, color=segment_color) plt.title(plot_title, fontsize=font_size*1.2) plt.xlabel("bp", fontsize=font_size*1.1) plt.ylabel("read depth", fontsize=font_size*1.1) - plt.tight_layout() + + # to squash a backend renderer error on OSX related to tight layout + if plt.get_backend().lower() in ['agg', 'macosx']: + fig.set_tight_layout(True) + else: + fig.tight_layout() + plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) os.unlink(bam_aligned) os.unlink(bam_sorted) - # TODO: uncomment when done testing + if not out_summary: os.unlink(coverage_tsv_file) From f5e45fda23d9c4e4c425b543fe357c82cf4fd8e2 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 13 Jul 2016 22:31:25 -0400 Subject: [PATCH 03/13] explicitly set matplotlib backend to hopefully fix Travis $DISPLAY==null issue --- read_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/read_utils.py b/read_utils.py index 05a2c6097..a72f8d6d6 100755 --- a/read_utils.py +++ b/read_utils.py @@ -18,6 +18,8 @@ from collections import OrderedDict from Bio import SeqIO +import matplotlib +matplotlib.use('Agg') import matplotlib.pyplot as plt import util.cmd From db75236ce8aeebb44067af6c6078b8ca70ed6faf Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 13 Jul 2016 23:18:21 -0400 Subject: [PATCH 04/13] improve parser defaults and help display --- read_utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/read_utils.py b/read_utils.py index a72f8d6d6..f198f085d 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1064,31 +1064,33 @@ def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser need help='The generated chart file') parser.add_argument('--plotFormat', dest="plot_format", - default="pdf", + default=None, type=str, choices=list(plt.gcf().canvas.get_supported_filetypes().keys()), - help="File format of the coverage plot") + metavar='', + help="File format of the coverage plot. By default it is inferred from the file extension of out_plot_file, but it can be set explicitly via --plotFormat. Valid formats include: " + ", ".join( list(plt.gcf().canvas.get_supported_filetypes().keys())) ) parser.add_argument('--plotStyle', dest="plot_style", default="ggplot", type=str, choices=plt.style.available, - help="The plot visual style") + metavar='', + help="The plot visual style. Valid options: " + ", ".join(plt.style.available) + " (default: %(default)s)") parser.add_argument('--plotWidth', dest="plot_width", default=1024, type=int, - help="Width of the plot in pixels") + help="Width of the plot in pixels (default: %(default)s)") parser.add_argument('--plotHeight', dest="plot_height", default=768, type=int, - help="Width of the plot in pixels") + help="Width of the plot in pixels (default: %(default)s)") parser.add_argument('--plotTitle', dest="plot_title", default="Coverage Plot", type=str, - help="The title displayed on the coverage plot") + help="The title displayed on the coverage plot (default: '%(default)s')") parser.add_argument('-q', dest="base_q_threshold", default=None, From e0d862b32d32c1aa16d736c5f0e04ca3d43696dc Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 13 Jul 2016 23:42:34 -0400 Subject: [PATCH 05/13] for bwa, change conditional to use bam or cram --- tools/bwa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bwa.py b/tools/bwa.py index 6581e5946..37a870aeb 100644 --- a/tools/bwa.py +++ b/tools/bwa.py @@ -64,7 +64,7 @@ def mem(self, inReads, refDb, outAlign, opts=None, threads=None): samtools.sort(aln_sam, aln_sam_sorted) os.unlink(aln_sam) # cannot index sam files; only do so if a bam is desired - if outAlign.endswith(".bam"): + if outAlign.endswith(".bam") or outAlign.endswith(".cram"): # convert sam -> bam samtools.view(["-b"], aln_sam_sorted, outAlign) samtools.index(outAlign) From 884d8bc12533c3d774a7948ce7a9a12acafba06e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 13 Jul 2016 23:42:59 -0400 Subject: [PATCH 06/13] add data display styles of dots, line, and filled to plot_coverage --- read_utils.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/read_utils.py b/read_utils.py index f198f085d..d9fb43a17 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1069,6 +1069,13 @@ def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser need choices=list(plt.gcf().canvas.get_supported_filetypes().keys()), metavar='', help="File format of the coverage plot. By default it is inferred from the file extension of out_plot_file, but it can be set explicitly via --plotFormat. Valid formats include: " + ", ".join( list(plt.gcf().canvas.get_supported_filetypes().keys())) ) + parser.add_argument('--plotDataStyle', + dest="plot_data_style", + default="filled", + type=str, + choices=["filled","line","dots"], + metavar='', + help="The plot data display style. Valid options: " + ", ".join(["filled","line","dots"]) + " (default: %(default)s)") parser.add_argument('--plotStyle', dest="plot_style", default="ggplot", @@ -1118,7 +1125,7 @@ def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser need help="Coverage summary TSV file. Default is to write to temp.") return parser -def plot_coverage(in_bam, out_plot_file, plot_format, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary=None): +def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' @@ -1141,10 +1148,10 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_style, plot_width, pl bam_aligned = mkstempfname('.aligned.bam') - if in_bam[-4:] == ".sam": + if in_bam.endswith(".sam"): # convert sam -> bam samtools.view(["-b"], in_bam, bam_aligned) - elif in_bam[-4:] == ".bam": + elif in_bam.endswith(".bam") or in_bam.endswith(".cram"): shutil.copyfile(in_bam, bam_aligned) # call samtools sort @@ -1198,7 +1205,14 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_style, plot_width, pl colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style segment_color = colors[segment_num%len(colors)] # pick a color, offset by the segment index - plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0]*len(position_depths), linewidth=0, antialiased=True, color=segment_color) + + if plot_data_style == "filled": + plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0]*len(position_depths), linewidth=0, antialiased=True, color=segment_color) + elif plot_data_style == "line": + plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) + elif plot_data_style == "dots": + plt.plot(range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color) + plt.title(plot_title, fontsize=font_size*1.2) plt.xlabel("bp", fontsize=font_size*1.1) @@ -1228,7 +1242,7 @@ def parser_plot_coverage(parser=argparse.ArgumentParser()): __commands__.append(('plot_coverage', parser_plot_coverage)) -def align_and_plot_coverage(out_plot_file, plot_format, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, +def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, min_score_to_output=None ): ''' @@ -1280,7 +1294,7 @@ def align_and_plot_coverage(out_plot_file, plot_format, plot_style, plot_width, # call plot function - plot_coverage(bam_aligned, out_plot_file, plot_format, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary) + plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary) # remove the output bam, unless it is needed if out_bam is None: From b4cc1643d175841dda60481dcdf3451e2396cc01 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 13 Jul 2016 23:57:02 -0400 Subject: [PATCH 07/13] refactor sam filtering to statement using samtools view --- read_utils.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/read_utils.py b/read_utils.py index d9fb43a17..079f9b33b 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1269,31 +1269,24 @@ def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_st bwa_opts + ["-T", str(map_threshold)] - aln_sam = mkstempfname('.sam') - aln_sam_filtered = mkstempfname('.filtered.sam') + aln_bam = mkstempfname('.bam') - bwa.mem(in_bam, ref_indexed, aln_sam, opts=bwa_opts) + bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts) # @haydenm says: # For some reason (particularly when the --sensitive option is on), bwa # doesn't listen to its '-T' flag and outputs alignments with score less # than the '-T 30' threshold. So filter these: - os.system("grep \"^@\" " + aln_sam + " > " + aln_sam_filtered) - os.system("grep \"AS:i:\" " + aln_sam + " | awk -v threshold=" + str(map_threshold) + " '{split($14, subfield, \":\"); if(subfield[3]>=threshold) print $0}' >> " + aln_sam_filtered) - os.unlink(aln_sam) - - # convert sam -> bam - aln_bam_filtered = mkstempfname('.reference.fasta') - samtools.view(["-b"], aln_sam_filtered, aln_bam_filtered) - os.unlink(aln_sam_filtered) + aln_bam_filtered = mkstempfname('.filtered.bam') + samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered) + os.unlink(aln_bam) samtools.sort(aln_bam_filtered, bam_aligned) os.unlink(aln_bam_filtered) samtools.index(bam_aligned) - - # call plot function + # -- call plot function -- plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary) # remove the output bam, unless it is needed From 2687cf2e0fdd54ee8a0f29f6bafa3453aae4062e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 14 Jul 2016 00:07:33 -0400 Subject: [PATCH 08/13] yapf formatting --- read_utils.py | 821 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 493 insertions(+), 328 deletions(-) diff --git a/read_utils.py b/read_utils.py index 079f9b33b..137b9fc01 100755 --- a/read_utils.py +++ b/read_utils.py @@ -62,9 +62,11 @@ def parser_purge_unmated(parser=argparse.ArgumentParser()): parser.add_argument('inFastq2', help='Input fastq file; 2nd end of paired-end reads.') parser.add_argument('outFastq1', help='Output fastq file; 1st end of paired-end reads.') parser.add_argument('outFastq2', help='Output fastq file; 2nd end of paired-end reads.') - parser.add_argument("--regex", - help="Perl regular expression to parse paired read IDs (default: %(default)s)", - default=r'^@(\S+)/[1|2]$') + parser.add_argument( + "--regex", + help="Perl regular expression to parse paired read IDs (default: %(default)s)", + default=r'^@(\S+)/[1|2]$' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, purge_unmated, split_args=True) return parser @@ -131,13 +133,17 @@ def main_index_fasta_samtools(args): def parser_index_fasta_picard(parser=argparse.ArgumentParser()): parser.add_argument('inFasta', help='Input reference genome, FASTA format.') - parser.add_argument('--JVMmemory', - default=tools.picard.CreateSequenceDictionaryTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard\'s CreateSequenceDictionary, OPTIONNAME=value ...') + parser.add_argument( + '--JVMmemory', + default=tools.picard.CreateSequenceDictionaryTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s CreateSequenceDictionary, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_index_fasta_picard) return parser @@ -146,10 +152,10 @@ def parser_index_fasta_picard(parser=argparse.ArgumentParser()): def main_index_fasta_picard(args): '''Create an index file for a reference genome suitable for Picard/GATK.''' tools.picard.CreateSequenceDictionaryTool().execute( - args.inFasta, - overwrite=True, + args.inFasta, overwrite=True, picardOptions=args.picardOptions, - JVMmemory=args.JVMmemory) + JVMmemory=args.JVMmemory + ) return 0 @@ -164,18 +170,24 @@ def parser_mkdup_picard(parser=argparse.ArgumentParser()): parser.add_argument('inBams', help='Input reads, BAM format.', nargs='+') parser.add_argument('outBam', help='Output reads, BAM format.') parser.add_argument('--outMetrics', help='Output metrics file. Default is to dump to a temp file.', default=None) - parser.add_argument("--remove", - help="Instead of marking duplicates, remove them entirely (default: %(default)s)", - default=False, - action="store_true", - dest="remove") - parser.add_argument('--JVMmemory', - default=tools.picard.MarkDuplicatesTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard\'s MarkDuplicates, OPTIONNAME=value ...') + parser.add_argument( + "--remove", + help="Instead of marking duplicates, remove them entirely (default: %(default)s)", + default=False, + action="store_true", + dest="remove" + ) + parser.add_argument( + '--JVMmemory', + default=tools.picard.MarkDuplicatesTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s MarkDuplicates, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_mkdup_picard) return parser @@ -187,11 +199,10 @@ def main_mkdup_picard(args): if args.remove: opts = ['REMOVE_DUPLICATES=true'] + opts tools.picard.MarkDuplicatesTool().execute( - args.inBams, - args.outBam, - args.outMetrics, - picardOptions=opts, - JVMmemory=args.JVMmemory) + args.inBams, args.outBam, + args.outMetrics, picardOptions=opts, + JVMmemory=args.JVMmemory + ) return 0 @@ -205,13 +216,17 @@ def main_mkdup_picard(args): def parser_revert_bam_picard(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input reads, BAM format.') parser.add_argument('outBam', help='Output reads, BAM format.') - parser.add_argument('--JVMmemory', - default=tools.picard.RevertSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard\'s RevertSam, OPTIONNAME=value ...') + parser.add_argument( + '--JVMmemory', + default=tools.picard.RevertSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s RevertSam, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_revert_bam_picard) return parser @@ -233,13 +248,16 @@ def main_revert_bam_picard(args): def parser_picard(parser=argparse.ArgumentParser()): parser.add_argument('command', help='picard command') - parser.add_argument('--JVMmemory', - default=tools.picard.PicardTools.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard, OPTIONNAME=value ...') + parser.add_argument( + '--JVMmemory', + default=tools.picard.PicardTools.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], nargs='*', + help='Optional arguments to Picard, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_picard) return parser @@ -261,27 +279,37 @@ def main_picard(args): def parser_sort_bam(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input bam file.') parser.add_argument('outBam', help='Output bam file, sorted.') - parser.add_argument('sortOrder', - help='How to sort the reads. [default: %(default)s]', - choices=tools.picard.SortSamTool.valid_sort_orders, - default=tools.picard.SortSamTool.default_sort_order) - parser.add_argument("--index", - help="Index outBam (default: %(default)s)", - default=False, - action="store_true", - dest="index") - parser.add_argument("--md5", - help="MD5 checksum outBam (default: %(default)s)", - default=False, - action="store_true", - dest="md5") - parser.add_argument('--JVMmemory', - default=tools.picard.SortSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard\'s SortSam, OPTIONNAME=value ...') + parser.add_argument( + 'sortOrder', + help='How to sort the reads. [default: %(default)s]', + choices=tools.picard.SortSamTool.valid_sort_orders, + default=tools.picard.SortSamTool.default_sort_order + ) + parser.add_argument( + "--index", + help="Index outBam (default: %(default)s)", + default=False, + action="store_true", + dest="index" + ) + parser.add_argument( + "--md5", + help="MD5 checksum outBam (default: %(default)s)", + default=False, + action="store_true", + dest="md5" + ) + parser.add_argument( + '--JVMmemory', + default=tools.picard.SortSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s SortSam, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_sort_bam) return parser @@ -295,11 +323,9 @@ def main_sort_bam(args): if args.md5: opts = ['CREATE_MD5_FILE=true'] + opts tools.picard.SortSamTool().execute( - args.inBam, - args.outBam, - args.sortOrder, - picardOptions=opts, - JVMmemory=args.JVMmemory) + args.inBam, args.outBam, args.sortOrder, + picardOptions=opts, JVMmemory=args.JVMmemory + ) return 0 @@ -313,13 +339,17 @@ def main_sort_bam(args): def parser_merge_bams(parser=argparse.ArgumentParser()): parser.add_argument('inBams', help='Input bam files.', nargs='+') parser.add_argument('outBam', help='Output bam file.') - parser.add_argument('--JVMmemory', - default=tools.picard.MergeSamFilesTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard\'s MergeSamFiles, OPTIONNAME=value ...') + parser.add_argument( + '--JVMmemory', + default=tools.picard.MergeSamFilesTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s MergeSamFiles, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_merge_bams) return parser @@ -343,20 +373,26 @@ def parser_filter_bam(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input bam file.') parser.add_argument('readList', help='Input file of read IDs.') parser.add_argument('outBam', help='Output bam file.') - parser.add_argument("--exclude", - help="""If specified, readList is a list of reads to remove from input. + parser.add_argument( + "--exclude", + help="""If specified, readList is a list of reads to remove from input. Default behavior is to treat readList as an inclusion list (all unnamed reads are removed).""", - default=False, - action="store_true", - dest="exclude") - parser.add_argument('--JVMmemory', - default=tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard\'s FilterSamReads, OPTIONNAME=value ...') + default=False, + action="store_true", + dest="exclude" + ) + parser.add_argument( + '--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s FilterSamReads, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_filter_bam) return parser @@ -370,7 +406,8 @@ def main_filter_bam(args): args.readList, args.outBam, picardOptions=args.picardOptions, - JVMmemory=args.JVMmemory) + JVMmemory=args.JVMmemory + ) return 0 @@ -381,18 +418,20 @@ def main_filter_bam(args): # ======================= -def bam_to_fastq(inBam, outFastq1, outFastq2, outHeader=None, - JVMmemory=tools.picard.SamToFastqTool.jvmMemDefault, picardOptions=None): +def bam_to_fastq( + inBam, + outFastq1, + outFastq2, + outHeader=None, + JVMmemory=tools.picard.SamToFastqTool.jvmMemDefault, + picardOptions=None +): ''' Convert a bam file to a pair of fastq paired-end read files and optional text header. ''' picardOptions = picardOptions or [] - tools.picard.SamToFastqTool().execute(inBam, - outFastq1, - outFastq2, - picardOptions=picardOptions, - JVMmemory=JVMmemory) + tools.picard.SamToFastqTool().execute(inBam, outFastq1, outFastq2, picardOptions=picardOptions, JVMmemory=JVMmemory) if outHeader: tools.samtools.SamtoolsTool().dumpHeader(inBam, outHeader) return 0 @@ -403,13 +442,17 @@ def parser_bam_to_fastq(parser=argparse.ArgumentParser()): parser.add_argument('outFastq1', help='Output fastq file; 1st end of paired-end reads.') parser.add_argument('outFastq2', help='Output fastq file; 2nd end of paired-end reads.') parser.add_argument('--outHeader', help='Optional text file name that will receive bam header.', default=None) - parser.add_argument('--JVMmemory', - default=tools.picard.SamToFastqTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='Optional arguments to Picard\'s SamToFastq, OPTIONNAME=value ...') + parser.add_argument( + '--JVMmemory', + default=tools.picard.SamToFastqTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s SamToFastq, OPTIONNAME=value ...' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, bam_to_fastq, split_args=True) return parser @@ -422,8 +465,15 @@ def parser_bam_to_fastq(parser=argparse.ArgumentParser()): # ======================= -def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName=None, header=None, - JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault, picardOptions=None): +def fastq_to_bam( + inFastq1, + inFastq2, + outBam, + sampleName=None, + header=None, + JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault, + picardOptions=None +): ''' Convert a pair of fastq paired-end read files and optional text header to a single bam file. ''' @@ -434,7 +484,7 @@ def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName=None, header=None, else: fastqToSamOut = outBam if sampleName is None: - sampleName = 'Dummy' # Will get overwritten by rehead command + sampleName = 'Dummy' # Will get overwritten by rehead command if header: # With the header option, rehead will be called after FastqToSam. # This will invalidate any md5 file, which would be a slow to construct @@ -442,12 +492,11 @@ def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName=None, header=None, if any(opt.lower() == 'CREATE_MD5_FILE=True'.lower() for opt in picardOptions): raise Exception("""CREATE_MD5_FILE is not allowed with '--header.'""") tools.picard.FastqToSamTool().execute( - inFastq1, - inFastq2, - sampleName, - fastqToSamOut, + inFastq1, inFastq2, + sampleName, fastqToSamOut, picardOptions=picardOptions, - JVMmemory=JVMmemory) + JVMmemory=JVMmemory + ) if header: tools.samtools.SamtoolsTool().reheader(fastqToSamOut, header, outBam) @@ -462,15 +511,19 @@ def parser_fastq_to_bam(parser=argparse.ArgumentParser()): headerGroup = parser.add_mutually_exclusive_group(required=True) headerGroup.add_argument('--sampleName', help='Sample name to insert into the read group header.') headerGroup.add_argument('--header', help='Optional text file containing header.') - parser.add_argument('--JVMmemory', - default=tools.picard.FastqToSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', - default=[], - nargs='*', - help='''Optional arguments to Picard\'s FastqToSam, + parser.add_argument( + '--JVMmemory', + default=tools.picard.FastqToSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) + parser.add_argument( + '--picardOptions', + default=[], + nargs='*', + help='''Optional arguments to Picard\'s FastqToSam, OPTIONNAME=value ... Note that header-related options will be - overwritten by HEADER if present.''') + overwritten by HEADER if present.''' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, fastq_to_bam, split_args=True) return parser @@ -486,9 +539,15 @@ def parser_fastq_to_bam(parser=argparse.ArgumentParser()): defaultFormat = 'fastq' -def split_reads(inFileName, outPrefix, outSuffix="", - maxReads=None, numChunks=None, - indexLen=defaultIndexLen, fmt=defaultFormat): +def split_reads( + inFileName, + outPrefix, + outSuffix="", + maxReads=None, + numChunks=None, + indexLen=defaultIndexLen, + fmt=defaultFormat +): '''Split fasta or fastq file into chunks of maxReads reads or into numChunks chunks named outPrefix01, outPrefix02, etc. If both maxReads and numChunks are None, use defaultMaxReads. @@ -529,34 +588,45 @@ def split_reads(inFileName, outPrefix, outSuffix="", def parser_split_reads(parser=argparse.ArgumentParser()): parser.add_argument('inFileName', help='Input fastq or fasta file.') - parser.add_argument('outPrefix', - help='Output files will be named ${outPrefix}01${outSuffix}, ${outPrefix}02${outSuffix}...') + parser.add_argument( + 'outPrefix', + help='Output files will be named ${outPrefix}01${outSuffix}, ${outPrefix}02${outSuffix}...' + ) group = parser.add_mutually_exclusive_group(required=False) - group.add_argument('--maxReads', - type=int, - default=None, - help='''Maximum number of reads per chunk (default {:d} if neither - maxReads nor numChunks is specified).'''.format(defaultMaxReads)) - group.add_argument('--numChunks', - type=int, - default=None, - help='Number of output files, if maxReads is not specified.') - parser.add_argument('--indexLen', - type=int, - default=defaultIndexLen, - help='''Number of characters to append to outputPrefix for each + group.add_argument( + '--maxReads', + type=int, + default=None, + help='''Maximum number of reads per chunk (default {:d} if neither + maxReads nor numChunks is specified).'''.format(defaultMaxReads) + ) + group.add_argument( + '--numChunks', + type=int, default=None, + help='Number of output files, if maxReads is not specified.' + ) + parser.add_argument( + '--indexLen', + type=int, + default=defaultIndexLen, + help='''Number of characters to append to outputPrefix for each output file (default %(default)s). - Number of files must not exceed 10^INDEXLEN.''') - parser.add_argument('--format', - dest="fmt", - choices=['fastq', 'fasta'], - default=defaultFormat, - help='Input fastq or fasta file (default: %(default)s).') - parser.add_argument('--outSuffix', - default='', - help='''Output filename suffix (e.g. .fastq or .fastq.gz). + Number of files must not exceed 10^INDEXLEN.''' + ) + parser.add_argument( + '--format', + dest="fmt", + choices=['fastq', 'fasta'], + default=defaultFormat, + help='Input fastq or fasta file (default: %(default)s).' + ) + parser.add_argument( + '--outSuffix', + default='', + help='''Output filename suffix (e.g. .fastq or .fastq.gz). A suffix ending in .gz will cause the output file - to be gzip compressed. Default is no suffix.''') + to be gzip compressed. Default is no suffix.''' + ) util.cmd.attach_main(parser, split_reads, split_args=True) return parser @@ -603,11 +673,12 @@ def split_bam(inBam, outBams): if outBam == outBams[-1]: for line in inf: outf.write(line) - picard.execute("SamFormatConverter", - [ - 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' - ], - JVMmemory='512m') + picard.execute( + "SamFormatConverter", [ + 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' + ], + JVMmemory='512m' + ) os.unlink(tmp_sam_reads) os.unlink(bigsam) @@ -622,7 +693,6 @@ def parser_split_bam(parser=argparse.ArgumentParser()): __commands__.append(('split_bam', parser_split_bam)) - # ======================= # *** reheader_bam *** # ======================= @@ -648,14 +718,14 @@ def main_reheader_bam(args): CN broad BI ''' # read mapping file - mapper = dict((a+':'+b, a+':'+c) for a,b,c in util.file.read_tabfile(args.rgMap)) + mapper = dict((a + ':' + b, a + ':' + c) for a, b, c in util.file.read_tabfile(args.rgMap)) # read and convert bam header header_file = mkstempfname('.sam') with open(header_file, 'wt') as outf: for row in tools.samtools.SamtoolsTool().getHeader(args.inBam): if row[0] == '@RG': row = [mapper.get(x, x) for x in row] - outf.write('\t'.join(row)+'\n') + outf.write('\t'.join(row) + '\n') # write new bam with new header tools.samtools.SamtoolsTool().reheader(args.inBam, header_file, args.outBam) os.unlink(header_file) @@ -670,6 +740,8 @@ def parser_reheader_bams(parser=argparse.ArgumentParser()): util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_reheader_bams) return parser + + def main_reheader_bams(args): ''' Copy BAM files while renaming elements of the BAM header. The mapping file specifies which (key, old value, new value) mappings. For @@ -683,8 +755,8 @@ def main_reheader_bams(args): FN in2.bam out2.bam ''' # read mapping file - mapper = dict((a+':'+b, a+':'+c) for a,b,c in util.file.read_tabfile(args.rgMap) if a != 'FN') - files = list((b,c) for a,b,c in util.file.read_tabfile(args.rgMap) if a == 'FN') + mapper = dict((a + ':' + b, a + ':' + c) for a, b, c in util.file.read_tabfile(args.rgMap) if a != 'FN') + files = list((b, c) for a, b, c in util.file.read_tabfile(args.rgMap) if a == 'FN') header_file = mkstempfname('.sam') # read and convert bam headers for inBam, outBam in files: @@ -693,14 +765,15 @@ def main_reheader_bams(args): for row in tools.samtools.SamtoolsTool().getHeader(inBam): if row[0] == '@RG': row = [mapper.get(x, x) for x in row] - outf.write('\t'.join(row)+'\n') + outf.write('\t'.join(row) + '\n') # write new bam with new header tools.samtools.SamtoolsTool().reheader(inBam, header_file, outBam) os.unlink(header_file) return 0 -__commands__.append(('reheader_bams', parser_reheader_bams)) +__commands__.append(('reheader_bams', parser_reheader_bams)) + # ============================ # *** dup_remove_mvicuna *** # ============================ @@ -767,11 +840,13 @@ def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None): outf.write(line) os.unlink(fn) else: - log.warn("""no reads found in %s, - assuming that's because there's no reads in that read group""", fn) + log.warn( + """no reads found in %s, + assuming that's because there's no reads in that read group""", fn + ) # M-Vicuna DupRm to see what we should keep (append IDs to running file) - if os.path.getsize(infastqs[0])>0 or os.path.getsize(infastqs[1])>0: + if os.path.getsize(infastqs[0]) > 0 or os.path.getsize(infastqs[1]) > 0: mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList) for fn in infastqs: os.unlink(fn) @@ -784,9 +859,11 @@ def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None): def parser_rmdup_mvicuna_bam(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input reads, BAM format.') parser.add_argument('outBam', help='Output reads, BAM format.') - parser.add_argument('--JVMmemory', - default=tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument( + '--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, rmdup_mvicuna_bam, split_args=True) return parser @@ -809,7 +886,8 @@ def parser_dup_remove_mvicuna(parser=argparse.ArgumentParser()): def main_dup_remove_mvicuna(args): '''Run mvicuna's duplicate removal operation on paired-end reads.''' tools.mvicuna.MvicunaTool().rmdup( - (args.inFastq1, args.inFastq2), (args.pairedOutFastq1, args.pairedOutFastq2), args.unpairedOutFastq) + (args.inFastq1, args.inFastq2), (args.pairedOutFastq1, args.pairedOutFastq2), args.unpairedOutFastq + ) return 0 @@ -866,12 +944,12 @@ def parser_novoalign(parser=argparse.ArgumentParser()): parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Novoindex.') parser.add_argument('outBam', help='Output reads, BAM format (aligned).') parser.add_argument('--options', default='-r Random', help='Novoalign options (default: %(default)s)') - parser.add_argument('--min_qual', - default=0, - help='Filter outBam to minimum mapping quality (default: %(default)s)') - parser.add_argument('--JVMmemory', - default=tools.picard.SortSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--min_qual', default=0, help='Filter outBam to minimum mapping quality (default: %(default)s)') + parser.add_argument( + '--JVMmemory', + default=tools.picard.SortSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_novoalign) return parser @@ -885,7 +963,8 @@ def main_novoalign(args): args.outBam, options=args.options.split(), min_qual=args.min_qual, - JVMmemory=args.JVMmemory) + JVMmemory=args.JVMmemory + ) return 0 @@ -907,15 +986,21 @@ def parser_novoindex(parser=argparse.ArgumentParser()): def parser_gatk_ug(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input reads, BAM format.') parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard.') - parser.add_argument('outVcf', - help='''Output calls in VCF format. If this filename ends with .gz, - GATK will BGZIP compress the output and produce a Tabix index file as well.''') - parser.add_argument('--options', - default='--min_base_quality_score 15 -ploidy 4', - help='UnifiedGenotyper options (default: %(default)s)') - parser.add_argument('--JVMmemory', - default=tools.gatk.GATKTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument( + 'outVcf', + help='''Output calls in VCF format. If this filename ends with .gz, + GATK will BGZIP compress the output and produce a Tabix index file as well.''' + ) + parser.add_argument( + '--options', + default='--min_base_quality_score 15 -ploidy 4', + help='UnifiedGenotyper options (default: %(default)s)' + ) + parser.add_argument( + '--JVMmemory', + default=tools.gatk.GATKTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_gatk_ug) return parser @@ -923,11 +1008,11 @@ def parser_gatk_ug(parser=argparse.ArgumentParser()): def main_gatk_ug(args): '''Call genotypes using the GATK UnifiedGenotyper.''' - tools.gatk.GATKTool().ug(args.inBam, - args.refFasta, - args.outVcf, - options=args.options.split(), - JVMmemory=args.JVMmemory) + tools.gatk.GATKTool().ug( + args.inBam, args.refFasta, + args.outVcf, options=args.options.split(), + JVMmemory=args.JVMmemory + ) return 0 @@ -938,9 +1023,11 @@ def parser_gatk_realign(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input reads, BAM format, aligned to refFasta.') parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard.') parser.add_argument('outBam', help='Realigned reads.') - parser.add_argument('--JVMmemory', - default=tools.gatk.GATKTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument( + '--JVMmemory', + default=tools.gatk.GATKTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, main_gatk_realign) parser.add_argument('--threads', default=1, help='Number of threads (default: %(default)s)') @@ -950,11 +1037,10 @@ def parser_gatk_realign(parser=argparse.ArgumentParser()): def main_gatk_realign(args): '''Local realignment of BAM files with GATK IndelRealigner.''' tools.gatk.GATKTool().local_realign( - args.inBam, - args.refFasta, - args.outBam, - JVMmemory=args.JVMmemory, - threads=args.threads) + args.inBam, args.refFasta, + args.outBam, JVMmemory=args.JVMmemory, + threads=args.threads + ) return 0 @@ -962,8 +1048,15 @@ def main_gatk_realign(args): # ========================= -def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, - novoalign_options='', JVMmemory=None, threads=1): + +def align_and_fix( + inBam, refFasta, + outBamAll=None, + outBamFiltered=None, + novoalign_options='', + JVMmemory=None, + threads=1 +): ''' Take reads, align to reference with Novoalign, mark duplicates with Picard, realign indels with GATK, and optionally filter final file to mapped/non-dupe reads. @@ -974,18 +1067,16 @@ def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, bam_aligned = mkstempfname('.aligned.bam') tools.novoalign.NovoalignTool().execute( - inBam, - refFasta, - bam_aligned, + inBam, refFasta, bam_aligned, options=novoalign_options.split(), - JVMmemory=JVMmemory) + JVMmemory=JVMmemory + ) bam_mkdup = mkstempfname('.mkdup.bam') tools.picard.MarkDuplicatesTool().execute( - [bam_aligned], - bam_mkdup, - picardOptions=['CREATE_INDEX=true'], - JVMmemory=JVMmemory) + [bam_aligned], bam_mkdup, picardOptions=['CREATE_INDEX=true'], + JVMmemory=JVMmemory + ) os.unlink(bam_aligned) bam_realigned = mkstempfname('.realigned.bam') @@ -1004,14 +1095,18 @@ def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, def parser_align_and_fix(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input unaligned reads, BAM format.') parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.') - parser.add_argument('--outBamAll', - default=None, - help='''Aligned, sorted, and indexed reads. Unmapped reads are - retained and duplicate reads are marked, not removed.''') - parser.add_argument('--outBamFiltered', - default=None, - help='''Aligned, sorted, and indexed reads. Unmapped reads and - duplicate reads are removed from this file.''') + parser.add_argument( + '--outBamAll', + default=None, + help='''Aligned, sorted, and indexed reads. Unmapped reads are + retained and duplicate reads are marked, not removed.''' + ) + parser.add_argument( + '--outBamFiltered', + default=None, + help='''Aligned, sorted, and indexed reads. Unmapped reads and + duplicate reads are removed from this file.''' + ) parser.add_argument('--novoalign_options', default='-r Random', help='Novoalign options (default: %(default)s)') parser.add_argument('--JVMmemory', default='4g', help='JVM virtual memory size (default: %(default)s)') parser.add_argument('--threads', default=1, help='Number of threads (default: %(default)s)') @@ -1022,9 +1117,9 @@ def parser_align_and_fix(parser=argparse.ArgumentParser()): __commands__.append(('align_and_fix', parser_align_and_fix)) - # ========================= + def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None): ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats. ''' @@ -1044,6 +1139,7 @@ def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None): if outBam is None: os.unlink(bam_aligned) + def parser_bwamem_idxstats(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input unaligned reads, BAM format.') parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.') @@ -1053,83 +1149,112 @@ def parser_bwamem_idxstats(parser=argparse.ArgumentParser()): util.cmd.attach_main(parser, bwamem_idxstats, split_args=True) return parser + __commands__.append(('bwamem_idxstats', parser_bwamem_idxstats)) # ========================= -def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser needs add_help=False? - parser.add_argument('in_bam', - help='Input reads, BAM format.') - parser.add_argument('out_plot_file', - help='The generated chart file') - parser.add_argument('--plotFormat', - dest="plot_format", - default=None, - type=str, - choices=list(plt.gcf().canvas.get_supported_filetypes().keys()), - metavar='', - help="File format of the coverage plot. By default it is inferred from the file extension of out_plot_file, but it can be set explicitly via --plotFormat. Valid formats include: " + ", ".join( list(plt.gcf().canvas.get_supported_filetypes().keys())) ) - parser.add_argument('--plotDataStyle', - dest="plot_data_style", - default="filled", - type=str, - choices=["filled","line","dots"], - metavar='', - help="The plot data display style. Valid options: " + ", ".join(["filled","line","dots"]) + " (default: %(default)s)") - parser.add_argument('--plotStyle', - dest="plot_style", - default="ggplot", - type=str, - choices=plt.style.available, - metavar='', - help="The plot visual style. Valid options: " + ", ".join(plt.style.available) + " (default: %(default)s)") - parser.add_argument('--plotWidth', - dest="plot_width", - default=1024, - type=int, - help="Width of the plot in pixels (default: %(default)s)") - parser.add_argument('--plotHeight', - dest="plot_height", - default=768, - type=int, - help="Width of the plot in pixels (default: %(default)s)") - parser.add_argument('--plotTitle', - dest="plot_title", - default="Coverage Plot", - type=str, - help="The title displayed on the coverage plot (default: '%(default)s')") - parser.add_argument('-q', - dest="base_q_threshold", - default=None, - type=int, - help="The minimum base quality threshold") - parser.add_argument('-Q', - dest="mapping_q_threshold", - default=None, - type=int, - help="The minimum mapping quality threshold") - parser.add_argument('-m', - dest="max_coverage_depth", - default=1000000, - type=int, - help="The max coverage depth (default: %(default)s)") - parser.add_argument('-l', - dest="read_length_threshold", - default=None, - type=int, - help="Read length threshold") - parser.add_argument('--outSummary', - dest="out_summary", - default=None, - type=str, - help="Coverage summary TSV file. Default is to write to temp.") + +def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser needs add_help=False? + parser.add_argument('in_bam', help='Input reads, BAM format.') + parser.add_argument('out_plot_file', help='The generated chart file') + parser.add_argument( + '--plotFormat', + dest="plot_format", + default=None, + type=str, + choices=list(plt.gcf().canvas.get_supported_filetypes().keys()), + metavar='', + help="File format of the coverage plot. By default it is inferred from the file extension of out_plot_file, but it can be set explicitly via --plotFormat. Valid formats include: " + + ", ".join(list(plt.gcf().canvas.get_supported_filetypes().keys())) + ) + parser.add_argument( + '--plotDataStyle', + dest="plot_data_style", + default="filled", + type=str, + choices=["filled", "line", "dots"], + metavar='', + help="The plot data display style. Valid options: " + ", ".join(["filled", "line", "dots"]) + + " (default: %(default)s)" + ) + parser.add_argument( + '--plotStyle', + dest="plot_style", + default="ggplot", + type=str, + choices=plt.style.available, + metavar='', + help="The plot visual style. Valid options: " + ", ".join(plt.style.available) + " (default: %(default)s)" + ) + parser.add_argument( + '--plotWidth', + dest="plot_width", + default=1024, + type=int, + help="Width of the plot in pixels (default: %(default)s)" + ) + parser.add_argument( + '--plotHeight', + dest="plot_height", + default=768, + type=int, + help="Width of the plot in pixels (default: %(default)s)" + ) + parser.add_argument( + '--plotTitle', + dest="plot_title", + default="Coverage Plot", + type=str, + help="The title displayed on the coverage plot (default: '%(default)s')" + ) + parser.add_argument( + '-q', dest="base_q_threshold", + default=None, type=int, + help="The minimum base quality threshold" + ) + parser.add_argument( + '-Q', dest="mapping_q_threshold", + default=None, + type=int, help="The minimum mapping quality threshold" + ) + parser.add_argument( + '-m', + dest="max_coverage_depth", + default=1000000, + type=int, + help="The max coverage depth (default: %(default)s)" + ) + parser.add_argument('-l', dest="read_length_threshold", default=None, type=int, help="Read length threshold") + parser.add_argument( + '--outSummary', + dest="out_summary", + default=None, + type=str, + help="Coverage summary TSV file. Default is to write to temp." + ) return parser -def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary=None): + +def plot_coverage( + in_bam, + out_plot_file, + plot_format, + plot_data_style, + plot_style, + plot_width, + plot_height, + plot_title, + base_q_threshold, + mapping_q_threshold, + max_coverage_depth, + read_length_threshold, + out_summary=None +): ''' Generate a coverage plot from an aligned bam file ''' - + # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" @@ -1138,8 +1263,10 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_styl # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: - raise Exception("""The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam) - + raise Exception( + """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" + % in_bam + ) if out_summary is None: coverage_tsv_file = mkstempfname('.summary.tsv') @@ -1157,13 +1284,13 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_styl # call samtools sort bam_sorted = mkstempfname('.aligned.bam') samtools.sort(bam_aligned, bam_sorted) - + # call samtools index samtools.index(bam_sorted) - + # call samtools depth opts = [] - opts += ['-aa'] # report coverate at "absolutely all" positions + opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: @@ -1181,7 +1308,7 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_styl domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): - segment_depths.setdefault(row[0],[]).append(int(row[2])) + segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 @@ -1189,34 +1316,45 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_styl fig = plt.gcf() DPI = fig.get_dpi() - fig.set_size_inches(float(plot_width)/float(DPI),float(plot_height)/float(DPI)) + fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI)) - font_size = math.sqrt((plot_width**2)+(plot_height**2))/float(DPI)*1.25 + font_size = math.sqrt((plot_width**2) + (plot_height**2)) / float(DPI) * 1.25 - ax = plt.subplot() # Defines ax variable by creating an empty plot + ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): - label.set_fontsize(font_size) + label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) - colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style - segment_color = colors[segment_num%len(colors)] # pick a color, offset by the segment index - + colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style + segment_color = colors[segment_num % len(colors)] # pick a color, offset by the segment index + if plot_data_style == "filled": - plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0]*len(position_depths), linewidth=0, antialiased=True, color=segment_color) + plt.fill_between( + range(prior_domain_max, domain_max), + position_depths, [0] * len(position_depths), + linewidth=0, + antialiased=True, + color=segment_color + ) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": - plt.plot(range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color) - - - plt.title(plot_title, fontsize=font_size*1.2) - plt.xlabel("bp", fontsize=font_size*1.1) - plt.ylabel("read depth", fontsize=font_size*1.1) + plt.plot( + range(prior_domain_max, domain_max), + position_depths, + 'ro', + antialiased=True, + color=segment_color + ) + + plt.title(plot_title, fontsize=font_size * 1.2) + plt.xlabel("bp", fontsize=font_size * 1.1) + plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: @@ -1224,7 +1362,7 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_styl else: fig.tight_layout() - plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') + plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) os.unlink(bam_aligned) @@ -1232,7 +1370,7 @@ def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_styl if not out_summary: os.unlink(coverage_tsv_file) - + def parser_plot_coverage(parser=argparse.ArgumentParser()): parser = parser_plot_coverage_common(parser) @@ -1240,11 +1378,29 @@ def parser_plot_coverage(parser=argparse.ArgumentParser()): util.cmd.attach_main(parser, plot_coverage, split_args=True) return parser + __commands__.append(('plot_coverage', parser_plot_coverage)) -def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, - in_bam, ref_fasta, out_bam=None, sensitive=False, min_score_to_output=None - ): + +def align_and_plot_coverage( + out_plot_file, + plot_format, + plot_data_style, + plot_style, + plot_width, + plot_height, + plot_title, + base_q_threshold, + mapping_q_threshold, + max_coverage_depth, + read_length_threshold, + out_summary, + in_bam, + ref_fasta, + out_bam=None, + sensitive=False, + min_score_to_output=None +): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' @@ -1285,9 +1441,12 @@ def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_st os.unlink(aln_bam_filtered) samtools.index(bam_aligned) - + # -- call plot function -- - plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary) + plot_coverage( + bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, + base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary + ) # remove the output bam, unless it is needed if out_bam is None: @@ -1295,37 +1454,43 @@ def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_st # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed - for ext in [".amb",".ann",".bwt",".bwa",".pac",".sa",""]: - file_to_remove = ref_indexed+ext + for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: + file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): - os.unlink( file_to_remove ) + os.unlink(file_to_remove) + def parser_align_and_plot_coverage(parser=argparse.ArgumentParser()): parser = parser_plot_coverage_common(parser) - parser.add_argument('ref_fasta', - default=None, - help='Reference genome, FASTA format.') - parser.add_argument('--outBam', - dest="out_bam", - default=None, - help='Output aligned, indexed BAM file. Default is to write to temp.') - parser.add_argument('--sensitive', - action="store_true", - help="Equivalent to giving bwa: '-k 12 -A 1 -B 1 -O 1 -E 1' ") - parser.add_argument('-T', - dest="min_score_to_output", - default=30, - type=int, - help="The min score to output during alignment (default: %(default)s)") + parser.add_argument('ref_fasta', default=None, help='Reference genome, FASTA format.') + parser.add_argument( + '--outBam', + dest="out_bam", + default=None, + help='Output aligned, indexed BAM file. Default is to write to temp.' + ) + parser.add_argument( + '--sensitive', action="store_true", + help="Equivalent to giving bwa: '-k 12 -A 1 -B 1 -O 1 -E 1' " + ) + parser.add_argument( + '-T', + dest="min_score_to_output", + default=30, + type=int, + help="The min score to output during alignment (default: %(default)s)" + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, align_and_plot_coverage, split_args=True) return parser + __commands__.append(('align_and_plot_coverage', parser_align_and_plot_coverage)) # ========================= + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) From 0f1310e9f89a5f24674d2b0e0fd599505a52b8c6 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 14 Jul 2016 01:09:15 -0400 Subject: [PATCH 09/13] remove sam->bam conversion prior to samtools.sort call --- read_utils.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/read_utils.py b/read_utils.py index 137b9fc01..fae290110 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1273,17 +1273,9 @@ def plot_coverage( else: coverage_tsv_file = out_summary - bam_aligned = mkstempfname('.aligned.bam') - - if in_bam.endswith(".sam"): - # convert sam -> bam - samtools.view(["-b"], in_bam, bam_aligned) - elif in_bam.endswith(".bam") or in_bam.endswith(".cram"): - shutil.copyfile(in_bam, bam_aligned) - # call samtools sort - bam_sorted = mkstempfname('.aligned.bam') - samtools.sort(bam_aligned, bam_sorted) + bam_sorted = mkstempfname('.sorted.bam') + samtools.sort(in_bam, bam_sorted, args=["-O", "bam"]) # call samtools index samtools.index(bam_sorted) @@ -1301,6 +1293,7 @@ def plot_coverage( opts += ["-l", str(read_length_threshold)] samtools.depth(bam_sorted, coverage_tsv_file, opts) + os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- @@ -1365,9 +1358,6 @@ def plot_coverage( plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) - os.unlink(bam_aligned) - os.unlink(bam_sorted) - if not out_summary: os.unlink(coverage_tsv_file) From 7b80bd6408b2f6e4422be234fbdef93fbfda0f88 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 14 Jul 2016 01:17:40 -0400 Subject: [PATCH 10/13] move plot_coverage() and align_and_plot_coverage() to reports.py move plot_coverage() and align_and_plot_coverage() to reports.py, along with their respective parser functions --- read_utils.py | 332 ------------------------------------------------- reports.py | 333 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 333 insertions(+), 332 deletions(-) diff --git a/read_utils.py b/read_utils.py index fae290110..601b39f66 100755 --- a/read_utils.py +++ b/read_utils.py @@ -14,13 +14,8 @@ import os import tempfile import shutil -import csv -from collections import OrderedDict from Bio import SeqIO -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt import util.cmd import util.file @@ -1154,333 +1149,6 @@ def parser_bwamem_idxstats(parser=argparse.ArgumentParser()): # ========================= - -def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser needs add_help=False? - parser.add_argument('in_bam', help='Input reads, BAM format.') - parser.add_argument('out_plot_file', help='The generated chart file') - parser.add_argument( - '--plotFormat', - dest="plot_format", - default=None, - type=str, - choices=list(plt.gcf().canvas.get_supported_filetypes().keys()), - metavar='', - help="File format of the coverage plot. By default it is inferred from the file extension of out_plot_file, but it can be set explicitly via --plotFormat. Valid formats include: " - + ", ".join(list(plt.gcf().canvas.get_supported_filetypes().keys())) - ) - parser.add_argument( - '--plotDataStyle', - dest="plot_data_style", - default="filled", - type=str, - choices=["filled", "line", "dots"], - metavar='', - help="The plot data display style. Valid options: " + ", ".join(["filled", "line", "dots"]) + - " (default: %(default)s)" - ) - parser.add_argument( - '--plotStyle', - dest="plot_style", - default="ggplot", - type=str, - choices=plt.style.available, - metavar='', - help="The plot visual style. Valid options: " + ", ".join(plt.style.available) + " (default: %(default)s)" - ) - parser.add_argument( - '--plotWidth', - dest="plot_width", - default=1024, - type=int, - help="Width of the plot in pixels (default: %(default)s)" - ) - parser.add_argument( - '--plotHeight', - dest="plot_height", - default=768, - type=int, - help="Width of the plot in pixels (default: %(default)s)" - ) - parser.add_argument( - '--plotTitle', - dest="plot_title", - default="Coverage Plot", - type=str, - help="The title displayed on the coverage plot (default: '%(default)s')" - ) - parser.add_argument( - '-q', dest="base_q_threshold", - default=None, type=int, - help="The minimum base quality threshold" - ) - parser.add_argument( - '-Q', dest="mapping_q_threshold", - default=None, - type=int, help="The minimum mapping quality threshold" - ) - parser.add_argument( - '-m', - dest="max_coverage_depth", - default=1000000, - type=int, - help="The max coverage depth (default: %(default)s)" - ) - parser.add_argument('-l', dest="read_length_threshold", default=None, type=int, help="Read length threshold") - parser.add_argument( - '--outSummary', - dest="out_summary", - default=None, - type=str, - help="Coverage summary TSV file. Default is to write to temp." - ) - return parser - - -def plot_coverage( - in_bam, - out_plot_file, - plot_format, - plot_data_style, - plot_style, - plot_width, - plot_height, - plot_title, - base_q_threshold, - mapping_q_threshold, - max_coverage_depth, - read_length_threshold, - out_summary=None -): - ''' - Generate a coverage plot from an aligned bam file - ''' - - # TODO: remove this: - #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" - - samtools = tools.samtools.SamtoolsTool() - - # check if in_bam is aligned, if not raise an error - num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) - if num_mapped_reads == 0: - raise Exception( - """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" - % in_bam - ) - - if out_summary is None: - coverage_tsv_file = mkstempfname('.summary.tsv') - else: - coverage_tsv_file = out_summary - - # call samtools sort - bam_sorted = mkstempfname('.sorted.bam') - samtools.sort(in_bam, bam_sorted, args=["-O", "bam"]) - - # call samtools index - samtools.index(bam_sorted) - - # call samtools depth - opts = [] - opts += ['-aa'] # report coverate at "absolutely all" positions - if base_q_threshold: - opts += ["-q", str(base_q_threshold)] - if mapping_q_threshold: - opts += ["-Q", str(mapping_q_threshold)] - if max_coverage_depth: - opts += ["-m", str(max_coverage_depth)] - if read_length_threshold: - opts += ["-l", str(read_length_threshold)] - - samtools.depth(bam_sorted, coverage_tsv_file, opts) - os.unlink(bam_sorted) - - # ---- create plot based on coverage_tsv_file ---- - - segment_depths = OrderedDict() - domain_max = 0 - with open(coverage_tsv_file, "r") as tabfile: - for row in csv.reader(tabfile, delimiter='\t'): - segment_depths.setdefault(row[0], []).append(int(row[2])) - domain_max += 1 - - domain_max = 0 - with plt.style.context(plot_style): - - fig = plt.gcf() - DPI = fig.get_dpi() - fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI)) - - font_size = math.sqrt((plot_width**2) + (plot_height**2)) / float(DPI) * 1.25 - - ax = plt.subplot() # Defines ax variable by creating an empty plot - - # Set the tick labels font - for label in (ax.get_xticklabels() + ax.get_yticklabels()): - label.set_fontsize(font_size) - - for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): - prior_domain_max = domain_max - domain_max += len(position_depths) - - colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style - segment_color = colors[segment_num % len(colors)] # pick a color, offset by the segment index - - if plot_data_style == "filled": - plt.fill_between( - range(prior_domain_max, domain_max), - position_depths, [0] * len(position_depths), - linewidth=0, - antialiased=True, - color=segment_color - ) - elif plot_data_style == "line": - plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) - elif plot_data_style == "dots": - plt.plot( - range(prior_domain_max, domain_max), - position_depths, - 'ro', - antialiased=True, - color=segment_color - ) - - plt.title(plot_title, fontsize=font_size * 1.2) - plt.xlabel("bp", fontsize=font_size * 1.1) - plt.ylabel("read depth", fontsize=font_size * 1.1) - - # to squash a backend renderer error on OSX related to tight layout - if plt.get_backend().lower() in ['agg', 'macosx']: - fig.set_tight_layout(True) - else: - fig.tight_layout() - - plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') - log.info("Coverage plot saved to: " + out_plot_file) - - if not out_summary: - os.unlink(coverage_tsv_file) - - -def parser_plot_coverage(parser=argparse.ArgumentParser()): - parser = parser_plot_coverage_common(parser) - util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, plot_coverage, split_args=True) - return parser - - -__commands__.append(('plot_coverage', parser_plot_coverage)) - - -def align_and_plot_coverage( - out_plot_file, - plot_format, - plot_data_style, - plot_style, - plot_width, - plot_height, - plot_title, - base_q_threshold, - mapping_q_threshold, - max_coverage_depth, - read_length_threshold, - out_summary, - in_bam, - ref_fasta, - out_bam=None, - sensitive=False, - min_score_to_output=None -): - ''' - Take reads, align to reference with BWA-MEM, and generate a coverage plot - ''' - if out_bam is None: - bam_aligned = mkstempfname('.aligned.bam') - else: - bam_aligned = out_bam - - ref_indexed = mkstempfname('.reference.fasta') - shutil.copyfile(ref_fasta, ref_indexed) - - bwa = tools.bwa.Bwa() - samtools = tools.samtools.SamtoolsTool() - - bwa.index(ref_indexed) - - bwa_opts = [] - if sensitive: - bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() - - map_threshold = min_score_to_output or 30 - - bwa_opts + ["-T", str(map_threshold)] - - aln_bam = mkstempfname('.bam') - - bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts) - - # @haydenm says: - # For some reason (particularly when the --sensitive option is on), bwa - # doesn't listen to its '-T' flag and outputs alignments with score less - # than the '-T 30' threshold. So filter these: - aln_bam_filtered = mkstempfname('.filtered.bam') - samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered) - os.unlink(aln_bam) - - samtools.sort(aln_bam_filtered, bam_aligned) - os.unlink(aln_bam_filtered) - - samtools.index(bam_aligned) - - # -- call plot function -- - plot_coverage( - bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, - base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary - ) - - # remove the output bam, unless it is needed - if out_bam is None: - os.unlink(bam_aligned) - - # remove the files created by bwa index. - # The empty extension causes the original fasta file to be removed - for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: - file_to_remove = ref_indexed + ext - if os.path.isfile(file_to_remove): - os.unlink(file_to_remove) - - -def parser_align_and_plot_coverage(parser=argparse.ArgumentParser()): - parser = parser_plot_coverage_common(parser) - parser.add_argument('ref_fasta', default=None, help='Reference genome, FASTA format.') - parser.add_argument( - '--outBam', - dest="out_bam", - default=None, - help='Output aligned, indexed BAM file. Default is to write to temp.' - ) - parser.add_argument( - '--sensitive', action="store_true", - help="Equivalent to giving bwa: '-k 12 -A 1 -B 1 -O 1 -E 1' " - ) - parser.add_argument( - '-T', - dest="min_score_to_output", - default=30, - type=int, - help="The min score to output during alignment (default: %(default)s)" - ) - - util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, align_and_plot_coverage, split_args=True) - return parser - - -__commands__.append(('align_and_plot_coverage', parser_align_and_plot_coverage)) - -# ========================= - - def full_parser(): return util.cmd.make_parser(__commands__, __doc__) diff --git a/reports.py b/reports.py index 158e2114f..9b3d78527 100755 --- a/reports.py +++ b/reports.py @@ -12,16 +12,21 @@ import time from collections import OrderedDict import csv +import math import pysam import Bio.SeqIO import Bio.AlignIO from Bio.Alphabet.IUPAC import IUPACUnambiguousDNA +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt import util.cmd import util.file import util.misc import tools.samtools +import tools.bwa import assembly import interhost from util.stats import mean, median @@ -373,6 +378,334 @@ def parser_consolidate_spike_count(parser=argparse.ArgumentParser()): __commands__.append(('consolidate_spike_count', parser_consolidate_spike_count)) + +# ========================= + + +def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser needs add_help=False? + parser.add_argument('in_bam', help='Input reads, BAM format.') + parser.add_argument('out_plot_file', help='The generated chart file') + parser.add_argument( + '--plotFormat', + dest="plot_format", + default=None, + type=str, + choices=list(plt.gcf().canvas.get_supported_filetypes().keys()), + metavar='', + help="File format of the coverage plot. By default it is inferred from the file extension of out_plot_file, but it can be set explicitly via --plotFormat. Valid formats include: " + + ", ".join(list(plt.gcf().canvas.get_supported_filetypes().keys())) + ) + parser.add_argument( + '--plotDataStyle', + dest="plot_data_style", + default="filled", + type=str, + choices=["filled", "line", "dots"], + metavar='', + help="The plot data display style. Valid options: " + ", ".join(["filled", "line", "dots"]) + + " (default: %(default)s)" + ) + parser.add_argument( + '--plotStyle', + dest="plot_style", + default="ggplot", + type=str, + choices=plt.style.available, + metavar='', + help="The plot visual style. Valid options: " + ", ".join(plt.style.available) + " (default: %(default)s)" + ) + parser.add_argument( + '--plotWidth', + dest="plot_width", + default=1024, + type=int, + help="Width of the plot in pixels (default: %(default)s)" + ) + parser.add_argument( + '--plotHeight', + dest="plot_height", + default=768, + type=int, + help="Width of the plot in pixels (default: %(default)s)" + ) + parser.add_argument( + '--plotTitle', + dest="plot_title", + default="Coverage Plot", + type=str, + help="The title displayed on the coverage plot (default: '%(default)s')" + ) + parser.add_argument( + '-q', dest="base_q_threshold", + default=None, type=int, + help="The minimum base quality threshold" + ) + parser.add_argument( + '-Q', dest="mapping_q_threshold", + default=None, + type=int, help="The minimum mapping quality threshold" + ) + parser.add_argument( + '-m', + dest="max_coverage_depth", + default=1000000, + type=int, + help="The max coverage depth (default: %(default)s)" + ) + parser.add_argument('-l', dest="read_length_threshold", default=None, type=int, help="Read length threshold") + parser.add_argument( + '--outSummary', + dest="out_summary", + default=None, + type=str, + help="Coverage summary TSV file. Default is to write to temp." + ) + return parser + + +def plot_coverage( + in_bam, + out_plot_file, + plot_format, + plot_data_style, + plot_style, + plot_width, + plot_height, + plot_title, + base_q_threshold, + mapping_q_threshold, + max_coverage_depth, + read_length_threshold, + out_summary=None +): + ''' + Generate a coverage plot from an aligned bam file + ''' + + # TODO: remove this: + #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" + + samtools = tools.samtools.SamtoolsTool() + + # check if in_bam is aligned, if not raise an error + num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) + if num_mapped_reads == 0: + raise Exception( + """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" + % in_bam + ) + + if out_summary is None: + coverage_tsv_file = util.file.mkstempfname('.summary.tsv') + else: + coverage_tsv_file = out_summary + + # call samtools sort + bam_sorted = util.file.mkstempfname('.sorted.bam') + samtools.sort(in_bam, bam_sorted, args=["-O", "bam"]) + + # call samtools index + samtools.index(bam_sorted) + + # call samtools depth + opts = [] + opts += ['-aa'] # report coverate at "absolutely all" positions + if base_q_threshold: + opts += ["-q", str(base_q_threshold)] + if mapping_q_threshold: + opts += ["-Q", str(mapping_q_threshold)] + if max_coverage_depth: + opts += ["-m", str(max_coverage_depth)] + if read_length_threshold: + opts += ["-l", str(read_length_threshold)] + + samtools.depth(bam_sorted, coverage_tsv_file, opts) + os.unlink(bam_sorted) + + # ---- create plot based on coverage_tsv_file ---- + + segment_depths = OrderedDict() + domain_max = 0 + with open(coverage_tsv_file, "r") as tabfile: + for row in csv.reader(tabfile, delimiter='\t'): + segment_depths.setdefault(row[0], []).append(int(row[2])) + domain_max += 1 + + domain_max = 0 + with plt.style.context(plot_style): + + fig = plt.gcf() + DPI = fig.get_dpi() + fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI)) + + font_size = math.sqrt((plot_width**2) + (plot_height**2)) / float(DPI) * 1.25 + + ax = plt.subplot() # Defines ax variable by creating an empty plot + + # Set the tick labels font + for label in (ax.get_xticklabels() + ax.get_yticklabels()): + label.set_fontsize(font_size) + + for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): + prior_domain_max = domain_max + domain_max += len(position_depths) + + colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style + segment_color = colors[segment_num % len(colors)] # pick a color, offset by the segment index + + if plot_data_style == "filled": + plt.fill_between( + range(prior_domain_max, domain_max), + position_depths, [0] * len(position_depths), + linewidth=0, + antialiased=True, + color=segment_color + ) + elif plot_data_style == "line": + plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) + elif plot_data_style == "dots": + plt.plot( + range(prior_domain_max, domain_max), + position_depths, + 'ro', + antialiased=True, + color=segment_color + ) + + plt.title(plot_title, fontsize=font_size * 1.2) + plt.xlabel("bp", fontsize=font_size * 1.1) + plt.ylabel("read depth", fontsize=font_size * 1.1) + + # to squash a backend renderer error on OSX related to tight layout + if plt.get_backend().lower() in ['agg', 'macosx']: + fig.set_tight_layout(True) + else: + fig.tight_layout() + + plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') + log.info("Coverage plot saved to: " + out_plot_file) + + if not out_summary: + os.unlink(coverage_tsv_file) + + +def parser_plot_coverage(parser=argparse.ArgumentParser()): + parser = parser_plot_coverage_common(parser) + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) + util.cmd.attach_main(parser, plot_coverage, split_args=True) + return parser + + +__commands__.append(('plot_coverage', parser_plot_coverage)) + + +def align_and_plot_coverage( + out_plot_file, + plot_format, + plot_data_style, + plot_style, + plot_width, + plot_height, + plot_title, + base_q_threshold, + mapping_q_threshold, + max_coverage_depth, + read_length_threshold, + out_summary, + in_bam, + ref_fasta, + out_bam=None, + sensitive=False, + min_score_to_output=None +): + ''' + Take reads, align to reference with BWA-MEM, and generate a coverage plot + ''' + if out_bam is None: + bam_aligned = util.file.mkstempfname('.aligned.bam') + else: + bam_aligned = out_bam + + ref_indexed = util.file.mkstempfname('.reference.fasta') + shutil.copyfile(ref_fasta, ref_indexed) + + bwa = tools.bwa.Bwa() + samtools = tools.samtools.SamtoolsTool() + + bwa.index(ref_indexed) + + bwa_opts = [] + if sensitive: + bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() + + map_threshold = min_score_to_output or 30 + + bwa_opts + ["-T", str(map_threshold)] + + aln_bam = util.file.mkstempfname('.bam') + + bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts) + + # @haydenm says: + # For some reason (particularly when the --sensitive option is on), bwa + # doesn't listen to its '-T' flag and outputs alignments with score less + # than the '-T 30' threshold. So filter these: + aln_bam_filtered = util.file.mkstempfname('.filtered.bam') + samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered) + os.unlink(aln_bam) + + samtools.sort(aln_bam_filtered, bam_aligned) + os.unlink(aln_bam_filtered) + + samtools.index(bam_aligned) + + # -- call plot function -- + plot_coverage( + bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, + base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary + ) + + # remove the output bam, unless it is needed + if out_bam is None: + os.unlink(bam_aligned) + + # remove the files created by bwa index. + # The empty extension causes the original fasta file to be removed + for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: + file_to_remove = ref_indexed + ext + if os.path.isfile(file_to_remove): + os.unlink(file_to_remove) + + +def parser_align_and_plot_coverage(parser=argparse.ArgumentParser()): + parser = parser_plot_coverage_common(parser) + parser.add_argument('ref_fasta', default=None, help='Reference genome, FASTA format.') + parser.add_argument( + '--outBam', + dest="out_bam", + default=None, + help='Output aligned, indexed BAM file. Default is to write to temp.' + ) + parser.add_argument( + '--sensitive', action="store_true", + help="Equivalent to giving bwa: '-k 12 -A 1 -B 1 -O 1 -E 1' " + ) + parser.add_argument( + '-T', + dest="min_score_to_output", + default=30, + type=int, + help="The min score to output during alignment (default: %(default)s)" + ) + + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) + util.cmd.attach_main(parser, align_and_plot_coverage, split_args=True) + return parser + + +__commands__.append(('align_and_plot_coverage', parser_align_and_plot_coverage)) + + # ======================= From a07cd3ef981fd3520185579a416d92d2a17ad380 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 14 Jul 2016 01:19:57 -0400 Subject: [PATCH 11/13] missed hitting save before last commit; add missing shutil import --- reports.py | 1 + 1 file changed, 1 insertion(+) diff --git a/reports.py b/reports.py index 9b3d78527..99bf159da 100755 --- a/reports.py +++ b/reports.py @@ -13,6 +13,7 @@ from collections import OrderedDict import csv import math +import shutil import pysam import Bio.SeqIO From bfdcccc623d92ef2e7399fda3141b14fa7a2a456 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 14 Jul 2016 17:41:17 -0400 Subject: [PATCH 12/13] refactor bwa.py to remove the sam->bam conversion added earlier in this branch --- tools/bwa.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tools/bwa.py b/tools/bwa.py index 37a870aeb..3fe19af20 100644 --- a/tools/bwa.py +++ b/tools/bwa.py @@ -7,7 +7,6 @@ import os import os.path import subprocess -import shutil import tools import tools.samtools @@ -56,20 +55,18 @@ def mem(self, inReads, refDb, outAlign, opts=None, threads=None): fq1 = util.file.mkstempfname('.1.fastq') fq2 = util.file.mkstempfname('.2.fastq') aln_sam = util.file.mkstempfname('.sam') - aln_sam_sorted = util.file.mkstempfname('sorted.sam') samtools.bam2fq(inReads, fq1, fq2) - self.execute('mem', opts + ['-t', str(threads), refDb, fq1, fq2], stdout=aln_sam) + + if '-t' not in opts: + threads = threads or utils.misc.available_cpu_count() + opts.extend( ('-t', str(threads)) ) + + self.execute('mem', opts + [refDb, fq1, fq2], stdout=aln_sam) + os.unlink(fq1) os.unlink(fq2) - samtools.sort(aln_sam, aln_sam_sorted) + samtools.sort(aln_sam, outAlign) os.unlink(aln_sam) - # cannot index sam files; only do so if a bam is desired + # cannot index sam files; only do so if a bam/cram is desired if outAlign.endswith(".bam") or outAlign.endswith(".cram"): - # convert sam -> bam - samtools.view(["-b"], aln_sam_sorted, outAlign) samtools.index(outAlign) - elif outAlign.endswith(".sam"): - shutil.copyfile(aln_sam_sorted, outAlign) - os.unlink(aln_sam_sorted) - - From 0f47ccc6ef1f7c4930a18788edfcfaf571993a4f Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Thu, 14 Jul 2016 18:55:23 -0400 Subject: [PATCH 13/13] add DPI option as parameter for plot_coverage --- reports.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/reports.py b/reports.py index 99bf159da..99ff054b1 100755 --- a/reports.py +++ b/reports.py @@ -429,6 +429,13 @@ def parser_plot_coverage_common(parser=argparse.ArgumentParser()): # parser n type=int, help="Width of the plot in pixels (default: %(default)s)" ) + parser.add_argument( + '--plotDPI', + dest="plot_dpi", + default=plt.gcf().get_dpi(), + type=int, + help="dots per inch for rendered output, more useful for vector modes (default: %(default)s)" + ) parser.add_argument( '--plotTitle', dest="plot_title", @@ -472,6 +479,7 @@ def plot_coverage( plot_style, plot_width, plot_height, + plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, @@ -534,12 +542,11 @@ def plot_coverage( domain_max = 0 with plt.style.context(plot_style): - fig = plt.gcf() - DPI = fig.get_dpi() + DPI = plot_dpi or fig.get_dpi() fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI)) - font_size = math.sqrt((plot_width**2) + (plot_height**2)) / float(DPI) * 1.25 + font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot @@ -607,6 +614,7 @@ def align_and_plot_coverage( plot_style, plot_width, plot_height, + plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, @@ -662,7 +670,7 @@ def align_and_plot_coverage( # -- call plot function -- plot_coverage( - bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_title, + bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary )