diff --git a/bio_bits/miseq_report.py b/bio_bits/miseq_report.py new file mode 100755 index 0000000..3182c0c --- /dev/null +++ b/bio_bits/miseq_report.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python + +import gzip +import multiprocessing +import sys +import logging +import itertools + +from Bio.SeqIO.QualityIO import FastqGeneralIterator +from path import Path +import sh +import pandas as pd + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger('miseqreport') + +try: + from itertools import imap +except ImportError: + pass + +class MiSeqRun(object): + def __init__(self, runpath=None): + self.runpath = Path(runpath) + self.basecalls = self.runpath / 'Data' / 'Intensities' / 'BaseCalls' + self.samplesheet = self.basecalls / 'SampleSheet.csv' + self.rundate = self.runpath.basename().split('_')[0] + + def get_fastq_gz(self): + return self.basecalls.glob('*_R1_*.fastq.gz') + \ + self.basecalls.glob('*_R2_*.fastq.gz') + + def get_paired_fastq_gz(self): + gzs = zip( + self.basecalls.glob('*_R1_*.fastq.gz'), + self.basecalls.glob('*_R2_*.fastq.gz') + ) + return gzs + + def get_samplenames(self): + samplenames = [] + for line in self.samplesheet.lines(retain=False): + if line.endswith(',,'): + samplenames.append(line.split(',')[0]) + return samplenames + + def sample_from_fq(self, fqpath): + return fqpath.basename().split('_')[0] + + def get_fqs_per_samplename(self): + persample = [] + for pair in self.get_paired_fastq_gz(): + persample.append((self.sample_from_fq(pair[0]), pair)) + return persample + + def all_sample_read_stats(self, threads=None): + ''' + Return [(samplename, sample_read_stats(samplenamepair)), ...] which contains stats for each sample + ''' + pool = multiprocessing.Pool(processes=threads) + return sorted(pool.map(stat, self.get_fqs_per_samplename())) + + def run_stats(self, allstats=None, threads=None): + ''' + CSV output for the run + :param iterable allstats: (samplename, (sample_read_stats(forward), sampleread_stats(reverse)) + ''' + if allstats is None: + allstats = self.all_sample_read_stats(threads=threads) + headers = "SampleName,TotalReads,FReads,FQual,FLen,FBases,RReads,RQual,RLen,RBases" + rowfmt = ','.join(['{{{0}}}'.format(i) for i in range(len(headers.split(',')))]) + report = Path('report.{0}.csv'.format(self.rundate)) + report.write_text(headers + '\n') + for sn, pair in allstats: + f,r = pair + row = (sn,f[0]+r[0]) + f + r + report.write_text(rowfmt.format(*row) + '\n', append=True) + return report + +def run_fastqc(fqpaths, outdir=None, threads=None, tmpdir=None): + ''' + Run fastqc on fqpath + ''' + # Remove old one + op = Path(outdir) + op.rmtree_p() + op.mkdir() + print sh.fastqc(fqpaths, threads=threads, outdir=outdir, dir=tmpdir) + # build fastqc html index + index = op / 'index.html' + html = [''] + for link in sorted(op.glob('*.html')): + html.append('') + html.append(''.format(link.basename())) + html.append('') + html.append('
{0}
') + index.write_lines(html) + +def stat(x): + return (x[0], sample_read_stats(x[1])) + +def sample_read_stats(paired): + ''' + Return (read_stats(forward), read_stats(reverse)) + ''' + return map(read_stats, paired) + +def read_stats(readpath): + ''' + Return (#reads, avgqual, avglen, totalbases) for a fastq.gz read + ''' + reads = 0 + quals, lengths = [],[] + with gzip.open(readpath) as fh: + logger.info('Getting stats for {0}'.format(readpath.basename())) + for title, seq, qual in FastqGeneralIterator(fh): + reads += 1 + lengths.append(len(seq)) + qual = map(lambda q: ord(q)-33, qual) + quals.append(sum(qual)/float(len(qual))) + totalbases = sum(lengths) + return (reads, sum(quals)/float(len(quals)), totalbases/float(len(lengths)), totalbases) + +def html_report(reportpath, fastqcdir): + ''' + reportpath - path to report.csv + fastqcdir - Path.py.Path object representing directory of fastqc outputs + ''' + html_report_pth = reportpath + '.html' + fastqc_htmls = sorted(map(lambda x: (x.basename().split('_')[0],x), fastqcdir.glob('*fastqc.html'))) + paired_htmls = dict(map(lambda x: (x[0], map(lambda j: j[1], x[1])), itertools.groupby(fastqc_htmls, lambda x: x[0]))) + # Colors + defaultcolor = '#FFFFFF' + # Color map for stddev +/- mean + # 1 is the color for 1 stdev above mean + # -1 is color for for 1 stdev below mean + colors = { + 1: '#82E0AA', + 2: '#2ECC71', + 3: '#239B56', + -1: '#C0392B', + -2: '#922B21', + -3: '#641E16' + } + # Read the csv report + df = pd.read_csv(reportpath, index_col=0) + # Build table + html = [''] + # Put headers in table + html.append('') + html.append(''.format(df.index.name)) + html += map(lambda h: ''.format(h), df.keys()) + html.append('') + # Put values in table + for sn, values in df.iterrows(): + #print list(paired_htmls[sn]) + htmls = list(paired_htmls[sn]) + html.append('') + # Samplename col + href = '{1}' + fhref = href.format(htmls[0], 'R1') + rhref = href.format(htmls[1], 'R2') + html.append(''.format(sn,fhref,rhref)) + for col, val in values.iteritems(): + # The following two calculations should be moved out such that + # they are only calculated once per column and then put into a dict + # that can be looked up by colheader + # Current column standard dev + colstd = df[col].std() + # Current column mean + colmean = df[col].mean() + # Creates a new mapping from colors based on stddev/mean + cmap = map(lambda x: (colstd*x[0]+colmean, x[1]), colors.iteritems()) + # Find values above mean where val is greater than stddev + colorv = filter(lambda x: x[0] >= colmean and val >= x[0], cmap) + # If non found, then try looking below mean + if colorv: + color = max(colorv)[1] + else: + color = filter(lambda x: x[0] <= colmean and val <= x[0], cmap) + if color: + color = min(color)[1] + else: + color = defaultcolor + html.append(''.format(color, val)) + html.append('') + html.append('
{0}{0}
{0} ({1},{2}){1}
') + html_report_pth.write_lines(html) + return html_report_pth + +if __name__ == '__main__': + if len(sys.argv) != 5: + print "Usage: report.py " + sys.exit(1) + runpath, fqcoutdir, threads, tmpdir = sys.argv[1:] + threads = int(threads) + run = MiSeqRun(runpath) + logger.info("Creating csv report") + reportpth = run.run_stats(threads=threads) + logger.info("Wrote {0}".format(reportpth)) + logger.info("Creating {0}".format(fqcoutdir)) + logger.debug(str(run.get_fastq_gz())) + run_fastqc(run.get_fastq_gz(), outdir=fqcoutdir, threads=threads, tmpdir=tmpdir) + logger.info("Created {0}".format(fqcoutdir)) + logger.info("Writing html report") + reportpth = html_report(reportpth, Path(fqcoutdir)) + logger.info("Wrote {0}".format(reportpth)) diff --git a/tests/test_miseq_report.py b/tests/test_miseq_report.py new file mode 100644 index 0000000..3fb7a00 --- /dev/null +++ b/tests/test_miseq_report.py @@ -0,0 +1,74 @@ +import unittest +from os.path import dirname, basename, join, abspath +from nose.plugins.attrib import attr + +import report + +run = '/media/VD_Research/TMPDIR/RUNS/160204_M04171_0004_000000000-AJVMN' +samplename = '0126-2010' +_f = run + '/Data/Intensities/BaseCalls/{0}_S30_L001_R1_001.fastq.gz'.format(samplename) +_fr = 644208 +_r = run + '/Data/Intensities/BaseCalls/{0}_S30_L001_R2_001.fastq.gz'.format(samplename) +_rr = _fr + +class TestMiSeqRun(unittest.TestCase): + def setUp(self): + self.inst = report.MiSeqRun(run) + + def test_gets_fastq_gzs(self): + sn = self.inst.get_samplenames() + fqgz = self.inst.get_fastq_gz() + self.assertEqual(2+2*len(sn), len(fqgz)) + + def test_gets_paired_fastq_gzs(self): + samplenames = self.inst.get_samplenames() + fqgz = self.inst.get_fastq_gz() + paired = self.inst.get_paired_fastq_gz() + print paired + self.assertEqual(len(fqgz)/2, len(paired)) + undet = paired[-1:] + paired = paired[:-1] + for forward, reverse in paired: + self.assertIn(basename(forward).split('_')[0], samplenames) + self.assertIn(basename(reverse).split('_')[0], samplenames) + self.assertEqual(basename(undet[0][0]).split('_')[0], 'Undetermined') + self.assertEqual(basename(undet[0][1]).split('_')[0], 'Undetermined') + + def test_gets_samplenames(self): + self.assertTrue(self.inst.get_samplenames() > 2) + + def test_get_fqqs_per_samplename(self): + x = self.inst.get_fqs_per_samplename() + for sn, fr in x: + f,r = fr + self.assertTrue(f.basename().startswith(sn), "{0} does not start with {1}".format(f, sn)) + self.assertTrue(r.basename().startswith(sn), "{0} does not start with {1}".format(r, sn)) + + @attr('slow') + def test_read_stats(self): + r,q,l,b = self.inst.read_stats(_f) + assert _fr == r, "{0} != {1}".format(_fr, r) + + @attr('slow') + def test_sample_read_stats(self): + x = self.inst.sample_read_stats((_f,_r)) + r = x[0][0] + x[1][0] + assert _fr+_rr >= r, "{0} != {1}".format(_fr+_rr, r) + + def test_sets_rundate(self): + self.inst.rundate = '160204' + + @attr('slow') + def test_integration_sample_read_stats(self): + fqs = dict(self.inst.get_fqs_per_samplename()) + x = fqs[samplename] + x = self.inst.sample_read_stats(x) + r = x[0][0] + x[1][0] + assert _fr+_rr >= r, "{0} != {1}".format(_fr+_rr, r) + + def test_run_stats(self): + allstats = [ + ('s1', ((1,1,1,1),(1,1,1,1))), + ('s2', ((2,2,2,2),(2,2,2,2))), + ] + x = self.inst.run_stats(allstats)