diff --git a/bio_bits/miseq_report.py b/bio_bits/miseq_report.py
new file mode 100755
index 0000000..3182c0c
--- /dev/null
+++ b/bio_bits/miseq_report.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import gzip
+import multiprocessing
+import sys
+import logging
+import itertools
+
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+from path import Path
+import sh
+import pandas as pd
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger('miseqreport')
+
+try:
+ from itertools import imap
+except ImportError:
+ pass
+
+class MiSeqRun(object):
+ def __init__(self, runpath=None):
+ self.runpath = Path(runpath)
+ self.basecalls = self.runpath / 'Data' / 'Intensities' / 'BaseCalls'
+ self.samplesheet = self.basecalls / 'SampleSheet.csv'
+ self.rundate = self.runpath.basename().split('_')[0]
+
+ def get_fastq_gz(self):
+ return self.basecalls.glob('*_R1_*.fastq.gz') + \
+ self.basecalls.glob('*_R2_*.fastq.gz')
+
+ def get_paired_fastq_gz(self):
+ gzs = zip(
+ self.basecalls.glob('*_R1_*.fastq.gz'),
+ self.basecalls.glob('*_R2_*.fastq.gz')
+ )
+ return gzs
+
+ def get_samplenames(self):
+ samplenames = []
+ for line in self.samplesheet.lines(retain=False):
+ if line.endswith(',,'):
+ samplenames.append(line.split(',')[0])
+ return samplenames
+
+ def sample_from_fq(self, fqpath):
+ return fqpath.basename().split('_')[0]
+
+ def get_fqs_per_samplename(self):
+ persample = []
+ for pair in self.get_paired_fastq_gz():
+ persample.append((self.sample_from_fq(pair[0]), pair))
+ return persample
+
+ def all_sample_read_stats(self, threads=None):
+ '''
+ Return [(samplename, sample_read_stats(samplenamepair)), ...] which contains stats for each sample
+ '''
+ pool = multiprocessing.Pool(processes=threads)
+ return sorted(pool.map(stat, self.get_fqs_per_samplename()))
+
+ def run_stats(self, allstats=None, threads=None):
+ '''
+ CSV output for the run
+ :param iterable allstats: (samplename, (sample_read_stats(forward), sampleread_stats(reverse))
+ '''
+ if allstats is None:
+ allstats = self.all_sample_read_stats(threads=threads)
+ headers = "SampleName,TotalReads,FReads,FQual,FLen,FBases,RReads,RQual,RLen,RBases"
+ rowfmt = ','.join(['{{{0}}}'.format(i) for i in range(len(headers.split(',')))])
+ report = Path('report.{0}.csv'.format(self.rundate))
+ report.write_text(headers + '\n')
+ for sn, pair in allstats:
+ f,r = pair
+ row = (sn,f[0]+r[0]) + f + r
+ report.write_text(rowfmt.format(*row) + '\n', append=True)
+ return report
+
+def run_fastqc(fqpaths, outdir=None, threads=None, tmpdir=None):
+ '''
+ Run fastqc on fqpath
+ '''
+ # Remove old one
+ op = Path(outdir)
+ op.rmtree_p()
+ op.mkdir()
+ print sh.fastqc(fqpaths, threads=threads, outdir=outdir, dir=tmpdir)
+ # build fastqc html index
+ index = op / 'index.html'
+ html = ['
']
+ for link in sorted(op.glob('*.html')):
+ html.append('')
+ html.append('{0} | '.format(link.basename()))
+ html.append('
')
+ html.append('
')
+ index.write_lines(html)
+
+def stat(x):
+ return (x[0], sample_read_stats(x[1]))
+
+def sample_read_stats(paired):
+ '''
+ Return (read_stats(forward), read_stats(reverse))
+ '''
+ return map(read_stats, paired)
+
+def read_stats(readpath):
+ '''
+ Return (#reads, avgqual, avglen, totalbases) for a fastq.gz read
+ '''
+ reads = 0
+ quals, lengths = [],[]
+ with gzip.open(readpath) as fh:
+ logger.info('Getting stats for {0}'.format(readpath.basename()))
+ for title, seq, qual in FastqGeneralIterator(fh):
+ reads += 1
+ lengths.append(len(seq))
+ qual = map(lambda q: ord(q)-33, qual)
+ quals.append(sum(qual)/float(len(qual)))
+ totalbases = sum(lengths)
+ return (reads, sum(quals)/float(len(quals)), totalbases/float(len(lengths)), totalbases)
+
+def html_report(reportpath, fastqcdir):
+ '''
+ reportpath - path to report.csv
+ fastqcdir - Path.py.Path object representing directory of fastqc outputs
+ '''
+ html_report_pth = reportpath + '.html'
+ fastqc_htmls = sorted(map(lambda x: (x.basename().split('_')[0],x), fastqcdir.glob('*fastqc.html')))
+ paired_htmls = dict(map(lambda x: (x[0], map(lambda j: j[1], x[1])), itertools.groupby(fastqc_htmls, lambda x: x[0])))
+ # Colors
+ defaultcolor = '#FFFFFF'
+ # Color map for stddev +/- mean
+ # 1 is the color for 1 stdev above mean
+ # -1 is color for for 1 stdev below mean
+ colors = {
+ 1: '#82E0AA',
+ 2: '#2ECC71',
+ 3: '#239B56',
+ -1: '#C0392B',
+ -2: '#922B21',
+ -3: '#641E16'
+ }
+ # Read the csv report
+ df = pd.read_csv(reportpath, index_col=0)
+ # Build table
+ html = ['']
+ # Put headers in table
+ html.append('')
+ html.append('{0} | '.format(df.index.name))
+ html += map(lambda h: '{0} | '.format(h), df.keys())
+ html.append('
')
+ # Put values in table
+ for sn, values in df.iterrows():
+ #print list(paired_htmls[sn])
+ htmls = list(paired_htmls[sn])
+ html.append('')
+ # Samplename col
+ href = '{1}'
+ fhref = href.format(htmls[0], 'R1')
+ rhref = href.format(htmls[1], 'R2')
+ html.append('{0} ({1},{2}) | '.format(sn,fhref,rhref))
+ for col, val in values.iteritems():
+ # The following two calculations should be moved out such that
+ # they are only calculated once per column and then put into a dict
+ # that can be looked up by colheader
+ # Current column standard dev
+ colstd = df[col].std()
+ # Current column mean
+ colmean = df[col].mean()
+ # Creates a new mapping from colors based on stddev/mean
+ cmap = map(lambda x: (colstd*x[0]+colmean, x[1]), colors.iteritems())
+ # Find values above mean where val is greater than stddev
+ colorv = filter(lambda x: x[0] >= colmean and val >= x[0], cmap)
+ # If non found, then try looking below mean
+ if colorv:
+ color = max(colorv)[1]
+ else:
+ color = filter(lambda x: x[0] <= colmean and val <= x[0], cmap)
+ if color:
+ color = min(color)[1]
+ else:
+ color = defaultcolor
+ html.append('{1} | '.format(color, val))
+ html.append('
')
+ html.append('
')
+ html_report_pth.write_lines(html)
+ return html_report_pth
+
+if __name__ == '__main__':
+ if len(sys.argv) != 5:
+ print "Usage: report.py "
+ sys.exit(1)
+ runpath, fqcoutdir, threads, tmpdir = sys.argv[1:]
+ threads = int(threads)
+ run = MiSeqRun(runpath)
+ logger.info("Creating csv report")
+ reportpth = run.run_stats(threads=threads)
+ logger.info("Wrote {0}".format(reportpth))
+ logger.info("Creating {0}".format(fqcoutdir))
+ logger.debug(str(run.get_fastq_gz()))
+ run_fastqc(run.get_fastq_gz(), outdir=fqcoutdir, threads=threads, tmpdir=tmpdir)
+ logger.info("Created {0}".format(fqcoutdir))
+ logger.info("Writing html report")
+ reportpth = html_report(reportpth, Path(fqcoutdir))
+ logger.info("Wrote {0}".format(reportpth))
diff --git a/tests/test_miseq_report.py b/tests/test_miseq_report.py
new file mode 100644
index 0000000..3fb7a00
--- /dev/null
+++ b/tests/test_miseq_report.py
@@ -0,0 +1,74 @@
+import unittest
+from os.path import dirname, basename, join, abspath
+from nose.plugins.attrib import attr
+
+import report
+
+run = '/media/VD_Research/TMPDIR/RUNS/160204_M04171_0004_000000000-AJVMN'
+samplename = '0126-2010'
+_f = run + '/Data/Intensities/BaseCalls/{0}_S30_L001_R1_001.fastq.gz'.format(samplename)
+_fr = 644208
+_r = run + '/Data/Intensities/BaseCalls/{0}_S30_L001_R2_001.fastq.gz'.format(samplename)
+_rr = _fr
+
+class TestMiSeqRun(unittest.TestCase):
+ def setUp(self):
+ self.inst = report.MiSeqRun(run)
+
+ def test_gets_fastq_gzs(self):
+ sn = self.inst.get_samplenames()
+ fqgz = self.inst.get_fastq_gz()
+ self.assertEqual(2+2*len(sn), len(fqgz))
+
+ def test_gets_paired_fastq_gzs(self):
+ samplenames = self.inst.get_samplenames()
+ fqgz = self.inst.get_fastq_gz()
+ paired = self.inst.get_paired_fastq_gz()
+ print paired
+ self.assertEqual(len(fqgz)/2, len(paired))
+ undet = paired[-1:]
+ paired = paired[:-1]
+ for forward, reverse in paired:
+ self.assertIn(basename(forward).split('_')[0], samplenames)
+ self.assertIn(basename(reverse).split('_')[0], samplenames)
+ self.assertEqual(basename(undet[0][0]).split('_')[0], 'Undetermined')
+ self.assertEqual(basename(undet[0][1]).split('_')[0], 'Undetermined')
+
+ def test_gets_samplenames(self):
+ self.assertTrue(self.inst.get_samplenames() > 2)
+
+ def test_get_fqqs_per_samplename(self):
+ x = self.inst.get_fqs_per_samplename()
+ for sn, fr in x:
+ f,r = fr
+ self.assertTrue(f.basename().startswith(sn), "{0} does not start with {1}".format(f, sn))
+ self.assertTrue(r.basename().startswith(sn), "{0} does not start with {1}".format(r, sn))
+
+ @attr('slow')
+ def test_read_stats(self):
+ r,q,l,b = self.inst.read_stats(_f)
+ assert _fr == r, "{0} != {1}".format(_fr, r)
+
+ @attr('slow')
+ def test_sample_read_stats(self):
+ x = self.inst.sample_read_stats((_f,_r))
+ r = x[0][0] + x[1][0]
+ assert _fr+_rr >= r, "{0} != {1}".format(_fr+_rr, r)
+
+ def test_sets_rundate(self):
+ self.inst.rundate = '160204'
+
+ @attr('slow')
+ def test_integration_sample_read_stats(self):
+ fqs = dict(self.inst.get_fqs_per_samplename())
+ x = fqs[samplename]
+ x = self.inst.sample_read_stats(x)
+ r = x[0][0] + x[1][0]
+ assert _fr+_rr >= r, "{0} != {1}".format(_fr+_rr, r)
+
+ def test_run_stats(self):
+ allstats = [
+ ('s1', ((1,1,1,1),(1,1,1,1))),
+ ('s2', ((2,2,2,2),(2,2,2,2))),
+ ]
+ x = self.inst.run_stats(allstats)