Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

miseq_report that works, but tests are not that great #93

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 207 additions & 0 deletions bio_bits/miseq_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#!/usr/bin/env python

import gzip
import multiprocessing
import sys
import logging
import itertools

from Bio.SeqIO.QualityIO import FastqGeneralIterator
from path import Path
import sh
import pandas as pd

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('miseqreport')

try:
from itertools import imap
except ImportError:
pass

class MiSeqRun(object):
def __init__(self, runpath=None):
self.runpath = Path(runpath)
self.basecalls = self.runpath / 'Data' / 'Intensities' / 'BaseCalls'
self.samplesheet = self.basecalls / 'SampleSheet.csv'
self.rundate = self.runpath.basename().split('_')[0]

def get_fastq_gz(self):
return self.basecalls.glob('*_R1_*.fastq.gz') + \
self.basecalls.glob('*_R2_*.fastq.gz')

def get_paired_fastq_gz(self):
gzs = zip(
self.basecalls.glob('*_R1_*.fastq.gz'),
self.basecalls.glob('*_R2_*.fastq.gz')
)
return gzs

def get_samplenames(self):
samplenames = []
for line in self.samplesheet.lines(retain=False):
if line.endswith(',,'):
samplenames.append(line.split(',')[0])
return samplenames

def sample_from_fq(self, fqpath):
return fqpath.basename().split('_')[0]

def get_fqs_per_samplename(self):
persample = []
for pair in self.get_paired_fastq_gz():
persample.append((self.sample_from_fq(pair[0]), pair))
return persample

def all_sample_read_stats(self, threads=None):
'''
Return [(samplename, sample_read_stats(samplenamepair)), ...] which contains stats for each sample
'''
pool = multiprocessing.Pool(processes=threads)
return sorted(pool.map(stat, self.get_fqs_per_samplename()))

def run_stats(self, allstats=None, threads=None):
'''
CSV output for the run
:param iterable allstats: (samplename, (sample_read_stats(forward), sampleread_stats(reverse))
'''
if allstats is None:
allstats = self.all_sample_read_stats(threads=threads)
headers = "SampleName,TotalReads,FReads,FQual,FLen,FBases,RReads,RQual,RLen,RBases"
rowfmt = ','.join(['{{{0}}}'.format(i) for i in range(len(headers.split(',')))])
report = Path('report.{0}.csv'.format(self.rundate))
report.write_text(headers + '\n')
for sn, pair in allstats:
f,r = pair
row = (sn,f[0]+r[0]) + f + r
report.write_text(rowfmt.format(*row) + '\n', append=True)
return report

def run_fastqc(fqpaths, outdir=None, threads=None, tmpdir=None):
'''
Run fastqc on fqpath
'''
# Remove old one
op = Path(outdir)
op.rmtree_p()
op.mkdir()
print sh.fastqc(fqpaths, threads=threads, outdir=outdir, dir=tmpdir)
# build fastqc html index
index = op / 'index.html'
html = ['<table>']
for link in sorted(op.glob('*.html')):
html.append('<tr>')
html.append('<td><a href="{0}">{0}</a></td>'.format(link.basename()))
html.append('</tr>')
html.append('</table>')
index.write_lines(html)

def stat(x):
return (x[0], sample_read_stats(x[1]))

def sample_read_stats(paired):
'''
Return (read_stats(forward), read_stats(reverse))
'''
return map(read_stats, paired)

def read_stats(readpath):
'''
Return (#reads, avgqual, avglen, totalbases) for a fastq.gz read
'''
reads = 0
quals, lengths = [],[]
with gzip.open(readpath) as fh:
logger.info('Getting stats for {0}'.format(readpath.basename()))
for title, seq, qual in FastqGeneralIterator(fh):
reads += 1
lengths.append(len(seq))
qual = map(lambda q: ord(q)-33, qual)
quals.append(sum(qual)/float(len(qual)))
totalbases = sum(lengths)
return (reads, sum(quals)/float(len(quals)), totalbases/float(len(lengths)), totalbases)

def html_report(reportpath, fastqcdir):
'''
reportpath - path to report.csv
fastqcdir - Path.py.Path object representing directory of fastqc outputs
'''
html_report_pth = reportpath + '.html'
fastqc_htmls = sorted(map(lambda x: (x.basename().split('_')[0],x), fastqcdir.glob('*fastqc.html')))
paired_htmls = dict(map(lambda x: (x[0], map(lambda j: j[1], x[1])), itertools.groupby(fastqc_htmls, lambda x: x[0])))
# Colors
defaultcolor = '#FFFFFF'
# Color map for stddev +/- mean
# 1 is the color for 1 stdev above mean
# -1 is color for for 1 stdev below mean
colors = {
1: '#82E0AA',
2: '#2ECC71',
3: '#239B56',
-1: '#C0392B',
-2: '#922B21',
-3: '#641E16'
}
# Read the csv report
df = pd.read_csv(reportpath, index_col=0)
# Build table
html = ['<table border="1">']
# Put headers in table
html.append('<tr>')
html.append('<td>{0}</td>'.format(df.index.name))
html += map(lambda h: '<td>{0}</td>'.format(h), df.keys())
html.append('</tr>')
# Put values in table
for sn, values in df.iterrows():
#print list(paired_htmls[sn])
htmls = list(paired_htmls[sn])
html.append('<tr>')
# Samplename col
href = '<a href="{0}">{1}</a>'
fhref = href.format(htmls[0], 'R1')
rhref = href.format(htmls[1], 'R2')
html.append('<td>{0} ({1},{2})</td>'.format(sn,fhref,rhref))
for col, val in values.iteritems():
# The following two calculations should be moved out such that
# they are only calculated once per column and then put into a dict
# that can be looked up by colheader
# Current column standard dev
colstd = df[col].std()
# Current column mean
colmean = df[col].mean()
# Creates a new mapping from colors based on stddev/mean
cmap = map(lambda x: (colstd*x[0]+colmean, x[1]), colors.iteritems())
# Find values above mean where val is greater than stddev
colorv = filter(lambda x: x[0] >= colmean and val >= x[0], cmap)
# If non found, then try looking below mean
if colorv:
color = max(colorv)[1]
else:
color = filter(lambda x: x[0] <= colmean and val <= x[0], cmap)
if color:
color = min(color)[1]
else:
color = defaultcolor
html.append('<td bgcolor="{0}">{1}</td>'.format(color, val))
html.append('</tr>')
html.append('</table>')
html_report_pth.write_lines(html)
return html_report_pth

if __name__ == '__main__':
if len(sys.argv) != 5:
print "Usage: report.py <miseqrunpath> <fastqcoutdir> <threads> <tmpdir>"
sys.exit(1)
runpath, fqcoutdir, threads, tmpdir = sys.argv[1:]
threads = int(threads)
run = MiSeqRun(runpath)
logger.info("Creating csv report")
reportpth = run.run_stats(threads=threads)
logger.info("Wrote {0}".format(reportpth))
logger.info("Creating {0}".format(fqcoutdir))
logger.debug(str(run.get_fastq_gz()))
run_fastqc(run.get_fastq_gz(), outdir=fqcoutdir, threads=threads, tmpdir=tmpdir)
logger.info("Created {0}".format(fqcoutdir))
logger.info("Writing html report")
reportpth = html_report(reportpth, Path(fqcoutdir))
logger.info("Wrote {0}".format(reportpth))
74 changes: 74 additions & 0 deletions tests/test_miseq_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import unittest
from os.path import dirname, basename, join, abspath
from nose.plugins.attrib import attr

import report

run = '/media/VD_Research/TMPDIR/RUNS/160204_M04171_0004_000000000-AJVMN'
samplename = '0126-2010'
_f = run + '/Data/Intensities/BaseCalls/{0}_S30_L001_R1_001.fastq.gz'.format(samplename)
_fr = 644208
_r = run + '/Data/Intensities/BaseCalls/{0}_S30_L001_R2_001.fastq.gz'.format(samplename)
_rr = _fr

class TestMiSeqRun(unittest.TestCase):
def setUp(self):
self.inst = report.MiSeqRun(run)

def test_gets_fastq_gzs(self):
sn = self.inst.get_samplenames()
fqgz = self.inst.get_fastq_gz()
self.assertEqual(2+2*len(sn), len(fqgz))

def test_gets_paired_fastq_gzs(self):
samplenames = self.inst.get_samplenames()
fqgz = self.inst.get_fastq_gz()
paired = self.inst.get_paired_fastq_gz()
print paired
self.assertEqual(len(fqgz)/2, len(paired))
undet = paired[-1:]
paired = paired[:-1]
for forward, reverse in paired:
self.assertIn(basename(forward).split('_')[0], samplenames)
self.assertIn(basename(reverse).split('_')[0], samplenames)
self.assertEqual(basename(undet[0][0]).split('_')[0], 'Undetermined')
self.assertEqual(basename(undet[0][1]).split('_')[0], 'Undetermined')

def test_gets_samplenames(self):
self.assertTrue(self.inst.get_samplenames() > 2)

def test_get_fqqs_per_samplename(self):
x = self.inst.get_fqs_per_samplename()
for sn, fr in x:
f,r = fr
self.assertTrue(f.basename().startswith(sn), "{0} does not start with {1}".format(f, sn))
self.assertTrue(r.basename().startswith(sn), "{0} does not start with {1}".format(r, sn))

@attr('slow')
def test_read_stats(self):
r,q,l,b = self.inst.read_stats(_f)
assert _fr == r, "{0} != {1}".format(_fr, r)

@attr('slow')
def test_sample_read_stats(self):
x = self.inst.sample_read_stats((_f,_r))
r = x[0][0] + x[1][0]
assert _fr+_rr >= r, "{0} != {1}".format(_fr+_rr, r)

def test_sets_rundate(self):
self.inst.rundate = '160204'

@attr('slow')
def test_integration_sample_read_stats(self):
fqs = dict(self.inst.get_fqs_per_samplename())
x = fqs[samplename]
x = self.inst.sample_read_stats(x)
r = x[0][0] + x[1][0]
assert _fr+_rr >= r, "{0} != {1}".format(_fr+_rr, r)

def test_run_stats(self):
allstats = [
('s1', ((1,1,1,1),(1,1,1,1))),
('s2', ((2,2,2,2),(2,2,2,2))),
]
x = self.inst.run_stats(allstats)