diff --git a/experimental/visualize_stutter.py b/experimental/visualize_stutter.py new file mode 100644 index 0000000..47d6971 --- /dev/null +++ b/experimental/visualize_stutter.py @@ -0,0 +1,158 @@ +#!/usr/bin/python +import matplotlib +matplotlib.use('Agg') +import argparse +import StringIO +import numpy as np +import matplotlib.pyplot as plt +import os +import pandas as pd +import urllib, base64 +from scipy.stats.mstats import mquantiles +pd.options.mode.chained_assignment = None + +DESC = """ +This script is run on output of the +allelotype tool run with --command train. It generates an HTML file providing +visualization of PCR stutter noise at STRs. +""" + +NOISE_MODEL_PREFIX = None +OUT_FILE = None +MAX_PERIOD = 6 + +class StepModel(object): + """ + Contains information about step size distributions + """ + def __init__(self, filename): + self.LoadStepModel(filename) + + def LoadStepModel(self, filename): + self.NonUnitStepByPeriod = {} + self.StepSizeByPeriod = {} + f = open(filename, "r") + for i in range(MAX_PERIOD): + line = f.readline() + self.NonUnitStepByPeriod[i+1] = float(line.strip()) + self.ProbIncrease = float(f.readline().split("=")[1].strip()) + for i in range(MAX_PERIOD): + line = f.readline() + items = map(float, line.strip().split()[1:]) + self.StepSizeByPeriod[i+1] = items + f.close() + + def PlotHistogram(self, period): + """ Plot histogram of error sizes""" + colors = ["gray","red","gold","blue","green","purple"] + fig = plt.figure() + fig.set_size_inches((6,3)) + ax = fig.add_subplot(111) + xran = range(-18,19) + ax.bar(xran, self.StepSizeByPeriod[period], align="center", color=colors[period-1], alpha=0.3) + ax.bar([xran[i] for i in xran if xran[i]%period==0], [self.StepSizeByPeriod[period][i] for i in xran if xran[i]%period==0], align="center", color=colors[period-1]) + ax.set_xlabel("Step size", size=15) + ax.set_ylabel("Frequency", size=15) + ax.set_xlim(left=-3*MAX_PERIOD, right=3*MAX_PERIOD) + ax.set_yticklabels(ax.get_yticks(), size=15) + ax.set_xticks(xran) + ax.set_xticklabels(xran, size=12, rotation=90) + imgdata = StringIO.StringIO() + fig.savefig(imgdata, format="png") + imgdata.seek(0) + return urllib.quote(base64.b64encode(imgdata.buf)) + +class StutterProblem(object): + """ Contains info on reads used to build stutter model """ + def __init__(self, filename): + self.LoadProblem(filename) + + def LoadProblem(self, filename): + sp = pd.read_csv(filename, sep=" ", names=["stutter","period","length","gc","score"]) + sp = sp[sp["period"].apply(lambda x: type(x)==str)] # get rid of empty last line + sp.loc[:,"period"] = sp["period"].apply(lambda x: int(x.split(":")[1])) + sp.loc[:,"length"] = sp["length"].apply(lambda x: int(x.split(":")[1])) + sp.loc[:,"gc"] = sp["gc"].apply(lambda x: float(x.split(":")[1])) + sp.loc[:,"score"] = sp["score"].apply(lambda x: float(x.split(":")[1])) + self.sp = sp + + def GetAverageStutterProb(self): + return self.sp[self.sp["stutter"]==1].shape[0]*1.0/self.sp.shape[0] + + def StutterByPeriod(self, period): + tmp = self.sp[self.sp.period==period] + return tmp[tmp.stutter==1].shape[0]*1.0/tmp.shape[0] + + def PlotVariable(self, variable, bins): + colors = ["gray","red","gold","blue","green","purple"] + fig = plt.figure() + fig.set_size_inches((6,3)) + ax = fig.add_subplot(111) + for period in range(1, MAX_PERIOD+1): + sub = self.sp[self.sp.period==period] + # get deciles of the variable of interest + probs = [] + for i in range(len(bins)-1): + l = bins[i] + u = bins[i+1] + tmp = sub[(sub[variable]>=l) & (sub[variable]lobSTR PCR stutter analysis - %s\n"%NOISE_MODEL_PREFIX) + f.write("\n") + + # Write stats from noise model + f.write("

Stutter params

\n") + f.write("

Stutter probability by period

\n") + for i in range(MAX_PERIOD): f.write("Period %s: %s
\n"%(i+1, stutterproblem.StutterByPeriod(i+1))) + f.write("
") + f.write("Average stutter rate: %s
\n"%(stutterproblem.GetAverageStutterProb())) + f.write("Percent of stutter errors that increase the repeat number: %s
\n"%(stepmodel.ProbIncrease)) + + # Stutter err distributions + f.write("

Error size distributions

\n") + for i in range(MAX_PERIOD): + f.write("

Period %s

\n"%(i+1)) + pltdata = stepmodel.PlotHistogram(i+1) + uri = "data:image/png;base64," + pltdata + f.write("\n"%uri) + + # Stutter by feature (track length, GC, purity, by period) + f.write("

Sequence features

\n") + featureBins = {"length": np.arange(10, 60, 10), + "score": [0.5, 0.8, 1], + "gc": np.arange(0.3,0.7,0.1)} + for feature in ["length","gc","score"]: + f.write("

%s

\n"%feature) + pltdata = stutterproblem.PlotVariable(feature, featureBins[feature]) + uri = "data:image/png;base64," + pltdata + f.write("\n"%uri) + + + f.write("\n") + f.close()