diff --git a/TEsmall/__init__.py.bak b/TEsmall/__init__.py.bak deleted file mode 100644 index a12e1ba..0000000 --- a/TEsmall/__init__.py.bak +++ /dev/null @@ -1 +0,0 @@ -from version import __version__ diff --git a/TEsmall/abundance.py.bak b/TEsmall/abundance.py.bak deleted file mode 100644 index 1c857e7..0000000 --- a/TEsmall/abundance.py.bak +++ /dev/null @@ -1,37 +0,0 @@ -import logging -import re -from os.path import splitext -import pandas as pd - -def calc_composition(anno_list): - for anno in anno_list: - logging.info("Calculating read composition...") - df = pd.read_table(anno, usecols=["rid", "rlen", "ftype"]) - df = df.groupby(["rlen", "ftype"]).rid.nunique() - df = df.unstack("rlen") - - root = splitext(anno)[0] - outfile = "{0}.comp".format(root) - df.T.to_csv(outfile, sep="\t", na_rep=0, float_format="%.0f") - -def calc_abundance(anno_list): - count_dict = {} - for anno in anno_list: - logging.info("Calculating feature abundances...") - df = pd.read_table(anno, usecols=["rid", "ftype", "fid"]) - rweight = 1/df.groupby("rid").fid.nunique() - ftable = df.groupby(["ftype", "fid"]).rid.unique() - count_dict[splitext(anno)[0]] = ftable.apply(lambda l: round(sum(map(lambda s: rweight[s], l)))) - - count = pd.DataFrame(count_dict).reset_index() - #strcount = count[count["ftype"] == "structural_RNA"] - #count = count[count["ftype"] != "structural_RNA"] - #strcount.to_csv("structural_RNA.count.txt", sep="\t", na_rep=0, float_format="%.0f", index=False) - tecount = count[(count["ftype"] == "anti_TE") | (count["ftype"] == "sense_TE")] - count = count[(count["ftype"] != "anti_TE") & (count["ftype"] != "sense_TE")] - tecount["fid"] = map(lambda s: re.sub(r"_dup\d+$", "", s), tecount["fid"]) - tecount = tecount.groupby(["ftype", "fid"]).sum().reset_index() - - count = pd.concat([count, tecount]) - root = splitext(anno)[0] - count.to_csv("{0}.count".format(root), sep="\t", na_rep=0, float_format="%.0f", index=False) diff --git a/TEsmall/annotation.py.bak b/TEsmall/annotation.py.bak deleted file mode 100644 index 530afb9..0000000 --- a/TEsmall/annotation.py.bak +++ /dev/null @@ -1,56 +0,0 @@ -import argparse -import glob -import logging -import os - -import pybedtools - -from settings import * - -def annotate_reads(genome, order, multi): - root = os.path.splitext(os.path.basename(multi))[0] - root = os.path.splitext(root)[0] - logging.info("Assigning reads to genomic features...") - bedfiles = map(lambda s: os.path.join(ANNOTATION.format(genome), "{0}.bed".format(s)), order) - #bedfiles = sorted(glob.glob(os.path.join(ANNOTATION.format(genome), "*.bed"))) - outfname = "{0}.anno".format(root) - with open(outfname, "w") as outfile: - columns = ["rid", "rchr", "rstart", "rend", "rstrand", "rlen", "ftype", "fid", "fchr", "fstart", "fend", "fstrand", "overlap"] - outfile.write("\t".join(columns) + "\n") - anno_reads = set() - bamfile = multi - for bedfile in bedfiles: - current_reads = set() - f_type = os.path.splitext(os.path.basename(bedfile))[0] - bam = pybedtools.BedTool(bamfile) - bed = pybedtools.BedTool(bedfile) - for line in bam.intersect(bed, wo=True, bed=True, f=0.9, sorted=False, stream=True): - r_chrom = line[0] - r_start = line[1] - r_end = line[2] - r_id = line[3] - r_strand = line[5] - r_len = line[10][:-1] - f_chrom = line[12] - f_start = line[13] - f_end = line[14] - f_id = line[15] - f_strand = line[17] - f_len = line[18] - if f_type in ["miRNA", "hairpin"]: - if r_strand == f_strand: - current_reads.add(r_id) - if r_id not in anno_reads: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, f_type, f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - else: - current_reads.add(r_id) - if r_id not in anno_reads: - if f_type == "TE": - if r_strand == f_strand: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, "sense_TE", f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - else: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, "anti_TE", f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - else: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, f_type, f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - anno_reads = anno_reads.union(current_reads) - return outfname diff --git a/TEsmall/command_line.py.bak b/TEsmall/command_line.py.bak deleted file mode 100644 index 7cfd22a..0000000 --- a/TEsmall/command_line.py.bak +++ /dev/null @@ -1,77 +0,0 @@ -import argparse -import logging -import os -import sys - -from abundance import * -from alignment import * -from annotation import * -from settings import * -from summary import * -from version import __version__ - -def main(): - parser = argparse.ArgumentParser(prog="tesmall") - parser.add_argument("-a", "--adapter", metavar="STR", - default='TGGAATTCTCGGGTGCCAAGG', help="Sequence of an adapter that was " - "ligated to the 3' end. The adapter itself and anything that follows " - "is trimmed. (default: %(default)s)") - parser.add_argument("-m", "--minlen", metavar="INT", type=int, - default=16, help="Discard trimmed reads that are shorter than INT. " - "Reads that are too short even before adapter removal are also " - "discarded. (default: %(default)d)") - parser.add_argument("-M", "--maxlen", metavar="INT", type=int, - default=36, help="Discard trimmed reads that are longer than INT. " - "Reads that are too long even before adapter removal are also " - "discarded. (default: %(default)d)") - parser.add_argument("-g", "--genome", metavar="STR", default="hg19", - choices=["dm3", "mm9", "hg19", "hg38", "mm10", "dm6"], help="Version of reference genome " - "(hg19, mm9, or dm3; default: %(default)s)") - parser.add_argument("--maxaln", metavar="INT", type=int, default=100, - help="Suppress all alignments for a particular read if more than INT " - "reportable alignments exist for it. (default: %(default)s)") - parser.add_argument("--mismatch", metavar="INT", type=int, default=0, - choices=[0, 1, 2, 3], help="Report alignments with at most INT " - "mismatches. (default: %(default)s)") - parser.add_argument("-o", "--order", metavar="STR", nargs="+", - choices=["structural_RNA", "miRNA", "hairpin", "exon", "TE", "intron", - "piRNA_cluster"], default=["structural_RNA", "miRNA", "hairpin", "exon", - "TE", "intron", "piRNA_cluster"], help="Annotation priority. (default: structural_RNA miRNA " - "hairpin exon TE intron piRNA_cluster)") - parser.add_argument("-p", "--parallel", metavar="INT", type=int, default=1, - help="Parallel execute by INT CPUs. (default: %(default)s)") - parser.add_argument("-f", "--fastq", metavar="STR", nargs="+", help="Input in " - "FASTQ format. Compressed input is supported and auto-detected from " - "the filename extension (.gz).") - parser.add_argument("-l", "--label", metavar="STR", nargs="+", - help="Unique label for each sample.") - parser.add_argument("--verbose", metavar="INT", type=int, default=2, - help="Set verbose level. 0: only show critical message, 1: show additional " - "warning message, 2: show process information, 3: show debug messages. DEFAULT:2") - parser.add_argument("-v", "--version", action="version", version="%(prog)s {0}".format(__version__)) - args = parser.parse_args() - - logging.basicConfig(level=(4 - args.verbose) * 10, - format="%(asctime)s %(levelname)s %(message)s", - stream=sys.stderr, filemode="w") - - get_requirements(args.genome) - annofiles = [] - if not args.label: - args.label = map(lambda s: re.sub(r".f(ast)?q(.gz)?$", "", os.path.basename(s)), args.fastq) - else: - assert len(set(args.label)) == len(args.fastq) - for label, fastq in zip(args.label, args.fastq): - trimmed_fastq = trim_3p_adapters(fastq, label, args.adapter, args.minlen, args.maxlen) - trimmed_fastq = trim_5p_adapters(trimmed_fastq, label, "GTTCAGAGTTCTACAGTCCGACGATC", args.minlen, args.maxlen) - btidx = os.path.join(BOWTIE_INDEX.format(args.genome), "genome") - rbtidx = os.path.join(BOWTIE_INDEX.format(args.genome), "rDNA") - bestone_bam, unfile = map_bestone_reads(trimmed_fastq, rbtidx, 2) - #bestone_bam, unfile = map_bestone_reads(fastq, rbtidx, args.mismatch) - multi_bam = map_multi_reads(unfile, btidx, args.maxaln, args.mismatch) - readinfo = get_read_info(multi_bam) - annofiles.append(annotate_reads(args.genome, args.order, multi_bam)) - - calc_composition(annofiles) - gen_summary(args.label, args.order, args.maxaln) - calc_abundance(annofiles) diff --git a/TEsmall/summary.py.bak b/TEsmall/summary.py.bak deleted file mode 100644 index 8179af9..0000000 --- a/TEsmall/summary.py.bak +++ /dev/null @@ -1,412 +0,0 @@ -from __future__ import division -import datetime -import argparse -import logging -import os -import string -from math import pi, cos, sin -from collections import defaultdict, Counter -import pandas as pd -import bokeh -from bokeh.layouts import column, row, layout -from bokeh.plotting import figure -from bokeh.charts import Bar, output_file, save -from bokeh.charts.operations import blend -from bokeh.charts.attributes import cat, color -from bokeh.models import HoverTool, ColumnDataSource -from bokeh.models.widgets import DataTable, TableColumn, NumberFormatter -from bokeh.embed import components -import seaborn as sns -import pysam - -template = """ - - - - - - - - - - - - TEsmall Dashboard - - - - - - - - - - - - - - - - -
-
- -""" - -footer = """ -
-
- - - - - - - - -""" - -def get_read_info(multibam): - root = os.path.splitext(os.path.basename(multibam))[0] - root = os.path.splitext(root)[0] - output = "{0}.rinfo".format(root) - with open(output, "w") as outfile: - outfile.write("id\tlength\tcount\tmapper\n") - fname = multibam - rcounter = Counter() - bamfile = pysam.AlignmentFile(fname, "rb", check_sq=False) - for read in bamfile: - rid = read.query_name - rlen = read.query_length - rcounter.update([(rid, rlen)]) - - for rid, rlen in rcounter: - num = rcounter[(rid, rlen)] - outfile.write("{0}\t{1}\t{2}\t{3}\n".format(rid, rlen, num, "unique" if num == 1 else "multi")) - return output - -def get_stat(prefix, maxaln): - stat = {"Statistics": [], "Number of reads": [], "Proportion": []} - with open("{0}.cutadapt1.log".format(prefix)) as infile: - for line in infile: - if "Total reads processed:" in line: - raw_reads = int(line.strip().split()[-1].replace(",", "")) - stat["Statistics"].append("Raw reads") - stat["Number of reads"].append(raw_reads) - stat["Proportion"].append(None) - break - with open("{0}.rRNA.log".format(prefix)) as infile: - for line in infile: - if "# reads processed:" in line: - trimmed_reads = int(line.strip().split()[-1]) - stat["Statistics"].append("After trimming adapters") - stat["Number of reads"].append(trimmed_reads) - stat["Proportion"].append(None) - with open("{0}.log".format(prefix)) as infile: - for line in infile: - if "# reads processed:" in line: - rm_reads = int(line.strip().split()[-1]) - stat["Statistics"].append("After removing rRNAs") - stat["Number of reads"].append(rm_reads) - stat["Proportion"].append(rm_reads/rm_reads) - elif "# reads with at least one reported alignment:" in line: - up_reads = int(line.strip().split()[-2]) - stat["Statistics"].append("Aligned reads (up to {0} alignments)".format(maxaln)) - stat["Number of reads"].append(up_reads) - stat["Proportion"].append(up_reads/rm_reads) - elif "# reads with alignments suppressed due to -m:" in line: - over_reads = int(line.strip().split()[-2]) - stat["Statistics"].append("Over {0} alignments".format(maxaln)) - stat["Number of reads"].append(over_reads) - stat["Proportion"].append(over_reads/rm_reads) - elif "# reads that failed to align:" in line: - un_reads = int(line.strip().split()[-2]) - stat["Statistics"].append("Unaligned reads".format(maxaln)) - stat["Number of reads"].append(un_reads) - stat["Proportion"].append(un_reads/rm_reads) - - df = pd.read_table("{0}.anno".format(prefix), usecols=["rid"]) - anno_reads = len(df.rid.unique()) - stat["Statistics"].append("Annotated reads of aligned reads") - stat["Number of reads"].append(anno_reads) - stat["Proportion"].append(anno_reads/up_reads) - - return stat - -def output_components(prefix, order, maxaln): - rinfo = "{0}.rinfo".format(prefix) - comp = "{0}.comp".format(prefix) - - def plot_read_dist(rinfo): - df = pd.read_table(rinfo) - data = df[['length', 'mapper', 'count']].groupby(['length', 'mapper']).count() - data = data.apply(lambda s: s/data.sum()*100, axis=1).reset_index() - p = Bar(data, plot_width=500, plot_height=400, - label='length', values='count', agg='sum', - stack=cat(columns='mapper', sort=False), legend='top_right', - color=color(columns='mapper', palette=["#f98283", "#a4a4a4"], sort=False), - xlabel='read length (nt)', ylabel='proportion (%)', - ygrid=False, - tooltips=[('length', '@length'), ('mapper', '@mapper'), ('percent', '@height')]) - p.toolbar.logo = None - p.outline_line_color = None - return p - - rdist = plot_read_dist(rinfo) - - df = pd.read_table(comp, index_col=0) - total = df.sum() - total = total*100/total.sum() - df = df.apply(lambda s: s*100/s.sum(), axis=1) - df = df.reset_index() - #ftypes = df.columns[1:].tolist() - ftypes = order - colors = sns.color_palette("hls", len(ftypes)).as_hex() - bar = Bar(df, - values=blend(*ftypes, name="ftypes", labels_name="ftype"), - x_range=rdist.x_range, - y_range=(0, 100), - label=cat(columns='rlen', sort=False), - stack=cat(columns='ftype', sort=False), - xlabel='read length (nt)', - ylabel='proportion (%)', - legend="top_right", - ygrid=False, - width=500, - height=400, - color=color(columns='ftype', palette=colors, sort=False), - fill_alpha=1, - tooltips=[("length", "@rlen"), ("feature", "@ftype"), ("percent", "@height")]) - bar.toolbar.logo = None - bar.outline_line_color = None - - start_angles = {} - end_angles = {} - start = 0 - for ftype in ftypes: - end = 2*pi*total[ftype]/100 - start_angles[ftype] = start - end_angles[ftype] = start + end - start += end - - colors = dict(zip(ftypes, colors)) - df = pd.DataFrame(total).reset_index() - df.columns = ["ftype", "percent"] - df["start"] = df.apply(lambda s: start_angles[s["ftype"]], axis=1) - df["end"] = df.apply(lambda s: end_angles[s["ftype"]], axis=1) - df["color"] = df.apply(lambda s: colors[s["ftype"]], axis=1) - df["x"] = df.apply(lambda s: 1.2*cos((start_angles[s["ftype"]] + end_angles[s["ftype"]])/2), axis=1) - df["y"] = df.apply(lambda s: 1.2*sin((start_angles[s["ftype"]] + end_angles[s["ftype"]])/2), axis=1) - df["text"] = df.apply(lambda s: "{0:.3f}%".format(s["percent"]), axis=1) - source = ColumnDataSource(data=df) - - pie = figure(width=400, height=400, x_range=(-1.4, 1.4), y_range=(-1.4, 1.4)) - pie.toolbar.logo = None - wr = pie.annular_wedge(x=0, y=0, inner_radius=0.5, outer_radius=1, - start_angle="start", end_angle="end", fill_color="color", - line_color="#ffffff", line_width=0.5, source=source) - - - pie.axis.visible = False - pie.grid.grid_line_color = None - pie.outline_line_color = None - - hover = HoverTool(tooltips=[("feature", "@ftype"), ("percent", "@percent")], renderers=[wr]) - pie.add_tools(hover) - - text_props = { - "source": source, - "angle": 0, - "color": "black", - "text_align": "center", - "text_baseline": "middle", - "text_font_size": "8pt", - "text_font_style": "normal" - } - pie.text(x="x", y="y", text="text", **text_props) - - empty = figure(width=400, height=400, x_range=(-1.1, 1.1), y_range=(-1.1, 1.1)) - empty.toolbar.logo = None - empty.axis.visible = False - empty.grid.grid_line_color = None - empty.outline_line_color = None - empty.toolbar_location = None - - stat = get_stat(prefix, maxaln) - source = ColumnDataSource(data=stat) - columns = [ - TableColumn(field="Statistics", title="Statistics", width=200), - TableColumn(field="Number of reads", title="Number of reads", formatter=NumberFormatter(format="0,0"), width=150), - TableColumn(field="Proportion", title="Proportion", formatter=NumberFormatter(format="0.000%"), width=100), - ] - data_table = DataTable(source=source, columns=columns, width=450, row_headers=False) - - script, div = components(layout([[data_table, rdist], [pie, bar]], sizing_mode="scale_width")) - return script, div - -def get_parser(): - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--prefix", nargs="+") - parser.add_argument("-o", "--order", nargs="+") - parser.add_argument("-m", "--maxaln", type=int) - return parser - -def gen_summary(prefix, order, maxaln): - logging.info("Generating summary report...") - idx = order.index("TE") - order[idx] = "sense_TE" - order.insert(idx, "anti_TE") - date = datetime.datetime.now().strftime("%B %d, %Y") - with open("report.html", "w") as outfile: - outfile.write(template) - outfile.write('
  • {0}
  • '.format(date)) - outfile.write(navh) - for i, p in enumerate(prefix): - if i == 0: - outfile.write('
  • {0}
  • \n'.format(p)) - else: - outfile.write('
  • {0}
  • \n'.format(p)) - outfile.write(navt) - for p in prefix: - script, div = output_components(p, order, maxaln) - outfile.write('
    \n') - outfile.write('

    {0}

    \n'.format(p)) - outfile.write(div + "\n") - outfile.write(script + "\n") - outfile.write("
    \n") - outfile.write(footer) - -if __name__ == "__main__": - parser = get_parser() - args = parser.parse_args() - gen_summary(args.prefix, args.order, args.maxaln)