diff --git a/TEsmall/__init__.py.bak b/TEsmall/__init__.py.bak deleted file mode 100644 index a12e1ba..0000000 --- a/TEsmall/__init__.py.bak +++ /dev/null @@ -1 +0,0 @@ -from version import __version__ diff --git a/TEsmall/abundance.py.bak b/TEsmall/abundance.py.bak deleted file mode 100644 index 1c857e7..0000000 --- a/TEsmall/abundance.py.bak +++ /dev/null @@ -1,37 +0,0 @@ -import logging -import re -from os.path import splitext -import pandas as pd - -def calc_composition(anno_list): - for anno in anno_list: - logging.info("Calculating read composition...") - df = pd.read_table(anno, usecols=["rid", "rlen", "ftype"]) - df = df.groupby(["rlen", "ftype"]).rid.nunique() - df = df.unstack("rlen") - - root = splitext(anno)[0] - outfile = "{0}.comp".format(root) - df.T.to_csv(outfile, sep="\t", na_rep=0, float_format="%.0f") - -def calc_abundance(anno_list): - count_dict = {} - for anno in anno_list: - logging.info("Calculating feature abundances...") - df = pd.read_table(anno, usecols=["rid", "ftype", "fid"]) - rweight = 1/df.groupby("rid").fid.nunique() - ftable = df.groupby(["ftype", "fid"]).rid.unique() - count_dict[splitext(anno)[0]] = ftable.apply(lambda l: round(sum(map(lambda s: rweight[s], l)))) - - count = pd.DataFrame(count_dict).reset_index() - #strcount = count[count["ftype"] == "structural_RNA"] - #count = count[count["ftype"] != "structural_RNA"] - #strcount.to_csv("structural_RNA.count.txt", sep="\t", na_rep=0, float_format="%.0f", index=False) - tecount = count[(count["ftype"] == "anti_TE") | (count["ftype"] == "sense_TE")] - count = count[(count["ftype"] != "anti_TE") & (count["ftype"] != "sense_TE")] - tecount["fid"] = map(lambda s: re.sub(r"_dup\d+$", "", s), tecount["fid"]) - tecount = tecount.groupby(["ftype", "fid"]).sum().reset_index() - - count = pd.concat([count, tecount]) - root = splitext(anno)[0] - count.to_csv("{0}.count".format(root), sep="\t", na_rep=0, float_format="%.0f", index=False) diff --git a/TEsmall/annotation.py.bak b/TEsmall/annotation.py.bak deleted file mode 100644 index 530afb9..0000000 --- a/TEsmall/annotation.py.bak +++ /dev/null @@ -1,56 +0,0 @@ -import argparse -import glob -import logging -import os - -import pybedtools - -from settings import * - -def annotate_reads(genome, order, multi): - root = os.path.splitext(os.path.basename(multi))[0] - root = os.path.splitext(root)[0] - logging.info("Assigning reads to genomic features...") - bedfiles = map(lambda s: os.path.join(ANNOTATION.format(genome), "{0}.bed".format(s)), order) - #bedfiles = sorted(glob.glob(os.path.join(ANNOTATION.format(genome), "*.bed"))) - outfname = "{0}.anno".format(root) - with open(outfname, "w") as outfile: - columns = ["rid", "rchr", "rstart", "rend", "rstrand", "rlen", "ftype", "fid", "fchr", "fstart", "fend", "fstrand", "overlap"] - outfile.write("\t".join(columns) + "\n") - anno_reads = set() - bamfile = multi - for bedfile in bedfiles: - current_reads = set() - f_type = os.path.splitext(os.path.basename(bedfile))[0] - bam = pybedtools.BedTool(bamfile) - bed = pybedtools.BedTool(bedfile) - for line in bam.intersect(bed, wo=True, bed=True, f=0.9, sorted=False, stream=True): - r_chrom = line[0] - r_start = line[1] - r_end = line[2] - r_id = line[3] - r_strand = line[5] - r_len = line[10][:-1] - f_chrom = line[12] - f_start = line[13] - f_end = line[14] - f_id = line[15] - f_strand = line[17] - f_len = line[18] - if f_type in ["miRNA", "hairpin"]: - if r_strand == f_strand: - current_reads.add(r_id) - if r_id not in anno_reads: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, f_type, f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - else: - current_reads.add(r_id) - if r_id not in anno_reads: - if f_type == "TE": - if r_strand == f_strand: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, "sense_TE", f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - else: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, "anti_TE", f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - else: - outfile.write("\t".join([r_id, r_chrom, r_start, r_end, r_strand, r_len, f_type, f_id, f_chrom, f_start, f_end, f_strand, f_len]) + "\n") - anno_reads = anno_reads.union(current_reads) - return outfname diff --git a/TEsmall/command_line.py.bak b/TEsmall/command_line.py.bak deleted file mode 100644 index 7cfd22a..0000000 --- a/TEsmall/command_line.py.bak +++ /dev/null @@ -1,77 +0,0 @@ -import argparse -import logging -import os -import sys - -from abundance import * -from alignment import * -from annotation import * -from settings import * -from summary import * -from version import __version__ - -def main(): - parser = argparse.ArgumentParser(prog="tesmall") - parser.add_argument("-a", "--adapter", metavar="STR", - default='TGGAATTCTCGGGTGCCAAGG', help="Sequence of an adapter that was " - "ligated to the 3' end. The adapter itself and anything that follows " - "is trimmed. (default: %(default)s)") - parser.add_argument("-m", "--minlen", metavar="INT", type=int, - default=16, help="Discard trimmed reads that are shorter than INT. " - "Reads that are too short even before adapter removal are also " - "discarded. (default: %(default)d)") - parser.add_argument("-M", "--maxlen", metavar="INT", type=int, - default=36, help="Discard trimmed reads that are longer than INT. " - "Reads that are too long even before adapter removal are also " - "discarded. (default: %(default)d)") - parser.add_argument("-g", "--genome", metavar="STR", default="hg19", - choices=["dm3", "mm9", "hg19", "hg38", "mm10", "dm6"], help="Version of reference genome " - "(hg19, mm9, or dm3; default: %(default)s)") - parser.add_argument("--maxaln", metavar="INT", type=int, default=100, - help="Suppress all alignments for a particular read if more than INT " - "reportable alignments exist for it. (default: %(default)s)") - parser.add_argument("--mismatch", metavar="INT", type=int, default=0, - choices=[0, 1, 2, 3], help="Report alignments with at most INT " - "mismatches. (default: %(default)s)") - parser.add_argument("-o", "--order", metavar="STR", nargs="+", - choices=["structural_RNA", "miRNA", "hairpin", "exon", "TE", "intron", - "piRNA_cluster"], default=["structural_RNA", "miRNA", "hairpin", "exon", - "TE", "intron", "piRNA_cluster"], help="Annotation priority. (default: structural_RNA miRNA " - "hairpin exon TE intron piRNA_cluster)") - parser.add_argument("-p", "--parallel", metavar="INT", type=int, default=1, - help="Parallel execute by INT CPUs. (default: %(default)s)") - parser.add_argument("-f", "--fastq", metavar="STR", nargs="+", help="Input in " - "FASTQ format. Compressed input is supported and auto-detected from " - "the filename extension (.gz).") - parser.add_argument("-l", "--label", metavar="STR", nargs="+", - help="Unique label for each sample.") - parser.add_argument("--verbose", metavar="INT", type=int, default=2, - help="Set verbose level. 0: only show critical message, 1: show additional " - "warning message, 2: show process information, 3: show debug messages. DEFAULT:2") - parser.add_argument("-v", "--version", action="version", version="%(prog)s {0}".format(__version__)) - args = parser.parse_args() - - logging.basicConfig(level=(4 - args.verbose) * 10, - format="%(asctime)s %(levelname)s %(message)s", - stream=sys.stderr, filemode="w") - - get_requirements(args.genome) - annofiles = [] - if not args.label: - args.label = map(lambda s: re.sub(r".f(ast)?q(.gz)?$", "", os.path.basename(s)), args.fastq) - else: - assert len(set(args.label)) == len(args.fastq) - for label, fastq in zip(args.label, args.fastq): - trimmed_fastq = trim_3p_adapters(fastq, label, args.adapter, args.minlen, args.maxlen) - trimmed_fastq = trim_5p_adapters(trimmed_fastq, label, "GTTCAGAGTTCTACAGTCCGACGATC", args.minlen, args.maxlen) - btidx = os.path.join(BOWTIE_INDEX.format(args.genome), "genome") - rbtidx = os.path.join(BOWTIE_INDEX.format(args.genome), "rDNA") - bestone_bam, unfile = map_bestone_reads(trimmed_fastq, rbtidx, 2) - #bestone_bam, unfile = map_bestone_reads(fastq, rbtidx, args.mismatch) - multi_bam = map_multi_reads(unfile, btidx, args.maxaln, args.mismatch) - readinfo = get_read_info(multi_bam) - annofiles.append(annotate_reads(args.genome, args.order, multi_bam)) - - calc_composition(annofiles) - gen_summary(args.label, args.order, args.maxaln) - calc_abundance(annofiles) diff --git a/TEsmall/summary.py.bak b/TEsmall/summary.py.bak deleted file mode 100644 index 8179af9..0000000 --- a/TEsmall/summary.py.bak +++ /dev/null @@ -1,412 +0,0 @@ -from __future__ import division -import datetime -import argparse -import logging -import os -import string -from math import pi, cos, sin -from collections import defaultdict, Counter -import pandas as pd -import bokeh -from bokeh.layouts import column, row, layout -from bokeh.plotting import figure -from bokeh.charts import Bar, output_file, save -from bokeh.charts.operations import blend -from bokeh.charts.attributes import cat, color -from bokeh.models import HoverTool, ColumnDataSource -from bokeh.models.widgets import DataTable, TableColumn, NumberFormatter -from bokeh.embed import components -import seaborn as sns -import pysam - -template = """ - - -
- - - - - - - - -