diff --git a/CLASHChimeras.egg-info/PKG-INFO b/CLASHChimeras.egg-info/PKG-INFO new file mode 100644 index 0000000..39f039d --- /dev/null +++ b/CLASHChimeras.egg-info/PKG-INFO @@ -0,0 +1,21 @@ +Metadata-Version: 1.1 +Name: CLASHChimeras +Version: 0.1b3 +Summary: Python package to find chimeras in CRAC/CLASH and HITS-CLIP datasets +Home-page: https://github.com/kashyapchhatbar/CLASHChimeras +Author: Kashyap Chhatbar +Author-email: kashyap.c@ed.ac.uk +License: MIT +Description: UNKNOWN +Keywords: clash chimeras hybrids hits-clip bioinformatics +Platform: UNKNOWN +Classifier: Development Status :: 4 - Beta +Classifier: Topic :: Scientific/Engineering :: Bio-Informatics +Classifier: Topic :: Scientific/Engineering :: Medical Science Apps. +Classifier: License :: OSI Approved :: MIT License +Classifier: Environment :: Console +Classifier: Intended Audience :: Science/Research +Classifier: Intended Audience :: Developers +Classifier: Operating System :: POSIX :: Linux +Classifier: Operating System :: MacOS +Classifier: Programming Language :: Python :: 3 :: Only diff --git a/CLASHChimeras.egg-info/SOURCES.txt b/CLASHChimeras.egg-info/SOURCES.txt new file mode 100644 index 0000000..e23791f --- /dev/null +++ b/CLASHChimeras.egg-info/SOURCES.txt @@ -0,0 +1,24 @@ +LICENSE +MANIFEST.in +README.rst +VERSION +requirements.txt +setup.cfg +setup.py +CLASHChimeras.egg-info/PKG-INFO +CLASHChimeras.egg-info/SOURCES.txt +CLASHChimeras.egg-info/dependency_links.txt +CLASHChimeras.egg-info/requires.txt +CLASHChimeras.egg-info/top_level.txt +clashchimeras/__init__.py +clashchimeras/align.py +clashchimeras/download.py +clashchimeras/find.py +clashchimeras/initialize.py +clashchimeras/log.py +clashchimeras/methods.py +clashchimeras/parsers.py +clashchimeras/runners.py +scripts/align-for-chimeras +scripts/download-for-chimeras +scripts/find-chimeras \ No newline at end of file diff --git a/CLASHChimeras.egg-info/dependency_links.txt b/CLASHChimeras.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/CLASHChimeras.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/CLASHChimeras.egg-info/requires.txt b/CLASHChimeras.egg-info/requires.txt new file mode 100644 index 0000000..35f2e18 --- /dev/null +++ b/CLASHChimeras.egg-info/requires.txt @@ -0,0 +1,12 @@ +alabaster +ansicolors >= 1.0.2 +beautifulsoup4 >= 4.3.2 +biopython >= 1.65 +coloredlogs >= 1.0.1 +ftputil >= 2.2 +pandas >= 0.16 +progress >= 1.2 +pyfaidx >= 0.3.4 +requests >= 2.7.0 +sphinx-argparse +tabulate >= 0.7.5 diff --git a/CLASHChimeras.egg-info/top_level.txt b/CLASHChimeras.egg-info/top_level.txt new file mode 100644 index 0000000..7ee7188 --- /dev/null +++ b/CLASHChimeras.egg-info/top_level.txt @@ -0,0 +1 @@ +clashchimeras diff --git a/build/lib/clashchimeras/__init__.py b/build/lib/clashchimeras/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/clashchimeras/align.py b/build/lib/clashchimeras/align.py new file mode 100644 index 0000000..8f2efc9 --- /dev/null +++ b/build/lib/clashchimeras/align.py @@ -0,0 +1,207 @@ +import argparse +import sys + +import clashchimeras.log +from clashchimeras.initialize import Arguments +from clashchimeras.parsers import SAM +from clashchimeras.runners import Bowtie, Tophat + + +class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter): + pass + + +def parseArguments(): + requiredArgs = getRequiredArgs() + bowtieArgs = getBowtie2Args() + tophatArgs = getTophatArgs() + optionalArgs = getOptionalArgs() + outputArgs = getOutputArgs() + + parser = argparse.ArgumentParser( + parents=[requiredArgs, bowtieArgs, tophatArgs, + outputArgs, optionalArgs], + formatter_class=CustomFormatter, + description='Given a fastq file, this script executes ' + 'bowtie2 and tophat aligners to generate alignment files ' + 'necessary for detecting chimeras in the reads', + usage='\n %(prog)s -i input.fastq -si ' + '/path/to/smallRNA_index -ti /path/to/targetRNA_index -o ' + 'output -r bowtie2 \n %(prog)s -i input.fastq -gi ' + '/path/to/genome_index -tri /path/to/transcriptome_index ' + '-o output -r tophat \n \n \n ' + 'To see detailed help, please run \n %(prog)s -h', + add_help=True) + + return parser + + args = parser.parse_args() + + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') + + argCheck = Arguments(args, type='align') + argCheck.validateAlign() + + return args + + +def getRequiredArgs(): + parser = argparse.ArgumentParser(add_help=False) + + required = parser.add_argument_group('Input arguments') + + required.add_argument('--input', '-i', + help='Input file containing reads fastq', + metavar='raw reads', + required=True) + + return parser + + +def getBowtie2Args(): + parser = argparse.ArgumentParser(add_help=False) + + bowtieArgs = parser.add_argument_group('Bowtie2 arguments') + + bowtieArgs.add_argument("--smallRNAIndex", "-si", + help="""Provide the smallRNA bowtie2 index (Usually + resides in ~/db/CLASHChimeras or elsewhere if you have + specified in --path -pa during initialize)""") + + bowtieArgs.add_argument("--targetRNAIndex", "-ti", + help="""Provide the targetRNA bowtie2 index (Usually + resides in ~/db/CLASHChimeras or elsewhere if you have + specified in --path -pa during initialize)""") + + return parser + + +def getTophatArgs(): + parser = argparse.ArgumentParser(add_help=False) + + tophatArgs = parser.add_argument_group('Tophat arguments') + + tophatArgs.add_argument("--genomeIndex", "-gi", + help="""Provide the genome bowtie2 index (Usually + resides in ~/db/CLASHChimeras or elsewhere if you have + specified in --path during initialize)""") + + tophatArgs.add_argument("--transcriptomeIndex", "-tri", + help="""Provide the transcriptome index as specified + in tophat --transcriptome-index""") + + return parser + + +def getOutputArgs(): + parser = argparse.ArgumentParser(add_help=False) + + output = parser.add_argument_group('Output') + + output.add_argument("--output", "-o", + help="""The output name without extension (.sam .bam will + be added)""", + metavar='output prefix', + required=True) + + return parser + + +def getOptionalArgs(): + parser = argparse.ArgumentParser(add_help=False) + + optional = parser.add_argument_group('Optional arguments') + + optional.add_argument("--run", "-r", + help="Run the following aligner for raw reads", + default='bowtie2', + choices=['bowtie2', 'tophat']) + + optional.add_argument("--logLevel", "-l", + help="Set logging level", + default='INFO', + choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) + + optional.add_argument("--gzip", "-gz", action="store_true", + help="Whether your input file is gzipped") + + optional.add_argument("--bowtieExecutable", "-be", + help="""Provide bowtie2 executable if it's not present + in your path""") + + optional.add_argument("--tophatExecutable", "-te", + help="""Provide Tophat executable if it's not present in + your path""") + + optional.add_argument("--preset", "-p", default='sensitive-local', + choices=['very-fast', 'fast', 'sensitive', + 'very-sensitive', 'very-fast-local', 'fast-local', + 'sensitive-local', 'very-sensitive-local'], + help="Provide preset for bowtie2") + + optional.add_argument("--tophatPreset", "-tp", + choices=['very-fast', 'fast', 'sensitive', + 'very-sensitive'], + default='very-sensitive', + help="Provide preset for Tophat") + + optional.add_argument("--mismatch", "-m", type=int, + choices=[0, 1], default=1, + help="""Number of seed mismatches as represented in + bowtie2 as -N""") + + optional.add_argument("--reverseComplement", "-rc", + action="store_true", + help="""Align to reverse complement of reference as + represented in bowtie2 as --norc""") + + optional.add_argument("--unaligned", "-un", + action="store_true", + help="""Whether to keep unaligned reads in the output + sam file. Represented in bowtie2 as --no-unal""") + + optional.add_argument("--threads", "-n", default=1, + help="Specify the number of threads") + + return parser + + +def main(): + + parser = parseArguments() + + args = parser.parse_args() + + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') + + argCheck = Arguments(args) + argCheck.validateAlign() + + if args.run == 'bowtie2': + b = Bowtie(args=args) + b.run(output='smallRNA') + s = SAM(fileName=b.outputSam) + filteredFasta = s.filterPotentialChimeras(target=b.outputSam) + b.run(output='targetRNA', filtered=filteredFasta) + + if args.run == 'tophat': + t = Tophat(args=args) + t.run() + +if __name__ == "__main__": + main() diff --git a/build/lib/clashchimeras/download.py b/build/lib/clashchimeras/download.py new file mode 100644 index 0000000..ef767b6 --- /dev/null +++ b/build/lib/clashchimeras/download.py @@ -0,0 +1,126 @@ +import argparse +import os +import textwrap + +import clashchimeras.log +from clashchimeras.initialize import Releases, Index + + +class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter): + pass + + +def parseArguments(): + requiredArgs = getRequiredArgs() + optionalArgs = getOptionalArgs() + outputArgs = getOutputArgs() + + parser = argparse.ArgumentParser( + parents=[requiredArgs, outputArgs, optionalArgs], + formatter_class=CustomFormatter, + description=textwrap.dedent("""\ + Downloads required sequences and create bowtie2 indexes + required for alignment"""), + usage='An example usage is: %(prog)s -gor "H.sapiens" -mor hsa', + add_help=True) + + return parser + + +def getRequiredArgs(): + parser = argparse.ArgumentParser(add_help=False) + + required = parser.add_argument_group('Required arguments') + + required.add_argument("--gencodeOrganism", "-gor", + default="H.sapiens", choices=["H.sapiens", + "M.musculus"], + help="""Select model organism""", + required=True) + + required.add_argument("--mirbaseOrganism", "-mor", + default='hsa', + help="""Select organism to download microRNAs for""", + required=True) + + return parser + + +def getOutputArgs(): + parser = argparse.ArgumentParser(add_help=False) + + output = parser.add_argument_group('Output') + + output.add_argument("--path", "-pa", + help="""Location where all the database files and + indexes will be downloaded""", + default='~/db/CLASHChimeras', + metavar='path') + + return parser + + +def getOptionalArgs(): + parser = argparse.ArgumentParser(add_help=False) + + optional = parser.add_argument_group('Optional arguments') + + optional.add_argument("--logLevel", "-l", + help="Set logging level", + default='INFO', + choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) + + optional.add_argument("--bowtieExecutable", "-be", + help="""Provide bowtie2 executable if it's not present + in your path""") + + optional.add_argument("--tophatExecutable", "-te", + help="""Provide Tophat executable if it's not present in + your path""") + + optional.add_argument("--miRNA", "-mi", + choices=['mature', 'hairpin'], default='hairpin', + help="Which miRNA sequences to align") + + return parser + + +def main(): + parser = parseArguments() + + args = parser.parse_args() + + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') + + r = Releases(gencodeOrganism=args.gencodeOrganism, + mirbaseOrganism=args.mirbaseOrganism, + path=os.path.expanduser(args.path)) + + i = Index(root=r.gencodePath) + i.create() + + logger.warn('Please find the indexes created listed below..') + logger.warn('Use them when you run align-for-chimeras') + + for _i in i.created: + logger.warn(_i) + + i = Index(root=r.mirbasePath) + i.create() + + logger.warn('Please find the indexes created listed below..') + logger.warn('Use them when you run align-for-chimeras') + + for _i in i.created: + logger.warn(_i) + +if __name__ == "__main__": + main() diff --git a/build/lib/clashchimeras/find.py b/build/lib/clashchimeras/find.py new file mode 100644 index 0000000..025a0d3 --- /dev/null +++ b/build/lib/clashchimeras/find.py @@ -0,0 +1,218 @@ +import argparse +import textwrap +import sys + +from colors import yellow + +import clashchimeras.log +import clashchimeras.methods +from clashchimeras.parsers import GFF, GTF, Output, SAM +from clashchimeras.initialize import Arguments + + +class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter): + pass + + +def parseArguments(): + requiredArgs = getRequiredArgs() + optionalArgs = getOptionalArgs() + outputArgs = getOutputArgs() + + parser = argparse.ArgumentParser( + parents=[requiredArgs, outputArgs, optionalArgs], + formatter_class=CustomFormatter, + + usage='An example usage is: %(prog)s -s smallRNA.sam -t ' + 'targetRNA.sam -o output', + + description=textwrap.dedent('''\ + Given two SAM files, this script tries to find chimeras that + are observed between a smallRNA and a targetRNA'''), + + add_help=True) + + return parser + + +def getRequiredArgs(): + parser = argparse.ArgumentParser(add_help=False) + + required = parser.add_argument_group('Required arguments') + + required.add_argument('--smallRNA', '-s', + help='Provide smallRNA alignment SAM file', + metavar='SAM file', + required=True) + required.add_argument('--targetRNA', '-t', + help='Provide targetRNA alignment SAM file', + metavar='SAM file', + required=True) + + required.add_argument("--getGenomicLocationsSmallRNA", "-ggs", + help="Do you want genomic locations for small RNA?", + action="store_true") + + required.add_argument("--getGenomicLocationsTargetRNA", "-ggt", + help="Do you want genomic locations for target RNA?", + action="store_true") + + required.add_argument("--smallRNAAnnotation", "-sa", + action="store", + help="""Provide smallRNA annotation gtf(from Gencode) + or gff3(from Mirbase). Only provide gtf from Gencode or + gff3 from Mirbase. Does not support other gtf files""") + + required.add_argument("--targetRNAAnnotation", "-ta", + action="store", + help="""Provide targetRNA annotation gtf(from Gencode). + Only provide gtf from Gencode. Does not support other + gtf files""") + + return parser + + +def getOutputArgs(): + parser = argparse.ArgumentParser(add_help=False) + + output = parser.add_argument_group('Output') + + output.add_argument("--output", "-o", + help="""The output name without extension (.bed .csv will + be added)""", + metavar='output prefix', + required=True) + + return parser + + +def getOptionalArgs(): + parser = argparse.ArgumentParser(add_help=False) + + optional = parser.add_argument_group('Optional arguments') + + optional.add_argument("--overlap", "-ov", + help="""Maximum overlap to be set between two + molecules when determining chimeras""", + default=4) + + optional.add_argument("--gap", "-ga", + help="""Maximum gap (number of unaligned nucleotides) to + allowed between two molecules within a chimera""", + default=9) + + optional.add_argument("--logLevel", "-l", + help="Set logging level", + default='INFO', + choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) + + return parser + + +def main(): + """The main function to execute when you run the program as a script + """ + parser = parseArguments() + + args = parser.parse_args() + + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') + + argCheck = Arguments(args, type='find') + argCheck.validateFind() + + smallRNA = SAM(fileName=args.smallRNA) + smallRNA.read() + + targetRNA = SAM(fileName=args.targetRNA) + targetRNA.read() + + if args.getGenomicLocationsSmallRNA: + + if args.smallRNAAnnotation.endswith('gff3'): + + smallRNAAnnotation = GFF(fileName=args.smallRNAAnnotation) + smallRNAAnnotation.read() + + elif args.smallRNAAnnotation.endswith('gtf'): + smallRNAAnnotation = GTF(fileName=args.smallRNAAnnotation) + if 'tRNAs' in args.smallRNAAnnotation: + smallRNAAnnotation.read(featureType='tRNAscan') + else: + smallRNAAnnotation.read() + + if args.getGenomicLocationsTargetRNA: + if args.targetRNAAnnotation.endswith('gtf'): + targetRNAAnnotation = GTF(fileName=args.targetRNAAnnotation) + targetRNAAnnotation.read() + + output = Output(target=args.output, + smallRNABed=args.getGenomicLocationsSmallRNA, + targetRNABed=args.getGenomicLocationsTargetRNA, + overlap=args.overlap, + gap=args.gap) + + common = set(smallRNA.records.keys()).intersection(targetRNA.records.keys()) + + logger.info('Determining chimeras') + + for index, queryName in enumerate(common, start=1): + smallRNARecord = smallRNA.access(queryName) + targetRNARecord = targetRNA.access(queryName) + + chimeraString = clashchimeras.methods.chimeraOrNot( + smallRNARecord.cigarString, targetRNARecord.cigarString, + overlap=args.overlap, gap=args.gap) + + if chimeraString: + + if args.getGenomicLocationsSmallRNA: + + try: + smallRNALocation = smallRNAAnnotation.coordinates( + smallRNARecord.refId, + start=smallRNARecord.start, + end=smallRNARecord.end) + except IndexError as e: + logger.error(e) + logger.error('Are you sure you have the correct annotation file for ' + 'smallRNA?') + logger.error('Please check --smallRNAAnnotation -sa and ' + 'and make sure they are matching with the alignment ' + 'file {}'.format(args.smallRNA)) + sys.exit() + output.writeSmallRNABed(queryName, smallRNALocation, + targetRNARecord.refId) + + if args.getGenomicLocationsTargetRNA: + try: + targetRNALocation = targetRNAAnnotation.coordinates( + targetRNARecord.refId, + start=targetRNARecord.start, + end=targetRNARecord.end) + except IndexError as e: + logger.error(e.msg) + logger.error('Are you sure you have the correct annotation file for ' + 'targetRNA?') + logger.error('Please check --targetRNAAnnotation -ta and ' + 'and make sure they are matching with the alignment ' + 'file {}'.format(args.targetRNA)) + sys.exit() + + output.writeTargetRNABed(queryName, targetRNALocation, + smallRNARecord.refId) + + output.write(queryName, smallRNARecord, targetRNARecord) + + logger.info('Determining chimeras finished') + +if __name__ == "__main__": + main() diff --git a/build/lib/clashchimeras/initialize.py b/build/lib/clashchimeras/initialize.py new file mode 100644 index 0000000..17e275f --- /dev/null +++ b/build/lib/clashchimeras/initialize.py @@ -0,0 +1,759 @@ +import glob +import gzip +import hashlib +import logging +import os +import re +import subprocess +import sys +import textwrap +import tempfile +from collections import defaultdict + +import ftputil +import pandas as pd +import requests +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from colors import green +from bs4 import BeautifulSoup +from tabulate import tabulate + +from clashchimeras.parsers import Fasta + +logger = logging.getLogger('root') + + +class Releases: + + def __init__(self, gencodeOrganism='H.sapiens', mirbaseOrganism='hsa', + path=os.path.join(os.path.expanduser("~"), '.CLASHchimeras')): + organisms = {'H.sapiens': 'http://www.gencodegenes.org/releases/', + 'M.musculus': 'http://www.gencodegenes.org/mouse_releases/'} + shortOrganisms = {'H.sapiens': 'human', + 'M.musculus': 'mouse'} + self.gencodeOrganism = gencodeOrganism + self.mirbaseOrganism = mirbaseOrganism + self.path = path + self.url = organisms[self.gencodeOrganism] + self.gencodeShortOrganism = shortOrganisms[self.gencodeOrganism] + self.mirbaseUrl = 'ftp://mirbase.org/pub/mirbase/CURRENT/README' + self.df = self.openUrl() + self.mirbaseDf = self.openMirbaseReadme() + + self.selectGencodeRelease() + + self.selectMirbaseRelease() + + g = Gencode(release=self.gencodeRelease, + path=self.gencodePath, + ftpDir=self.gencodeFtpDir) + + g.download() + + m = Mirbase(release=self.mirbaseRelease, + path=self.mirbasePath, + ftpDir=self.mirbaseFtpDir, + organism=self.mirbaseOrganism) + + m.download() + + m.process() + + def openUrl(self): + r = requests.get(self.url) + data = r.text + soup = BeautifulSoup(data) + table = soup.find("table") + headings = [th.get_text().strip() for th in table.find("tr").find_all("th")] + dataset = defaultdict(dict) + for index, row in enumerate(table.find_all("tr")[1:]): + for i, j in zip(headings, + (td.get_text().strip() for td in row.find_all("td"))): + dataset[index][i] = j + + return pd.DataFrame(dataset).transpose() + + def openMirbaseReadme(self): + with ftputil.FTPHost('mirbase.org', 'anonymous', 'anonymous') as fH: + fobj = fH.open('pub/mirbase/CURRENT/README') + store = False + dataset = defaultdict(dict) + index = 0 + for line in fobj.readlines(): + if store: + row = line.strip().split() + if len(row) == 3 and row[1][0].isdigit(): + dataset[index]['Version'] = row[0] + dataset[index]['Date'] = row[1] + dataset[index]['Entries'] = row[2] + index += 1 + if 'HISTORY' in line: + store = True + + return pd.DataFrame(dataset).transpose() + + def downloadMirbaseRelease(self, key): + logger.warn('Are you sure??') + decision = input('Y/n: ') + if decision.lower() == 'y' or decision == '': + + self.mirbaseRelease = self.mirbaseDf.loc[int(key)]['Version'] + + os.makedirs(os.path.join(self.path, 'Mirbase', + str(self.mirbaseRelease)), exist_ok=True) + + self.mirbasePath = os.path.join(self.path, 'Mirbase', + str(self.mirbaseRelease)) + + self.mirbaseFtpDir = os.path.join('pub/mirbase', self.mirbaseRelease) + + return True + + def downloadGencodeRelease(self, key): + logger.warn('Are you sure??') + decision = input('Y/n: ') + if decision.lower() == 'y' or decision == '': + + self.gencodeRelease = self.df.loc[int(key)]['GENCODE release'] + + os.makedirs(os.path.join(self.path, 'Gencode', self.gencodeOrganism, + str(self.gencodeRelease)), exist_ok=True) + + self.gencodePath = os.path.join(self.path, 'Gencode', + self.gencodeOrganism, + str(self.gencodeRelease)) + + self.gencodeFtpDir = os.path.join('pub/gencode/Gencode_' + + self.gencodeShortOrganism, + 'release_' + self.gencodeRelease) + + return True + + def selectGencodeRelease(self): + cols = self.df.columns + + logger.info("Select the release that you want \n{}".format( + tabulate(self.df[[cols[2], cols[0], + cols[1], cols[3]]], + headers=['Index', cols[2], cols[0], cols[1], cols[3]], + tablefmt="simple"))) + + logger.warn('Please bare in mind that the automatic download relies on ' + 'regex search which is known to work for Gencode release 17 ' + 'and higher ') + + releaseKeys = self.df.index + + while True: + logger.warn('Please select which Gencode release to use (select index ' + + '[{}]): '.format(min(self.df.index))) + key = input('Index: ') + if key.isdigit() and int(key) in releaseKeys: + logger.warn('This will download the Gencode release {} which ' + 'corresponds to Ensembl release {} and Genome assembly ' + 'version {} freezed on {}'.format( + self.df.loc[int(key)]['GENCODE release'], + self.df.loc[int(key)]['Ensembl release'], + self.df.loc[int(key)]['Genome assembly version'], + self.df.loc[int(key)]['Freeze date *'])) + if self.downloadGencodeRelease(key): + break + + elif key == '': + logger.warn('This will download the latest Gencode release {} which ' + 'corresponds to Ensembl release {} and Genome assembly ' + 'version {} freezed on {}'.format( + self.df.loc[min(self.df.index)]['GENCODE release'], + self.df.loc[min(self.df.index)]['Ensembl release'], + self.df.loc[min(self.df.index)]['Genome assembly ' + 'version'], + self.df.loc[min(self.df.index)]['Freeze date *'])) + if self.downloadGencodeRelease(min(self.df.index)): + break + else: + logger.warn('Not a valid release index:') + + def selectMirbaseRelease(self): + cols = self.mirbaseDf.columns + + logger.info("Select the Mirbase release you want \n{}".format( + tabulate(self.mirbaseDf, headers=['Index', cols[0], cols[1], + cols[2]], tablefmt="simple"))) + + releaseKeys = self.mirbaseDf.index + + while True: + logger.warn('Please select which Mirbase release to use (select index ' + + '[{}]): '.format(max(self.mirbaseDf.index))) + key = input('Index: ') + if key.isdigit() and int(key) in releaseKeys: + logger.warn('This will download miRBase release {} which contains {} ' + 'entries and released on {}'.format( + self.mirbaseDf.loc[int(key)]['Version'], + self.mirbaseDf.loc[int(key)]['Entries'], + self.mirbaseDf.loc[int(key)]['Date'])) + if self.downloadMirbaseRelease(key): + break + + elif key == '': + logger.warn('This will download miRBase release {} which contains {} ' + 'entries and released on {}'.format( + self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Version'], + self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Entries'], + self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Date'])) + if self.downloadMirbaseRelease(max(self.mirbaseDf.index)): + break + + else: + logger.warn('Not a valid release index:') + + +class Arguments: + + def __init__(self, args, type=None): + self.args = args + if type == 'align': + if args.genomeIndex: + self.args.genomeIndex = os.path.expanduser(args.genomeIndex) + if args.transcriptomeIndex: + self.args.transcriptomeIndex = os.path.expanduser( + args.transcriptomeIndex) + if args.input: + self.args.input = os.path.expanduser(args.input) + if args.smallRNAIndex: + self.args.smallRNAIndex = os.path.expanduser(args.smallRNAIndex) + if self.args.targetRNAIndex: + self.args.targetRNAIndex = os.path.expanduser(args.targetRNAIndex) + + if args.output: + self.args.output = os.path.expanduser(args.output) + + if type == 'find': + if args.smallRNAAnnotation: + self.args.smallRNAAnnotation = os.path.expanduser( + args.smallRNAAnnotation) + if args.targetRNAAnnotation: + self.args.targetRNAAnnotation = os.path.expanduser( + args.targetRNAAnnotation) + if args.smallRNA: + self.args.smallRNA = os.path.expanduser(args.smallRNA) + if args.targetRNA: + self.args.targetRNA = os.path.expanduser(args.targetRNA) + + self.exit = False + + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + + def validateFind(self): + if not os.path.exists(self.args.smallRNA): + logger.error('{} not found...'.format(self.args.smallRNA)) + self.exit = True + if not os.path.exists(self.args.targetRNA): + logger.error('{} not found...'.format(self.args.smallRNA)) + self.exit = True + if self.args.getGenomicLocationsSmallRNA: + if not self.args.smallRNAAnnotation: + logger.error('--smallRNAAnnotation -sa not specified. If you want ' + 'genomic locations for smallRNA, please enable ' + '--getGenomicLocationsSmallRNA -ggs and specify ' + 'annotation file using --smallRNAAnnotation -sa') + self.exit = True + else: + if not os.path.exists(self.args.smallRNAAnnotation): + logger.error('{} not found'.format(self.args.smallRNAAnnotation)) + self.exit = True + if self.args.getGenomicLocationsTargetRNA: + if not self.args.targetRNAAnnotation: + logger.error('--targetRNAAnnotation -ta not specified. If you want ' + 'genomic locations for targetRNA, please enable ' + '--getGenomicLocationsTargetRNA -ggt and specify ' + 'annotation file using --targetRNAAnnotation -ta') + self.exit = True + else: + if not os.path.exists(self.args.targetRNAAnnotation): + logger.error('{} not found...'.format(self.args.targetRNAAnnotation)) + self.exit = True + if self.hash(self.args.smallRNA) == self.hash(self.args.targetRNA): + logger.error('CLASHChimeras does not detect chimeras between the same ' + 'RNA type yet... Please hang in there, we are planning it ' + 'in the feature') + self.exit = True + + if self.exit: + logger.error('Exiting because of the above errors...') + sys.exit() + elif self.hash(self.args.smallRNA) == self.hash(self.args.targetRNA): + logger.error('CLASHChimeras does not detect chimeras between the same ' + 'RNA type yet... Please hang in there, we are planning it ' + 'in the feature') + sys.exit() + + def validateAlign(self): + if not os.path.exists(self.args.input): + logger.error('{} not found...'.format(self.args.input)) + self.exit = True + if self.args.run == 'bowtie2' and (not self.args.smallRNAIndex or + not self.args.targetRNAIndex): + logger.error('{}'.format('Please specify --smallRNAIndex -si and ' + + '--targetRNAIndex -ti properly...')) + self.exit = True + if self.args.smallRNAIndex: + bts = glob.glob(self.args.smallRNAIndex + '*.bt2') + if not len(bts) >= 6: + logger.error('Something wrong with the {}'.format( + self.args.smallRNAIndex)) + logger.error("Can't find all the .bt2 files for that index") + self.exit = True + if self.args.targetRNAIndex: + bts = glob.glob(self.args.targetRNAIndex + '*.bt2') + if not len(bts) >= 6: + logger.error('Something wrong with the {}'.format( + self.args.targetRNAIndex)) + logger.error("Can't find all the .bt2 files for that index") + self.exit = True + if self.args.genomeIndex: + bts = glob.glob(self.args.genomeIndex.strip() + '*.bt2') + if not len(bts) >= 6: + logger.error('Something wrong with the {}'.format( + self.args.genomeIndex)) + logger.error("Can't find all the .bt2 files for that index") + self.exit = True + if self.args.run == 'tophat' and not self.args.genomeIndex: + logger.error('Please provide genome index if you want to use tophat..') + self.exit = True + if self.exit: + sys.exit() + + +class Gencode: + """This class makes sure the required files (sequences, annotation etc.) are + present and in working order. It generates sha256 checksums and autodownloads + the required files from Gencode Genes. + """ + + def __init__(self, release=None, user="anonymous", password=None, + path=None, ftpDir=None): + + self.downloaded = [] + self.release = release + self.reList = ['^GRC.+\.genome\.fa', + '^gencode.+chr_patch_hapl_scaff\.annotation\.gtf', + '^gencode.+pc_transcripts\.fa', + '^gencode.+tRNAs\.gtf', + '^gencode.+lncRNA_transcripts\.fa'] + + self.host = "ftp.sanger.ac.uk" + self.user = user + self.path = path + + self.files = [] + self.otherFiles = {} + biotypes = ['snRNA', 'snoRNA', 'misc_RNA', 'tRNA'] + for b in biotypes: + self.otherFiles[b] = 'gencode.v{}.{}_transcripts.fa'.format(self.release, + b) + if self.user == "anonymous": + self.password = "anonymous" + else: + self.password = password + + ftp_host = ftputil.FTPHost(self.host, self.user, self.password) + + self.dir = ftpDir + fileList = ftp_host.listdir(self.dir) + for file in fileList: + for rEx in self.reList: + if re.match(rEx, file): + if not 'primary_assembly' in file: + self.files.append(file) + ftp_host.close() + _files = self.files.copy() + _otherFiles = self.otherFiles.copy() + + for file in _files: + logger.debug('Checking %s' % file) + _file = file.rpartition(".")[0] + if os.path.exists(os.path.join(self.path, _file + '.sha256sum')) and \ + os.path.exists(os.path.join(self.path, _file)): + with open(os.path.join(self.path, _file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, _file)) + if s == _s: + logger.info("%s is present and verified" % _file) + self.files.remove(file) + else: + logger.warn('%s is downloaded but doesnt match the sha256sum' % _file) + logger.error('Will be downloaded again') + else: + logger.warn('%s will be downloaded' % _file) + + for biotype, file in _otherFiles.items(): + logger.debug('Checking %s' % file) + _file = file + if os.path.exists(os.path.join(self.path, _file + '.sha256sum')) and \ + os.path.exists(os.path.join(self.path, _file)): + with open(os.path.join(self.path, _file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, _file)) + if s == _s: + logger.info("%s is present and verified" % file) + self.otherFiles.pop(biotype) + else: + logger.warn('%s is generated but doesnt match the sha256sum' % file) + logger.error('Will be generated again') + else: + logger.warn('%s will be generated' % file) + + def download(self): + + if len(self.files) == 0: + logger.info('Gencode files are downloaded and checksum verified...') + else: + with ftputil.FTPHost(self.host, self.user, self.password) as fH: + for file in self.files: + logger.info('Downloading %s from ftp.sanger.ac.uk' % file) + fH.download(self.dir + '/' + file, os.path.join(self.path, file), + callback=None) + self.downloaded.append(file) + _file = file.rpartition(".")[0] + p = subprocess.Popen(['gunzip', os.path.join(self.path, file)]) + p.communicate() + sha256sum = self.hash(os.path.join(self.path, _file)) + with open(os.path.join(self.path, _file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.info('Downloading, extraction and hashing of %s finished' % + file) + + gtfFiles = glob.glob(os.path.join(self.path, "*.annotation.gtf")) + + if len(gtfFiles) == 0: + logger.warn('This release does not contain annotation file or they are ' + 'not grabbed by regex. Please download it manually from {' + '}'.format(self.dir)) + gtfFile = None + elif len(gtfFiles) == 2: + gtfFiles.sort() + gtfFile = gtfFiles[1] + elif len(gtfFiles) == 1: + gtfFile = gtfFiles[0] + + genomeFiles = glob.glob(os.path.join(self.path, "*.genome.fa")) + + if len(genomeFiles) == 0: + logger.warn('This release does not contain genome file or they are not ' + 'grabbed by regex. Please download it manually from {' + '}'.format(self.dir)) + genomeFile = None + else: + genomeFile = genomeFiles[0] + + tRNAgtfFiles = glob.glob(os.path.join(self.path, "*.tRNAs.gtf")) + + if len(tRNAgtfFiles) == 0: + logger.warn(('This release does not contain tRNA annotation file or they ' + 'are not grabbed by regex. Please download it manually from {' + '}'.format(self.dir))) + tRNAgtfFile = None + else: + tRNAgtfFile = tRNAgtfFiles[0] + + if len(self.otherFiles) == 0: + logger.info('Other fasta files are generated and checksum verified...') + else: + for biotype, file in self.otherFiles.items(): + if biotype == 'tRNA' and (tRNAgtfFile and genomeFile): + fasta = Fasta(genome=genomeFile, gtf=tRNAgtfFile) + fasta.getBiotype(biotype='tRNA', output=os.path.join(self.path, + file)) + sha256sum = self.hash(os.path.join(self.path, file)) + with open(os.path.join(self.path, file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.info('Extraction and hashing of %s finished' % file) + elif gtfFile and genomeFile: + fasta = Fasta(genome=genomeFile, gtf=gtfFile) + fasta.getBiotype(biotype=biotype, output=os.path.join(self.path, + file)) + sha256sum = self.hash(os.path.join(self.path, file)) + with open(os.path.join(self.path, file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.info('Extraction and hashing of %s finished' % file) + + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + + +class Mirbase: + """This class makes sure that requried files (sequences, annotations etc.) + from Mirbase are downloaded. It generates sha256 checksums and autodownloads + the required files + """ + + def __init__(self, release=None, user="anonymous", password=None, + path=os.path.join(os.path.expanduser("~"), '.CLASHchimeras'), + organism='hsa', + ftpDir=None): + self.reList = ['^hairpin\.fa\.gz', '^mature\.fa\.gz'] + organisms = {} + with ftputil.FTPHost('mirbase.org', 'anonymous', 'anonymous') as fH: + tH, tP = tempfile.mkstemp() + fH.download(os.path.join('pub/mirbase', release, 'organisms.txt.gz'), tP) + for i in gzip.open(tP): + ii = i.decode('utf-8').strip() + if not ii.startswith('#'): + organisms[ii.split("\t")[0]] = ii.split("\t")[2] + + self.shortOrganism = organism + self.organism = organisms.get(self.shortOrganism, None) + if not self.organism: + logger.error('{} is not a valid organism name in Mirbase'.format( + self.shortOrganism)) + sys.exit() + self.host = 'mirbase.org' + self.release = release + self.user = user + self.path = path + self.files = [] + + if self.user == "anonymous": + self.password = "anonymous" + else: + self.password = password + + ftp_host = ftputil.FTPHost(self.host, self.user, self.password) + + self.dir = ftpDir + + fileList = ftp_host.listdir(self.dir) + + self.files.append('genomes/' + self.shortOrganism + '.gff3') + for file in fileList: + for rEx in self.reList: + if re.match(rEx, file): + self.files.append(file) + ftp_host.close() + _files = self.files.copy() + + for file in _files: + logger.debug('Checking %s' % file) + if file.endswith('gz'): + _file = file.rpartition(".")[0] + elif '/' in file: + _file = file.rpartition("/")[2] + if os.path.exists(os.path.join(self.path, _file + '.sha256sum')) and \ + os.path.exists(os.path.join(self.path, _file)): + with open(os.path.join(self.path, _file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, _file)) + if s == _s: + logger.info("%s is present and verified" % file) + self.files.remove(file) + else: + logger.warn('%s is downloaded but doesnt match the sha256sum' % file) + logger.error('Must run download method') + else: + logger.warn('%s will be downloaded when you run download method' % file) + + def download(self): + + if len(self.files) == 0: + logger.debug('Mirbase files are downloaded and checksum verified...') + else: + with ftputil.FTPHost(self.host, self.user, self.password) as fH: + for file in self.files: + logger.info('Downloading %s from mirbase.org' % file) + if "/" in file: + _file = file.rpartition("/")[2] + else: + _file = file + fH.download(self.dir + '/' + file, os.path.join(self.path, _file), + callback=None) + if _file.endswith('.gz'): + __file = _file.rpartition(".")[0] + p = subprocess.Popen(['gunzip', os.path.join(self.path, __file)]) + p.communicate() + else: + __file = _file + sha256sum = self.hash(os.path.join(self.path, __file)) + with open(os.path.join(self.path, __file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.debug('Downloading, extraction and hashing of %s finished' % + file) + self.files = [] + + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + + def process(self): + files = ['hairpin.fa', 'mature.fa'] + for file in files: + temp = [] + if os.path.exists(os.path.join(self.path, self.shortOrganism + '-' + file)) \ + and os.path.exists(os.path.join(self.path, + self.shortOrganism + '-' + file + '.sha256sum')): + with open(os.path.join(self.path, file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, file)) + if s == _s: + logger.info("%s is present and verified" % file) + else: + logger.warn('%s is downloaded but doesnt match the sha256sum' % file) + else: + logger.debug('Extrating %s sequences from %s' % (self.organism, file)) + with open(os.path.join(self.path, file)) as iH: + for rec in SeqIO.parse(iH, 'fasta'): + if self.organism in rec.description: + _temp_rec = SeqRecord(id=rec.id, description=rec.description, + seq=rec.seq.back_transcribe()) + temp.append(_temp_rec) + SeqIO.write(temp, os.path.join(self.path, self.shortOrganism + '-' + file), + 'fasta') + with open(os.path.join(self.path, + self.shortOrganism + '-' + file + '.sha256sum'), 'w') as wH: + print(self.hash(os.path.join(self.path, + self.shortOrganism + '-' + file + '.sha256sum')), file=wH) + + +class Index: + + def __init__(self, root=os.path.join(os.path.expanduser("~"), + '.CLASHchimeras'), bowtieExecutable=None, tophatExecutable=None): + + self.created = [] + + if bowtieExecutable is not None: + logger.info('Bowtie2 path {} provided and will be used' + .format(bowtieExecutable)) + self.bowtieExecutable = bowtieExecutable + else: + try: + bowtieVersion = float(subprocess.check_output(['bowtie2', '--version'] + ).decode('utf-8').split("\n")[0].split()[-1].rpartition(".")[0]) + fullBowtieVersion = subprocess.check_output(['bowtie2', '--version'] + ).decode('utf-8').split("\n")[0].split()[-1] + if bowtieVersion < 2.2: + logger.warn("Please update bowtie2, you can download it from") + logger.warn("http://sourceforge.net/projects/bowtie-bio/files/bowtie2/") + else: + logger.info("Bowtie2 version {} found and will be used" + .format(fullBowtieVersion)) + self.bowtieExecutable = 'bowtie2' + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error(textwrap.fill(""" + Can't find Bowtie2 in your path. Please install it from + http://sourceforge.net/projects/bowtie-bio/files/bowtie2/ + """)) + else: + logger.error('Something wrong with your bowtie2 command') + + if tophatExecutable is not None: + logger.info('Tophat2 path {} provided and will be used' + .format(tophatExecutable)) + self.tophatExecutable = tophatExecutable + else: + try: + tophatVersion = float(subprocess.check_output(['tophat', + '--version']).decode('utf-8').split(" v")[-1].rpartition(".")[0]) + fullTophatVersion = subprocess.check_output(['tophat', + '--version']).decode('utf-8').split(" v")[-1].rstrip() + if tophatVersion < 2: + logger.warn("Please update tophat, you can download it from") + logger.warn("http://ccb.jhu.edu/software/tophat/index.shtml") + else: + logger.info("Tophat version {} found and will be used" + .format(fullTophatVersion)) + self.tophatExecutable = 'tophat' + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error("Can't find Tophat in your path. Please install it from") + logger.error("http://ccb.jhu.edu/software/tophat/index.shtml") + else: + logger.error('Something wrong with your tophat command') + + self.root = root + + def create(self): + + stdoutHandle = open(os.path.join(self.root, 'CLASHChimeras-Index.stdout'), + 'w') + stderrHandle = open(os.path.join(self.root, 'CLASHChimeras-Index.stderr'), + 'w') + indexCreated = False + for fa in glob.glob(os.path.join(self.root, '*.fa')): + indexName = fa.rpartition(".fa")[0] + self.created.append('{} - {}'.format(indexName, green('Bowtie2', + style='bold'))) + if 'genome.fa' in fa: + genomeIndex = fa.rpartition(".")[0] + if len(glob.glob(os.path.join(self.root, indexName + '*.bt2'))) == 6: + logger.info('Bowtie2 index for {} found'.format(fa)) + else: + try: + logger.info('Generating bowtie2 index for {}'.format(fa)) + indexCreated = True + p = subprocess.Popen([self.bowtieExecutable + '-build', fa, indexName], + stdout=stdoutHandle, stderr=stderrHandle) + p.communicate() + logger.info('Bowtie2 index for {} is generated'.format(fa)) + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error("Can't find bowtie2-build in your path...") + logger.error("Please make sure bowtie2 is in your path and") + logger.error("bowtie2-build can run from your shell") + else: + logger.error("Something wrong with your bowtie2-build command") + + if indexCreated: + logger.warn('The stdout and stderr for bowtie2-build are stored in') + logger.warn('CLASHChimeras-Index.stdout and CLASHChimeras-Index.stderr') + + for gtf in glob.glob(os.path.join(self.root, '*.annotation.gtf')): + indexName = gtf.rpartition("/")[-1].rpartition(".")[0] + self.created.append('{} - {}'.format(gtf.rpartition(".")[0], green( + 'Tophat2 Transcriptome', style='bold'))) + if len(glob.glob(os.path.join(self.root, indexName + '*.bt2'))) == 6: + logger.info('Tophat2 transcriptome index found for {}'.format(gtf)) + else: + try: + indexCreated = True + logger.info('Generating tophat transcriptome index for {}' + .format(gtf)) + p = subprocess.Popen([self.tophatExecutable, '-G', gtf, + '--transcriptome-index', os.path.join( + self.root, indexName), + genomeIndex], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.communicate() + logger.info('Tophat2 transcriptome index generated for {}' + .format(gtf)) + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error("Can't find tophat in your path...") + logger.error("Please make sure tophat is in your path") + else: + logger.error("Something wrong with your tophat command") + + stdoutHandle.close() + stderrHandle.close() diff --git a/build/lib/clashchimeras/log.py b/build/lib/clashchimeras/log.py new file mode 100644 index 0000000..2a6b0ed --- /dev/null +++ b/build/lib/clashchimeras/log.py @@ -0,0 +1,30 @@ +import logging +import coloredlogs + + +def debug_logger(name): + logger = logging.getLogger(name) + coloredlogs.install(level=logging.DEBUG, show_name=False, + show_hostname=True) + return logger + + +def info_logger(name): + logger = logging.getLogger(name) + coloredlogs.install(level=logging.INFO, show_name=False, + show_hostname=True) + return logger + + +def warning_logger(name): + logger = logging.getLogger(name) + coloredlogs.install(level=logging.WARNING, show_name=False, + show_hostname=True) + return logger + + +def error_logger(name): + logger = logging.getLogger(name) + coloredlogs.install(level=logging.ERROR, show_name=False, + show_hostname=True) + return logger diff --git a/build/lib/clashchimeras/methods.py b/build/lib/clashchimeras/methods.py new file mode 100644 index 0000000..c52d45e --- /dev/null +++ b/build/lib/clashchimeras/methods.py @@ -0,0 +1,85 @@ +import re + +from itertools import repeat + + +def convertCigar(cigarString): + cigarAttributes = re.findall("\D", cigarString) + cigarNumbers = re.findall("\d+", cigarString) + convertedCigarString = "" + for index, attribute in enumerate(cigarAttributes): + for i in repeat(attribute, int(cigarNumbers[index])): + convertedCigarString += i + return convertedCigarString + + +def chimeraOrNot(bitOne, bitTwo, overlap=4, gap=9): + combinedCigar = "" + for i, j in zip(bitOne, bitTwo): + if i == "M" and j == "M": + combinedCigar += "=" + elif i == "S" and j == "S": + combinedCigar += "#" + elif i == "I" and j == "D": + combinedCigar += "-" + elif i == "D" and j == "I": + combinedCigar += "-" + elif i == "M" and j != "M": + combinedCigar += "{" + elif i != "M" and j == "M": + combinedCigar += "}" + elif i != "M" and j == "D": + combinedCigar += "-" + elif i == "D" and j != "M": + combinedCigar += "-" + elif i == "I" and j != "M": + combinedCigar += "+" + elif i != "M" and j == "I": + combinedCigar += "+" + else: + combinedCigar += "*" + numberOfMs = len(list(filter(lambda x: x == "=", combinedCigar))) + curlyStartStart = combinedCigar.find("{") + curlyStartEnd = combinedCigar.rfind("{") + curlyEndStart = combinedCigar.find("}") + curlyEndEnd = combinedCigar.rfind("}") + listCurlies = sorted([[curlyStartStart, curlyStartEnd], [curlyEndStart, + curlyEndEnd]]) + matches = combinedCigar.count("{") + combinedCigar.count("}") + \ + combinedCigar.count("=") + combinedCigar.count("-") + \ + combinedCigar.count("+") + if numberOfMs <= overlap and abs(listCurlies[0][1] - listCurlies[1][0]) \ + <= gap and matches >= (0.75 * len(combinedCigar)): + return combinedCigar + else: + return None + + +def findRegion(record_data): + if 'ENS' in record_data.refId and '|' in record_data.refId: + reference_id = record_data.refId + reference_start = int(record_data.start) + match_length = record_data.matchLength + regions = reference_id.split("|")[7:-1] + match_start = reference_start + match_end = reference_start + match_length + mixed = [] + + for region in regions: + region_name = region.split(":")[0] + region_start = int(region.split(":")[-1].split("-")[0]) + region_end = int(region.split(":")[-1].split("-")[-1]) + if match_start >= region_start and match_end <= region_end: + mixed.append(region_name) + break + if match_start >= region_start and match_start <= region_end: + mixed.append(region_name) + if match_end >= region_start and match_end <= region_end: + mixed.append(region_name) + if match_start <= region_start and match_end >= region_end: + mixed.append(region_name) + return '{}|{}|{}'.format(reference_id.split("|")[0], + reference_id.split("|")[5], + "-".join(mixed)) + else: + return record_data.refId diff --git a/build/lib/clashchimeras/parsers.py b/build/lib/clashchimeras/parsers.py new file mode 100644 index 0000000..989c662 --- /dev/null +++ b/build/lib/clashchimeras/parsers.py @@ -0,0 +1,803 @@ +import csv +import gzip +import logging +import mmap +import os +import sys +import textwrap +from collections import Counter, defaultdict +from itertools import groupby +from operator import itemgetter + +import pandas as pd +import pyfaidx +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq + +import clashchimeras + +logger = logging.getLogger('root') + + +class GFF: + """GFF file parser for mirbase gff3 file + + This class uses memory-mapped file object to read a mirbase gff3 file. It + contains methods to read, process a gff3 file and return genomic coordinates + + Attributes: + fileName: A mirbase gff3 file path + """ + + def __init__(self, fileName=None): + self.features = {} + self.fileName = fileName + + def read(self, featureType='miRNA_primary_transcript'): + """Reads gff3 file provided during class initialization + + Stores the byte positions of every feature in a dict object named + self.features + + Keyword Args: + featureType: Feature type of a gff3 record, the third element of every + record in the file. Please change this if you want to store mature + form of microRNA, by default it uses primary transcript + (default 'miRNA_primary_transcript') + """ + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split(";") + for attribute in attributes: + if attribute.startswith('Name'): + mirbase_name = attribute.split("=")[-1] + self.features[mirbase_name] = bytePosition + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + logger.debug('Reading %s finished' % self.fileName) + + def process(self, name): + """A method to return a Record object providing genomic information + + Args: + name: A valid miRNA_primary_transcript name + + Returns: + An object Record containing scaffold, start, end, strand, mirbase_id and + mirbase_name as its variables for access + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + self.mm.seek(self.features[name]) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") + attributes = row[-1].split(";") + for attribute in attributes: + if attribute.startswith("ID"): + _id = attribute.split("=")[-1] + elif attribute.startswith("Name"): + _name = attribute.split("=")[-1] + record = Record(scaffold=row[0], start=int(row[3]), end=int(row[4]), + strand=row[6], mirbase_id=_id, mirbase_name=_name) + self.fileHandle.close() + return record + + def coordinates(self, name, start=None, end=None): + """A method to return a bed record containing genomic coordinates for the + aligned segment + + Keyword Args: + start: The alignment start position of the cDNA molecule or the relative + start of the particular molecule + end: The alignment end position in the cDNA molecule or the relative end + of the particular molecule + + Args: + name: A valid miRNA_primary_transcript name + + Returns: + A tuple of strings containing elements for a bed record + """ + record = self.process(name) + if not start and not end: + start = 1 + end = record.end - record.start + 1 + positions = {} + match_positions = [] + + if record.strand == '+': + _start = 1 + for relative, actual in enumerate(range(record.start - 1, record.end), + start=_start): + positions[relative] = actual + for pos in range(start, end + 1): + match_positions.append(positions[pos]) + return [(record.scaffold, min(match_positions), max(match_positions) + 1, + record.mirbase_name, 0, record.strand)] + + elif record.strand == '-': + _start = 1 + for relative, actual in enumerate(reversed(range(record.start - 1, + record.end)), start=_start): + positions[relative] = actual + for pos in range(start, end + 1): + match_positions.append(positions[pos]) + return [(record.scaffold, min(match_positions), max(match_positions) + 1, + record.mirbase_name, 0, record.strand)] + + +class GTF: + """GTF file parser for gencode gtf file + + This class uses memory-mapped file object to read a gencode gtf file. It + contains methods to read, process a gtf file and return genomic coordinates + + Attributes: + fileName: A gencode gtf file path + """ + + def __init__(self, fileName=None): + self.features = defaultdict(list) + self.biotypeFeatures = defaultdict(list) + self.geneFeatures = defaultdict(list) + self.fileName = fileName + self.geneIds = {} + + def readBiotype(self, featureType='exon', biotype=None): + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split("; ") + + havana_transcript = '-' + havana_gene = '-' + exon_number = '0' + + for attribute in attributes: + + if attribute.startswith("transcript_id"): + transcript_id = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("transcript_type"): + transcript_type = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("exon_number"): + exon_number = int(attribute.split(" ")[-1]) + + elif attribute.startswith("havana_gene"): + havana_gene = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("havana_transcript"): + havana_transcript = attribute.split(" ")[-1][1:-2] + + elif attribute.startswith("gene_id"): + gene_id = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("gene_name"): + gene_name = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("transcript_name"): + transcript_name = attribute.split(" ")[-1][1:-1] + + if biotype == 'tRNA': + if transcript_type == "tRNAscan": + self.biotypeFeatures[transcript_id].append((exon_number, row[0], + int(row[3]), int(row[4]), + row[6], gene_id, + havana_gene, + havana_transcript, + transcript_name, + gene_name)) + else: + if transcript_type == biotype: + self.biotypeFeatures[transcript_id].append((exon_number, row[0], + int(row[3]), int(row[4]), + row[6], gene_id, + havana_gene, + havana_transcript, + transcript_name, + gene_name)) + + self.fileHandle.close() + + def read(self, featureType='exon'): + """Reads gtf file provided during class initialization + + Stores the byte positions of every feature in a defaultdict(list) object + named self.features + + Keyword Args: + featureType: Feature type of a gtf record, the third element of every + record in the file. Please change this if you want to get specific + records (e.g. 'UTR') (default 'exon') + """ + + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split("; ") + for attribute in attributes: + if attribute.startswith("transcript_id"): + transcript_id = attribute.split(" ")[-1][1:-1] + self.features[transcript_id].append(bytePosition) + self.geneIds[transcript_id] = gene_id + if attribute.startswith("gene_id"): + gene_id = attribute.split(" ")[-1][1:-1] + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split("; ") + for attribute in attributes: + if attribute.startswith("gene_id"): + gene_id = attribute.split(" ")[-1][1:-1] + self.geneFeatures[gene_id].append(bytePosition) + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + + logger.debug('Reading %s finished' % self.fileName) + + def process(self, name): + """A method to return a Record object providing genomic information + + Args: + name: A valid gencode transcript_id + + Returns: + An object Record containing scaffold, start, end, strand, mirbase_id and + mirbase_name as its variables for access + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + positions = self.features[name] + for position in positions: + self.mm.seek(position) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") + attributes = row[-1].split("; ") + _eid = '-' + _enb = '0' + for attribute in attributes: + if attribute.startswith("transcript_type"): + _tt = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("transcript_id"): + _tid = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("exon_id"): + _eid = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("exon_number"): + _enb = int(attribute.split(" ")[-1]) + elif attribute.startswith("gene_name"): + _gn = attribute.split(" ")[-1][1:-1] + record = Record(scaffold=row[0], start=int(row[3]), end=int(row[4]), + strand=row[6], transcript_type=_tt, transcript_id=_tid, exon_id=_eid, + exon_number=_enb, gene_name=_gn) + yield record + self.fileHandle.close() + + def geneExonicRegions(self, df): + """Given a DataFrame with the exon coordinates from Gencode for a single + gene, return the total number of coding regions in that gene. + """ + scaffold = df.iloc[0].scaffold + strand = df.iloc[0].strand + gene_type = df.iloc[0].gene_type + gene_id = df.iloc[0].gene_id + gene_name = df.iloc[0].gene_name + start = df.start.min() + end = df.end.max() + bp = [False] * (end - start + 1) + for i in range(df.shape[0]): + s = df.iloc[i]['start'] - start + e = df.iloc[i]['end'] - start + 1 + bp[s:e] = [True] * (e - s) + regions = list(range(start, end + 1)) + groups = [] + + for i, j in groupby(bp): + groups.append((i, len(list(j)))) + e_start = 0 + + for i in groups: + e_end = e_start + i[1] + if i[0]: + record = Record(scaffold=scaffold, start=regions[e_start], + end=regions[e_end - 1], gene_type=gene_type, gene_id=gene_id, + gene_name=gene_name, strand=strand) + yield record + e_start += i[1] + + def geneProcess(self, name): + """A method to return a Record object providing genomic information + + Args: + name: A valid gencode gene_id + + Returns: + An object Record containing scaffold, start, end, strand, mirbase_id and + mirbase_name as its variables for access + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + positions = self.geneFeatures[name] + exons = [] + for position in positions: + self.mm.seek(position) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") + attributes = row[-1].split("; ") + for attribute in attributes: + if attribute.startswith("gene_type"): + _gt = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("gene_id"): + _gid = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("gene_name"): + _gn = attribute.split(" ")[-1][1:-1] + exons.append((row[0], int(row[3]), int(row[4]), row[6], _gt, _gid, _gn)) + self.fileHandle.close() + exons_df = pd.DataFrame(exons, columns=['scaffold', 'start', 'end', + 'strand', 'gene_type', 'gene_id', 'gene_name']) + + for record in self.geneExonicRegions(exons_df): + yield record + + def coordinates(self, name, start=None, end=None): + """A generator to return a bed record containing genomic coordinates for the + aligned segment + + Keyword Args: + start: The alignment start position of the cDNA molecule or the relative + start of the particular molecule + end: The alignment end position in the cDNA molecule or the relative end + of the particular molecule + + Args: + name: A valid miRNA_primary_transcript name + + Returns: + A list of tuple(s) of strings containing elements for a bed record. There + may be more than one because of alternate splicing. + """ + if "|" in name: + self.name = name.split("|")[0] + else: + self.name = name + positions = {} + match_positions = [] + records = [] + segments = [] + result_segments = [] + for record in self.process(self.name): + records.append(record) + records.sort(key=lambda x: int(x.exon_number)) + + if records[0].strand == '+': + _start = 1 + for record in records: + for relative, actual in enumerate(range(record.start, record.end + 1), + start=_start): + positions[relative] = actual + _start = relative + 1 + for pos in range(start, end): + match_positions.append(positions[pos]) + for key, group in groupby(enumerate(match_positions), + lambda x: x[0] - x[-1]): + segment = list(map(itemgetter(1), group)) + segments.append([segment[0], segment[-1]]) + for segment in segments: + for record in records: + if segment[0] >= record.start and segment[1] <= record.end: + result_segments.append((record.scaffold, segment[0], segment[1], + record.transcript_id + '|' + record.gene_name, 0, record.strand)) + + elif records[0].strand == '-': + _start = 1 + for record in records: + for relative, actual in enumerate(reversed(range(record.start, + record.end + 1)), start=_start): + positions[relative] = actual + _start = relative + 1 + for pos in range(start, end): + match_positions.append(positions[pos]) + for key, group in groupby(enumerate(reversed(match_positions)), + lambda x: x[0] - x[-1]): + segment = list(map(itemgetter(1), group)) + segments.append([segment[0], segment[-1]]) + for segment in segments: + for record in records: + if segment[0] >= record.start and segment[1] <= record.end: + result_segments.append((record.scaffold, segment[0], segment[1], + record.transcript_id + '|' + record.gene_name, 0, record.strand)) + + if len(result_segments) == 0: + logger.debug('%s, %s, %s' % (name, start, end)) + logger.debug('%s' % str(segments)) + for r in records: + logger.debug('%s %s %s %s' % (r.scaffold, r.strand, + r.start, r.end)) + + return result_segments + + +class SAM: + """SAM file parser for parsing bowtie2 generated files + + This class uses memory-mapped file object to read a sam file + + Attributes: + fileName: A sam file path + """ + + def __init__(self, fileName=None): + self.fileName = fileName + self.records = {} + + def read(self, flag=0): + """Reads sam file provided during class initialization + + Stores the byte position of every record based on the keyword arg flag + provided, to a dict object named self.records + + Keyword Args: + flag: The SAM alignment flag for a record. For default, it uses the + primary alignment for every record and ignores secondary alignments + (default 0) + """ + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + read = line.decode('utf-8').split("\t") + if not read[0].startswith("@") and read[1] == str(flag): + self.records[read[0]] = bytePosition + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + logger.debug('Reading %s finished' % self.fileName) + + def access(self, queryName): + """Provides random access of a record from the sam file + + Args: + queryName: The query name of the read from the sam file + + Returns: + A list generated after splitting the record line from sam file + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + self.mm.seek(self.records[queryName]) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") + self.fileHandle.close() + + return self.pretty(row) + + def filterPotentialChimeras(self, min_length=30, flag=0, target=None): + """Generated a filtered fasta file from a sam file + + This filtered fasta file contains reads that can be potentially chimeras. + The criteria for filtering is based on the minimum length + + Keyword Args: + min_length: To be selected as a potential chimera, this is the minimum + read length (default 30) + flag: The SAM alignment flag describing the type of alignment (default 0) + target: The prefix for output file + """ + logger.debug('Filtering {} for potential chimeras'.format(target)) + target = '{}.filter.fasta'.format(target.rpartition(".")[0]) + if os.path.exists(target): + logger.info('Skipping filtering for {}'.format(target)) + else: + with open(target, 'w') as oH: + with open(self.fileName) as iH: + for row in csv.reader(iH, delimiter="\t"): + if not row[0].startswith('@') and row[1] == str(flag): + if len(row[9]) >= 30: + print(textwrap.fill('>%s' % row[0], width=80), file=oH) + print(textwrap.fill('%s' % row[9], width=80), file=oH) + logger.debug('Filtering finished') + return target + + def pretty(self, row): + refId = row[2] + start = int(row[3]) + for i in row[10:]: + if i.startswith('MD'): + mismatchInfo = i + sequence = row[9] + cigar = row[5] + cigarString = clashchimeras.methods.convertCigar(row[5]) + matchLength = cigarString.count("M") + cigarString.count("D") + end = start + matchLength - 1 + record = Record(refId=refId, start=start, mismatchInfo=mismatchInfo, + sequence=sequence, cigarString=cigarString, matchLength=matchLength, + cigar=cigar, end=end) + return record + + +class Output: + """Contains methods for writing output files + + This class is used to generate every kind of output generated by this + package which includes plain text, ansi colored text and bed file + + Attributes: + target: A prefix for output file which will be automatically followed by + extension (default 'wip') + overlap: Minimum overlap to be set between two molecules when determining + chimera (default 4) + gap: Maximum gap (number of unknown nucleotides) to be allowed between + two molecules within a chimera (default 9) + """ + + def __init__(self, + target=None, + smallRNABed=False, + targetRNABed=False, + overlap=4, + gap=9): + self.target = target + self.overlap = overlap + self.gap = gap + + if smallRNABed: + self.smallRNABedHandle = open('{}.smallRNA.bed'.format(self.target), 'w') + print('# BED locations of smallRNA part of the identified chimera', + file=self.smallRNABedHandle) + self.smallRNABedCSV = csv.writer(self.smallRNABedHandle, delimiter="\t") + self.smallRNABedCSV.writerow( + ['# The name field represents the following:']) + self.smallRNABedCSV.writerow( + ['# E.g. 201980-1-48|hsa-mir-100==PAPSS1']) + self.smallRNABedCSV.writerow( + ['# 201980-1-48 is the fasta identifier']) + self.smallRNABedCSV.writerow( + ["# 201980 is the unique identifier"]) + self.smallRNABedCSV.writerow( + ["# 1 is the number of times that sequence was observed in raw " + "fastq "]) + self.smallRNABedCSV.writerow( + ["# 48 is the length of the sequence"]) + self.smallRNABedCSV.writerow( + ['# hsa-mir-100 represents the smallRNA transcript']) + self.smallRNABedCSV.writerow( + ['# PAPSS1 represents the gene symbol for targetRNA transcript ' + 'transcript ']) + if targetRNABed: + self.targetRNABedHandle = open('{}.targetRNA.bed'.format(self.target), + 'w') + self.targetRNABedCSV = csv.writer(self.targetRNABedHandle, delimiter="\t") + self.targetRNABedCSV.writerow( + ['# The name field represents the following:']) + self.targetRNABedCSV.writerow( + ['# E.g. 136019-1-48|ENST00000375759.6|SPEN==hsa-mir-103a-2']) + self.targetRNABedCSV.writerow( + ['# 136019-1-48 is the fasta identifier']) + self.targetRNABedCSV.writerow( + ["# 136019 is the unique identifier"]) + self.targetRNABedCSV.writerow( + ["# 1 is the number of times that sequence was observed in raw " + "fastq "]) + self.targetRNABedCSV.writerow( + ["# 48 is the length of the sequence"]) + self.targetRNABedCSV.writerow( + ["# ENST00000375759.6 is the targetRNA transcript identifier"]) + self.targetRNABedCSV.writerow( + ['# SPEN is the gene symbol for for targetRNA transcript ' + 'ENST00000375759.6']) + self.targetRNABedCSV.writerow( + ['# hsa-mir-103a-2 represents the smallRNA transcript ']) + + self.hybWriter = open('%s.chimeras.tsv' % self.target, 'w') + self.hybComments() + + def hybComments(self): + print("# fasta Identifier: The identifier in .unique.fasta. ", + "#\tE.g. 123456-3-68 ", + "#\t123456 is the unique identifier", + "#\t3 is the number of times that sequence was observed in raw " + "fastq ", + "#\t68 is the length of the sequence", sep="\n", file=self.hybWriter) + + print("# smallRNA: The cDNA ID of the type of RNA labelled as smallRNA in " + "the analysis", + "#\tE.g. hsa-let-7b (miRBase identifier)", + "#\tE.g. ENST00000619178.1|SNORD3D| (Gencode snoRNA identifier)", + sep="\n", file=self.hybWriter) + + print("# smallRNA_start: cDNA alignment start position of the smallRNA " + "part of the chimera", file=self.hybWriter) + + print("# smallRNA_MDtag: Showing the MD tag from the smallRNA SAM " + "alignment for the chimera", + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + "#\tMD Z String for mismatching positions.Regex:[0-9]+(([" + "A-Z]|\^[A-Z]+)[0-9]+)*9", sep="\n", file=self.hybWriter) + + print('# smallRNA_cigar: Cigar string from the smallRNA SAM alignment for ' + 'the chimera', + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + '#\tSee CIGAR in the file', sep="\n", file=self.hybWriter) + + print('# arbitrary_chimera: The chimera representation indicating what ' + 'part of the sequence represents smallRNA and targetRNA', + '#\t{ is representing a match with smallRNA', + '#\t} is representing a match with targetRNA', + '#\t# is representing unaligned sequences (identified as --gap -ga)', + '#\t- is representing a deletion (D in cigar string)', + '#\t+ is representing a deletion (I in cigar string)', + '#\tE.g {{{{{{{{-{{{{{{{{{{{{{##}}}}}}}}}}+}}}}}}}}}}}}}}}}}}}}}}' + '#\tE.g The first 22 nucleotides are aligning to smallRNA cDNA', + '#\tE.g The last 33 nucleotides are aligning to targetRNA cDNA', + sep="\n", file=self.hybWriter) + + print('# read_sequence: The actual sequence that is appeared in raw ' + 'reads', file=self.hybWriter) + + print("# targetRNA: The cDNA ID of the type of RNA labelled as targetRNA " + "in " + "the analysis", + "#\tE.g. hsa-let-7b (miRBase identifier)", + "#\tE.g. ENST00000619178.1|SNORD3D| (Gencode snoRNA identifier)", + sep="\n", file=self.hybWriter) + + print("# targetRNA_start: cDNA alignment start position of the targetRNA " + "part of the chimera", file=self.hybWriter) + + print("# targetRNA_MDtag: Showing the MD tag from the targetRNA SAM " + "alignment for the chimera", + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + "#\tMD Z String for mismatching positions.Regex:[0-9]+(([" + "A-Z]|\^[A-Z]+)[0-9]+)*9", sep="\n", file=self.hybWriter) + + print('# targetRNA_cigar: Cigar string from the targetRNA SAM alignment ' + 'for ' + 'the chimera', + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + '#\tSee CIGAR in the file', sep="\n", file=self.hybWriter) + + print("# fasta_Identifier", "smallRNA", "smallRNA_start", "smallRNA_MDtag", + "smallRNA_cigar", "arbitrary_chimera", "read_sequence", "targetRNA", + "targetRNA_start", "targetRNA_MDtag", "targetRNA_cigar", sep="\t", + file=self.hybWriter) + + def writeTargetRNABed(self, query, targetRNASegments, smallRNA): + if "ENS" in smallRNA and "|" in smallRNA: + _smallRNA = smallRNA.split("|")[5] + else: + _smallRNA = smallRNA + for segment in targetRNASegments: + _segment = list(segment) + _segment[3] = query + "|" + _segment[3] + "==" + _smallRNA + self.targetRNABedCSV.writerow(_segment) + + def writeSmallRNABed(self, query, smallRNASegments, targetRNA): + if "ENS" in targetRNA and "|" in targetRNA: + _targetRNA = targetRNA.split("|")[5] + else: + _targetRNA = targetRNA + for segment in smallRNASegments: + _segment = list(segment) + _segment[3] = query + "|" + _segment[3] + "==" + _targetRNA + self.smallRNABedCSV.writerow(_segment) + + def write(self, queryName, smallRNA, targetRNA): + chimeraString = clashchimeras.methods.chimeraOrNot(smallRNA.cigarString, + targetRNA.cigarString, overlap=self.overlap, gap=self.gap) + smallRNARegion = clashchimeras.methods.findRegion(smallRNA) + targetRNARegion = clashchimeras.methods.findRegion(targetRNA) + print(queryName, smallRNARegion, smallRNA.start, smallRNA.mismatchInfo, + smallRNA.cigar, chimeraString, smallRNA.sequence, + targetRNARegion, targetRNA.start, + targetRNA.mismatchInfo, targetRNA.cigar, sep="\t", file=self.hybWriter) + + def __del__(self): + self.hybWriter.close() + + +class Fasta: + + def __init__(self, genome=None, gtf=None): + self.genome = genome + self.gtf = gtf + self.faidx = pyfaidx.Fasta(self.genome) + + def getBiotype(self, output=None, biotype=None): + self.sequences = [] + g = GTF(fileName=self.gtf) + if biotype == 'tRNA': + g.readBiotype(biotype=biotype, featureType='tRNAscan') + else: + g.readBiotype(biotype=biotype) + for transcript_id, exons in g.biotypeFeatures.items(): + temp_seq = '' + exons.sort(key=itemgetter(0)) + for exon in exons: + if exon[4] == '-': + temp_seq += (-self.faidx[exon[1]][exon[2] - 1:exon[3]]).seq + elif exon[4] == '+': + temp_seq += self.faidx[exon[1]][exon[2] - 1:exon[3]].seq + + _id = '{}|{}|{}|{}|{}|{}|{}'.format(transcript_id, + exons[0][5], + exons[0][6], + exons[0][7], + exons[0][8], + exons[0][9], + len(temp_seq)) + + temp_rec = SeqRecord(seq=Seq(temp_seq), id=_id, + description='') + + self.sequences.append(temp_rec) + + if not output: + logger.error('Please provide output file..') + sys.exit() + else: + logger.info('Writing {}'.format(output)) + SeqIO.write(self.sequences, output, 'fasta') + + +class Fastq: + + def __init__(self, fileName=None, compressed=False): + self.fileName = fileName + self.compressed = compressed + self.n = 4 + self.sequences = Counter() + self.uniqueOutput = fileName.rpartition(".")[0] + '.unique.fasta' + + def recordIterator(self): + record = [] + record_length = 0 + for line in self.fileHandle: + if record_length == self.n: + yield record + record_length = 0 + record = [] + record.append(line.decode().rstrip()) + record_length += 1 + yield record + + def createUnique(self): + if self.compressed: + self.fileHandle = gzip.open(self.fileName, 'rb') + else: + self.fileHandle = open(self.fileName, 'rb') + logger.info('Reading {}'.format(self.fileName)) + for record in self.recordIterator(): + self.sequences[record[1]] += 1 + logger.info('Writing {}'.format(self.uniqueOutput)) + with open(self.uniqueOutput, 'w') as wH: + for index, (sequence, counts) in enumerate(sorted(self.sequences.items(), + key=itemgetter(1), reverse=True), start=1): + print('>{}-{}-{}'.format(index, counts, len(sequence)), file=wH) + print(textwrap.fill(sequence, width=80), file=wH) + logger.debug('Finished writing {}'.format(self.uniqueOutput)) + self.fileHandle.close() + + +class Record: + """A custom object (preferred over dict) for easy access using variables + + It's a dependency for GTF and GFF classes + """ + + def __init__(self, **kwds): + self.__dict__.update(kwds) diff --git a/build/lib/clashchimeras/runners.py b/build/lib/clashchimeras/runners.py new file mode 100644 index 0000000..9f0a033 --- /dev/null +++ b/build/lib/clashchimeras/runners.py @@ -0,0 +1,179 @@ +import hashlib +import logging +import os +import subprocess + +from clashchimeras.parsers import Fastq + +logger = logging.getLogger('root') + + +class Bowtie: + """Class for calling bowtie2 aligner + """ + + def __init__(self, args=None): + self.parameters = [] + self.inputFile = args.input + self.compressed = args.gzip + self.bowtieExecutable = args.bowtieExecutable + self.uniqueInput = self.inputFile.rpartition(".")[0] + '.unique.fasta' + if os.path.exists(self.uniqueInput) and os.path.exists( + self.uniqueInput + '.sha256sum'): + with open(self.uniqueInput + '.sha256sum') as f: + s = f.readline().rstrip() + _s = self.hash(self.uniqueInput) + if s == _s: + logger.info( + '{} is present and verified'.format(self.uniqueInput)) + else: + logger.warn('{} is present but checksum verification ' + 'failed'.format(self.uniqueInput)) + logger.warn( + '{} will be generated again'.format(self.uniqueInput)) + else: + _f = Fastq(fileName=self.inputFile, compressed=self.compressed) + _f.createUnique() + sha256sum = self.hash(self.uniqueInput) + with open(self.uniqueInput + '.sha256sum', 'w') as wH: + print(sha256sum, file=wH) + if not args.reverseComplement: + self.parameters.append('--norc') + self.parameters.append('-N') + self.parameters.append(str(args.mismatch)) + if not args.unaligned: + self.parameters.append('--no-unal') + self.parameters.append('-p') + self.parameters.append(str(args.threads)) + self.parameters.append('--' + args.preset) + self.smallRNAIndex = args.smallRNAIndex + self.targetRNAIndex = args.targetRNAIndex + self.output = args.output + self.runAligner = True + + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + + def run(self, output=None, filtered=None): + self.bowtie2 = [] + if output == 'smallRNA': + self.outputSam = '{}.{}.sam'.format(self.output, 'smallRNA') + elif output == 'targetRNA': + self.outputSam = '{}.{}.sam'.format(self.output, 'targetRNA') + if not self.bowtieExecutable: + self.bowtie2.append('bowtie2') + else: + self.bowtie2.append(self.bowtieExecutable) + self.bowtie2.append('-f') + if filtered: + self.bowtie2.append(filtered) + else: + self.bowtie2.append(self.uniqueInput) + self.bowtie2.append('-x') + if output == 'smallRNA': + self.bowtie2.append('"' + self.smallRNAIndex + '"') + elif output == 'targetRNA': + self.bowtie2.append('"' + self.targetRNAIndex + '"') + self.bowtie2 = self.bowtie2 + self.parameters + self.bowtie2.append('-S') + self.bowtie2.append(self.outputSam) + + if os.path.exists(self.outputSam) and os.path.exists( + self.outputSam + '.sha256sum'): + with open(self.outputSam + '.sha256sum') as f: + s = f.readline().rstrip() + _s = self.hash(self.outputSam) + if s == _s: + logger.info( + '{} is present and verified'.format(self.outputSam)) + self.runAligner = False + else: + logger.warn('{} is present but checksum verification ' + 'failed'.format(self.outputSam)) + logger.warn('{} will be generated again') + self.runAligner = True + + if self.runAligner: + logger.info( + 'Executing bowtie2 and generating {}'.format(self.outputSam)) + logger.info('The complete command for bowtie2 is \n{}'.format( + " ".join(self.bowtie2))) + bowtie2Process = subprocess.Popen(self.bowtie2, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + bowtie2Stdout, bowtie2Stderr = bowtie2Process.communicate() + logger.warn(bowtie2Stdout.decode('utf-8')) + logger.warn(bowtie2Stderr.decode('utf-8')) + if os.path.exists(self.outputSam): + logger.info('Bowtie2 execution finished and {} ' + 'generated'.format(self.outputSam)) + sha256sum = self.hash(self.outputSam) + with open(self.outputSam + '.sha256sum', 'w') as wH: + print(sha256sum, file=wH) + else: + logger.error('Something went wrong with your bowtie2 command.') + logger.error( + 'Please check your bowtie2 stderr and report a bug....') + + +class Tophat: + + def __init__(self, args=None): + self.inputFile = args.input + if args.genomeIndex: + args.genomeIndex = os.path.expanduser(args.genomeIndex) + if args.transcriptomeIndex: + args.transcriptomeIndex = os.path.expanduser( + args.transcriptomeIndex) + if args.output: + args.output = os.path.expanduser(args.output) + if args.input: + args.input = os.path.expanduser(args.input) + + self.tophatOutput = args.output + self.genomeIndex = args.genomeIndex + + self.tophatRun = [] + if args.tophatExecutable is not None: + self.tophatExecutable = args.tophatExecutable + else: + self.tophatExecutable = 'tophat' + self.tophatRun.append(self.tophatExecutable) + self.tophatRun.append('--b2-' + args.tophatPreset) + self.tophatRun.append('--b2-N') + self.tophatRun.append(str(args.mismatch)) + + if args.transcriptomeIndex: + self.tophatRun.append('--transcriptome-index=' + + args.transcriptomeIndex.strip()) + else: + logger.warn('Transcriptome index is not provided for tophat') + self.tophatRun.append('-p') + self.tophatRun.append(str(args.threads)) + self.tophatRun.append('-o') + self.tophatRun.append(self.tophatOutput) + self.tophatRun.append(self.genomeIndex) + self.tophatRun.append(self.inputFile) + + def run(self): + logger.info('Executing Tophat2 for {}'.format(self.inputFile)) + logger.info('The complete command for tophat is \n{}'.format(" ".join( + self.tophatRun))) + logger.info('The stdout and stderr for tophat run will be ' + 'dumped after its finished') + tophatProcess = subprocess.Popen(self.tophatRun, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + tophatStderr, tophatStdout = tophatProcess.communicate() + logger.warn('Tophat stderr') + logger.info(tophatStderr.decode("utf-8")) + logger.warn('Tophat stdout') + logger.info(tophatStdout.decode("utf-8")) + logger.info('Execution of Tophat2 finished') diff --git a/build/scripts-3.5/align-for-chimeras b/build/scripts-3.5/align-for-chimeras new file mode 100755 index 0000000..d3291f8 --- /dev/null +++ b/build/scripts-3.5/align-for-chimeras @@ -0,0 +1,5 @@ +#!/usr/bin/python3 + +import clashchimeras.align + +clashchimeras.align.main() diff --git a/build/scripts-3.5/download-for-chimeras b/build/scripts-3.5/download-for-chimeras new file mode 100755 index 0000000..1dfe575 --- /dev/null +++ b/build/scripts-3.5/download-for-chimeras @@ -0,0 +1,5 @@ +#!/usr/bin/python3 + +import clashchimeras.download + +clashchimeras.download.main() diff --git a/build/scripts-3.5/find-chimeras b/build/scripts-3.5/find-chimeras new file mode 100755 index 0000000..53e0116 --- /dev/null +++ b/build/scripts-3.5/find-chimeras @@ -0,0 +1,5 @@ +#!/usr/bin/python3 + +import clashchimeras.find + +clashchimeras.find.main() diff --git a/clashchimeras/align.py b/clashchimeras/align.py index addf1a0..8f2efc9 100644 --- a/clashchimeras/align.py +++ b/clashchimeras/align.py @@ -6,199 +6,202 @@ from clashchimeras.parsers import SAM from clashchimeras.runners import Bowtie, Tophat + class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): - pass + pass + def parseArguments(): - requiredArgs = getRequiredArgs() - bowtieArgs = getBowtie2Args() - tophatArgs = getTophatArgs() - optionalArgs = getOptionalArgs() - outputArgs = getOutputArgs() - - parser = argparse.ArgumentParser( - parents=[requiredArgs, bowtieArgs, tophatArgs, - outputArgs, optionalArgs], - formatter_class=CustomFormatter, - description='Given a fastq file, this script executes ' - 'bowtie2 and tophat aligners to generate alignment files ' - 'necessary for detecting chimeras in the reads', - usage='\n %(prog)s -i input.fastq -si ' - '/path/to/smallRNA_index -ti /path/to/targetRNA_index -o ' - 'output -r bowtie2 \n %(prog)s -i input.fastq -gi ' - '/path/to/genome_index -tri /path/to/transcriptome_index ' - '-o output -r tophat \n \n \n ' - 'To see detailed help, please run \n %(prog)s -h', - add_help=True) - - return parser - - args = parser.parse_args() - - if args.logLevel == 'DEBUG': - logger = clashchimeras.log.debug_logger('root') - elif args.logLevel == 'WARNING': - logger = clashchimeras.log.warning_logger('root') - elif args.logLevel == 'ERROR': - logger = clashchimeras.log.error_logger('root') - else: - logger = clashchimeras.log.info_logger('root') - - argCheck = Arguments(args, type='align') - argCheck.validateAlign() - - return args + requiredArgs = getRequiredArgs() + bowtieArgs = getBowtie2Args() + tophatArgs = getTophatArgs() + optionalArgs = getOptionalArgs() + outputArgs = getOutputArgs() + + parser = argparse.ArgumentParser( + parents=[requiredArgs, bowtieArgs, tophatArgs, + outputArgs, optionalArgs], + formatter_class=CustomFormatter, + description='Given a fastq file, this script executes ' + 'bowtie2 and tophat aligners to generate alignment files ' + 'necessary for detecting chimeras in the reads', + usage='\n %(prog)s -i input.fastq -si ' + '/path/to/smallRNA_index -ti /path/to/targetRNA_index -o ' + 'output -r bowtie2 \n %(prog)s -i input.fastq -gi ' + '/path/to/genome_index -tri /path/to/transcriptome_index ' + '-o output -r tophat \n \n \n ' + 'To see detailed help, please run \n %(prog)s -h', + add_help=True) + + return parser + + args = parser.parse_args() + + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') + + argCheck = Arguments(args, type='align') + argCheck.validateAlign() + + return args + def getRequiredArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Input arguments') + required = parser.add_argument_group('Input arguments') - required.add_argument('--input', '-i', - help='Input file containing reads fastq', - metavar='raw reads', - required=True) + required.add_argument('--input', '-i', + help='Input file containing reads fastq', + metavar='raw reads', + required=True) - return parser + return parser -def getBowtie2Args(): - parser = argparse.ArgumentParser(add_help=False) - bowtieArgs = parser.add_argument_group('Bowtie2 arguments') +def getBowtie2Args(): + parser = argparse.ArgumentParser(add_help=False) + bowtieArgs = parser.add_argument_group('Bowtie2 arguments') - bowtieArgs.add_argument("--smallRNAIndex", "-si", - help="""Provide the smallRNA bowtie2 index (Usually + bowtieArgs.add_argument("--smallRNAIndex", "-si", + help="""Provide the smallRNA bowtie2 index (Usually resides in ~/db/CLASHChimeras or elsewhere if you have specified in --path -pa during initialize)""") - bowtieArgs.add_argument("--targetRNAIndex", "-ti", - help="""Provide the targetRNA bowtie2 index (Usually + bowtieArgs.add_argument("--targetRNAIndex", "-ti", + help="""Provide the targetRNA bowtie2 index (Usually resides in ~/db/CLASHChimeras or elsewhere if you have specified in --path -pa during initialize)""") - return parser + return parser + def getTophatArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - tophatArgs = parser.add_argument_group('Tophat arguments') + tophatArgs = parser.add_argument_group('Tophat arguments') - tophatArgs.add_argument("--genomeIndex", "-gi", - help="""Provide the genome bowtie2 index (Usually + tophatArgs.add_argument("--genomeIndex", "-gi", + help="""Provide the genome bowtie2 index (Usually resides in ~/db/CLASHChimeras or elsewhere if you have specified in --path during initialize)""") - tophatArgs.add_argument("--transcriptomeIndex", "-tri", - help="""Provide the transcriptome index as specified + tophatArgs.add_argument("--transcriptomeIndex", "-tri", + help="""Provide the transcriptome index as specified in tophat --transcriptome-index""") - return parser + return parser def getOutputArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - output = parser.add_argument_group('Output') + output = parser.add_argument_group('Output') - output.add_argument("--output", "-o", - help="""The output name without extension (.sam .bam will + output.add_argument("--output", "-o", + help="""The output name without extension (.sam .bam will be added)""", - metavar='output prefix', - required=True) + metavar='output prefix', + required=True) - return parser + return parser def getOptionalArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Optional arguments') + optional = parser.add_argument_group('Optional arguments') - optional.add_argument("--run", "-r", - help="Run the following aligner for raw reads", - default='bowtie2', - choices=['bowtie2', 'tophat']) + optional.add_argument("--run", "-r", + help="Run the following aligner for raw reads", + default='bowtie2', + choices=['bowtie2', 'tophat']) - optional.add_argument("--logLevel", "-l", - help="Set logging level", - default='INFO', - choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) + optional.add_argument("--logLevel", "-l", + help="Set logging level", + default='INFO', + choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) - optional.add_argument("--gzip", "-gz", action="store_true", - help="Whether your input file is gzipped") + optional.add_argument("--gzip", "-gz", action="store_true", + help="Whether your input file is gzipped") - optional.add_argument("--bowtieExecutable", "-be", - help="""Provide bowtie2 executable if it's not present + optional.add_argument("--bowtieExecutable", "-be", + help="""Provide bowtie2 executable if it's not present in your path""") - optional.add_argument("--tophatExecutable", "-te", - help="""Provide Tophat executable if it's not present in + optional.add_argument("--tophatExecutable", "-te", + help="""Provide Tophat executable if it's not present in your path""") - optional.add_argument("--preset", "-p", default='sensitive-local', - choices=['very-fast', 'fast', 'sensitive', - 'very-sensitive', 'very-fast-local', 'fast-local', - 'sensitive-local', 'very-sensitive-local'], - help="Provide preset for bowtie2") - - optional.add_argument("--tophatPreset", "-tp", - choices=['very-fast', 'fast', 'sensitive', - 'very-sensitive'], - default='very-sensitive', - help="Provide preset for Tophat") - - optional.add_argument("--mismatch", "-m", type=int, - choices=[0, 1], default=1, - help="""Number of seed mismatches as represented in + optional.add_argument("--preset", "-p", default='sensitive-local', + choices=['very-fast', 'fast', 'sensitive', + 'very-sensitive', 'very-fast-local', 'fast-local', + 'sensitive-local', 'very-sensitive-local'], + help="Provide preset for bowtie2") + + optional.add_argument("--tophatPreset", "-tp", + choices=['very-fast', 'fast', 'sensitive', + 'very-sensitive'], + default='very-sensitive', + help="Provide preset for Tophat") + + optional.add_argument("--mismatch", "-m", type=int, + choices=[0, 1], default=1, + help="""Number of seed mismatches as represented in bowtie2 as -N""") - optional.add_argument("--reverseComplement", "-rc", - action="store_true", - help="""Align to reverse complement of reference as + optional.add_argument("--reverseComplement", "-rc", + action="store_true", + help="""Align to reverse complement of reference as represented in bowtie2 as --norc""") - optional.add_argument("--unaligned", "-un", - action="store_true", - help="""Whether to keep unaligned reads in the output + optional.add_argument("--unaligned", "-un", + action="store_true", + help="""Whether to keep unaligned reads in the output sam file. Represented in bowtie2 as --no-unal""") - optional.add_argument("--threads", "-n", default=1, - help="Specify the number of threads") + optional.add_argument("--threads", "-n", default=1, + help="Specify the number of threads") - return parser + return parser def main(): - parser = parseArguments() - - args = parser.parse_args() + parser = parseArguments() + args = parser.parse_args() - if args.logLevel == 'DEBUG': - logger = clashchimeras.log.debug_logger('root') - elif args.logLevel == 'WARNING': - logger = clashchimeras.log.warning_logger('root') - elif args.logLevel == 'ERROR': - logger = clashchimeras.log.error_logger('root') - else: - logger = clashchimeras.log.info_logger('root') + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') - argCheck = Arguments(args) - argCheck.validateAlign() + argCheck = Arguments(args) + argCheck.validateAlign() - if args.run == 'bowtie2': - b = Bowtie(args=args) - b.run(output='smallRNA') - s = SAM(fileName=b.outputSam) - filteredFasta = s.filterPotentialChimeras(target=b.outputSam) - b.run(output='targetRNA', filtered=filteredFasta) + if args.run == 'bowtie2': + b = Bowtie(args=args) + b.run(output='smallRNA') + s = SAM(fileName=b.outputSam) + filteredFasta = s.filterPotentialChimeras(target=b.outputSam) + b.run(output='targetRNA', filtered=filteredFasta) - if args.run == 'tophat' : - t = Tophat(args=args) - t.run() + if args.run == 'tophat': + t = Tophat(args=args) + t.run() if __name__ == "__main__": - main() + main() diff --git a/clashchimeras/download.py b/clashchimeras/download.py index ddb9b64..ef767b6 100644 --- a/clashchimeras/download.py +++ b/clashchimeras/download.py @@ -5,117 +5,122 @@ import clashchimeras.log from clashchimeras.initialize import Releases, Index + class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): - pass + pass + def parseArguments(): - requiredArgs = getRequiredArgs() - optionalArgs = getOptionalArgs() - outputArgs = getOutputArgs() - - parser = argparse.ArgumentParser( - parents=[requiredArgs, outputArgs, optionalArgs], - formatter_class=CustomFormatter, - description=textwrap.dedent("""\ + requiredArgs = getRequiredArgs() + optionalArgs = getOptionalArgs() + outputArgs = getOutputArgs() + + parser = argparse.ArgumentParser( + parents=[requiredArgs, outputArgs, optionalArgs], + formatter_class=CustomFormatter, + description=textwrap.dedent("""\ Downloads required sequences and create bowtie2 indexes required for alignment"""), - usage='An example usage is: %(prog)s -gor "H.sapiens" -mor hsa', - add_help=True) + usage='An example usage is: %(prog)s -gor "H.sapiens" -mor hsa', + add_help=True) + + return parser - return parser def getRequiredArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group('Required arguments') - required.add_argument("--gencodeOrganism", "-gor", - default="H.sapiens", choices=["H.sapiens", - "M.musculus"], - help="""Select model organism""", - required=True) + required.add_argument("--gencodeOrganism", "-gor", + default="H.sapiens", choices=["H.sapiens", + "M.musculus"], + help="""Select model organism""", + required=True) - required.add_argument("--mirbaseOrganism", "-mor", - default='hsa', - help="""Select organism to download microRNAs for""", - required=True) + required.add_argument("--mirbaseOrganism", "-mor", + default='hsa', + help="""Select organism to download microRNAs for""", + required=True) + + return parser - return parser def getOutputArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - output = parser.add_argument_group('Output') + output = parser.add_argument_group('Output') - output.add_argument("--path", "-pa", - help="""Location where all the database files and + output.add_argument("--path", "-pa", + help="""Location where all the database files and indexes will be downloaded""", - default='~/db/CLASHChimeras', - metavar='path') + default='~/db/CLASHChimeras', + metavar='path') + + return parser - return parser def getOptionalArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Optional arguments') + optional = parser.add_argument_group('Optional arguments') - optional.add_argument("--logLevel", "-l", - help="Set logging level", - default='INFO', - choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) + optional.add_argument("--logLevel", "-l", + help="Set logging level", + default='INFO', + choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) - optional.add_argument("--bowtieExecutable", "-be", - help="""Provide bowtie2 executable if it's not present + optional.add_argument("--bowtieExecutable", "-be", + help="""Provide bowtie2 executable if it's not present in your path""") - optional.add_argument("--tophatExecutable", "-te", - help="""Provide Tophat executable if it's not present in + optional.add_argument("--tophatExecutable", "-te", + help="""Provide Tophat executable if it's not present in your path""") - optional.add_argument("--miRNA", "-mi", - choices=['mature', 'hairpin'], default='hairpin', - help="Which miRNA sequences to align") + optional.add_argument("--miRNA", "-mi", + choices=['mature', 'hairpin'], default='hairpin', + help="Which miRNA sequences to align") - return parser + return parser def main(): - parser = parseArguments() + parser = parseArguments() - args = parser.parse_args() + args = parser.parse_args() - if args.logLevel == 'DEBUG': - logger = clashchimeras.log.debug_logger('root') - elif args.logLevel == 'WARNING': - logger = clashchimeras.log.warning_logger('root') - elif args.logLevel == 'ERROR': - logger = clashchimeras.log.error_logger('root') - else: - logger = clashchimeras.log.info_logger('root') + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') - r = Releases(gencodeOrganism=args.gencodeOrganism, - mirbaseOrganism=args.mirbaseOrganism, - path=os.path.expanduser(args.path)) + r = Releases(gencodeOrganism=args.gencodeOrganism, + mirbaseOrganism=args.mirbaseOrganism, + path=os.path.expanduser(args.path)) - i = Index(root=r.gencodePath) - i.create() + i = Index(root=r.gencodePath) + i.create() - logger.warn('Please find the indexes created listed below..') - logger.warn('Use them when you run align-for-chimeras') + logger.warn('Please find the indexes created listed below..') + logger.warn('Use them when you run align-for-chimeras') - for _i in i.created: - logger.warn(_i) + for _i in i.created: + logger.warn(_i) - i = Index(root=r.mirbasePath) - i.create() + i = Index(root=r.mirbasePath) + i.create() - logger.warn('Please find the indexes created listed below..') - logger.warn('Use them when you run align-for-chimeras') + logger.warn('Please find the indexes created listed below..') + logger.warn('Use them when you run align-for-chimeras') - for _i in i.created: - logger.warn(_i) + for _i in i.created: + logger.warn(_i) if __name__ == "__main__": - main() + main() diff --git a/clashchimeras/find.py b/clashchimeras/find.py index f5a4302..025a0d3 100644 --- a/clashchimeras/find.py +++ b/clashchimeras/find.py @@ -3,223 +3,216 @@ import sys from colors import yellow -from progress.bar import Bar import clashchimeras.log import clashchimeras.methods from clashchimeras.parsers import GFF, GTF, Output, SAM from clashchimeras.initialize import Arguments + class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): - pass + pass + def parseArguments(): - requiredArgs = getRequiredArgs() - optionalArgs = getOptionalArgs() - outputArgs = getOutputArgs() + requiredArgs = getRequiredArgs() + optionalArgs = getOptionalArgs() + outputArgs = getOutputArgs() - parser = argparse.ArgumentParser( - parents=[requiredArgs, outputArgs, optionalArgs], - formatter_class=CustomFormatter, + parser = argparse.ArgumentParser( + parents=[requiredArgs, outputArgs, optionalArgs], + formatter_class=CustomFormatter, - usage='An example usage is: %(prog)s -s smallRNA.sam -t ' - 'targetRNA.sam -o output', + usage='An example usage is: %(prog)s -s smallRNA.sam -t ' + 'targetRNA.sam -o output', - description=textwrap.dedent('''\ + description=textwrap.dedent('''\ Given two SAM files, this script tries to find chimeras that are observed between a smallRNA and a targetRNA'''), - add_help=True) + add_help=True) - return parser - -def getRequiredArgs(): - parser = argparse.ArgumentParser(add_help=False) + return parser - required = parser.add_argument_group('Required arguments') - - required.add_argument('--smallRNA', '-s', - help='Provide smallRNA alignment SAM file', - metavar='SAM file', - required=True) - required.add_argument('--targetRNA', '-t', - help='Provide targetRNA alignment SAM file', - metavar='SAM file', - required=True) - required.add_argument("--getGenomicLocationsSmallRNA", "-ggs", - help="Do you want genomic locations for small RNA?", - action="store_true") - - required.add_argument("--getGenomicLocationsTargetRNA", "-ggt", - help="Do you want genomic locations for target RNA?", - action="store_true") - - required.add_argument("--smallRNAAnnotation", "-sa", - action="store", - help="""Provide smallRNA annotation gtf(from Gencode) +def getRequiredArgs(): + parser = argparse.ArgumentParser(add_help=False) + + required = parser.add_argument_group('Required arguments') + + required.add_argument('--smallRNA', '-s', + help='Provide smallRNA alignment SAM file', + metavar='SAM file', + required=True) + required.add_argument('--targetRNA', '-t', + help='Provide targetRNA alignment SAM file', + metavar='SAM file', + required=True) + + required.add_argument("--getGenomicLocationsSmallRNA", "-ggs", + help="Do you want genomic locations for small RNA?", + action="store_true") + + required.add_argument("--getGenomicLocationsTargetRNA", "-ggt", + help="Do you want genomic locations for target RNA?", + action="store_true") + + required.add_argument("--smallRNAAnnotation", "-sa", + action="store", + help="""Provide smallRNA annotation gtf(from Gencode) or gff3(from Mirbase). Only provide gtf from Gencode or gff3 from Mirbase. Does not support other gtf files""") - required.add_argument("--targetRNAAnnotation", "-ta", - action="store", - help="""Provide targetRNA annotation gtf(from Gencode). + required.add_argument("--targetRNAAnnotation", "-ta", + action="store", + help="""Provide targetRNA annotation gtf(from Gencode). Only provide gtf from Gencode. Does not support other gtf files""") - return parser + return parser + def getOutputArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - output = parser.add_argument_group('Output') + output = parser.add_argument_group('Output') - output.add_argument("--output", "-o", - help="""The output name without extension (.bed .csv will + output.add_argument("--output", "-o", + help="""The output name without extension (.bed .csv will be added)""", - metavar='output prefix', - required=True) + metavar='output prefix', + required=True) + + return parser - return parser def getOptionalArgs(): - parser = argparse.ArgumentParser(add_help=False) + parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Optional arguments') + optional = parser.add_argument_group('Optional arguments') - optional.add_argument("--overlap", "-ov", - help="""Maximum overlap to be set between two + optional.add_argument("--overlap", "-ov", + help="""Maximum overlap to be set between two molecules when determining chimeras""", - default=4) + default=4) - optional.add_argument("--gap", "-ga", - help="""Maximum gap (number of unaligned nucleotides) to + optional.add_argument("--gap", "-ga", + help="""Maximum gap (number of unaligned nucleotides) to allowed between two molecules within a chimera""", - default=9) + default=9) - optional.add_argument("--logLevel", "-l", - help="Set logging level", - default='INFO', - choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) + optional.add_argument("--logLevel", "-l", + help="Set logging level", + default='INFO', + choices=['INFO', 'DEBUG', 'WARNING', 'ERROR']) - return parser + return parser def main(): - """The main function to execute when you run the program as a script - """ - parser = parseArguments() - - args = parser.parse_args() - - if args.logLevel == 'DEBUG': - logger = clashchimeras.log.debug_logger('root') - elif args.logLevel == 'WARNING': - logger = clashchimeras.log.warning_logger('root') - elif args.logLevel == 'ERROR': - logger = clashchimeras.log.error_logger('root') - else: - logger = clashchimeras.log.info_logger('root') - - argCheck = Arguments(args, type='find') - argCheck.validateFind() - - smallRNA = SAM(fileName=args.smallRNA) - smallRNA.read() - - targetRNA = SAM(fileName=args.targetRNA) - targetRNA.read() - - if args.getGenomicLocationsSmallRNA: - - if args.smallRNAAnnotation.endswith('gff3'): - - smallRNAAnnotation = GFF(fileName=args.smallRNAAnnotation) - smallRNAAnnotation.read() - - elif args.smallRNAAnnotation.endswith('gtf'): - smallRNAAnnotation = GTF(fileName=args.smallRNAAnnotation) - if 'tRNAs' in args.smallRNAAnnotation: - smallRNAAnnotation.read(featureType='tRNAscan') - else: - smallRNAAnnotation.read() - - if args.getGenomicLocationsTargetRNA: - if args.targetRNAAnnotation.endswith('gtf'): - targetRNAAnnotation = GTF(fileName=args.targetRNAAnnotation) - targetRNAAnnotation.read() - - output = Output(target=args.output, - smallRNABed=args.getGenomicLocationsSmallRNA, - targetRNABed=args.getGenomicLocationsTargetRNA, - overlap=args.overlap, - gap=args.gap) - - common = set(smallRNA.records.keys()).intersection(targetRNA.records.keys()) - - logger.info('Determining chimeras') - - updateInterval = int(len(common)/100) - - if updateInterval == 0: - updateInterval = 1 - pbar = Bar(width=80, max=100, fill=yellow(u"â–ˆ"), - suffix="%(percent)d%% - %(eta)ds") - - for index, queryName in enumerate(common, start=1): - smallRNARecord = smallRNA.access(queryName) - targetRNARecord = targetRNA.access(queryName) - - chimeraString = clashchimeras.methods.chimeraOrNot( - smallRNARecord.cigarString, targetRNARecord.cigarString, - overlap=args.overlap, gap=args.gap) - - if chimeraString: - - if args.getGenomicLocationsSmallRNA: - - try: - smallRNALocation = smallRNAAnnotation.coordinates( - smallRNARecord.refId, - start=smallRNARecord.start, - end=smallRNARecord.end) - except IndexError as e: - logger.error(e) - logger.error('Are you sure you have the correct annotation file for ' - 'smallRNA?') - logger.error('Please check --smallRNAAnnotation -sa and ' - 'and make sure they are matching with the alignment ' - 'file {}'.format(args.smallRNA)) - sys.exit() - output.writeSmallRNABed(queryName, smallRNALocation, - targetRNARecord.refId) - - if args.getGenomicLocationsTargetRNA: - try: - targetRNALocation = targetRNAAnnotation.coordinates( - targetRNARecord.refId, - start=targetRNARecord.start, - end=targetRNARecord.end) - except IndexError as e: - logger.error(e.msg) - logger.error('Are you sure you have the correct annotation file for ' - 'targetRNA?') - logger.error('Please check --targetRNAAnnotation -ta and ' - 'and make sure they are matching with the alignment ' - 'file {}'.format(args.targetRNA)) - sys.exit() - - output.writeTargetRNABed(queryName, targetRNALocation, - smallRNARecord.refId) - - output.write(queryName, smallRNARecord, targetRNARecord) - - - if index % updateInterval == 0: - pbar.next() - pbar.finish() - logger.info('Determining chimeras finished') + """The main function to execute when you run the program as a script + """ + parser = parseArguments() + + args = parser.parse_args() + + if args.logLevel == 'DEBUG': + logger = clashchimeras.log.debug_logger('root') + elif args.logLevel == 'WARNING': + logger = clashchimeras.log.warning_logger('root') + elif args.logLevel == 'ERROR': + logger = clashchimeras.log.error_logger('root') + else: + logger = clashchimeras.log.info_logger('root') + + argCheck = Arguments(args, type='find') + argCheck.validateFind() + + smallRNA = SAM(fileName=args.smallRNA) + smallRNA.read() + + targetRNA = SAM(fileName=args.targetRNA) + targetRNA.read() + + if args.getGenomicLocationsSmallRNA: + + if args.smallRNAAnnotation.endswith('gff3'): + + smallRNAAnnotation = GFF(fileName=args.smallRNAAnnotation) + smallRNAAnnotation.read() + + elif args.smallRNAAnnotation.endswith('gtf'): + smallRNAAnnotation = GTF(fileName=args.smallRNAAnnotation) + if 'tRNAs' in args.smallRNAAnnotation: + smallRNAAnnotation.read(featureType='tRNAscan') + else: + smallRNAAnnotation.read() + + if args.getGenomicLocationsTargetRNA: + if args.targetRNAAnnotation.endswith('gtf'): + targetRNAAnnotation = GTF(fileName=args.targetRNAAnnotation) + targetRNAAnnotation.read() + + output = Output(target=args.output, + smallRNABed=args.getGenomicLocationsSmallRNA, + targetRNABed=args.getGenomicLocationsTargetRNA, + overlap=args.overlap, + gap=args.gap) + + common = set(smallRNA.records.keys()).intersection(targetRNA.records.keys()) + + logger.info('Determining chimeras') + + for index, queryName in enumerate(common, start=1): + smallRNARecord = smallRNA.access(queryName) + targetRNARecord = targetRNA.access(queryName) + + chimeraString = clashchimeras.methods.chimeraOrNot( + smallRNARecord.cigarString, targetRNARecord.cigarString, + overlap=args.overlap, gap=args.gap) + + if chimeraString: + + if args.getGenomicLocationsSmallRNA: + + try: + smallRNALocation = smallRNAAnnotation.coordinates( + smallRNARecord.refId, + start=smallRNARecord.start, + end=smallRNARecord.end) + except IndexError as e: + logger.error(e) + logger.error('Are you sure you have the correct annotation file for ' + 'smallRNA?') + logger.error('Please check --smallRNAAnnotation -sa and ' + 'and make sure they are matching with the alignment ' + 'file {}'.format(args.smallRNA)) + sys.exit() + output.writeSmallRNABed(queryName, smallRNALocation, + targetRNARecord.refId) + + if args.getGenomicLocationsTargetRNA: + try: + targetRNALocation = targetRNAAnnotation.coordinates( + targetRNARecord.refId, + start=targetRNARecord.start, + end=targetRNARecord.end) + except IndexError as e: + logger.error(e.msg) + logger.error('Are you sure you have the correct annotation file for ' + 'targetRNA?') + logger.error('Please check --targetRNAAnnotation -ta and ' + 'and make sure they are matching with the alignment ' + 'file {}'.format(args.targetRNA)) + sys.exit() + + output.writeTargetRNABed(queryName, targetRNALocation, + smallRNARecord.refId) + + output.write(queryName, smallRNARecord, targetRNARecord) + + logger.info('Determining chimeras finished') if __name__ == "__main__": - main() + main() diff --git a/clashchimeras/initialize.py b/clashchimeras/initialize.py index d25c58e..17e275f 100644 --- a/clashchimeras/initialize.py +++ b/clashchimeras/initialize.py @@ -23,729 +23,737 @@ logger = logging.getLogger('root') + class Releases: - def __init__(self, gencodeOrganism='H.sapiens', mirbaseOrganism='hsa', - path=os.path.join(os.path.expanduser("~"),'.CLASHchimeras')): - organisms = {'H.sapiens': 'http://www.gencodegenes.org/releases/', - 'M.musculus': 'http://www.gencodegenes.org/mouse_releases/'} - shortOrganisms = {'H.sapiens': 'human', - 'M.musculus': 'mouse'} - self.gencodeOrganism = gencodeOrganism - self.mirbaseOrganism = mirbaseOrganism - self.path = path - self.url = organisms[self.gencodeOrganism] - self.gencodeShortOrganism = shortOrganisms[self.gencodeOrganism] - self.mirbaseUrl = 'ftp://mirbase.org/pub/mirbase/CURRENT/README' - self.df = self.openUrl() - self.mirbaseDf = self.openMirbaseReadme() - - self.selectGencodeRelease() - - self.selectMirbaseRelease() - - g = Gencode(release=self.gencodeRelease, - path=self.gencodePath, - ftpDir=self.gencodeFtpDir) - - g.download() - - m = Mirbase(release=self.mirbaseRelease, - path=self.mirbasePath, - ftpDir=self.mirbaseFtpDir, - organism=self.mirbaseOrganism) - - m.download() - - m.process() - - def openUrl(self): - r = requests.get(self.url) - data = r.text - soup = BeautifulSoup(data) - table = soup.find("table") - headings = [th.get_text().strip() for th in table.find("tr").find_all("th")] - dataset = defaultdict(dict) - for index, row in enumerate(table.find_all("tr")[1:]): - for i, j in zip(headings, - (td.get_text().strip() for td in row.find_all("td"))): - dataset[index][i] = j - - return pd.DataFrame(dataset).transpose() - - def openMirbaseReadme(self): - with ftputil.FTPHost('mirbase.org', 'anonymous', 'anonymous') as fH: - fobj = fH.open('pub/mirbase/CURRENT/README') - store = False - dataset = defaultdict(dict) - index = 0 - for line in fobj.readlines(): - if store: - row = line.strip().split() - if len(row) == 3 and row[1][0].isdigit(): - dataset[index]['Version'] = row[0] - dataset[index]['Date'] = row[1] - dataset[index]['Entries'] = row[2] - index += 1 - if 'HISTORY' in line: - store = True - - return pd.DataFrame(dataset).transpose() - - def downloadMirbaseRelease(self, key): - logger.warn('Are you sure??') - decision = input('Y/n: ') - if decision.lower() == 'y' or decision == '': - - self.mirbaseRelease = self.mirbaseDf.loc[int(key)]['Version'] - - os.makedirs(os.path.join(self.path, 'Mirbase', - str(self.mirbaseRelease)), exist_ok=True) - - self.mirbasePath = os.path.join(self.path, 'Mirbase', - str(self.mirbaseRelease)) - - self.mirbaseFtpDir = os.path.join('pub/mirbase', self.mirbaseRelease) - - return True - - def downloadGencodeRelease(self, key): - logger.warn('Are you sure??') - decision = input('Y/n: ') - if decision.lower() == 'y' or decision == '': - - self.gencodeRelease = self.df.loc[int(key)]['GENCODE release'] - - os.makedirs(os.path.join(self.path, 'Gencode', self.gencodeOrganism, - str(self.gencodeRelease)), exist_ok=True) - - self.gencodePath = os.path.join(self.path, 'Gencode', - self.gencodeOrganism, - str(self.gencodeRelease)) - - self.gencodeFtpDir = os.path.join('pub/gencode/Gencode_'+\ - self.gencodeShortOrganism, - 'release_'+self.gencodeRelease) - - return True - - def selectGencodeRelease(self): - cols = self.df.columns - - logger.info("Select the release that you want \n{}".format( - tabulate(self.df[[cols[2], cols[0], - cols[1], cols[3]]], - headers=['Index', cols[2], cols[0], cols[1], cols[3]], - tablefmt="simple"))) - - logger.warn('Please bare in mind that the automatic download relies on ' - 'regex search which is known to work for Gencode release 17 ' - 'and higher ') - - releaseKeys = self.df.index - - while True: - logger.warn('Please select which Gencode release to use (select index '+\ - '[{}]): '.format(min(self.df.index))) - key = input('Index: ') - if key.isdigit() and int(key) in releaseKeys: - logger.warn('This will download the Gencode release {} which ' - 'corresponds to Ensembl release {} and Genome assembly ' - 'version {} freezed on {}'.format( - self.df.loc[int(key)]['GENCODE release'], - self.df.loc[int(key)]['Ensembl release'], - self.df.loc[int(key)]['Genome assembly version'], - self.df.loc[int(key)]['Freeze date *'])) - if self.downloadGencodeRelease(key): - break - - elif key == '': - logger.warn('This will download the latest Gencode release {} which ' - 'corresponds to Ensembl release {} and Genome assembly ' - 'version {} freezed on {}'.format( - self.df.loc[min(self.df.index)]['GENCODE release'], - self.df.loc[min(self.df.index)]['Ensembl release'], - self.df.loc[min(self.df.index)]['Genome assembly ' - 'version'], - self.df.loc[min(self.df.index)]['Freeze date *'])) - if self.downloadGencodeRelease(min(self.df.index)): - break - else: - logger.warn('Not a valid release index:') - - def selectMirbaseRelease(self): - cols = self.mirbaseDf.columns - - logger.info("Select the Mirbase release you want \n{}".format( - tabulate(self.mirbaseDf, headers=['Index', cols[0], cols[1], - cols[2]], tablefmt="simple"))) - - releaseKeys = self.mirbaseDf.index - - while True: - logger.warn('Please select which Mirbase release to use (select index '+\ - '[{}]): '.format(max(self.mirbaseDf.index))) - key = input('Index: ') - if key.isdigit() and int(key) in releaseKeys: - logger.warn('This will download miRBase release {} which contains {} ' - 'entries and released on {}'.format( - self.mirbaseDf.loc[int(key)]['Version'], - self.mirbaseDf.loc[int(key)]['Entries'], - self.mirbaseDf.loc[int(key)]['Date'])) - if self.downloadMirbaseRelease(key): - break - - elif key == '': - logger.warn('This will download miRBase release {} which contains {} ' - 'entries and released on {}'.format( - self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Version'], - self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Entries'], - self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Date'])) - if self.downloadMirbaseRelease(max(self.mirbaseDf.index)): - break - - else: - logger.warn('Not a valid release index:') + + def __init__(self, gencodeOrganism='H.sapiens', mirbaseOrganism='hsa', + path=os.path.join(os.path.expanduser("~"), '.CLASHchimeras')): + organisms = {'H.sapiens': 'http://www.gencodegenes.org/releases/', + 'M.musculus': 'http://www.gencodegenes.org/mouse_releases/'} + shortOrganisms = {'H.sapiens': 'human', + 'M.musculus': 'mouse'} + self.gencodeOrganism = gencodeOrganism + self.mirbaseOrganism = mirbaseOrganism + self.path = path + self.url = organisms[self.gencodeOrganism] + self.gencodeShortOrganism = shortOrganisms[self.gencodeOrganism] + self.mirbaseUrl = 'ftp://mirbase.org/pub/mirbase/CURRENT/README' + self.df = self.openUrl() + self.mirbaseDf = self.openMirbaseReadme() + + self.selectGencodeRelease() + + self.selectMirbaseRelease() + + g = Gencode(release=self.gencodeRelease, + path=self.gencodePath, + ftpDir=self.gencodeFtpDir) + + g.download() + + m = Mirbase(release=self.mirbaseRelease, + path=self.mirbasePath, + ftpDir=self.mirbaseFtpDir, + organism=self.mirbaseOrganism) + + m.download() + + m.process() + + def openUrl(self): + r = requests.get(self.url) + data = r.text + soup = BeautifulSoup(data) + table = soup.find("table") + headings = [th.get_text().strip() for th in table.find("tr").find_all("th")] + dataset = defaultdict(dict) + for index, row in enumerate(table.find_all("tr")[1:]): + for i, j in zip(headings, + (td.get_text().strip() for td in row.find_all("td"))): + dataset[index][i] = j + + return pd.DataFrame(dataset).transpose() + + def openMirbaseReadme(self): + with ftputil.FTPHost('mirbase.org', 'anonymous', 'anonymous') as fH: + fobj = fH.open('pub/mirbase/CURRENT/README') + store = False + dataset = defaultdict(dict) + index = 0 + for line in fobj.readlines(): + if store: + row = line.strip().split() + if len(row) == 3 and row[1][0].isdigit(): + dataset[index]['Version'] = row[0] + dataset[index]['Date'] = row[1] + dataset[index]['Entries'] = row[2] + index += 1 + if 'HISTORY' in line: + store = True + + return pd.DataFrame(dataset).transpose() + + def downloadMirbaseRelease(self, key): + logger.warn('Are you sure??') + decision = input('Y/n: ') + if decision.lower() == 'y' or decision == '': + + self.mirbaseRelease = self.mirbaseDf.loc[int(key)]['Version'] + + os.makedirs(os.path.join(self.path, 'Mirbase', + str(self.mirbaseRelease)), exist_ok=True) + + self.mirbasePath = os.path.join(self.path, 'Mirbase', + str(self.mirbaseRelease)) + + self.mirbaseFtpDir = os.path.join('pub/mirbase', self.mirbaseRelease) + + return True + + def downloadGencodeRelease(self, key): + logger.warn('Are you sure??') + decision = input('Y/n: ') + if decision.lower() == 'y' or decision == '': + + self.gencodeRelease = self.df.loc[int(key)]['GENCODE release'] + + os.makedirs(os.path.join(self.path, 'Gencode', self.gencodeOrganism, + str(self.gencodeRelease)), exist_ok=True) + + self.gencodePath = os.path.join(self.path, 'Gencode', + self.gencodeOrganism, + str(self.gencodeRelease)) + + self.gencodeFtpDir = os.path.join('pub/gencode/Gencode_' + + self.gencodeShortOrganism, + 'release_' + self.gencodeRelease) + + return True + + def selectGencodeRelease(self): + cols = self.df.columns + + logger.info("Select the release that you want \n{}".format( + tabulate(self.df[[cols[2], cols[0], + cols[1], cols[3]]], + headers=['Index', cols[2], cols[0], cols[1], cols[3]], + tablefmt="simple"))) + + logger.warn('Please bare in mind that the automatic download relies on ' + 'regex search which is known to work for Gencode release 17 ' + 'and higher ') + + releaseKeys = self.df.index + + while True: + logger.warn('Please select which Gencode release to use (select index ' + + '[{}]): '.format(min(self.df.index))) + key = input('Index: ') + if key.isdigit() and int(key) in releaseKeys: + logger.warn('This will download the Gencode release {} which ' + 'corresponds to Ensembl release {} and Genome assembly ' + 'version {} freezed on {}'.format( + self.df.loc[int(key)]['GENCODE release'], + self.df.loc[int(key)]['Ensembl release'], + self.df.loc[int(key)]['Genome assembly version'], + self.df.loc[int(key)]['Freeze date *'])) + if self.downloadGencodeRelease(key): + break + + elif key == '': + logger.warn('This will download the latest Gencode release {} which ' + 'corresponds to Ensembl release {} and Genome assembly ' + 'version {} freezed on {}'.format( + self.df.loc[min(self.df.index)]['GENCODE release'], + self.df.loc[min(self.df.index)]['Ensembl release'], + self.df.loc[min(self.df.index)]['Genome assembly ' + 'version'], + self.df.loc[min(self.df.index)]['Freeze date *'])) + if self.downloadGencodeRelease(min(self.df.index)): + break + else: + logger.warn('Not a valid release index:') + + def selectMirbaseRelease(self): + cols = self.mirbaseDf.columns + + logger.info("Select the Mirbase release you want \n{}".format( + tabulate(self.mirbaseDf, headers=['Index', cols[0], cols[1], + cols[2]], tablefmt="simple"))) + + releaseKeys = self.mirbaseDf.index + + while True: + logger.warn('Please select which Mirbase release to use (select index ' + + '[{}]): '.format(max(self.mirbaseDf.index))) + key = input('Index: ') + if key.isdigit() and int(key) in releaseKeys: + logger.warn('This will download miRBase release {} which contains {} ' + 'entries and released on {}'.format( + self.mirbaseDf.loc[int(key)]['Version'], + self.mirbaseDf.loc[int(key)]['Entries'], + self.mirbaseDf.loc[int(key)]['Date'])) + if self.downloadMirbaseRelease(key): + break + + elif key == '': + logger.warn('This will download miRBase release {} which contains {} ' + 'entries and released on {}'.format( + self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Version'], + self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Entries'], + self.mirbaseDf.loc[max(self.mirbaseDf.index)]['Date'])) + if self.downloadMirbaseRelease(max(self.mirbaseDf.index)): + break + + else: + logger.warn('Not a valid release index:') + class Arguments: - def __init__(self, args, type=None): - self.args = args - if type == 'align': - if args.genomeIndex: - self.args.genomeIndex = os.path.expanduser(args.genomeIndex) - if args.transcriptomeIndex: - self.args.transcriptomeIndex = os.path.expanduser( - args.transcriptomeIndex) - if args.input: - self.args.input = os.path.expanduser(args.input) - if args.smallRNAIndex: - self.args.smallRNAIndex = os.path.expanduser(args.smallRNAIndex) - if self.args.targetRNAIndex: - self.args.targetRNAIndex = os.path.expanduser(args.targetRNAIndex) - - if args.output: - self.args.output = os.path.expanduser(args.output) - - if type == 'find': - if args.smallRNAAnnotation: - self.args.smallRNAAnnotation = os.path.expanduser( - args.smallRNAAnnotation) - if args.targetRNAAnnotation: - self.args.targetRNAAnnotation = os.path.expanduser( - args.targetRNAAnnotation) - if args.smallRNA: - self.args.smallRNA = os.path.expanduser(args.smallRNA) - if args.targetRNA: - self.args.targetRNA = os.path.expanduser(args.targetRNA) - - self.exit = False - - def hash(self, file): - sha256 = hashlib.sha256() - with open(file, 'r+b') as f: - while True: - buf = f.read(8192) - if not buf: - break - sha256.update(buf) - return sha256.hexdigest() - - def validateFind(self): - if not os.path.exists(self.args.smallRNA): - logger.error('{} not found...'.format(self.args.smallRNA)) - self.exit = True - if not os.path.exists(self.args.targetRNA): - logger.error('{} not found...'.format(self.args.smallRNA)) - self.exit = True - if self.args.getGenomicLocationsSmallRNA: - if not self.args.smallRNAAnnotation: - logger.error('--smallRNAAnnotation -sa not specified. If you want ' - 'genomic locations for smallRNA, please enable ' - '--getGenomicLocationsSmallRNA -ggs and specify ' - 'annotation file using --smallRNAAnnotation -sa') - self.exit = True - else: - if not os.path.exists(self.args.smallRNAAnnotation): - logger.error('{} not found'.format(self.args.smallRNAAnnotation)) - self.exit = True - if self.args.getGenomicLocationsTargetRNA: - if not self.args.targetRNAAnnotation: - logger.error('--targetRNAAnnotation -ta not specified. If you want ' - 'genomic locations for targetRNA, please enable ' - '--getGenomicLocationsTargetRNA -ggt and specify ' - 'annotation file using --targetRNAAnnotation -ta') - self.exit = True - else: - if not os.path.exists(self.args.targetRNAAnnotation): - logger.error('{} not found...'.format(self.args.targetRNAAnnotation)) - self.exit = True - if self.hash(self.args.smallRNA) == self.hash(self.args.targetRNA): - logger.error('CLASHChimeras does not detect chimeras between the same ' - 'RNA type yet... Please hang in there, we are planning it ' - 'in the feature') - self.exit = True - - if self.exit: - logger.error('Exiting because of the above errors...') - sys.exit() - elif self.hash(self.args.smallRNA) == self.hash(self.args.targetRNA): - logger.error('CLASHChimeras does not detect chimeras between the same ' - 'RNA type yet... Please hang in there, we are planning it ' - 'in the feature') - sys.exit() - - def validateAlign(self): - if not os.path.exists(self.args.input): - logger.error('{} not found...'.format(self.args.input)) - self.exit = True - if self.args.run == 'bowtie2' and (not self.args.smallRNAIndex or \ - not self.args.targetRNAIndex): - logger.error('{}'.format('Please specify --smallRNAIndex -si and '+\ - '--targetRNAIndex -ti properly...')) - self.exit = True - if self.args.smallRNAIndex: - bts = glob.glob(self.args.smallRNAIndex+'*.bt2') - if not len(bts) >= 6: - logger.error('Something wrong with the {}'.format( - self.args.smallRNAIndex)) - logger.error("Can't find all the .bt2 files for that index") - self.exit=True - if self.args.targetRNAIndex: - bts = glob.glob(self.args.targetRNAIndex+'*.bt2') - if not len(bts) >= 6: - logger.error('Something wrong with the {}'.format( - self.args.targetRNAIndex)) - logger.error("Can't find all the .bt2 files for that index") - self.exit=True - if self.args.genomeIndex: - bts = glob.glob(self.args.genomeIndex.strip()+'*.bt2') - if not len(bts) >= 6: - logger.error('Something wrong with the {}'.format( - self.args.genomeIndex)) - logger.error("Can't find all the .bt2 files for that index") - self.exit=True - if self.args.run == 'tophat' and not self.args.genomeIndex: - logger.error('Please provide genome index if you want to use tophat..') - self.exit = True - if self.exit: - sys.exit() + + def __init__(self, args, type=None): + self.args = args + if type == 'align': + if args.genomeIndex: + self.args.genomeIndex = os.path.expanduser(args.genomeIndex) + if args.transcriptomeIndex: + self.args.transcriptomeIndex = os.path.expanduser( + args.transcriptomeIndex) + if args.input: + self.args.input = os.path.expanduser(args.input) + if args.smallRNAIndex: + self.args.smallRNAIndex = os.path.expanduser(args.smallRNAIndex) + if self.args.targetRNAIndex: + self.args.targetRNAIndex = os.path.expanduser(args.targetRNAIndex) + + if args.output: + self.args.output = os.path.expanduser(args.output) + + if type == 'find': + if args.smallRNAAnnotation: + self.args.smallRNAAnnotation = os.path.expanduser( + args.smallRNAAnnotation) + if args.targetRNAAnnotation: + self.args.targetRNAAnnotation = os.path.expanduser( + args.targetRNAAnnotation) + if args.smallRNA: + self.args.smallRNA = os.path.expanduser(args.smallRNA) + if args.targetRNA: + self.args.targetRNA = os.path.expanduser(args.targetRNA) + + self.exit = False + + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + + def validateFind(self): + if not os.path.exists(self.args.smallRNA): + logger.error('{} not found...'.format(self.args.smallRNA)) + self.exit = True + if not os.path.exists(self.args.targetRNA): + logger.error('{} not found...'.format(self.args.smallRNA)) + self.exit = True + if self.args.getGenomicLocationsSmallRNA: + if not self.args.smallRNAAnnotation: + logger.error('--smallRNAAnnotation -sa not specified. If you want ' + 'genomic locations for smallRNA, please enable ' + '--getGenomicLocationsSmallRNA -ggs and specify ' + 'annotation file using --smallRNAAnnotation -sa') + self.exit = True + else: + if not os.path.exists(self.args.smallRNAAnnotation): + logger.error('{} not found'.format(self.args.smallRNAAnnotation)) + self.exit = True + if self.args.getGenomicLocationsTargetRNA: + if not self.args.targetRNAAnnotation: + logger.error('--targetRNAAnnotation -ta not specified. If you want ' + 'genomic locations for targetRNA, please enable ' + '--getGenomicLocationsTargetRNA -ggt and specify ' + 'annotation file using --targetRNAAnnotation -ta') + self.exit = True + else: + if not os.path.exists(self.args.targetRNAAnnotation): + logger.error('{} not found...'.format(self.args.targetRNAAnnotation)) + self.exit = True + if self.hash(self.args.smallRNA) == self.hash(self.args.targetRNA): + logger.error('CLASHChimeras does not detect chimeras between the same ' + 'RNA type yet... Please hang in there, we are planning it ' + 'in the feature') + self.exit = True + + if self.exit: + logger.error('Exiting because of the above errors...') + sys.exit() + elif self.hash(self.args.smallRNA) == self.hash(self.args.targetRNA): + logger.error('CLASHChimeras does not detect chimeras between the same ' + 'RNA type yet... Please hang in there, we are planning it ' + 'in the feature') + sys.exit() + + def validateAlign(self): + if not os.path.exists(self.args.input): + logger.error('{} not found...'.format(self.args.input)) + self.exit = True + if self.args.run == 'bowtie2' and (not self.args.smallRNAIndex or + not self.args.targetRNAIndex): + logger.error('{}'.format('Please specify --smallRNAIndex -si and ' + + '--targetRNAIndex -ti properly...')) + self.exit = True + if self.args.smallRNAIndex: + bts = glob.glob(self.args.smallRNAIndex + '*.bt2') + if not len(bts) >= 6: + logger.error('Something wrong with the {}'.format( + self.args.smallRNAIndex)) + logger.error("Can't find all the .bt2 files for that index") + self.exit = True + if self.args.targetRNAIndex: + bts = glob.glob(self.args.targetRNAIndex + '*.bt2') + if not len(bts) >= 6: + logger.error('Something wrong with the {}'.format( + self.args.targetRNAIndex)) + logger.error("Can't find all the .bt2 files for that index") + self.exit = True + if self.args.genomeIndex: + bts = glob.glob(self.args.genomeIndex.strip() + '*.bt2') + if not len(bts) >= 6: + logger.error('Something wrong with the {}'.format( + self.args.genomeIndex)) + logger.error("Can't find all the .bt2 files for that index") + self.exit = True + if self.args.run == 'tophat' and not self.args.genomeIndex: + logger.error('Please provide genome index if you want to use tophat..') + self.exit = True + if self.exit: + sys.exit() + class Gencode: - """This class makes sure the required files (sequences, annotation etc.) are - present and in working order. It generates sha256 checksums and autodownloads - the required files from Gencode Genes. - """ - def __init__(self, release=None, user="anonymous", password=None, - path=None, ftpDir=None): - - self.downloaded = [] - self.release = release - self.reList = ['^GRC.+\.genome\.fa', - '^gencode.+chr_patch_hapl_scaff\.annotation\.gtf', - '^gencode.+pc_transcripts\.fa', - '^gencode.+tRNAs\.gtf', - '^gencode.+lncRNA_transcripts\.fa'] - - self.host = "ftp.sanger.ac.uk" - self.user = user - self.path = path - - self.files = [] - self.otherFiles = {} - biotypes = ['snRNA', 'snoRNA', 'misc_RNA', 'tRNA'] - for b in biotypes: - self.otherFiles[b] = 'gencode.v{}.{}_transcripts.fa'.format(self.release, - b) - if self.user == "anonymous": self.password = "anonymous" - else: self.password = password - - ftp_host = ftputil.FTPHost(self.host, self.user, self.password) - - self.dir = ftpDir - fileList = ftp_host.listdir(self.dir) - for file in fileList: - for rEx in self.reList: - if re.match(rEx, file): - if not 'primary_assembly' in file: - self.files.append(file) - ftp_host.close() - _files = self.files.copy() - _otherFiles = self.otherFiles.copy() - - for file in _files: - logger.debug('Checking %s' % file) - _file = file.rpartition(".")[0] - if os.path.exists(os.path.join(self.path, _file+'.sha256sum')) and \ - os.path.exists(os.path.join(self.path, _file)): - with open(os.path.join(self.path, _file+'.sha256sum')) as f: - s = f.readline().rstrip() - _s = self.hash(os.path.join(self.path, _file)) - if s == _s: - logger.info("%s is present and verified" % _file) - self.files.remove(file) + """This class makes sure the required files (sequences, annotation etc.) are + present and in working order. It generates sha256 checksums and autodownloads + the required files from Gencode Genes. + """ + + def __init__(self, release=None, user="anonymous", password=None, + path=None, ftpDir=None): + + self.downloaded = [] + self.release = release + self.reList = ['^GRC.+\.genome\.fa', + '^gencode.+chr_patch_hapl_scaff\.annotation\.gtf', + '^gencode.+pc_transcripts\.fa', + '^gencode.+tRNAs\.gtf', + '^gencode.+lncRNA_transcripts\.fa'] + + self.host = "ftp.sanger.ac.uk" + self.user = user + self.path = path + + self.files = [] + self.otherFiles = {} + biotypes = ['snRNA', 'snoRNA', 'misc_RNA', 'tRNA'] + for b in biotypes: + self.otherFiles[b] = 'gencode.v{}.{}_transcripts.fa'.format(self.release, + b) + if self.user == "anonymous": + self.password = "anonymous" + else: + self.password = password + + ftp_host = ftputil.FTPHost(self.host, self.user, self.password) + + self.dir = ftpDir + fileList = ftp_host.listdir(self.dir) + for file in fileList: + for rEx in self.reList: + if re.match(rEx, file): + if not 'primary_assembly' in file: + self.files.append(file) + ftp_host.close() + _files = self.files.copy() + _otherFiles = self.otherFiles.copy() + + for file in _files: + logger.debug('Checking %s' % file) + _file = file.rpartition(".")[0] + if os.path.exists(os.path.join(self.path, _file + '.sha256sum')) and \ + os.path.exists(os.path.join(self.path, _file)): + with open(os.path.join(self.path, _file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, _file)) + if s == _s: + logger.info("%s is present and verified" % _file) + self.files.remove(file) + else: + logger.warn('%s is downloaded but doesnt match the sha256sum' % _file) + logger.error('Will be downloaded again') + else: + logger.warn('%s will be downloaded' % _file) + + for biotype, file in _otherFiles.items(): + logger.debug('Checking %s' % file) + _file = file + if os.path.exists(os.path.join(self.path, _file + '.sha256sum')) and \ + os.path.exists(os.path.join(self.path, _file)): + with open(os.path.join(self.path, _file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, _file)) + if s == _s: + logger.info("%s is present and verified" % file) + self.otherFiles.pop(biotype) + else: + logger.warn('%s is generated but doesnt match the sha256sum' % file) + logger.error('Will be generated again') + else: + logger.warn('%s will be generated' % file) + + def download(self): + + if len(self.files) == 0: + logger.info('Gencode files are downloaded and checksum verified...') + else: + with ftputil.FTPHost(self.host, self.user, self.password) as fH: + for file in self.files: + logger.info('Downloading %s from ftp.sanger.ac.uk' % file) + fH.download(self.dir + '/' + file, os.path.join(self.path, file), + callback=None) + self.downloaded.append(file) + _file = file.rpartition(".")[0] + p = subprocess.Popen(['gunzip', os.path.join(self.path, file)]) + p.communicate() + sha256sum = self.hash(os.path.join(self.path, _file)) + with open(os.path.join(self.path, _file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.info('Downloading, extraction and hashing of %s finished' % + file) + + gtfFiles = glob.glob(os.path.join(self.path, "*.annotation.gtf")) + + if len(gtfFiles) == 0: + logger.warn('This release does not contain annotation file or they are ' + 'not grabbed by regex. Please download it manually from {' + '}'.format(self.dir)) + gtfFile = None + elif len(gtfFiles) == 2: + gtfFiles.sort() + gtfFile = gtfFiles[1] + elif len(gtfFiles) == 1: + gtfFile = gtfFiles[0] + + genomeFiles = glob.glob(os.path.join(self.path, "*.genome.fa")) + + if len(genomeFiles) == 0: + logger.warn('This release does not contain genome file or they are not ' + 'grabbed by regex. Please download it manually from {' + '}'.format(self.dir)) + genomeFile = None + else: + genomeFile = genomeFiles[0] + + tRNAgtfFiles = glob.glob(os.path.join(self.path, "*.tRNAs.gtf")) + + if len(tRNAgtfFiles) == 0: + logger.warn(('This release does not contain tRNA annotation file or they ' + 'are not grabbed by regex. Please download it manually from {' + '}'.format(self.dir))) + tRNAgtfFile = None else: - logger.warn('%s is downloaded but doesnt match the sha256sum' % _file) - logger.error('Will be downloaded again') - else: - logger.warn('%s will be downloaded' % _file) - - for biotype, file in _otherFiles.items(): - logger.debug('Checking %s' % file) - _file = file - if os.path.exists(os.path.join(self.path, _file+'.sha256sum')) and \ - os.path.exists(os.path.join(self.path, _file)): - with open(os.path.join(self.path, _file+'.sha256sum')) as f: - s = f.readline().rstrip() - _s = self.hash(os.path.join(self.path, _file)) - if s == _s: - logger.info("%s is present and verified" % file) - self.otherFiles.pop(biotype) + tRNAgtfFile = tRNAgtfFiles[0] + + if len(self.otherFiles) == 0: + logger.info('Other fasta files are generated and checksum verified...') else: - logger.warn('%s is generated but doesnt match the sha256sum' % file) - logger.error('Will be generated again') - else: - logger.warn('%s will be generated' % file) - - - def download(self): - - if len(self.files) == 0: - logger.info('Gencode files are downloaded and checksum verified...') - else: - with ftputil.FTPHost(self.host, self.user, self.password) as fH: - for file in self.files: - logger.info('Downloading %s from ftp.sanger.ac.uk' % file) - fH.download(self.dir+'/'+file, os.path.join(self.path, file), - callback=None) - self.downloaded.append(file) - _file = file.rpartition(".")[0] - p = subprocess.Popen(['gunzip', os.path.join(self.path, file)]) - p.communicate() - sha256sum = self.hash(os.path.join(self.path, _file)) - with open(os.path.join(self.path, _file+'.sha256sum'), 'w') as wH: - print(sha256sum, file=wH) - logger.info('Downloading, extraction and hashing of %s finished' % \ - file) - - gtfFiles = glob.glob(os.path.join(self.path, "*.annotation.gtf")) - - if len(gtfFiles) == 0: - logger.warn('This release does not contain annotation file or they are ' - 'not grabbed by regex. Please download it manually from {' - '}'.format(self.dir)) - gtfFile = None - elif len(gtfFiles) == 2: - gtfFiles.sort() - gtfFile = gtfFiles[1] - elif len(gtfFiles) == 1: - gtfFile = gtfFiles[0] - - genomeFiles = glob.glob(os.path.join(self.path, "*.genome.fa")) - - if len(genomeFiles) == 0: - logger.warn('This release does not contain genome file or they are not ' - 'grabbed by regex. Please download it manually from {' - '}'.format(self.dir)) - genomeFile = None - else: - genomeFile = genomeFiles[0] - - - tRNAgtfFiles = glob.glob(os.path.join(self.path, "*.tRNAs.gtf")) - - if len(tRNAgtfFiles) == 0: - logger.warn(('This release does not contain tRNA annotation file or they ' - 'are not grabbed by regex. Please download it manually from {' - '}'.format(self.dir))) - tRNAgtfFile = None - else: - tRNAgtfFile = tRNAgtfFiles[0] - - - if len(self.otherFiles) == 0: - logger.info('Other fasta files are generated and checksum verified...') - else: - for biotype, file in self.otherFiles.items(): - if biotype == 'tRNA' and (tRNAgtfFile and genomeFile): - fasta = Fasta(genome=genomeFile, gtf=tRNAgtfFile) - fasta.getBiotype(biotype='tRNA', output=os.path.join(self.path, - file)) - sha256sum = self.hash(os.path.join(self.path, file)) - with open(os.path.join(self.path, file+'.sha256sum'), 'w') as wH: - print(sha256sum, file=wH) - logger.info('Extraction and hashing of %s finished' % file) - elif gtfFile and genomeFile: - fasta = Fasta(genome=genomeFile, gtf=gtfFile) - fasta.getBiotype(biotype=biotype, output=os.path.join(self.path, - file)) - sha256sum = self.hash(os.path.join(self.path, file)) - with open(os.path.join(self.path, file+'.sha256sum'), 'w') as wH: - print(sha256sum, file=wH) - logger.info('Extraction and hashing of %s finished' % file) - - - def hash(self, file): - sha256 = hashlib.sha256() - with open(file, 'r+b') as f: - while True: - buf = f.read(8192) - if not buf: - break - sha256.update(buf) - return sha256.hexdigest() + for biotype, file in self.otherFiles.items(): + if biotype == 'tRNA' and (tRNAgtfFile and genomeFile): + fasta = Fasta(genome=genomeFile, gtf=tRNAgtfFile) + fasta.getBiotype(biotype='tRNA', output=os.path.join(self.path, + file)) + sha256sum = self.hash(os.path.join(self.path, file)) + with open(os.path.join(self.path, file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.info('Extraction and hashing of %s finished' % file) + elif gtfFile and genomeFile: + fasta = Fasta(genome=genomeFile, gtf=gtfFile) + fasta.getBiotype(biotype=biotype, output=os.path.join(self.path, + file)) + sha256sum = self.hash(os.path.join(self.path, file)) + with open(os.path.join(self.path, file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.info('Extraction and hashing of %s finished' % file) + + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + class Mirbase: - """This class makes sure that requried files (sequences, annotations etc.) - from Mirbase are downloaded. It generates sha256 checksums and autodownloads - the required files - """ - - def __init__(self, release=None, user="anonymous", password=None, - path=os.path.join(os.path.expanduser("~"),'.CLASHchimeras'), - organism='hsa', - ftpDir=None): - self.reList = ['^hairpin\.fa\.gz', '^mature\.fa\.gz'] - organisms = {} - with ftputil.FTPHost('mirbase.org', 'anonymous', 'anonymous') as fH: - tH, tP = tempfile.mkstemp() - fH.download(os.path.join('pub/mirbase', release, 'organisms.txt.gz'), tP) - for i in gzip.open(tP): - ii = i.decode('utf-8').strip() - if not ii.startswith('#'): - organisms[ii.split("\t")[0]] = ii.split("\t")[2] - - self.shortOrganism = organism - self.organism = organisms.get(self.shortOrganism, None) - if not self.organism: - logger.error('{} is not a valid organism name in Mirbase'.format( - self.shortOrganism)) - sys.exit() - self.host = 'mirbase.org' - self.release = release - self.user = user - self.path = path - self.files = [] - - if self.user == "anonymous": self.password = "anonymous" - else: self.password = password - - ftp_host = ftputil.FTPHost(self.host, self.user, self.password) - - self.dir = ftpDir - - fileList = ftp_host.listdir(self.dir) - - self.files.append('genomes/'+self.shortOrganism+'.gff3') - for file in fileList: - for rEx in self.reList: - if re.match(rEx, file): - self.files.append(file) - ftp_host.close() - _files = self.files.copy() - - for file in _files: - logger.debug('Checking %s' % file) - if file.endswith('gz'): - _file = file.rpartition(".")[0] - elif '/' in file: - _file = file.rpartition("/")[2] - if os.path.exists(os.path.join(self.path, _file+'.sha256sum')) and \ - os.path.exists(os.path.join(self.path, _file)): - with open(os.path.join(self.path, _file+'.sha256sum')) as f: - s = f.readline().rstrip() - _s = self.hash(os.path.join(self.path, _file)) - if s == _s: - logger.info("%s is present and verified" % file) - self.files.remove(file) + """This class makes sure that requried files (sequences, annotations etc.) + from Mirbase are downloaded. It generates sha256 checksums and autodownloads + the required files + """ + + def __init__(self, release=None, user="anonymous", password=None, + path=os.path.join(os.path.expanduser("~"), '.CLASHchimeras'), + organism='hsa', + ftpDir=None): + self.reList = ['^hairpin\.fa\.gz', '^mature\.fa\.gz'] + organisms = {} + with ftputil.FTPHost('mirbase.org', 'anonymous', 'anonymous') as fH: + tH, tP = tempfile.mkstemp() + fH.download(os.path.join('pub/mirbase', release, 'organisms.txt.gz'), tP) + for i in gzip.open(tP): + ii = i.decode('utf-8').strip() + if not ii.startswith('#'): + organisms[ii.split("\t")[0]] = ii.split("\t")[2] + + self.shortOrganism = organism + self.organism = organisms.get(self.shortOrganism, None) + if not self.organism: + logger.error('{} is not a valid organism name in Mirbase'.format( + self.shortOrganism)) + sys.exit() + self.host = 'mirbase.org' + self.release = release + self.user = user + self.path = path + self.files = [] + + if self.user == "anonymous": + self.password = "anonymous" else: - logger.warn('%s is downloaded but doesnt match the sha256sum' % file) - logger.error('Must run download method') - else: - logger.warn('%s will be downloaded when you run download method' % file) - - - def download(self): - - if len(self.files) == 0: - logger.debug('Mirbase files are downloaded and checksum verified...') - else: - with ftputil.FTPHost(self.host, self.user, self.password) as fH: - for file in self.files: - logger.info('Downloading %s from mirbase.org' % file) - if "/" in file: - _file = file.rpartition("/")[2] - else: - _file = file - fH.download(self.dir+'/'+file, os.path.join(self.path, _file), - callback=None) - if _file.endswith('.gz'): - __file = _file.rpartition(".")[0] - p = subprocess.Popen(['gunzip', os.path.join(self.path, __file)]) - p.communicate() - else: - __file = _file - sha256sum = self.hash(os.path.join(self.path, __file)) - with open(os.path.join(self.path, __file+'.sha256sum'), 'w') as wH: - print(sha256sum, file=wH) - logger.debug('Downloading, extraction and hashing of %s finished' % \ - file) - self.files = [] - - def hash(self, file): - sha256 = hashlib.sha256() - with open(file, 'r+b') as f: - while True: - buf = f.read(8192) - if not buf: - break - sha256.update(buf) - return sha256.hexdigest() - - def process(self): - files = ['hairpin.fa', 'mature.fa'] - for file in files: - temp = [] - if os.path.exists(os.path.join(self.path, self.shortOrganism+'-'+file)) \ - and os.path.exists(os.path.join(self.path, - self.shortOrganism+'-'+file+'.sha256sum')): - with open(os.path.join(self.path, file+'.sha256sum')) as f: - s = f.readline().rstrip() - _s = self.hash(os.path.join(self.path, file)) - if s == _s: - logger.info("%s is present and verified" % file) + self.password = password + + ftp_host = ftputil.FTPHost(self.host, self.user, self.password) + + self.dir = ftpDir + + fileList = ftp_host.listdir(self.dir) + + self.files.append('genomes/' + self.shortOrganism + '.gff3') + for file in fileList: + for rEx in self.reList: + if re.match(rEx, file): + self.files.append(file) + ftp_host.close() + _files = self.files.copy() + + for file in _files: + logger.debug('Checking %s' % file) + if file.endswith('gz'): + _file = file.rpartition(".")[0] + elif '/' in file: + _file = file.rpartition("/")[2] + if os.path.exists(os.path.join(self.path, _file + '.sha256sum')) and \ + os.path.exists(os.path.join(self.path, _file)): + with open(os.path.join(self.path, _file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, _file)) + if s == _s: + logger.info("%s is present and verified" % file) + self.files.remove(file) + else: + logger.warn('%s is downloaded but doesnt match the sha256sum' % file) + logger.error('Must run download method') + else: + logger.warn('%s will be downloaded when you run download method' % file) + + def download(self): + + if len(self.files) == 0: + logger.debug('Mirbase files are downloaded and checksum verified...') else: - logger.warn('%s is downloaded but doesnt match the sha256sum' % file) - else: - logger.debug('Extrating %s sequences from %s' % (self.organism, file)) - with open(os.path.join(self.path, file)) as iH: - for rec in SeqIO.parse(iH, 'fasta'): - if self.organism in rec.description: - _temp_rec = SeqRecord(id=rec.id, description=rec.description, - seq=rec.seq.back_transcribe()) - temp.append(_temp_rec) - SeqIO.write(temp, os.path.join(self.path, self.shortOrganism+'-'+file), - 'fasta') - with open(os.path.join(self.path, - self.shortOrganism+'-'+file+'.sha256sum'), 'w') as wH: - print(self.hash(os.path.join(self.path, - self.shortOrganism+'-'+file+'.sha256sum')), file=wH) + with ftputil.FTPHost(self.host, self.user, self.password) as fH: + for file in self.files: + logger.info('Downloading %s from mirbase.org' % file) + if "/" in file: + _file = file.rpartition("/")[2] + else: + _file = file + fH.download(self.dir + '/' + file, os.path.join(self.path, _file), + callback=None) + if _file.endswith('.gz'): + __file = _file.rpartition(".")[0] + p = subprocess.Popen(['gunzip', os.path.join(self.path, __file)]) + p.communicate() + else: + __file = _file + sha256sum = self.hash(os.path.join(self.path, __file)) + with open(os.path.join(self.path, __file + '.sha256sum'), 'w') as wH: + print(sha256sum, file=wH) + logger.debug('Downloading, extraction and hashing of %s finished' % + file) + self.files = [] + + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + + def process(self): + files = ['hairpin.fa', 'mature.fa'] + for file in files: + temp = [] + if os.path.exists(os.path.join(self.path, self.shortOrganism + '-' + file)) \ + and os.path.exists(os.path.join(self.path, + self.shortOrganism + '-' + file + '.sha256sum')): + with open(os.path.join(self.path, file + '.sha256sum')) as f: + s = f.readline().rstrip() + _s = self.hash(os.path.join(self.path, file)) + if s == _s: + logger.info("%s is present and verified" % file) + else: + logger.warn('%s is downloaded but doesnt match the sha256sum' % file) + else: + logger.debug('Extrating %s sequences from %s' % (self.organism, file)) + with open(os.path.join(self.path, file)) as iH: + for rec in SeqIO.parse(iH, 'fasta'): + if self.organism in rec.description: + _temp_rec = SeqRecord(id=rec.id, description=rec.description, + seq=rec.seq.back_transcribe()) + temp.append(_temp_rec) + SeqIO.write(temp, os.path.join(self.path, self.shortOrganism + '-' + file), + 'fasta') + with open(os.path.join(self.path, + self.shortOrganism + '-' + file + '.sha256sum'), 'w') as wH: + print(self.hash(os.path.join(self.path, + self.shortOrganism + '-' + file + '.sha256sum')), file=wH) + class Index: - def __init__(self, root=os.path.join(os.path.expanduser("~"), - '.CLASHchimeras'), bowtieExecutable=None, tophatExecutable=None): - - self.created = [] - - if bowtieExecutable is not None: - logger.info('Bowtie2 path {} provided and will be used'\ - .format(bowtieExecutable)) - self.bowtieExecutable = bowtieExecutable - else: - try: - bowtieVersion = float(subprocess.check_output(['bowtie2', '--version'] - ).decode('utf-8').split("\n")[0].split()[-1].rpartition(".")[0]) - fullBowtieVersion = subprocess.check_output(['bowtie2', '--version'] - ).decode('utf-8').split("\n")[0].split()[-1] - if bowtieVersion < 2.2: - logger.warn("Please update bowtie2, you can download it from") - logger.warn("http://sourceforge.net/projects/bowtie-bio/files/bowtie2/") + def __init__(self, root=os.path.join(os.path.expanduser("~"), + '.CLASHchimeras'), bowtieExecutable=None, tophatExecutable=None): + + self.created = [] + + if bowtieExecutable is not None: + logger.info('Bowtie2 path {} provided and will be used' + .format(bowtieExecutable)) + self.bowtieExecutable = bowtieExecutable else: - logger.info("Bowtie2 version {} found and will be used"\ - .format(fullBowtieVersion)) - self.bowtieExecutable = 'bowtie2' - except OSError as e: - if e.errno == os.errno.ENOENT: - logger.error(textwrap.fill(""" + try: + bowtieVersion = float(subprocess.check_output(['bowtie2', '--version'] + ).decode('utf-8').split("\n")[0].split()[-1].rpartition(".")[0]) + fullBowtieVersion = subprocess.check_output(['bowtie2', '--version'] + ).decode('utf-8').split("\n")[0].split()[-1] + if bowtieVersion < 2.2: + logger.warn("Please update bowtie2, you can download it from") + logger.warn("http://sourceforge.net/projects/bowtie-bio/files/bowtie2/") + else: + logger.info("Bowtie2 version {} found and will be used" + .format(fullBowtieVersion)) + self.bowtieExecutable = 'bowtie2' + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error(textwrap.fill(""" Can't find Bowtie2 in your path. Please install it from http://sourceforge.net/projects/bowtie-bio/files/bowtie2/ """)) + else: + logger.error('Something wrong with your bowtie2 command') + + if tophatExecutable is not None: + logger.info('Tophat2 path {} provided and will be used' + .format(tophatExecutable)) + self.tophatExecutable = tophatExecutable else: - logger.error('Something wrong with your bowtie2 command') - - if tophatExecutable is not None: - logger.info('Tophat2 path {} provided and will be used'\ - .format(tophatExecutable)) - self.tophatExecutable = tophatExecutable - else: - try: - tophatVersion = float(subprocess.check_output(['tophat', - '--version']).decode('utf-8').split(" v")[-1].rpartition(".")[0]) - fullTophatVersion = subprocess.check_output(['tophat', - '--version']).decode('utf-8').split(" v")[-1].rstrip() - if tophatVersion < 2: - logger.warn("Please update tophat, you can download it from") - logger.warn("http://ccb.jhu.edu/software/tophat/index.shtml") - else: - logger.info("Tophat version {} found and will be used"\ - .format(fullTophatVersion)) - self.tophatExecutable = 'tophat' - except OSError as e: - if e.errno == os.errno.ENOENT: - logger.error("Can't find Tophat in your path. Please install it from") - logger.error("http://ccb.jhu.edu/software/tophat/index.shtml") - else: - logger.error('Something wrong with your tophat command') - - self.root = root - - def create(self): - - stdoutHandle = open(os.path.join(self.root, 'CLASHChimeras-Index.stdout'), - 'w') - stderrHandle = open(os.path.join(self.root, 'CLASHChimeras-Index.stderr'), - 'w') - indexCreated = False - for fa in glob.glob(os.path.join(self.root, '*.fa')): - indexName = fa.rpartition(".fa")[0] - self.created.append('{} - {}'.format(indexName, green('Bowtie2', - style='bold'))) - if 'genome.fa' in fa: - genomeIndex = fa.rpartition(".")[0] - if len(glob.glob(os.path.join(self.root, indexName+'*.bt2'))) == 6: - logger.info('Bowtie2 index for {} found'.format(fa)) - else: - try: - logger.info('Generating bowtie2 index for {}'.format(fa)) - indexCreated = True - p = subprocess.Popen([self.bowtieExecutable+'-build', fa, indexName], - stdout=stdoutHandle, stderr=stderrHandle) - p.communicate() - logger.info('Bowtie2 index for {} is generated'.format(fa)) - except OSError as e: - if e.errno == os.errno.ENOENT: - logger.error("Can't find bowtie2-build in your path...") - logger.error("Please make sure bowtie2 is in your path and") - logger.error("bowtie2-build can run from your shell") - else: - logger.error("Something wrong with your bowtie2-build command") - - if indexCreated: - logger.warn('The stdout and stderr for bowtie2-build are stored in') - logger.warn('CLASHChimeras-Index.stdout and CLASHChimeras-Index.stderr') - - for gtf in glob.glob(os.path.join(self.root, '*.annotation.gtf')): - indexName = gtf.rpartition("/")[-1].rpartition(".")[0] - self.created.append('{} - {}'.format(gtf.rpartition(".")[0], green( - 'Tophat2 Transcriptome', style='bold'))) - if len(glob.glob(os.path.join(self.root, indexName+'*.bt2'))) == 6: - logger.info('Tophat2 transcriptome index found for {}'.format(gtf)) - else: - try: - indexCreated = True - logger.info('Generating tophat transcriptome index for {}'\ - .format(gtf)) - p = subprocess.Popen([self.tophatExecutable, '-G', gtf, - '--transcriptome-index', os.path.join(self.root, indexName), - genomeIndex], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - p.communicate() - logger.info('Tophat2 transcriptome index generated for {}'\ - .format(gtf)) - except OSError as e: - if e.errno == os.errno.ENOENT: - logger.error("Can't find tophat in your path...") - logger.error("Please make sure tophat is in your path") - else: - logger.error("Something wrong with your tophat command") - - stdoutHandle.close() - stderrHandle.close() + try: + tophatVersion = float(subprocess.check_output(['tophat', + '--version']).decode('utf-8').split(" v")[-1].rpartition(".")[0]) + fullTophatVersion = subprocess.check_output(['tophat', + '--version']).decode('utf-8').split(" v")[-1].rstrip() + if tophatVersion < 2: + logger.warn("Please update tophat, you can download it from") + logger.warn("http://ccb.jhu.edu/software/tophat/index.shtml") + else: + logger.info("Tophat version {} found and will be used" + .format(fullTophatVersion)) + self.tophatExecutable = 'tophat' + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error("Can't find Tophat in your path. Please install it from") + logger.error("http://ccb.jhu.edu/software/tophat/index.shtml") + else: + logger.error('Something wrong with your tophat command') + + self.root = root + + def create(self): + + stdoutHandle = open(os.path.join(self.root, 'CLASHChimeras-Index.stdout'), + 'w') + stderrHandle = open(os.path.join(self.root, 'CLASHChimeras-Index.stderr'), + 'w') + indexCreated = False + for fa in glob.glob(os.path.join(self.root, '*.fa')): + indexName = fa.rpartition(".fa")[0] + self.created.append('{} - {}'.format(indexName, green('Bowtie2', + style='bold'))) + if 'genome.fa' in fa: + genomeIndex = fa.rpartition(".")[0] + if len(glob.glob(os.path.join(self.root, indexName + '*.bt2'))) == 6: + logger.info('Bowtie2 index for {} found'.format(fa)) + else: + try: + logger.info('Generating bowtie2 index for {}'.format(fa)) + indexCreated = True + p = subprocess.Popen([self.bowtieExecutable + '-build', fa, indexName], + stdout=stdoutHandle, stderr=stderrHandle) + p.communicate() + logger.info('Bowtie2 index for {} is generated'.format(fa)) + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error("Can't find bowtie2-build in your path...") + logger.error("Please make sure bowtie2 is in your path and") + logger.error("bowtie2-build can run from your shell") + else: + logger.error("Something wrong with your bowtie2-build command") + + if indexCreated: + logger.warn('The stdout and stderr for bowtie2-build are stored in') + logger.warn('CLASHChimeras-Index.stdout and CLASHChimeras-Index.stderr') + + for gtf in glob.glob(os.path.join(self.root, '*.annotation.gtf')): + indexName = gtf.rpartition("/")[-1].rpartition(".")[0] + self.created.append('{} - {}'.format(gtf.rpartition(".")[0], green( + 'Tophat2 Transcriptome', style='bold'))) + if len(glob.glob(os.path.join(self.root, indexName + '*.bt2'))) == 6: + logger.info('Tophat2 transcriptome index found for {}'.format(gtf)) + else: + try: + indexCreated = True + logger.info('Generating tophat transcriptome index for {}' + .format(gtf)) + p = subprocess.Popen([self.tophatExecutable, '-G', gtf, + '--transcriptome-index', os.path.join( + self.root, indexName), + genomeIndex], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.communicate() + logger.info('Tophat2 transcriptome index generated for {}' + .format(gtf)) + except OSError as e: + if e.errno == os.errno.ENOENT: + logger.error("Can't find tophat in your path...") + logger.error("Please make sure tophat is in your path") + else: + logger.error("Something wrong with your tophat command") + + stdoutHandle.close() + stderrHandle.close() diff --git a/clashchimeras/log.py b/clashchimeras/log.py index e4e24d7..2a6b0ed 100644 --- a/clashchimeras/log.py +++ b/clashchimeras/log.py @@ -1,26 +1,30 @@ import logging import coloredlogs + def debug_logger(name): - logger = logging.getLogger(name) - coloredlogs.install(level=logging.DEBUG, show_name=False, - show_hostname=True) - return logger + logger = logging.getLogger(name) + coloredlogs.install(level=logging.DEBUG, show_name=False, + show_hostname=True) + return logger + def info_logger(name): - logger = logging.getLogger(name) - coloredlogs.install(level=logging.INFO, show_name=False, - show_hostname=True) - return logger + logger = logging.getLogger(name) + coloredlogs.install(level=logging.INFO, show_name=False, + show_hostname=True) + return logger + def warning_logger(name): - logger = logging.getLogger(name) - coloredlogs.install(level=logging.WARNING, show_name=False, - show_hostname=True) - return logger + logger = logging.getLogger(name) + coloredlogs.install(level=logging.WARNING, show_name=False, + show_hostname=True) + return logger + def error_logger(name): - logger = logging.getLogger(name) - coloredlogs.install(level=logging.ERROR, show_name=False, - show_hostname=True) - return logger + logger = logging.getLogger(name) + coloredlogs.install(level=logging.ERROR, show_name=False, + show_hostname=True) + return logger diff --git a/clashchimeras/methods.py b/clashchimeras/methods.py index 78eb739..c52d45e 100644 --- a/clashchimeras/methods.py +++ b/clashchimeras/methods.py @@ -2,70 +2,84 @@ from itertools import repeat + def convertCigar(cigarString): - cigarAttributes = re.findall("\D", cigarString) - cigarNumbers = re.findall("\d+", cigarString) - convertedCigarString = "" - for index, attribute in enumerate(cigarAttributes): - for i in repeat(attribute, int(cigarNumbers[index])): - convertedCigarString += i - return convertedCigarString + cigarAttributes = re.findall("\D", cigarString) + cigarNumbers = re.findall("\d+", cigarString) + convertedCigarString = "" + for index, attribute in enumerate(cigarAttributes): + for i in repeat(attribute, int(cigarNumbers[index])): + convertedCigarString += i + return convertedCigarString + def chimeraOrNot(bitOne, bitTwo, overlap=4, gap=9): - combinedCigar = "" - for i, j in zip(bitOne, bitTwo): - if i == "M" and j == "M": combinedCigar += "=" - elif i == "S" and j == "S": combinedCigar += "#" - elif i == "I" and j == "D": combinedCigar += "-" - elif i == "D" and j == "I": combinedCigar += "-" - elif i == "M" and j != "M": combinedCigar += "{" - elif i != "M" and j == "M": combinedCigar += "}" - elif i != "M" and j == "D": combinedCigar += "-" - elif i == "D" and j != "M": combinedCigar += "-" - elif i == "I" and j != "M": combinedCigar += "+" - elif i != "M" and j == "I": combinedCigar += "+" - else: combinedCigar += "*" - numberOfMs = len(list(filter(lambda x: x == "=", combinedCigar))) - curlyStartStart = combinedCigar.find("{") - curlyStartEnd = combinedCigar.rfind("{") - curlyEndStart = combinedCigar.find("}") - curlyEndEnd = combinedCigar.rfind("}") - listCurlies = sorted([[curlyStartStart, curlyStartEnd], [curlyEndStart, - curlyEndEnd]]) - matches = combinedCigar.count("{") + combinedCigar.count("}") + \ - combinedCigar.count("=") + combinedCigar.count("-") + \ - combinedCigar.count("+") - if numberOfMs <= overlap and abs(listCurlies[0][1] - listCurlies[1][0]) \ - <= gap and matches >= (0.75 * len(combinedCigar)): - return combinedCigar - else: - return None + combinedCigar = "" + for i, j in zip(bitOne, bitTwo): + if i == "M" and j == "M": + combinedCigar += "=" + elif i == "S" and j == "S": + combinedCigar += "#" + elif i == "I" and j == "D": + combinedCigar += "-" + elif i == "D" and j == "I": + combinedCigar += "-" + elif i == "M" and j != "M": + combinedCigar += "{" + elif i != "M" and j == "M": + combinedCigar += "}" + elif i != "M" and j == "D": + combinedCigar += "-" + elif i == "D" and j != "M": + combinedCigar += "-" + elif i == "I" and j != "M": + combinedCigar += "+" + elif i != "M" and j == "I": + combinedCigar += "+" + else: + combinedCigar += "*" + numberOfMs = len(list(filter(lambda x: x == "=", combinedCigar))) + curlyStartStart = combinedCigar.find("{") + curlyStartEnd = combinedCigar.rfind("{") + curlyEndStart = combinedCigar.find("}") + curlyEndEnd = combinedCigar.rfind("}") + listCurlies = sorted([[curlyStartStart, curlyStartEnd], [curlyEndStart, + curlyEndEnd]]) + matches = combinedCigar.count("{") + combinedCigar.count("}") + \ + combinedCigar.count("=") + combinedCigar.count("-") + \ + combinedCigar.count("+") + if numberOfMs <= overlap and abs(listCurlies[0][1] - listCurlies[1][0]) \ + <= gap and matches >= (0.75 * len(combinedCigar)): + return combinedCigar + else: + return None + def findRegion(record_data): - if 'ENS' in record_data.refId and '|' in record_data.refId: - reference_id = record_data.refId - reference_start = int(record_data.start) - match_length = record_data.matchLength - regions = reference_id.split("|")[7:-1] - match_start = reference_start - match_end = reference_start + match_length - mixed = [] + if 'ENS' in record_data.refId and '|' in record_data.refId: + reference_id = record_data.refId + reference_start = int(record_data.start) + match_length = record_data.matchLength + regions = reference_id.split("|")[7:-1] + match_start = reference_start + match_end = reference_start + match_length + mixed = [] - for region in regions: - region_name = region.split(":")[0] - region_start = int(region.split(":")[-1].split("-")[0]) - region_end = int(region.split(":")[-1].split("-")[-1]) - if match_start >= region_start and match_end <= region_end: - mixed.append(region_name) - break - if match_start >= region_start and match_start <= region_end: - mixed.append(region_name) - if match_end >= region_start and match_end <= region_end: - mixed.append(region_name) - if match_start <= region_start and match_end >= region_end: - mixed.append(region_name) - return '{}|{}|{}'.format(reference_id.split("|")[0], - reference_id.split("|")[5], - "-".join(mixed)) - else: - return record_data.refId + for region in regions: + region_name = region.split(":")[0] + region_start = int(region.split(":")[-1].split("-")[0]) + region_end = int(region.split(":")[-1].split("-")[-1]) + if match_start >= region_start and match_end <= region_end: + mixed.append(region_name) + break + if match_start >= region_start and match_start <= region_end: + mixed.append(region_name) + if match_end >= region_start and match_end <= region_end: + mixed.append(region_name) + if match_start <= region_start and match_end >= region_end: + mixed.append(region_name) + return '{}|{}|{}'.format(reference_id.split("|")[0], + reference_id.split("|")[5], + "-".join(mixed)) + else: + return record_data.refId diff --git a/clashchimeras/parsers.py b/clashchimeras/parsers.py index 21eff2e..989c662 100644 --- a/clashchimeras/parsers.py +++ b/clashchimeras/parsers.py @@ -19,779 +19,785 @@ logger = logging.getLogger('root') -class GFF: - """GFF file parser for mirbase gff3 file - - This class uses memory-mapped file object to read a mirbase gff3 file. It - contains methods to read, process a gff3 file and return genomic coordinates - - Attributes: - fileName: A mirbase gff3 file path - """ - - def __init__(self, fileName=None): - self.features = {} - self.fileName = fileName - def read(self, featureType='miRNA_primary_transcript'): - """Reads gff3 file provided during class initialization +class GFF: + """GFF file parser for mirbase gff3 file - Stores the byte positions of every feature in a dict object named - self.features + This class uses memory-mapped file object to read a mirbase gff3 file. It + contains methods to read, process a gff3 file and return genomic coordinates - Keyword Args: - featureType: Feature type of a gff3 record, the third element of every - record in the file. Please change this if you want to store mature - form of microRNA, by default it uses primary transcript - (default 'miRNA_primary_transcript') + Attributes: + fileName: A mirbase gff3 file path """ - logger.info('Reading %s' % self.fileName) - self.fileHandle = open(self.fileName, 'r+b') - bytePosition = self.fileHandle.tell() - for line in self.fileHandle: - row = line.decode('utf-8').rstrip().split("\t") - if not row[0].startswith("#") and row[2] == featureType: + + def __init__(self, fileName=None): + self.features = {} + self.fileName = fileName + + def read(self, featureType='miRNA_primary_transcript'): + """Reads gff3 file provided during class initialization + + Stores the byte positions of every feature in a dict object named + self.features + + Keyword Args: + featureType: Feature type of a gff3 record, the third element of every + record in the file. Please change this if you want to store mature + form of microRNA, by default it uses primary transcript + (default 'miRNA_primary_transcript') + """ + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split(";") + for attribute in attributes: + if attribute.startswith('Name'): + mirbase_name = attribute.split("=")[-1] + self.features[mirbase_name] = bytePosition + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + logger.debug('Reading %s finished' % self.fileName) + + def process(self, name): + """A method to return a Record object providing genomic information + + Args: + name: A valid miRNA_primary_transcript name + + Returns: + An object Record containing scaffold, start, end, strand, mirbase_id and + mirbase_name as its variables for access + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + self.mm.seek(self.features[name]) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") attributes = row[-1].split(";") for attribute in attributes: - if attribute.startswith('Name'): - mirbase_name = attribute.split("=")[-1] - self.features[mirbase_name] = bytePosition - bytePosition = self.fileHandle.tell() - self.fileHandle.close() - logger.debug('Reading %s finished' % self.fileName) - - def process(self, name): - """A method to return a Record object providing genomic information - - Args: - name: A valid miRNA_primary_transcript name - - Returns: - An object Record containing scaffold, start, end, strand, mirbase_id and - mirbase_name as its variables for access - """ - self.fileHandle = open(self.fileName, 'r+b') - self.mm = mmap.mmap(self.fileHandle.fileno(), 0) - self.mm.seek(self.features[name]) - row = self.mm.readline().decode('utf-8').rstrip().split("\t") - attributes = row[-1].split(";") - for attribute in attributes: - if attribute.startswith("ID"): - _id = attribute.split("=")[-1] - elif attribute.startswith("Name"): - _name = attribute.split("=")[-1] - record = Record(scaffold=row[0], start=int(row[3]), end=int(row[4]), - strand=row[6], mirbase_id=_id, mirbase_name=_name) - self.fileHandle.close() - return record - - - def coordinates(self, name, start=None, end=None): - """A method to return a bed record containing genomic coordinates for the - aligned segment - - Keyword Args: - start: The alignment start position of the cDNA molecule or the relative - start of the particular molecule - end: The alignment end position in the cDNA molecule or the relative end - of the particular molecule - - Args: - name: A valid miRNA_primary_transcript name - - Returns: - A tuple of strings containing elements for a bed record - """ - record = self.process(name) - if not start and not end: - start = 1 - end = record.end - record.start + 1 - positions = {} - match_positions = [] - - if record.strand == '+': - _start = 1 - for relative, actual in enumerate(range(record.start-1, record.end), \ - start=_start): - positions[relative] = actual - for pos in range(start, end+1): - match_positions.append(positions[pos]) - return [(record.scaffold, min(match_positions), max(match_positions)+1, - record.mirbase_name, 0, record.strand)] - - elif record.strand == '-': - _start = 1 - for relative, actual in enumerate(reversed(range(record.start-1, \ - record.end)), start=_start): - positions[relative] = actual - for pos in range(start, end+1): - match_positions.append(positions[pos]) - return [(record.scaffold, min(match_positions), max(match_positions)+1, - record.mirbase_name, 0, record.strand)] - + if attribute.startswith("ID"): + _id = attribute.split("=")[-1] + elif attribute.startswith("Name"): + _name = attribute.split("=")[-1] + record = Record(scaffold=row[0], start=int(row[3]), end=int(row[4]), + strand=row[6], mirbase_id=_id, mirbase_name=_name) + self.fileHandle.close() + return record + + def coordinates(self, name, start=None, end=None): + """A method to return a bed record containing genomic coordinates for the + aligned segment + + Keyword Args: + start: The alignment start position of the cDNA molecule or the relative + start of the particular molecule + end: The alignment end position in the cDNA molecule or the relative end + of the particular molecule + + Args: + name: A valid miRNA_primary_transcript name + + Returns: + A tuple of strings containing elements for a bed record + """ + record = self.process(name) + if not start and not end: + start = 1 + end = record.end - record.start + 1 + positions = {} + match_positions = [] + + if record.strand == '+': + _start = 1 + for relative, actual in enumerate(range(record.start - 1, record.end), + start=_start): + positions[relative] = actual + for pos in range(start, end + 1): + match_positions.append(positions[pos]) + return [(record.scaffold, min(match_positions), max(match_positions) + 1, + record.mirbase_name, 0, record.strand)] + + elif record.strand == '-': + _start = 1 + for relative, actual in enumerate(reversed(range(record.start - 1, + record.end)), start=_start): + positions[relative] = actual + for pos in range(start, end + 1): + match_positions.append(positions[pos]) + return [(record.scaffold, min(match_positions), max(match_positions) + 1, + record.mirbase_name, 0, record.strand)] class GTF: - """GTF file parser for gencode gtf file - - This class uses memory-mapped file object to read a gencode gtf file. It - contains methods to read, process a gtf file and return genomic coordinates + """GTF file parser for gencode gtf file - Attributes: - fileName: A gencode gtf file path - """ + This class uses memory-mapped file object to read a gencode gtf file. It + contains methods to read, process a gtf file and return genomic coordinates - def __init__(self, fileName=None): - self.features = defaultdict(list) - self.biotypeFeatures = defaultdict(list) - self.geneFeatures = defaultdict(list) - self.fileName = fileName - self.geneIds = {} - - def readBiotype(self, featureType='exon', biotype=None): - logger.info('Reading %s' % self.fileName) - self.fileHandle = open(self.fileName, 'r+b') - for line in self.fileHandle: - row = line.decode('utf-8').rstrip().split("\t") + Attributes: + fileName: A gencode gtf file path + """ - if not row[0].startswith("#") and row[2] == featureType: - attributes = row[-1].split("; ") + def __init__(self, fileName=None): + self.features = defaultdict(list) + self.biotypeFeatures = defaultdict(list) + self.geneFeatures = defaultdict(list) + self.fileName = fileName + self.geneIds = {} + + def readBiotype(self, featureType='exon', biotype=None): + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split("; ") + + havana_transcript = '-' + havana_gene = '-' + exon_number = '0' + + for attribute in attributes: + + if attribute.startswith("transcript_id"): + transcript_id = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("transcript_type"): + transcript_type = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("exon_number"): + exon_number = int(attribute.split(" ")[-1]) + + elif attribute.startswith("havana_gene"): + havana_gene = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("havana_transcript"): + havana_transcript = attribute.split(" ")[-1][1:-2] + + elif attribute.startswith("gene_id"): + gene_id = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("gene_name"): + gene_name = attribute.split(" ")[-1][1:-1] + + elif attribute.startswith("transcript_name"): + transcript_name = attribute.split(" ")[-1][1:-1] + + if biotype == 'tRNA': + if transcript_type == "tRNAscan": + self.biotypeFeatures[transcript_id].append((exon_number, row[0], + int(row[3]), int(row[4]), + row[6], gene_id, + havana_gene, + havana_transcript, + transcript_name, + gene_name)) + else: + if transcript_type == biotype: + self.biotypeFeatures[transcript_id].append((exon_number, row[0], + int(row[3]), int(row[4]), + row[6], gene_id, + havana_gene, + havana_transcript, + transcript_name, + gene_name)) + + self.fileHandle.close() + + def read(self, featureType='exon'): + """Reads gtf file provided during class initialization + + Stores the byte positions of every feature in a defaultdict(list) object + named self.features + + Keyword Args: + featureType: Feature type of a gtf record, the third element of every + record in the file. Please change this if you want to get specific + records (e.g. 'UTR') (default 'exon') + """ + + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split("; ") + for attribute in attributes: + if attribute.startswith("transcript_id"): + transcript_id = attribute.split(" ")[-1][1:-1] + self.features[transcript_id].append(bytePosition) + self.geneIds[transcript_id] = gene_id + if attribute.startswith("gene_id"): + gene_id = attribute.split(" ")[-1][1:-1] + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + row = line.decode('utf-8').rstrip().split("\t") + if not row[0].startswith("#") and row[2] == featureType: + attributes = row[-1].split("; ") + for attribute in attributes: + if attribute.startswith("gene_id"): + gene_id = attribute.split(" ")[-1][1:-1] + self.geneFeatures[gene_id].append(bytePosition) + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + + logger.debug('Reading %s finished' % self.fileName) + + def process(self, name): + """A method to return a Record object providing genomic information + + Args: + name: A valid gencode transcript_id + + Returns: + An object Record containing scaffold, start, end, strand, mirbase_id and + mirbase_name as its variables for access + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + positions = self.features[name] + for position in positions: + self.mm.seek(position) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") + attributes = row[-1].split("; ") + _eid = '-' + _enb = '0' + for attribute in attributes: + if attribute.startswith("transcript_type"): + _tt = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("transcript_id"): + _tid = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("exon_id"): + _eid = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("exon_number"): + _enb = int(attribute.split(" ")[-1]) + elif attribute.startswith("gene_name"): + _gn = attribute.split(" ")[-1][1:-1] + record = Record(scaffold=row[0], start=int(row[3]), end=int(row[4]), + strand=row[6], transcript_type=_tt, transcript_id=_tid, exon_id=_eid, + exon_number=_enb, gene_name=_gn) + yield record + self.fileHandle.close() + + def geneExonicRegions(self, df): + """Given a DataFrame with the exon coordinates from Gencode for a single + gene, return the total number of coding regions in that gene. + """ + scaffold = df.iloc[0].scaffold + strand = df.iloc[0].strand + gene_type = df.iloc[0].gene_type + gene_id = df.iloc[0].gene_id + gene_name = df.iloc[0].gene_name + start = df.start.min() + end = df.end.max() + bp = [False] * (end - start + 1) + for i in range(df.shape[0]): + s = df.iloc[i]['start'] - start + e = df.iloc[i]['end'] - start + 1 + bp[s:e] = [True] * (e - s) + regions = list(range(start, end + 1)) + groups = [] + + for i, j in groupby(bp): + groups.append((i, len(list(j)))) + e_start = 0 + + for i in groups: + e_end = e_start + i[1] + if i[0]: + record = Record(scaffold=scaffold, start=regions[e_start], + end=regions[e_end - 1], gene_type=gene_type, gene_id=gene_id, + gene_name=gene_name, strand=strand) + yield record + e_start += i[1] + + def geneProcess(self, name): + """A method to return a Record object providing genomic information + + Args: + name: A valid gencode gene_id + + Returns: + An object Record containing scaffold, start, end, strand, mirbase_id and + mirbase_name as its variables for access + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + positions = self.geneFeatures[name] + exons = [] + for position in positions: + self.mm.seek(position) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") + attributes = row[-1].split("; ") + for attribute in attributes: + if attribute.startswith("gene_type"): + _gt = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("gene_id"): + _gid = attribute.split(" ")[-1][1:-1] + elif attribute.startswith("gene_name"): + _gn = attribute.split(" ")[-1][1:-1] + exons.append((row[0], int(row[3]), int(row[4]), row[6], _gt, _gid, _gn)) + self.fileHandle.close() + exons_df = pd.DataFrame(exons, columns=['scaffold', 'start', 'end', + 'strand', 'gene_type', 'gene_id', 'gene_name']) + + for record in self.geneExonicRegions(exons_df): + yield record + + def coordinates(self, name, start=None, end=None): + """A generator to return a bed record containing genomic coordinates for the + aligned segment + + Keyword Args: + start: The alignment start position of the cDNA molecule or the relative + start of the particular molecule + end: The alignment end position in the cDNA molecule or the relative end + of the particular molecule + + Args: + name: A valid miRNA_primary_transcript name + + Returns: + A list of tuple(s) of strings containing elements for a bed record. There + may be more than one because of alternate splicing. + """ + if "|" in name: + self.name = name.split("|")[0] + else: + self.name = name + positions = {} + match_positions = [] + records = [] + segments = [] + result_segments = [] + for record in self.process(self.name): + records.append(record) + records.sort(key=lambda x: int(x.exon_number)) + + if records[0].strand == '+': + _start = 1 + for record in records: + for relative, actual in enumerate(range(record.start, record.end + 1), + start=_start): + positions[relative] = actual + _start = relative + 1 + for pos in range(start, end): + match_positions.append(positions[pos]) + for key, group in groupby(enumerate(match_positions), + lambda x: x[0] - x[-1]): + segment = list(map(itemgetter(1), group)) + segments.append([segment[0], segment[-1]]) + for segment in segments: + for record in records: + if segment[0] >= record.start and segment[1] <= record.end: + result_segments.append((record.scaffold, segment[0], segment[1], + record.transcript_id + '|' + record.gene_name, 0, record.strand)) + + elif records[0].strand == '-': + _start = 1 + for record in records: + for relative, actual in enumerate(reversed(range(record.start, + record.end + 1)), start=_start): + positions[relative] = actual + _start = relative + 1 + for pos in range(start, end): + match_positions.append(positions[pos]) + for key, group in groupby(enumerate(reversed(match_positions)), + lambda x: x[0] - x[-1]): + segment = list(map(itemgetter(1), group)) + segments.append([segment[0], segment[-1]]) + for segment in segments: + for record in records: + if segment[0] >= record.start and segment[1] <= record.end: + result_segments.append((record.scaffold, segment[0], segment[1], + record.transcript_id + '|' + record.gene_name, 0, record.strand)) + + if len(result_segments) == 0: + logger.debug('%s, %s, %s' % (name, start, end)) + logger.debug('%s' % str(segments)) + for r in records: + logger.debug('%s %s %s %s' % (r.scaffold, r.strand, + r.start, r.end)) + + return result_segments - havana_transcript = '-' - havana_gene = '-' - exon_number = '0' - for attribute in attributes: +class SAM: + """SAM file parser for parsing bowtie2 generated files - if attribute.startswith("transcript_id"): - transcript_id = attribute.split(" ")[-1][1:-1] + This class uses memory-mapped file object to read a sam file - elif attribute.startswith("transcript_type"): - transcript_type = attribute.split(" ")[-1][1:-1] + Attributes: + fileName: A sam file path + """ - elif attribute.startswith("exon_number"): - exon_number = int(attribute.split(" ")[-1]) + def __init__(self, fileName=None): + self.fileName = fileName + self.records = {} + + def read(self, flag=0): + """Reads sam file provided during class initialization + + Stores the byte position of every record based on the keyword arg flag + provided, to a dict object named self.records + + Keyword Args: + flag: The SAM alignment flag for a record. For default, it uses the + primary alignment for every record and ignores secondary alignments + (default 0) + """ + logger.info('Reading %s' % self.fileName) + self.fileHandle = open(self.fileName, 'r+b') + bytePosition = self.fileHandle.tell() + for line in self.fileHandle: + read = line.decode('utf-8').split("\t") + if not read[0].startswith("@") and read[1] == str(flag): + self.records[read[0]] = bytePosition + bytePosition = self.fileHandle.tell() + self.fileHandle.close() + logger.debug('Reading %s finished' % self.fileName) + + def access(self, queryName): + """Provides random access of a record from the sam file + + Args: + queryName: The query name of the read from the sam file + + Returns: + A list generated after splitting the record line from sam file + """ + self.fileHandle = open(self.fileName, 'r+b') + self.mm = mmap.mmap(self.fileHandle.fileno(), 0) + self.mm.seek(self.records[queryName]) + row = self.mm.readline().decode('utf-8').rstrip().split("\t") + self.fileHandle.close() + + return self.pretty(row) + + def filterPotentialChimeras(self, min_length=30, flag=0, target=None): + """Generated a filtered fasta file from a sam file + + This filtered fasta file contains reads that can be potentially chimeras. + The criteria for filtering is based on the minimum length + + Keyword Args: + min_length: To be selected as a potential chimera, this is the minimum + read length (default 30) + flag: The SAM alignment flag describing the type of alignment (default 0) + target: The prefix for output file + """ + logger.debug('Filtering {} for potential chimeras'.format(target)) + target = '{}.filter.fasta'.format(target.rpartition(".")[0]) + if os.path.exists(target): + logger.info('Skipping filtering for {}'.format(target)) + else: + with open(target, 'w') as oH: + with open(self.fileName) as iH: + for row in csv.reader(iH, delimiter="\t"): + if not row[0].startswith('@') and row[1] == str(flag): + if len(row[9]) >= 30: + print(textwrap.fill('>%s' % row[0], width=80), file=oH) + print(textwrap.fill('%s' % row[9], width=80), file=oH) + logger.debug('Filtering finished') + return target + + def pretty(self, row): + refId = row[2] + start = int(row[3]) + for i in row[10:]: + if i.startswith('MD'): + mismatchInfo = i + sequence = row[9] + cigar = row[5] + cigarString = clashchimeras.methods.convertCigar(row[5]) + matchLength = cigarString.count("M") + cigarString.count("D") + end = start + matchLength - 1 + record = Record(refId=refId, start=start, mismatchInfo=mismatchInfo, + sequence=sequence, cigarString=cigarString, matchLength=matchLength, + cigar=cigar, end=end) + return record - elif attribute.startswith("havana_gene"): - havana_gene = attribute.split(" ")[-1][1:-1] - elif attribute.startswith("havana_transcript"): - havana_transcript = attribute.split(" ")[-1][1:-2] +class Output: + """Contains methods for writing output files + + This class is used to generate every kind of output generated by this + package which includes plain text, ansi colored text and bed file + + Attributes: + target: A prefix for output file which will be automatically followed by + extension (default 'wip') + overlap: Minimum overlap to be set between two molecules when determining + chimera (default 4) + gap: Maximum gap (number of unknown nucleotides) to be allowed between + two molecules within a chimera (default 9) + """ - elif attribute.startswith("gene_id"): - gene_id = attribute.split(" ")[-1][1:-1] + def __init__(self, + target=None, + smallRNABed=False, + targetRNABed=False, + overlap=4, + gap=9): + self.target = target + self.overlap = overlap + self.gap = gap + + if smallRNABed: + self.smallRNABedHandle = open('{}.smallRNA.bed'.format(self.target), 'w') + print('# BED locations of smallRNA part of the identified chimera', + file=self.smallRNABedHandle) + self.smallRNABedCSV = csv.writer(self.smallRNABedHandle, delimiter="\t") + self.smallRNABedCSV.writerow( + ['# The name field represents the following:']) + self.smallRNABedCSV.writerow( + ['# E.g. 201980-1-48|hsa-mir-100==PAPSS1']) + self.smallRNABedCSV.writerow( + ['# 201980-1-48 is the fasta identifier']) + self.smallRNABedCSV.writerow( + ["# 201980 is the unique identifier"]) + self.smallRNABedCSV.writerow( + ["# 1 is the number of times that sequence was observed in raw " + "fastq "]) + self.smallRNABedCSV.writerow( + ["# 48 is the length of the sequence"]) + self.smallRNABedCSV.writerow( + ['# hsa-mir-100 represents the smallRNA transcript']) + self.smallRNABedCSV.writerow( + ['# PAPSS1 represents the gene symbol for targetRNA transcript ' + 'transcript ']) + if targetRNABed: + self.targetRNABedHandle = open('{}.targetRNA.bed'.format(self.target), + 'w') + self.targetRNABedCSV = csv.writer(self.targetRNABedHandle, delimiter="\t") + self.targetRNABedCSV.writerow( + ['# The name field represents the following:']) + self.targetRNABedCSV.writerow( + ['# E.g. 136019-1-48|ENST00000375759.6|SPEN==hsa-mir-103a-2']) + self.targetRNABedCSV.writerow( + ['# 136019-1-48 is the fasta identifier']) + self.targetRNABedCSV.writerow( + ["# 136019 is the unique identifier"]) + self.targetRNABedCSV.writerow( + ["# 1 is the number of times that sequence was observed in raw " + "fastq "]) + self.targetRNABedCSV.writerow( + ["# 48 is the length of the sequence"]) + self.targetRNABedCSV.writerow( + ["# ENST00000375759.6 is the targetRNA transcript identifier"]) + self.targetRNABedCSV.writerow( + ['# SPEN is the gene symbol for for targetRNA transcript ' + 'ENST00000375759.6']) + self.targetRNABedCSV.writerow( + ['# hsa-mir-103a-2 represents the smallRNA transcript ']) + + self.hybWriter = open('%s.chimeras.tsv' % self.target, 'w') + self.hybComments() + + def hybComments(self): + print("# fasta Identifier: The identifier in .unique.fasta. ", + "#\tE.g. 123456-3-68 ", + "#\t123456 is the unique identifier", + "#\t3 is the number of times that sequence was observed in raw " + "fastq ", + "#\t68 is the length of the sequence", sep="\n", file=self.hybWriter) + + print("# smallRNA: The cDNA ID of the type of RNA labelled as smallRNA in " + "the analysis", + "#\tE.g. hsa-let-7b (miRBase identifier)", + "#\tE.g. ENST00000619178.1|SNORD3D| (Gencode snoRNA identifier)", + sep="\n", file=self.hybWriter) + + print("# smallRNA_start: cDNA alignment start position of the smallRNA " + "part of the chimera", file=self.hybWriter) + + print("# smallRNA_MDtag: Showing the MD tag from the smallRNA SAM " + "alignment for the chimera", + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + "#\tMD Z String for mismatching positions.Regex:[0-9]+(([" + "A-Z]|\^[A-Z]+)[0-9]+)*9", sep="\n", file=self.hybWriter) + + print('# smallRNA_cigar: Cigar string from the smallRNA SAM alignment for ' + 'the chimera', + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + '#\tSee CIGAR in the file', sep="\n", file=self.hybWriter) + + print('# arbitrary_chimera: The chimera representation indicating what ' + 'part of the sequence represents smallRNA and targetRNA', + '#\t{ is representing a match with smallRNA', + '#\t} is representing a match with targetRNA', + '#\t# is representing unaligned sequences (identified as --gap -ga)', + '#\t- is representing a deletion (D in cigar string)', + '#\t+ is representing a deletion (I in cigar string)', + '#\tE.g {{{{{{{{-{{{{{{{{{{{{{##}}}}}}}}}}+}}}}}}}}}}}}}}}}}}}}}}' + '#\tE.g The first 22 nucleotides are aligning to smallRNA cDNA', + '#\tE.g The last 33 nucleotides are aligning to targetRNA cDNA', + sep="\n", file=self.hybWriter) + + print('# read_sequence: The actual sequence that is appeared in raw ' + 'reads', file=self.hybWriter) + + print("# targetRNA: The cDNA ID of the type of RNA labelled as targetRNA " + "in " + "the analysis", + "#\tE.g. hsa-let-7b (miRBase identifier)", + "#\tE.g. ENST00000619178.1|SNORD3D| (Gencode snoRNA identifier)", + sep="\n", file=self.hybWriter) + + print("# targetRNA_start: cDNA alignment start position of the targetRNA " + "part of the chimera", file=self.hybWriter) + + print("# targetRNA_MDtag: Showing the MD tag from the targetRNA SAM " + "alignment for the chimera", + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + "#\tMD Z String for mismatching positions.Regex:[0-9]+(([" + "A-Z]|\^[A-Z]+)[0-9]+)*9", sep="\n", file=self.hybWriter) + + print('# targetRNA_cigar: Cigar string from the targetRNA SAM alignment ' + 'for ' + 'the chimera', + "#\tSAM file format specification", + "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", + '#\tSee CIGAR in the file', sep="\n", file=self.hybWriter) + + print("# fasta_Identifier", "smallRNA", "smallRNA_start", "smallRNA_MDtag", + "smallRNA_cigar", "arbitrary_chimera", "read_sequence", "targetRNA", + "targetRNA_start", "targetRNA_MDtag", "targetRNA_cigar", sep="\t", + file=self.hybWriter) + + def writeTargetRNABed(self, query, targetRNASegments, smallRNA): + if "ENS" in smallRNA and "|" in smallRNA: + _smallRNA = smallRNA.split("|")[5] + else: + _smallRNA = smallRNA + for segment in targetRNASegments: + _segment = list(segment) + _segment[3] = query + "|" + _segment[3] + "==" + _smallRNA + self.targetRNABedCSV.writerow(_segment) + + def writeSmallRNABed(self, query, smallRNASegments, targetRNA): + if "ENS" in targetRNA and "|" in targetRNA: + _targetRNA = targetRNA.split("|")[5] + else: + _targetRNA = targetRNA + for segment in smallRNASegments: + _segment = list(segment) + _segment[3] = query + "|" + _segment[3] + "==" + _targetRNA + self.smallRNABedCSV.writerow(_segment) + + def write(self, queryName, smallRNA, targetRNA): + chimeraString = clashchimeras.methods.chimeraOrNot(smallRNA.cigarString, + targetRNA.cigarString, overlap=self.overlap, gap=self.gap) + smallRNARegion = clashchimeras.methods.findRegion(smallRNA) + targetRNARegion = clashchimeras.methods.findRegion(targetRNA) + print(queryName, smallRNARegion, smallRNA.start, smallRNA.mismatchInfo, + smallRNA.cigar, chimeraString, smallRNA.sequence, + targetRNARegion, targetRNA.start, + targetRNA.mismatchInfo, targetRNA.cigar, sep="\t", file=self.hybWriter) + + def __del__(self): + self.hybWriter.close() - elif attribute.startswith("gene_name"): - gene_name = attribute.split(" ")[-1][1:-1] - elif attribute.startswith("transcript_name"): - transcript_name = attribute.split(" ")[-1][1:-1] +class Fasta: + def __init__(self, genome=None, gtf=None): + self.genome = genome + self.gtf = gtf + self.faidx = pyfaidx.Fasta(self.genome) + def getBiotype(self, output=None, biotype=None): + self.sequences = [] + g = GTF(fileName=self.gtf) if biotype == 'tRNA': - if transcript_type == "tRNAscan": - self.biotypeFeatures[transcript_id].append((exon_number, row[0], - int(row[3]), int(row[4]), - row[6], gene_id, - havana_gene, - havana_transcript, - transcript_name, - gene_name)) + g.readBiotype(biotype=biotype, featureType='tRNAscan') else: - if transcript_type == biotype: - self.biotypeFeatures[transcript_id].append((exon_number, row[0], - int(row[3]), int(row[4]), - row[6], gene_id, - havana_gene, - havana_transcript, - transcript_name, - gene_name)) - - self.fileHandle.close() - - def read(self, featureType='exon'): - """Reads gtf file provided during class initialization - - Stores the byte positions of every feature in a defaultdict(list) object - named self.features - - Keyword Args: - featureType: Feature type of a gtf record, the third element of every - record in the file. Please change this if you want to get specific - records (e.g. 'UTR') (default 'exon') - """ - - logger.info('Reading %s' % self.fileName) - self.fileHandle = open(self.fileName, 'r+b') - bytePosition = self.fileHandle.tell() - for line in self.fileHandle: - row = line.decode('utf-8').rstrip().split("\t") - if not row[0].startswith("#") and row[2] == featureType: - attributes = row[-1].split("; ") - for attribute in attributes: - if attribute.startswith("transcript_id"): - transcript_id = attribute.split(" ")[-1][1:-1] - self.features[transcript_id].append(bytePosition) - self.geneIds[transcript_id] = gene_id - if attribute.startswith("gene_id"): - gene_id = attribute.split(" ")[-1][1:-1] - bytePosition = self.fileHandle.tell() - self.fileHandle.close() - - self.fileHandle = open(self.fileName, 'r+b') - bytePosition = self.fileHandle.tell() - for line in self.fileHandle: - row = line.decode('utf-8').rstrip().split("\t") - if not row[0].startswith("#") and row[2] == featureType: - attributes = row[-1].split("; ") - for attribute in attributes: - if attribute.startswith("gene_id"): - gene_id = attribute.split(" ")[-1][1:-1] - self.geneFeatures[gene_id].append(bytePosition) - bytePosition = self.fileHandle.tell() - self.fileHandle.close() + g.readBiotype(biotype=biotype) + for transcript_id, exons in g.biotypeFeatures.items(): + temp_seq = '' + exons.sort(key=itemgetter(0)) + for exon in exons: + if exon[4] == '-': + temp_seq += (-self.faidx[exon[1]][exon[2] - 1:exon[3]]).seq + elif exon[4] == '+': + temp_seq += self.faidx[exon[1]][exon[2] - 1:exon[3]].seq + + _id = '{}|{}|{}|{}|{}|{}|{}'.format(transcript_id, + exons[0][5], + exons[0][6], + exons[0][7], + exons[0][8], + exons[0][9], + len(temp_seq)) + + temp_rec = SeqRecord(seq=Seq(temp_seq), id=_id, + description='') + + self.sequences.append(temp_rec) + + if not output: + logger.error('Please provide output file..') + sys.exit() + else: + logger.info('Writing {}'.format(output)) + SeqIO.write(self.sequences, output, 'fasta') - logger.debug('Reading %s finished' % self.fileName) - def process(self, name): - """A method to return a Record object providing genomic information +class Fastq: - Args: - name: A valid gencode transcript_id + def __init__(self, fileName=None, compressed=False): + self.fileName = fileName + self.compressed = compressed + self.n = 4 + self.sequences = Counter() + self.uniqueOutput = fileName.rpartition(".")[0] + '.unique.fasta' - Returns: - An object Record containing scaffold, start, end, strand, mirbase_id and - mirbase_name as its variables for access - """ - self.fileHandle = open(self.fileName, 'r+b') - self.mm = mmap.mmap(self.fileHandle.fileno(), 0) - positions = self.features[name] - for position in positions: - self.mm.seek(position) - row = self.mm.readline().decode('utf-8').rstrip().split("\t") - attributes = row[-1].split("; ") - _eid = '-' - _enb = '0' - for attribute in attributes: - if attribute.startswith("transcript_type"): - _tt = attribute.split(" ")[-1][1:-1] - elif attribute.startswith("transcript_id"): - _tid = attribute.split(" ")[-1][1:-1] - elif attribute.startswith("exon_id"): - _eid = attribute.split(" ")[-1][1:-1] - elif attribute.startswith("exon_number"): - _enb = int(attribute.split(" ")[-1]) - elif attribute.startswith("gene_name"): - _gn = attribute.split(" ")[-1][1:-1] - record = Record(scaffold=row[0], start=int(row[3]), end=int(row[4]), - strand=row[6], transcript_type=_tt, transcript_id=_tid, exon_id=_eid, - exon_number=_enb, gene_name=_gn) - yield record - self.fileHandle.close() - - def geneExonicRegions(self, df): - """Given a DataFrame with the exon coordinates from Gencode for a single - gene, return the total number of coding regions in that gene. - """ - scaffold = df.iloc[0].scaffold - strand = df.iloc[0].strand - gene_type = df.iloc[0].gene_type - gene_id = df.iloc[0].gene_id - gene_name = df.iloc[0].gene_name - start = df.start.min() - end = df.end.max() - bp = [False] * (end - start + 1) - for i in range(df.shape[0]): - s = df.iloc[i]['start'] - start - e = df.iloc[i]['end'] - start + 1 - bp[s:e] = [True] * (e - s) - regions = list(range(start, end+1)) - groups = [] - - for i, j in groupby(bp): - groups.append((i, len(list(j)))) - e_start = 0 - - for i in groups: - e_end = e_start + i[1] - if i[0]: - record = Record(scaffold=scaffold, start=regions[e_start], - end=regions[e_end-1], gene_type=gene_type, gene_id=gene_id, - gene_name=gene_name, strand=strand) + def recordIterator(self): + record = [] + record_length = 0 + for line in self.fileHandle: + if record_length == self.n: + yield record + record_length = 0 + record = [] + record.append(line.decode().rstrip()) + record_length += 1 yield record - e_start += i[1] - - def geneProcess(self, name): - """A method to return a Record object providing genomic information - - Args: - name: A valid gencode gene_id - - Returns: - An object Record containing scaffold, start, end, strand, mirbase_id and - mirbase_name as its variables for access - """ - self.fileHandle = open(self.fileName, 'r+b') - self.mm = mmap.mmap(self.fileHandle.fileno(), 0) - positions = self.geneFeatures[name] - exons = [] - for position in positions: - self.mm.seek(position) - row = self.mm.readline().decode('utf-8').rstrip().split("\t") - attributes = row[-1].split("; ") - for attribute in attributes: - if attribute.startswith("gene_type"): - _gt = attribute.split(" ")[-1][1:-1] - elif attribute.startswith("gene_id"): - _gid = attribute.split(" ")[-1][1:-1] - elif attribute.startswith("gene_name"): - _gn = attribute.split(" ")[-1][1:-1] - exons.append((row[0], int(row[3]), int(row[4]), row[6], _gt, _gid, _gn)) - self.fileHandle.close() - exons_df = pd.DataFrame(exons, columns = ['scaffold', 'start', 'end', - 'strand', 'gene_type', 'gene_id', 'gene_name']) - - for record in self.geneExonicRegions(exons_df): - yield record - - def coordinates(self, name, start=None, end=None): - """A generator to return a bed record containing genomic coordinates for the - aligned segment - - Keyword Args: - start: The alignment start position of the cDNA molecule or the relative - start of the particular molecule - end: The alignment end position in the cDNA molecule or the relative end - of the particular molecule - - Args: - name: A valid miRNA_primary_transcript name - - Returns: - A list of tuple(s) of strings containing elements for a bed record. There - may be more than one because of alternate splicing. - """ - if "|" in name: - self.name = name.split("|")[0] - else: - self.name = name - positions = {} - match_positions = [] - records = [] - segments = [] - result_segments = [] - for record in self.process(self.name): - records.append(record) - records.sort(key=lambda x: int(x.exon_number)) - - if records[0].strand == '+': - _start = 1 - for record in records: - for relative, actual in enumerate(range(record.start, record.end+1),\ - start=_start): - positions[relative] = actual - _start = relative + 1 - for pos in range(start, end): - match_positions.append(positions[pos]) - for key, group in groupby(enumerate(match_positions), - lambda x: x[0] - x[-1]): - segment = list(map(itemgetter(1), group)) - segments.append([segment[0], segment[-1]]) - for segment in segments: - for record in records: - if segment[0] >= record.start and segment[1] <= record.end: - result_segments.append((record.scaffold, segment[0], segment[1], - record.transcript_id + '|' + record.gene_name, 0, record.strand)) - - elif records[0].strand == '-': - _start = 1 - for record in records: - for relative, actual in enumerate(reversed(range(record.start, \ - record.end+1)), start=_start): - positions[relative] = actual - _start = relative + 1 - for pos in range(start, end): - match_positions.append(positions[pos]) - for key, group in groupby(enumerate(reversed(match_positions)), - lambda x: x[0] - x[-1]): - segment = list(map(itemgetter(1), group)) - segments.append([segment[0], segment[-1]]) - for segment in segments: - for record in records: - if segment[0] >= record.start and segment[1] <= record.end: - result_segments.append((record.scaffold, segment[0], segment[1], - record.transcript_id + '|' + record.gene_name, 0, record.strand)) - - if len(result_segments) == 0: - logger.debug('%s, %s, %s' % (name, start, end)) - logger.debug('%s' % str(segments)) - for r in records: - logger.debug('%s %s %s %s' % (r.scaffold, r.strand, - r.start, r.end)) - - return result_segments - -class SAM: - """SAM file parser for parsing bowtie2 generated files - - This class uses memory-mapped file object to read a sam file - - Attributes: - fileName: A sam file path - """ - - def __init__(self, fileName=None): - self.fileName = fileName - self.records = {} - - def read(self, flag=0): - """Reads sam file provided during class initialization - - Stores the byte position of every record based on the keyword arg flag - provided, to a dict object named self.records - - Keyword Args: - flag: The SAM alignment flag for a record. For default, it uses the - primary alignment for every record and ignores secondary alignments - (default 0) - """ - logger.info('Reading %s' % self.fileName) - self.fileHandle = open(self.fileName, 'r+b') - bytePosition = self.fileHandle.tell() - for line in self.fileHandle: - read = line.decode('utf-8').split("\t") - if not read[0].startswith("@") and read[1] == str(flag): - self.records[read[0]] = bytePosition - bytePosition = self.fileHandle.tell() - self.fileHandle.close() - logger.debug('Reading %s finished' % self.fileName) - - def access(self, queryName): - """Provides random access of a record from the sam file - - Args: - queryName: The query name of the read from the sam file - - Returns: - A list generated after splitting the record line from sam file - """ - self.fileHandle = open(self.fileName, 'r+b') - self.mm = mmap.mmap(self.fileHandle.fileno(), 0) - self.mm.seek(self.records[queryName]) - row = self.mm.readline().decode('utf-8').rstrip().split("\t") - self.fileHandle.close() - return self.pretty(row) + def createUnique(self): + if self.compressed: + self.fileHandle = gzip.open(self.fileName, 'rb') + else: + self.fileHandle = open(self.fileName, 'rb') + logger.info('Reading {}'.format(self.fileName)) + for record in self.recordIterator(): + self.sequences[record[1]] += 1 + logger.info('Writing {}'.format(self.uniqueOutput)) + with open(self.uniqueOutput, 'w') as wH: + for index, (sequence, counts) in enumerate(sorted(self.sequences.items(), + key=itemgetter(1), reverse=True), start=1): + print('>{}-{}-{}'.format(index, counts, len(sequence)), file=wH) + print(textwrap.fill(sequence, width=80), file=wH) + logger.debug('Finished writing {}'.format(self.uniqueOutput)) + self.fileHandle.close() - def filterPotentialChimeras(self, min_length=30, flag=0, target=None): - """Generated a filtered fasta file from a sam file - This filtered fasta file contains reads that can be potentially chimeras. - The criteria for filtering is based on the minimum length +class Record: + """A custom object (preferred over dict) for easy access using variables - Keyword Args: - min_length: To be selected as a potential chimera, this is the minimum - read length (default 30) - flag: The SAM alignment flag describing the type of alignment (default 0) - target: The prefix for output file + It's a dependency for GTF and GFF classes """ - logger.debug('Filtering {} for potential chimeras'.format(target)) - target = '{}.filter.fasta'.format(target.rpartition(".")[0]) - if os.path.exists(target): - logger.info('Skipping filtering for {}'.format(target)) - else: - with open(target, 'w') as oH: - with open(self.fileName) as iH: - for row in csv.reader(iH, delimiter="\t"): - if not row[0].startswith('@') and row[1] == str(flag): - if len(row[9]) >= 30: - print(textwrap.fill('>%s' % row[0], width=80), file=oH) - print(textwrap.fill('%s' % row[9], width=80), file=oH) - logger.debug('Filtering finished') - return target - - def pretty(self, row): - refId = row[2] - start = int(row[3]) - for i in row[10:]: - if i.startswith('MD'): mismatchInfo = i - sequence = row[9] - cigar = row[5] - cigarString = clashchimeras.methods.convertCigar(row[5]) - matchLength = cigarString.count("M") + cigarString.count("D") - end = start + matchLength - 1 - record = Record(refId=refId, start=start, mismatchInfo=mismatchInfo, - sequence=sequence, cigarString=cigarString, matchLength=matchLength, - cigar=cigar, end=end) - return record - -class Output: - """Contains methods for writing output files - - This class is used to generate every kind of output generated by this - package which includes plain text, ansi colored text and bed file - - Attributes: - target: A prefix for output file which will be automatically followed by - extension (default 'wip') - overlap: Minimum overlap to be set between two molecules when determining - chimera (default 4) - gap: Maximum gap (number of unknown nucleotides) to be allowed between - two molecules within a chimera (default 9) - """ - - def __init__(self, - target=None, - smallRNABed=False, - targetRNABed=False, - overlap=4, - gap=9): - self.target = target - self.overlap = overlap - self.gap = gap - - if smallRNABed: - self.smallRNABedHandle = open('{}.smallRNA.bed'.format(self.target), 'w') - print('# BED locations of smallRNA part of the identified chimera', - file=self.smallRNABedHandle) - self.smallRNABedCSV = csv.writer(self.smallRNABedHandle, delimiter="\t") - self.smallRNABedCSV.writerow( - ['# The name field represents the following:']) - self.smallRNABedCSV.writerow( - ['# E.g. 201980-1-48|hsa-mir-100==PAPSS1']) - self.smallRNABedCSV.writerow( - ['# 201980-1-48 is the fasta identifier']) - self.smallRNABedCSV.writerow( - ["# 201980 is the unique identifier"]) - self.smallRNABedCSV.writerow( - ["# 1 is the number of times that sequence was observed in raw " - "fastq "]) - self.smallRNABedCSV.writerow( - ["# 48 is the length of the sequence"]) - self.smallRNABedCSV.writerow( - ['# hsa-mir-100 represents the smallRNA transcript']) - self.smallRNABedCSV.writerow( - ['# PAPSS1 represents the gene symbol for targetRNA transcript ' - 'transcript ']) - if targetRNABed: - self.targetRNABedHandle = open('{}.targetRNA.bed'.format(self.target), - 'w') - self.targetRNABedCSV = csv.writer(self.targetRNABedHandle, delimiter="\t") - self.targetRNABedCSV.writerow( - ['# The name field represents the following:']) - self.targetRNABedCSV.writerow( - ['# E.g. 136019-1-48|ENST00000375759.6|SPEN==hsa-mir-103a-2']) - self.targetRNABedCSV.writerow( - ['# 136019-1-48 is the fasta identifier']) - self.targetRNABedCSV.writerow( - ["# 136019 is the unique identifier"]) - self.targetRNABedCSV.writerow( - ["# 1 is the number of times that sequence was observed in raw " - "fastq "]) - self.targetRNABedCSV.writerow( - ["# 48 is the length of the sequence"]) - self.targetRNABedCSV.writerow( - ["# ENST00000375759.6 is the targetRNA transcript identifier"]) - self.targetRNABedCSV.writerow( - ['# SPEN is the gene symbol for for targetRNA transcript ' - 'ENST00000375759.6']) - self.targetRNABedCSV.writerow( - ['# hsa-mir-103a-2 represents the smallRNA transcript ']) - - self.hybWriter = open('%s.chimeras.tsv' % self.target, 'w') - self.hybComments() - - def hybComments(self): - print("# fasta Identifier: The identifier in .unique.fasta. ", - "#\tE.g. 123456-3-68 ", - "#\t123456 is the unique identifier", - "#\t3 is the number of times that sequence was observed in raw " - "fastq ", - "#\t68 is the length of the sequence", sep="\n", file=self.hybWriter) - - print("# smallRNA: The cDNA ID of the type of RNA labelled as smallRNA in " - "the analysis", - "#\tE.g. hsa-let-7b (miRBase identifier)", - "#\tE.g. ENST00000619178.1|SNORD3D| (Gencode snoRNA identifier)", - sep="\n", file=self.hybWriter) - - print("# smallRNA_start: cDNA alignment start position of the smallRNA " - "part of the chimera", file=self.hybWriter) - - print("# smallRNA_MDtag: Showing the MD tag from the smallRNA SAM " - "alignment for the chimera", - "#\tSAM file format specification", - "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", - "#\tMD Z String for mismatching positions.Regex:[0-9]+(([" - "A-Z]|\^[A-Z]+)[0-9]+)*9", sep="\n", file=self.hybWriter) - - print('# smallRNA_cigar: Cigar string from the smallRNA SAM alignment for ' - 'the chimera', - "#\tSAM file format specification", - "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", - '#\tSee CIGAR in the file', sep="\n", file=self.hybWriter) - - print('# arbitrary_chimera: The chimera representation indicating what ' - 'part of the sequence represents smallRNA and targetRNA', - '#\t{ is representing a match with smallRNA', - '#\t} is representing a match with targetRNA', - '#\t# is representing unaligned sequences (identified as --gap -ga)', - '#\t- is representing a deletion (D in cigar string)', - '#\t+ is representing a deletion (I in cigar string)', - '#\tE.g {{{{{{{{-{{{{{{{{{{{{{##}}}}}}}}}}+}}}}}}}}}}}}}}}}}}}}}}' - '#\tE.g The first 22 nucleotides are aligning to smallRNA cDNA', - '#\tE.g The last 33 nucleotides are aligning to targetRNA cDNA', - sep="\n", file=self.hybWriter) - - print('# read_sequence: The actual sequence that is appeared in raw ' - 'reads', file=self.hybWriter) - - print("# targetRNA: The cDNA ID of the type of RNA labelled as targetRNA " - "in " - "the analysis", - "#\tE.g. hsa-let-7b (miRBase identifier)", - "#\tE.g. ENST00000619178.1|SNORD3D| (Gencode snoRNA identifier)", - sep="\n", file=self.hybWriter) - - print("# targetRNA_start: cDNA alignment start position of the targetRNA " - "part of the chimera", file=self.hybWriter) - - print("# targetRNA_MDtag: Showing the MD tag from the targetRNA SAM " - "alignment for the chimera", - "#\tSAM file format specification", - "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", - "#\tMD Z String for mismatching positions.Regex:[0-9]+(([" - "A-Z]|\^[A-Z]+)[0-9]+)*9", sep="\n", file=self.hybWriter) - - print('# targetRNA_cigar: Cigar string from the targetRNA SAM alignment ' - 'for ' - 'the chimera', - "#\tSAM file format specification", - "#\thttp://samtools.github.io/hts-specs/SAMv1.pdf", - '#\tSee CIGAR in the file', sep="\n", file=self.hybWriter) - - print("# fasta_Identifier", "smallRNA", "smallRNA_start", "smallRNA_MDtag", - "smallRNA_cigar", "arbitrary_chimera", "read_sequence", "targetRNA", - "targetRNA_start", "targetRNA_MDtag", "targetRNA_cigar", sep="\t", - file=self.hybWriter) - - def writeTargetRNABed(self, query, targetRNASegments, smallRNA): - if "ENS" in smallRNA and "|" in smallRNA: - _smallRNA = smallRNA.split("|")[5] - else: - _smallRNA = smallRNA - for segment in targetRNASegments: - _segment = list(segment) - _segment[3] = query + "|" + _segment[3] + "==" + _smallRNA - self.targetRNABedCSV.writerow(_segment) - - def writeSmallRNABed(self, query, smallRNASegments, targetRNA): - if "ENS" in targetRNA and "|" in targetRNA: - _targetRNA = targetRNA.split("|")[5] - else: - _targetRNA = targetRNA - for segment in smallRNASegments: - _segment = list(segment) - _segment[3] = query + "|" + _segment[3] + "==" + _targetRNA - self.smallRNABedCSV.writerow(_segment) - - def write(self, queryName, smallRNA, targetRNA): - chimeraString = clashchimeras.methods.chimeraOrNot(smallRNA.cigarString, - targetRNA.cigarString, overlap=self.overlap, gap=self.gap) - smallRNARegion = clashchimeras.methods.findRegion(smallRNA) - targetRNARegion = clashchimeras.methods.findRegion(targetRNA) - print(queryName, smallRNARegion, smallRNA.start, smallRNA.mismatchInfo, - smallRNA.cigar, chimeraString, smallRNA.sequence, - targetRNARegion, targetRNA.start, - targetRNA.mismatchInfo, targetRNA.cigar, sep="\t", file=self.hybWriter) - - def __del__(self): - self.hybWriter.close() - -class Fasta: - def __init__(self, genome=None, gtf=None): - self.genome = genome - self.gtf = gtf - self.faidx = pyfaidx.Fasta(self.genome) - - def getBiotype(self, output=None, biotype=None): - self.sequences = [] - g = GTF(fileName=self.gtf) - if biotype == 'tRNA': - g.readBiotype(biotype=biotype, featureType='tRNAscan') - else: - g.readBiotype(biotype=biotype) - for transcript_id, exons in g.biotypeFeatures.items(): - temp_seq = '' - exons.sort(key=itemgetter(0)) - for exon in exons: - if exon[4] == '-': - temp_seq += (-self.faidx[exon[1]][exon[2]-1:exon[3]]).seq - elif exon[4] == '+': - temp_seq += self.faidx[exon[1]][exon[2]-1:exon[3]].seq - - _id = '{}|{}|{}|{}|{}|{}|{}'.format(transcript_id, - exons[0][5], - exons[0][6], - exons[0][7], - exons[0][8], - exons[0][9], - len(temp_seq)) - - temp_rec = SeqRecord(seq=Seq(temp_seq), id=_id, - description='') - - self.sequences.append(temp_rec) - - if not output: - logger.error('Please provide output file..') - sys.exit() - else: - logger.info('Writing {}'.format(output)) - SeqIO.write(self.sequences, output, 'fasta') - - -class Fastq: - def __init__(self, fileName=None, compressed=False): - self.fileName = fileName - self.compressed = compressed - self.n = 4 - self.sequences = Counter() - self.uniqueOutput = fileName.rpartition(".")[0] + '.unique.fasta' - - def recordIterator(self): - record = [] - record_length = 0 - for line in self.fileHandle: - if record_length == self.n: - yield record - record_length = 0 - record = [] - record.append(line.decode().rstrip()) - record_length += 1 - yield record - - def createUnique(self): - if self.compressed: - self.fileHandle = gzip.open(self.fileName, 'rb') - else: - self.fileHandle = open(self.fileName, 'rb') - logger.info('Reading {}'.format(self.fileName)) - for record in self.recordIterator(): - self.sequences[record[1]] += 1 - logger.info('Writing {}'.format(self.uniqueOutput)) - with open(self.uniqueOutput, 'w') as wH: - for index, (sequence, counts) in enumerate(sorted(self.sequences.items(), - key=itemgetter(1), reverse=True), start=1): - print('>{}-{}-{}'.format(index, counts, len(sequence)), file=wH) - print(textwrap.fill(sequence, width=80), file=wH) - logger.debug('Finished writing {}'.format(self.uniqueOutput)) - self.fileHandle.close() - -class Record: - """A custom object (preferred over dict) for easy access using variables - It's a dependency for GTF and GFF classes - """ - def __init__(self, **kwds): - self.__dict__.update(kwds) + def __init__(self, **kwds): + self.__dict__.update(kwds) diff --git a/clashchimeras/runners.py b/clashchimeras/runners.py index 4ef0cb1..9f0a033 100644 --- a/clashchimeras/runners.py +++ b/clashchimeras/runners.py @@ -1,4 +1,3 @@ -import glob import hashlib import logging import os @@ -8,164 +7,173 @@ logger = logging.getLogger('root') + class Bowtie: - """Class for calling bowtie2 aligner - """ - - def __init__(self, args=None): - self.parameters = [] - self.inputFile = args.input - self.compressed = args.gzip - self.bowtieExecutable = args.bowtieExecutable - self.uniqueInput = self.inputFile.rpartition(".")[0] + '.unique.fasta' - if os.path.exists(self.uniqueInput) and os.path.exists( - self.uniqueInput+'.sha256sum'): - with open(self.uniqueInput+'.sha256sum') as f: - s = f.readline().rstrip() - _s = self.hash(self.uniqueInput) - if s == _s: - logger.info('{} is present and verified'.format(self.uniqueInput)) - else: - logger.warn('{} is present but checksum verification failed'.format( - self.uniqueInput)) - logger.warn('{} will be generated again'.format(self.uniqueInput)) - else: - _f = Fastq(fileName=self.inputFile, compressed=self.compressed) - _f.createUnique() - sha256sum = self.hash(self.uniqueInput) - with open(self.uniqueInput+'.sha256sum', 'w') as wH: - print(sha256sum, file=wH) - if not args.reverseComplement: - self.parameters.append('--norc') - self.parameters.append('-N') - self.parameters.append(str(args.mismatch)) - if not args.unaligned: - self.parameters.append('--no-unal') - self.parameters.append('-p') - self.parameters.append(str(args.threads)) - self.parameters.append('--'+args.preset) - self.smallRNAIndex = args.smallRNAIndex - self.targetRNAIndex = args.targetRNAIndex - self.output = args.output - self.runAligner = True - - def hash(self, file): - sha256 = hashlib.sha256() - with open(file, 'r+b') as f: - while True: - buf = f.read(8192) - if not buf: - break - sha256.update(buf) - return sha256.hexdigest() - - def run(self, output=None, filtered=None): - self.bowtie2 = [] - if output=='smallRNA': - self.outputSam = '{}.{}.sam'.format(self.output, 'smallRNA') - elif output=='targetRNA': - self.outputSam = '{}.{}.sam'.format(self.output, 'targetRNA') - if not self.bowtieExecutable: - self.bowtie2.append('bowtie2') - else: - self.bowtie2.append(self.bowtieExecutable) - self.bowtie2.append('-f') - if filtered: - self.bowtie2.append(filtered) - else: - self.bowtie2.append(self.uniqueInput) - self.bowtie2.append('-x') - if output == 'smallRNA': - self.bowtie2.append('"'+self.smallRNAIndex+'"') - elif output == 'targetRNA': - self.bowtie2.append('"'+self.targetRNAIndex+'"') - self.bowtie2 = self.bowtie2 + self.parameters - self.bowtie2.append('-S') - self.bowtie2.append(self.outputSam) - - if os.path.exists(self.outputSam) and os.path.exists( - self.outputSam+'.sha256sum'): - with open(self.outputSam+'.sha256sum') as f: - s = f.readline().rstrip() - _s = self.hash(self.outputSam) - if s == _s: - logger.info('{} is present and verified'.format(self.outputSam)) - self.runAligner = False - else: - logger.warn('{} is present but checksum verification failed'.format( - self.outputSam)) - logger.warn('{} will be generated again') + """Class for calling bowtie2 aligner + """ + + def __init__(self, args=None): + self.parameters = [] + self.inputFile = args.input + self.compressed = args.gzip + self.bowtieExecutable = args.bowtieExecutable + self.uniqueInput = self.inputFile.rpartition(".")[0] + '.unique.fasta' + if os.path.exists(self.uniqueInput) and os.path.exists( + self.uniqueInput + '.sha256sum'): + with open(self.uniqueInput + '.sha256sum') as f: + s = f.readline().rstrip() + _s = self.hash(self.uniqueInput) + if s == _s: + logger.info( + '{} is present and verified'.format(self.uniqueInput)) + else: + logger.warn('{} is present but checksum verification ' + 'failed'.format(self.uniqueInput)) + logger.warn( + '{} will be generated again'.format(self.uniqueInput)) + else: + _f = Fastq(fileName=self.inputFile, compressed=self.compressed) + _f.createUnique() + sha256sum = self.hash(self.uniqueInput) + with open(self.uniqueInput + '.sha256sum', 'w') as wH: + print(sha256sum, file=wH) + if not args.reverseComplement: + self.parameters.append('--norc') + self.parameters.append('-N') + self.parameters.append(str(args.mismatch)) + if not args.unaligned: + self.parameters.append('--no-unal') + self.parameters.append('-p') + self.parameters.append(str(args.threads)) + self.parameters.append('--' + args.preset) + self.smallRNAIndex = args.smallRNAIndex + self.targetRNAIndex = args.targetRNAIndex + self.output = args.output self.runAligner = True - if self.runAligner: - logger.info('Executing bowtie2 and generating {}'.format(self.outputSam)) - logger.info('The complete command for bowtie2 is \n{}'.format( - " ".join(self.bowtie2))) - bowtie2Process = subprocess.Popen(self.bowtie2, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - bowtie2Stdout, bowtie2Stderr = bowtie2Process.communicate() - logger.warn(bowtie2Stdout.decode('utf-8')) - logger.warn(bowtie2Stderr.decode('utf-8')) - if os.path.exists(self.outputSam): - logger.info('Bowtie2 execution finished and {} generated'.format( - self.outputSam)) - sha256sum = self.hash(self.outputSam) - with open(self.outputSam+'.sha256sum', 'w') as wH: - print(sha256sum, file=wH) - else: - logger.error('Something went wrong with your bowtie2 command.') - logger.error('Please check your bowtie2 stderr and report a bug....') + def hash(self, file): + sha256 = hashlib.sha256() + with open(file, 'r+b') as f: + while True: + buf = f.read(8192) + if not buf: + break + sha256.update(buf) + return sha256.hexdigest() + + def run(self, output=None, filtered=None): + self.bowtie2 = [] + if output == 'smallRNA': + self.outputSam = '{}.{}.sam'.format(self.output, 'smallRNA') + elif output == 'targetRNA': + self.outputSam = '{}.{}.sam'.format(self.output, 'targetRNA') + if not self.bowtieExecutable: + self.bowtie2.append('bowtie2') + else: + self.bowtie2.append(self.bowtieExecutable) + self.bowtie2.append('-f') + if filtered: + self.bowtie2.append(filtered) + else: + self.bowtie2.append(self.uniqueInput) + self.bowtie2.append('-x') + if output == 'smallRNA': + self.bowtie2.append('"' + self.smallRNAIndex + '"') + elif output == 'targetRNA': + self.bowtie2.append('"' + self.targetRNAIndex + '"') + self.bowtie2 = self.bowtie2 + self.parameters + self.bowtie2.append('-S') + self.bowtie2.append(self.outputSam) + + if os.path.exists(self.outputSam) and os.path.exists( + self.outputSam + '.sha256sum'): + with open(self.outputSam + '.sha256sum') as f: + s = f.readline().rstrip() + _s = self.hash(self.outputSam) + if s == _s: + logger.info( + '{} is present and verified'.format(self.outputSam)) + self.runAligner = False + else: + logger.warn('{} is present but checksum verification ' + 'failed'.format(self.outputSam)) + logger.warn('{} will be generated again') + self.runAligner = True + + if self.runAligner: + logger.info( + 'Executing bowtie2 and generating {}'.format(self.outputSam)) + logger.info('The complete command for bowtie2 is \n{}'.format( + " ".join(self.bowtie2))) + bowtie2Process = subprocess.Popen(self.bowtie2, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + bowtie2Stdout, bowtie2Stderr = bowtie2Process.communicate() + logger.warn(bowtie2Stdout.decode('utf-8')) + logger.warn(bowtie2Stderr.decode('utf-8')) + if os.path.exists(self.outputSam): + logger.info('Bowtie2 execution finished and {} ' + 'generated'.format(self.outputSam)) + sha256sum = self.hash(self.outputSam) + with open(self.outputSam + '.sha256sum', 'w') as wH: + print(sha256sum, file=wH) + else: + logger.error('Something went wrong with your bowtie2 command.') + logger.error( + 'Please check your bowtie2 stderr and report a bug....') class Tophat: - def __init__(self, args=None): - self.inputFile = args.input - if args.genomeIndex: - args.genomeIndex = os.path.expanduser(args.genomeIndex) - if args.transcriptomeIndex: - args.transcriptomeIndex = os.path.expanduser( - args.transcriptomeIndex) - if args.output: - args.output = os.path.expanduser(args.output) - if args.input: - args.input = os.path.expanduser(args.input) - - self.tophatOutput = args.output - self.genomeIndex = args.genomeIndex - - self.tophatRun = [] - if args.tophatExecutable is not None: - self.tophatExecutable = args.tophatExecutable - else: - self.tophatExecutable = 'tophat' - self.tophatRun.append(self.tophatExecutable) - self.tophatRun.append('--b2-'+args.tophatPreset) - self.tophatRun.append('--b2-N') - self.tophatRun.append(str(args.mismatch)) - - if args.transcriptomeIndex: - self.tophatRun.append('--transcriptome-index='+args.transcriptomeIndex - .strip()) - else: - logger.warn('Transcriptome index is not provided for tophat') - self.tophatRun.append('-p') - self.tophatRun.append(str(args.threads)) - self.tophatRun.append('-o') - self.tophatRun.append(self.tophatOutput) - self.tophatRun.append(self.genomeIndex) - self.tophatRun.append(self.inputFile) - - def run(self): - logger.info('Executing Tophat2 for {}'.format(self.inputFile)) - logger.info('The complete command for tophat is \n{}'.format(" ".join( - self.tophatRun))) - logger.info('The stdout and stderr for tophat run will be dumped after ' - 'its finished') - tophatProcess = subprocess.Popen(self.tophatRun, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - tophatStderr, tophatStdout = tophatProcess.communicate() - logger.warn('Tophat stderr') - logger.info(tophatStderr.decode("utf-8")) - logger.warn('Tophat stdout') - logger.info(tophatStdout.decode("utf-8")) - logger.info('Execution of Tophat2 finished') + + def __init__(self, args=None): + self.inputFile = args.input + if args.genomeIndex: + args.genomeIndex = os.path.expanduser(args.genomeIndex) + if args.transcriptomeIndex: + args.transcriptomeIndex = os.path.expanduser( + args.transcriptomeIndex) + if args.output: + args.output = os.path.expanduser(args.output) + if args.input: + args.input = os.path.expanduser(args.input) + + self.tophatOutput = args.output + self.genomeIndex = args.genomeIndex + + self.tophatRun = [] + if args.tophatExecutable is not None: + self.tophatExecutable = args.tophatExecutable + else: + self.tophatExecutable = 'tophat' + self.tophatRun.append(self.tophatExecutable) + self.tophatRun.append('--b2-' + args.tophatPreset) + self.tophatRun.append('--b2-N') + self.tophatRun.append(str(args.mismatch)) + + if args.transcriptomeIndex: + self.tophatRun.append('--transcriptome-index=' + + args.transcriptomeIndex.strip()) + else: + logger.warn('Transcriptome index is not provided for tophat') + self.tophatRun.append('-p') + self.tophatRun.append(str(args.threads)) + self.tophatRun.append('-o') + self.tophatRun.append(self.tophatOutput) + self.tophatRun.append(self.genomeIndex) + self.tophatRun.append(self.inputFile) + + def run(self): + logger.info('Executing Tophat2 for {}'.format(self.inputFile)) + logger.info('The complete command for tophat is \n{}'.format(" ".join( + self.tophatRun))) + logger.info('The stdout and stderr for tophat run will be ' + 'dumped after its finished') + tophatProcess = subprocess.Popen(self.tophatRun, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + tophatStderr, tophatStdout = tophatProcess.communicate() + logger.warn('Tophat stderr') + logger.info(tophatStderr.decode("utf-8")) + logger.warn('Tophat stdout') + logger.info(tophatStdout.decode("utf-8")) + logger.info('Execution of Tophat2 finished') diff --git a/dist/CLASHChimeras-0.1b3-py3.5.egg b/dist/CLASHChimeras-0.1b3-py3.5.egg new file mode 100644 index 0000000..b232367 Binary files /dev/null and b/dist/CLASHChimeras-0.1b3-py3.5.egg differ diff --git a/setup.py b/setup.py index cd44dfb..813c429 100644 --- a/setup.py +++ b/setup.py @@ -1,59 +1,61 @@ +import logging import os import sys -import logging from setuptools import setup logging.basicConfig(level=logging.DEBUG, - format='[%(asctime)s] - %(name)s - %(levelname)s - %(message)s' -) + format='[%(asctime)s] - %(name)s - %(levelname)s - ' + '%(message)s') logger = logging.getLogger('CLASHChimeras') major, minor = sys.version_info.major, sys.version_info.minor if major == 3 and minor < 4: - logger.warn('This package is tested with Python 3.4') - logger.warn('However it should work with Python version 3.x {}'.format( - ' but should you face any problems, please install Python >= 3.4' - )) + logger.warn('This package is tested with Python 3.4') + logger.warn('However it should work with Python version 3.x {}'.format( + ' but should you face any problems, please install Python >= 3.4' + )) elif major < 3: - logger.error('Please install Python version >= 3 to use this package') - sys.exit(2) + logger.error('Please install Python version >= 3 to use this package') + sys.exit(2) -requirements = open(os.path.join(os.path.dirname(__file__), 'requirements.txt')).readlines() -version = open(os.path.join(os.path.dirname(__file__), 'VERSION')).readline().rstrip() +requirements = open(os.path.join(os.path.dirname( + __file__), 'requirements.txt')).readlines() +version = open(os.path.join(os.path.dirname( + __file__), 'VERSION')).readline().rstrip() setup( - name = "CLASHChimeras", - version = version, - author = "Kashyap Chhatbar", - author_email = "kashyap.c@ed.ac.uk", - description = "Python package to find chimeras in CRAC/CLASH and" - " HITS-CLIP datasets", - install_requires=requirements, - url = 'https://github.com/kashyapchhatbar/CLASHChimeras', + name="CLASHChimeras", + version=version, + author="Kashyap Chhatbar", + author_email="kashyap.c@ed.ac.uk", + description="Python package to find chimeras in CRAC/CLASH and" + " HITS-CLIP datasets", + install_requires=requirements, + url='https://github.com/kashyapchhatbar/CLASHChimeras', - classifiers=['Development Status :: 4 - Beta', + classifiers=['Development Status :: 4 - Beta', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Medical Science Apps.', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Topic :: Scientific/Engineering :: Medical Science Apps.', - 'License :: OSI Approved :: MIT License', + 'License :: OSI Approved :: MIT License', - 'Environment :: Console', - 'Intended Audience :: Science/Research', - 'Intended Audience :: Developers', + 'Environment :: Console', + 'Intended Audience :: Science/Research', + 'Intended Audience :: Developers', - 'Operating System :: POSIX :: Linux', - 'Operating System :: MacOS', + 'Operating System :: POSIX :: Linux', + 'Operating System :: MacOS', - 'Programming Language :: Python :: 3 :: Only', - ], + 'Programming Language :: Python :: 3 :: Only', + ], - keywords='clash chimeras hybrids hits-clip bioinformatics', + keywords='clash chimeras hybrids hits-clip bioinformatics', - license='MIT', + license='MIT', - py_modules = ['clashchimeras.align', + py_modules=['clashchimeras.align', 'clashchimeras.download', 'clashchimeras.find', 'clashchimeras.initialize', @@ -62,7 +64,7 @@ 'clashchimeras.parsers', 'clashchimeras.runners'], - scripts = ['scripts/align-for-chimeras', + scripts=['scripts/align-for-chimeras', 'scripts/download-for-chimeras', 'scripts/find-chimeras']