From ddb645561dcb5a4f3a28a13e9bac468924e1696c Mon Sep 17 00:00:00 2001 From: Hasan Celik Date: Mon, 28 Jan 2019 15:10:18 +0100 Subject: [PATCH] bug fix #139 and refactoring --- MMSplice/deltaLogitPSI/dataloader.py | 161 +----------------- MMSplice/deltaLogitPSI/dataloader.yaml | 9 +- MMSplice/deltaLogitPSI/model.py | 12 +- MMSplice/deltaLogitPSI/model.yaml | 1 + MMSplice/modularPredictions/dataloader.py | 162 ++---------------- MMSplice/modularPredictions/dataloader.yaml | 9 +- MMSplice/modularPredictions/model.py | 9 +- MMSplice/modularPredictions/model.yaml | 1 + MMSplice/modules/acceptor/dataloader.py | 66 ++++---- MMSplice/modules/donor/dataloader.py | 57 +++---- MMSplice/modules/exon_3prime/dataloader.py | 66 ++++---- MMSplice/modules/exon_5prime/dataloader.py | 66 ++++---- MMSplice/modules/intron_3prime/dataloader.py | 66 ++++---- MMSplice/modules/intron_5prime/dataloader.py | 66 ++++---- MMSplice/pathogenicity/dataloader.py | 163 ++----------------- MMSplice/pathogenicity/dataloader.yaml | 9 +- MMSplice/pathogenicity/model.py | 18 +- MMSplice/pathogenicity/model.yaml | 1 + MMSplice/splicingEfficiency/dataloader.py | 162 ++---------------- MMSplice/splicingEfficiency/dataloader.yaml | 9 +- MMSplice/splicingEfficiency/model.py | 17 +- MMSplice/splicingEfficiency/model.yaml | 1 + 22 files changed, 272 insertions(+), 859 deletions(-) diff --git a/MMSplice/deltaLogitPSI/dataloader.py b/MMSplice/deltaLogitPSI/dataloader.py index f2667861b..5697d00a4 100644 --- a/MMSplice/deltaLogitPSI/dataloader.py +++ b/MMSplice/deltaLogitPSI/dataloader.py @@ -1,14 +1,5 @@ -import numpy as np -from kipoi.data import SampleIterator - -import pickle -from pyfaidx import Fasta -from cyvcf2 import VCF -from concise.preprocessing import encodeDNA -import warnings - from mmsplice import MMSplice -from mmsplice.vcf_dataloader import GenerateExonIntervalTree, VariantInterval, get_var_side +from mmsplice.vcf_dataloader import SplicingVCFDataloader as BaseSplicingVCFDataloader model = MMSplice( exon_cut_l=0, @@ -20,148 +11,12 @@ donor_exon_len=5, donor_intron_len=13) -class SplicingVCFDataloader(SampleIterator): - """ - Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence. - Args: - gtf: gtf file or pickled gtf IntervalTree. - fasta_file: file path; Genome sequence - vcf_file: file path; vcf file with variants to score - """ - - def __init__(self, - gtf_file, - fasta_file, - vcf_file, - split_seq=False, - encode=True, - exon_cut_l=0, - exon_cut_r=0, - acceptor_intron_cut=6, - donor_intron_cut=6, - acceptor_intron_len=50, - acceptor_exon_len=3, - donor_exon_len=5, - donor_intron_len=13, - **kwargs - ): - try: - with open(gtf, 'rb') as f: - self.exons = pickle.load(f) - except: - self.exons = GenerateExonIntervalTree(gtf_file, **kwargs) - import six - if isinstance(fasta_file, six.string_types): - fasta = Fasta(fasta_file, as_raw=False) - self.fasta = fasta - self.ssGenerator = self.spliceSiteGenerator(vcf_file, self.exons) - - self.encode = encode - self.split_seq = split_seq - self.exon_cut_l = exon_cut_l - self.exon_cut_r = exon_cut_r - self.acceptor_intron_cut = acceptor_intron_cut - self.donor_intron_cut = donor_intron_cut - self.acceptor_intron_len = acceptor_intron_len - self.acceptor_exon_len = acceptor_exon_len - self.donor_exon_len = donor_exon_len - self.donor_intron_len = donor_intron_len - - @staticmethod - def spliceSiteGenerator(vcf_file, exonTree, variant_filter=True): - variants = VCF(vcf_file) - for var in variants: - if variant_filter and var.FILTER: - next - iv = VariantInterval.from_Variant(var) - - matches = map(lambda x: x.interval, - exonTree.intersect(iv, ignore_strand=True)) - - for match in matches: - side = get_var_side(( - var.POS, - var.REF, - var.ALT, - match.Exon_Start, - match.Exon_End, - match.strand - )) - var = iv.to_Variant(match.strand, side) # to my Variant class - yield match, var - - def __iter__(self): - return self +class SplicingVCFDataloader(BaseSplicingVCFDataloader): def __next__(self): - ss, var = next(self.ssGenerator) - out = {} - x = {} - x['inputs'] = {} - x['inputs_mut'] = {} - seq = ss.get_seq(self.fasta).upper() - mut_seq = ss.get_mut_seq(self.fasta, var).upper() - if self.split_seq: - seq = self.split(seq, ss.overhang) - mut_seq = self.split(mut_seq, ss.overhang) - x['inputs']['seq'] = seq - x['inputs_mut']['seq'] = mut_seq - x['inputs']['intronl_len'] = ss.overhang[0] - x['inputs']['intronr_len'] = ss.overhang[1] - x['inputs_mut']['intronl_len'] = ss.overhang[0] - x['inputs_mut']['intronr_len'] = ss.overhang[1] - - out['inputs'] = (model.predict(x['inputs_mut']) - model.predict(x['inputs'])).values - - out['metadata'] = {} - out['metadata']['ranges'] = ss.grange - out['metadata']['variant'] = var.to_dict - out['metadata']['ExonInterval'] = ss.to_dict # so that np collate will work - out['metadata']['annotation'] = str(ss) - return out - - def batch_predict_iter(self, **kwargs): - """Returns samples directly useful for prediction x["inputs"] - Args: - **kwargs: Arguments passed to self.batch_iter(**kwargs) - """ - return (x for x in self.batch_iter(**kwargs)) - - def split(self, x, overhang): - ''' x: a sequence to split - ''' - intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long - if lackl >= 0: - x = "N"*(lackl+1) + x - intronl_len += lackl+1 - lackr = self.donor_intron_len - intronr_len - if lackr >= 0: - x = x + "N"*(lackr+1) - intronr_len += lackr + 1 - acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] - donor_intron = x[-intronr_len+self.donor_intron_cut:] - if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": - warnings.warn("None GT donor", UserWarning) - if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": - warnings.warn("None AG donor", UserWarning) - - if self.encode: - return { - "acceptor_intron": encodeDNA([acceptor_intron]), - "acceptor": encodeDNA([acceptor]), - "exon": encodeDNA([exon]), - "donor": encodeDNA([donor]), - "donor_intron": encodeDNA([donor_intron]) - } - else: - return { - "acceptor_intron": acceptor_intron, - "acceptor": acceptor, - "exon": exon, - "donor": donor, - "donor_intron": donor_intron - } + super_out = super().__next__() + return { + 'inputs': (model.predict(super_out['inputs_mut']) - + model.predict(super_out['inputs'])).values, + 'metadata': super_out['metadata'] + } diff --git a/MMSplice/deltaLogitPSI/dataloader.yaml b/MMSplice/deltaLogitPSI/dataloader.yaml index dc0d09674..bf7f938d8 100644 --- a/MMSplice/deltaLogitPSI/dataloader.yaml +++ b/MMSplice/deltaLogitPSI/dataloader.yaml @@ -1,7 +1,7 @@ type: SampleIterator defined_as: dataloader.SplicingVCFDataloader args: - gtf_file: + gtf: doc: path to the GTF file required by the models (Ensemble) example: url: https://sandbox.zenodo.org/record/248604/files/test.gtf?download=1 @@ -19,6 +19,9 @@ args: split_seq: doc: Whether split the sequence in dataloader optional: True + variant_filter: + doc: If set True (default), variants with `FILTER` field other than `PASS` will be filtered out. + optional: True encode: doc: If split the sequence, whether one hot encoding optional: True @@ -73,7 +76,7 @@ dependencies: - python=3.5 pip: - kipoi - - mmsplice + - mmsplice>=0.2.7 output_schema: inputs: shape: (5, ) @@ -157,4 +160,4 @@ output_schema: doc: genomic end position of the retrieved sequence annotation: type: str - doc: retrieved sequence name \ No newline at end of file + doc: retrieved sequence name diff --git a/MMSplice/deltaLogitPSI/model.py b/MMSplice/deltaLogitPSI/model.py index 7ef128612..698ade5bf 100644 --- a/MMSplice/deltaLogitPSI/model.py +++ b/MMSplice/deltaLogitPSI/model.py @@ -1,16 +1,10 @@ from kipoi.model import BaseModel -import numpy as np from mmsplice import LINEAR_MODEL from mmsplice.utils.postproc import transform -# Model to predict delta logit PSI + class MMSpliceModel(BaseModel): - - def __init__(self): + '''Model to predict delta logit PSI''' - self.model = LINEAR_MODEL - def predict_on_batch(self, inputs): - X = transform(inputs, False) - pred = self.model.predict(X) - return pred \ No newline at end of file + return LINEAR_MODEL.predict(transform(inputs, False)) diff --git a/MMSplice/deltaLogitPSI/model.yaml b/MMSplice/deltaLogitPSI/model.yaml index c69744f52..80e6b6cfd 100644 --- a/MMSplice/deltaLogitPSI/model.yaml +++ b/MMSplice/deltaLogitPSI/model.yaml @@ -18,6 +18,7 @@ dependencies: - numpy pip: - scikit-learn + - mmsplice>=0.2.7 schema: inputs: shape: (5, ) diff --git a/MMSplice/modularPredictions/dataloader.py b/MMSplice/modularPredictions/dataloader.py index e54f73c4e..eda944f33 100644 --- a/MMSplice/modularPredictions/dataloader.py +++ b/MMSplice/modularPredictions/dataloader.py @@ -1,14 +1,6 @@ import numpy as np -from kipoi.data import SampleIterator - -import pickle -from pyfaidx import Fasta -from cyvcf2 import VCF -from concise.preprocessing import encodeDNA -import warnings - from mmsplice import MMSplice -from mmsplice.vcf_dataloader import GenerateExonIntervalTree, VariantInterval, get_var_side +from mmsplice.vcf_dataloader import SplicingVCFDataloader as BaseSplicingVCFDataloader model = MMSplice( exon_cut_l=0, @@ -20,148 +12,14 @@ donor_exon_len=5, donor_intron_len=13) -class SplicingVCFDataloader(SampleIterator): - """ - Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence. - Args: - gtf: gtf file or pickled gtf IntervalTree. - fasta_file: file path; Genome sequence - vcf_file: file path; vcf file with variants to score - """ - - def __init__(self, - gtf_file, - fasta_file, - vcf_file, - split_seq=False, - encode=True, - exon_cut_l=0, - exon_cut_r=0, - acceptor_intron_cut=6, - donor_intron_cut=6, - acceptor_intron_len=50, - acceptor_exon_len=3, - donor_exon_len=5, - donor_intron_len=13, - **kwargs - ): - try: - with open(gtf, 'rb') as f: - self.exons = pickle.load(f) - except: - self.exons = GenerateExonIntervalTree(gtf_file, **kwargs) - import six - if isinstance(fasta_file, six.string_types): - fasta = Fasta(fasta_file, as_raw=False) - self.fasta = fasta - self.ssGenerator = self.spliceSiteGenerator(vcf_file, self.exons) - - self.encode = encode - self.split_seq = split_seq - self.exon_cut_l = exon_cut_l - self.exon_cut_r = exon_cut_r - self.acceptor_intron_cut = acceptor_intron_cut - self.donor_intron_cut = donor_intron_cut - self.acceptor_intron_len = acceptor_intron_len - self.acceptor_exon_len = acceptor_exon_len - self.donor_exon_len = donor_exon_len - self.donor_intron_len = donor_intron_len - - @staticmethod - def spliceSiteGenerator(vcf_file, exonTree, variant_filter=True): - variants = VCF(vcf_file) - for var in variants: - if variant_filter and var.FILTER: - next - iv = VariantInterval.from_Variant(var) - - matches = map(lambda x: x.interval, - exonTree.intersect(iv, ignore_strand=True)) - - for match in matches: - side = get_var_side(( - var.POS, - var.REF, - var.ALT, - match.Exon_Start, - match.Exon_End, - match.strand - )) - var = iv.to_Variant(match.strand, side) # to my Variant class - yield match, var - - def __iter__(self): - return self +class SplicingVCFDataloader(BaseSplicingVCFDataloader): def __next__(self): - ss, var = next(self.ssGenerator) - out = {} - x = {} - x['inputs'] = {} - x['inputs_mut'] = {} - seq = ss.get_seq(self.fasta).upper() - mut_seq = ss.get_mut_seq(self.fasta, var).upper() - if self.split_seq: - seq = self.split(seq, ss.overhang) - mut_seq = self.split(mut_seq, ss.overhang) - x['inputs']['seq'] = seq - x['inputs_mut']['seq'] = mut_seq - x['inputs']['intronl_len'] = ss.overhang[0] - x['inputs']['intronr_len'] = ss.overhang[1] - x['inputs_mut']['intronl_len'] = ss.overhang[0] - x['inputs_mut']['intronr_len'] = ss.overhang[1] - - out['inputs'] = np.concatenate([model.predict(x['inputs_mut']).values, model.predict(x['inputs']).values]) - - out['metadata'] = {} - out['metadata']['ranges'] = ss.grange - out['metadata']['variant'] = var.to_dict - out['metadata']['ExonInterval'] = ss.to_dict # so that np collate will work - out['metadata']['annotation'] = str(ss) - return out - - def batch_predict_iter(self, **kwargs): - """Returns samples directly useful for prediction x["inputs"] - Args: - **kwargs: Arguments passed to self.batch_iter(**kwargs) - """ - return (x for x in self.batch_iter(**kwargs)) - - def split(self, x, overhang): - ''' x: a sequence to split - ''' - intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long - if lackl >= 0: - x = "N"*(lackl+1) + x - intronl_len += lackl+1 - lackr = self.donor_intron_len - intronr_len - if lackr >= 0: - x = x + "N"*(lackr+1) - intronr_len += lackr + 1 - acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] - donor_intron = x[-intronr_len+self.donor_intron_cut:] - if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": - warnings.warn("None GT donor", UserWarning) - if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": - warnings.warn("None AG donor", UserWarning) - - if self.encode: - return { - "acceptor_intron": encodeDNA([acceptor_intron]), - "acceptor": encodeDNA([acceptor]), - "exon": encodeDNA([exon]), - "donor": encodeDNA([donor]), - "donor_intron": encodeDNA([donor_intron]) - } - else: - return { - "acceptor_intron": acceptor_intron, - "acceptor": acceptor, - "exon": exon, - "donor": donor, - "donor_intron": donor_intron - } + super_out = super().__next__() + return { + 'inputs': np.concatenate([ + model.predict(super_out['inputs_mut']).values, + model.predict(super_out['inputs']).values + ]), + 'metadata': super_out['metadata'] + } diff --git a/MMSplice/modularPredictions/dataloader.yaml b/MMSplice/modularPredictions/dataloader.yaml index 5405eeca5..5bb51792c 100644 --- a/MMSplice/modularPredictions/dataloader.yaml +++ b/MMSplice/modularPredictions/dataloader.yaml @@ -1,7 +1,7 @@ type: SampleIterator defined_as: dataloader.SplicingVCFDataloader args: - gtf_file: + gtf: doc: path to the GTF file required by the models (Ensemble) example: url: https://sandbox.zenodo.org/record/248604/files/test.gtf?download=1 @@ -19,6 +19,9 @@ args: split_seq: doc: Whether split the sequence in dataloader optional: True + variant_filter: + doc: If set True (default), variants with `FILTER` field other than `PASS` will be filtered out. + optional: True encode: doc: If split the sequence, whether one hot encoding optional: True @@ -73,7 +76,7 @@ dependencies: - python=3.5 pip: - kipoi - - mmsplice + - mmsplice>=0.2.7 output_schema: inputs: shape: (10, ) @@ -157,4 +160,4 @@ output_schema: doc: genomic end position of the retrieved sequence annotation: type: str - doc: retrieved sequence name \ No newline at end of file + doc: retrieved sequence name diff --git a/MMSplice/modularPredictions/model.py b/MMSplice/modularPredictions/model.py index 7e61b0103..a9bebc561 100644 --- a/MMSplice/modularPredictions/model.py +++ b/MMSplice/modularPredictions/model.py @@ -1,9 +1,8 @@ from kipoi.model import BaseModel -import sklearn -import numpy as np -# Directly give the modular predictions + class MMSpliceModel(BaseModel): - + '''Directly give the modular predictions''' + def predict_on_batch(self, inputs): - return inputs \ No newline at end of file + return inputs diff --git a/MMSplice/modularPredictions/model.yaml b/MMSplice/modularPredictions/model.yaml index fbcbf3501..3da91334b 100644 --- a/MMSplice/modularPredictions/model.yaml +++ b/MMSplice/modularPredictions/model.yaml @@ -18,6 +18,7 @@ dependencies: - numpy pip: - scikit-learn + - mmsplice>=0.2.7 schema: inputs: shape: (10, ) diff --git a/MMSplice/modules/acceptor/dataloader.py b/MMSplice/modules/acceptor/dataloader.py index 5c477b5cc..72516e89d 100644 --- a/MMSplice/modules/acceptor/dataloader.py +++ b/MMSplice/modules/acceptor/dataloader.py @@ -1,16 +1,13 @@ """Dataloader """ -import numpy as np -from kipoi.data import SampleIterator import pickle -from pyfaidx import Fasta +import warnings import gffutils +from pyfaidx import Fasta from concise.preprocessing import encodeDNA -import warnings +from kipoi.data import SampleIterator from kipoi.metadata import GenomicRanges - -from mmsplice import MMSplice from mmsplice.vcf_dataloader import ExonInterval @@ -40,15 +37,15 @@ def __init__(self, **kwargs ): try: - with open(gtf, 'rb') as f: + with open(gtf_file, 'rb') as f: self.exons = pickle.load(f) - except: + except (FileNotFoundError, pickle.UnpicklingError): self.exonGenerator = self.GenerateExons(gtf_file, **kwargs) import six if isinstance(fasta_file, six.string_types): fasta = Fasta(fasta_file, as_raw=False) self.fasta = fasta - + self.encode = encode self.split_seq = split_seq self.exon_cut_l = exon_cut_l @@ -63,34 +60,34 @@ def __init__(self, @staticmethod def GenerateExons( - gtf_file, - overhang=(100, 100), - gtf_db_path=":memory:", - disable_infer_transcripts = True, - disable_infer_genes = True, - firstLastNoExtend=True): + gtf_file, + overhang=(100, 100), + gtf_db_path=":memory:", + disable_infer_transcripts=True, + disable_infer_genes=True, + firstLastNoExtend=True): ''' Generate EexonInterval objects from gtf file ''' try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except: gtf_db = gffutils.create_db(gtf_file, - gtf_db_path, - disable_infer_transcripts = disable_infer_transcripts, - disable_infer_genes = disable_infer_genes) - genes=gtf_db.features_of_type('gene') + gtf_db_path, + disable_infer_transcripts=disable_infer_transcripts, + disable_infer_genes=disable_infer_genes) + genes = gtf_db.features_of_type('gene') default_overhang = overhang for gene in genes: - for exon in gtf_db.children(gene, featuretype = 'exon'): - isLast = False # track whether is last exon + for exon in gtf_db.children(gene, featuretype='exon'): + isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): - overhang = (overhang[0],0) + overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): - #int(exon.attributes['exon_number'][0]) == 1: - overhang = (0,overhang[1]) - iv=ExonInterval.from_Feature(exon, overhang) + # int(exon.attributes['exon_number'][0]) == 1: + overhang = (0, overhang[1]) + iv = ExonInterval.from_Feature(exon, overhang) iv.isLast = isLast overhang = default_overhang yield iv @@ -106,29 +103,30 @@ def __next__(self): if self.split_seq: seq = self.split(seq, ss.overhang)['acceptor'][0] out['inputs']['ss'] = seq - + out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges( - ss.chrom, + ss.chrom, ss.Exon_Start, ss.Exon_End, ss.transcript_id, ss.strand) return out - + def batch_predict_iter(self, **kwargs): """Returns samples directly useful for prediction x["inputs"] Args: **kwargs: Arguments passed to self.batch_iter(**kwargs) """ return (x for x in self.batch_iter(**kwargs)) - + def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long + # need to pad N if left seq not enough long + lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N"*(lackl+1) + x intronl_len += lackl+1 @@ -137,16 +135,16 @@ def split(self, x, overhang): x = x + "N"*(lackr+1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] + acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] + exon = x[(intronl_len+self.exon_cut_l): (-intronr_len-self.exon_cut_r)] + donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] donor_intron = x[-intronr_len+self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) - if self.encode: + if self.encode: return { "acceptor_intron": encodeDNA([acceptor_intron]), "acceptor": encodeDNA([acceptor], maxlen=53), diff --git a/MMSplice/modules/donor/dataloader.py b/MMSplice/modules/donor/dataloader.py index a25b09ad3..71022d878 100644 --- a/MMSplice/modules/donor/dataloader.py +++ b/MMSplice/modules/donor/dataloader.py @@ -40,15 +40,15 @@ def __init__(self, **kwargs ): try: - with open(gtf, 'rb') as f: + with open(gtf_file, 'rb') as f: self.exons = pickle.load(f) - except: + except (FileNotFoundError, pickle.UnpicklingError): self.exonGenerator = self.GenerateExons(gtf_file, **kwargs) import six if isinstance(fasta_file, six.string_types): fasta = Fasta(fasta_file, as_raw=False) self.fasta = fasta - + self.encode = encode self.split_seq = split_seq self.exon_cut_l = exon_cut_l @@ -63,34 +63,34 @@ def __init__(self, @staticmethod def GenerateExons( - gtf_file, - overhang=(100, 100), - gtf_db_path=":memory:", - disable_infer_transcripts = True, - disable_infer_genes = True, - firstLastNoExtend=True): + gtf_file, + overhang=(100, 100), + gtf_db_path=":memory:", + disable_infer_transcripts=True, + disable_infer_genes=True, + firstLastNoExtend=True): ''' Generate EexonInterval objects from gtf file ''' try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except: gtf_db = gffutils.create_db(gtf_file, - gtf_db_path, - disable_infer_transcripts = disable_infer_transcripts, - disable_infer_genes = disable_infer_genes) - genes=gtf_db.features_of_type('gene') + gtf_db_path, + disable_infer_transcripts=disable_infer_transcripts, + disable_infer_genes=disable_infer_genes) + genes = gtf_db.features_of_type('gene') default_overhang = overhang for gene in genes: - for exon in gtf_db.children(gene, featuretype = 'exon'): - isLast = False # track whether is last exon + for exon in gtf_db.children(gene, featuretype='exon'): + isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): - overhang = (overhang[0],0) + overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): - #int(exon.attributes['exon_number'][0]) == 1: - overhang = (0,overhang[1]) - iv=ExonInterval.from_Feature(exon, overhang) + # int(exon.attributes['exon_number'][0]) == 1: + overhang = (0, overhang[1]) + iv = ExonInterval.from_Feature(exon, overhang) iv.isLast = isLast overhang = default_overhang yield iv @@ -106,29 +106,30 @@ def __next__(self): if self.split_seq: seq = self.split(seq, ss.overhang)['donor'][0] out['inputs']['ss'] = seq - + out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges( - ss.chrom, + ss.chrom, ss.Exon_Start, ss.Exon_End, ss.transcript_id, ss.strand) return out - + def batch_predict_iter(self, **kwargs): """Returns samples directly useful for prediction x["inputs"] Args: **kwargs: Arguments passed to self.batch_iter(**kwargs) """ return (x for x in self.batch_iter(**kwargs)) - + def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long + # need to pad N if left seq not enough long + lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N"*(lackl+1) + x intronl_len += lackl+1 @@ -137,16 +138,16 @@ def split(self, x, overhang): x = x + "N"*(lackr+1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] + acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] + exon = x[(intronl_len+self.exon_cut_l): (-intronr_len-self.exon_cut_r)] + donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] donor_intron = x[-intronr_len+self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) - if self.encode: + if self.encode: return { "acceptor_intron": encodeDNA([acceptor_intron]), "acceptor": encodeDNA([acceptor]), diff --git a/MMSplice/modules/exon_3prime/dataloader.py b/MMSplice/modules/exon_3prime/dataloader.py index d660ab6fc..d34fc39a7 100644 --- a/MMSplice/modules/exon_3prime/dataloader.py +++ b/MMSplice/modules/exon_3prime/dataloader.py @@ -1,16 +1,13 @@ """Dataloader """ -import numpy as np -from kipoi.data import SampleIterator import pickle -from pyfaidx import Fasta +import warnings import gffutils +from pyfaidx import Fasta from concise.preprocessing import encodeDNA -import warnings from kipoi.metadata import GenomicRanges - -from mmsplice import MMSplice +from kipoi.data import SampleIterator from mmsplice.vcf_dataloader import ExonInterval @@ -40,15 +37,15 @@ def __init__(self, **kwargs ): try: - with open(gtf, 'rb') as f: + with open(gtf_file, 'rb') as f: self.exons = pickle.load(f) - except: + except (FileNotFoundError, pickle.UnpicklingError): self.exonGenerator = self.GenerateExons(gtf_file, **kwargs) import six if isinstance(fasta_file, six.string_types): fasta = Fasta(fasta_file, as_raw=False) self.fasta = fasta - + self.encode = encode self.split_seq = split_seq self.exon_cut_l = exon_cut_l @@ -63,34 +60,34 @@ def __init__(self, @staticmethod def GenerateExons( - gtf_file, - overhang=(100, 100), - gtf_db_path=":memory:", - disable_infer_transcripts = True, - disable_infer_genes = True, - firstLastNoExtend=True): + gtf_file, + overhang=(100, 100), + gtf_db_path=":memory:", + disable_infer_transcripts=True, + disable_infer_genes=True, + firstLastNoExtend=True): ''' Generate EexonInterval objects from gtf file ''' try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except: gtf_db = gffutils.create_db(gtf_file, - gtf_db_path, - disable_infer_transcripts = disable_infer_transcripts, - disable_infer_genes = disable_infer_genes) - genes=gtf_db.features_of_type('gene') + gtf_db_path, + disable_infer_transcripts=disable_infer_transcripts, + disable_infer_genes=disable_infer_genes) + genes = gtf_db.features_of_type('gene') default_overhang = overhang for gene in genes: - for exon in gtf_db.children(gene, featuretype = 'exon'): - isLast = False # track whether is last exon + for exon in gtf_db.children(gene, featuretype='exon'): + isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): - overhang = (overhang[0],0) + overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): - #int(exon.attributes['exon_number'][0]) == 1: - overhang = (0,overhang[1]) - iv=ExonInterval.from_Feature(exon, overhang) + # int(exon.attributes['exon_number'][0]) == 1: + overhang = (0, overhang[1]) + iv = ExonInterval.from_Feature(exon, overhang) iv.isLast = isLast overhang = default_overhang yield iv @@ -106,29 +103,30 @@ def __next__(self): if self.split_seq: seq = self.split(seq, ss.overhang)['exon'][0] out['inputs']['input_4'] = seq - + out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges( - ss.chrom, + ss.chrom, ss.Exon_Start, ss.Exon_End, ss.transcript_id, ss.strand) return out - + def batch_predict_iter(self, **kwargs): """Returns samples directly useful for prediction x["inputs"] Args: **kwargs: Arguments passed to self.batch_iter(**kwargs) """ return (x for x in self.batch_iter(**kwargs)) - + def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long + # need to pad N if left seq not enough long + lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N"*(lackl+1) + x intronl_len += lackl+1 @@ -137,16 +135,16 @@ def split(self, x, overhang): x = x + "N"*(lackr+1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] + acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] + exon = x[(intronl_len+self.exon_cut_l): (-intronr_len-self.exon_cut_r)] + donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] donor_intron = x[-intronr_len+self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) - if self.encode: + if self.encode: return { "acceptor_intron": encodeDNA([acceptor_intron]), "acceptor": encodeDNA([acceptor]), diff --git a/MMSplice/modules/exon_5prime/dataloader.py b/MMSplice/modules/exon_5prime/dataloader.py index c1859709d..618673e4d 100644 --- a/MMSplice/modules/exon_5prime/dataloader.py +++ b/MMSplice/modules/exon_5prime/dataloader.py @@ -1,16 +1,13 @@ """Dataloader """ -import numpy as np -from kipoi.data import SampleIterator import pickle -from pyfaidx import Fasta +import warnings import gffutils +from pyfaidx import Fasta from concise.preprocessing import encodeDNA -import warnings +from kipoi.data import SampleIterator from kipoi.metadata import GenomicRanges - -from mmsplice import MMSplice from mmsplice.vcf_dataloader import ExonInterval @@ -40,15 +37,15 @@ def __init__(self, **kwargs ): try: - with open(gtf, 'rb') as f: + with open(gtf_file, 'rb') as f: self.exons = pickle.load(f) - except: + except (FileNotFoundError, pickle.UnpicklingError): self.exonGenerator = self.GenerateExons(gtf_file, **kwargs) import six if isinstance(fasta_file, six.string_types): fasta = Fasta(fasta_file, as_raw=False) self.fasta = fasta - + self.encode = encode self.split_seq = split_seq self.exon_cut_l = exon_cut_l @@ -63,34 +60,34 @@ def __init__(self, @staticmethod def GenerateExons( - gtf_file, - overhang=(100, 100), - gtf_db_path=":memory:", - disable_infer_transcripts = True, - disable_infer_genes = True, - firstLastNoExtend=True): + gtf_file, + overhang=(100, 100), + gtf_db_path=":memory:", + disable_infer_transcripts=True, + disable_infer_genes=True, + firstLastNoExtend=True): ''' Generate EexonInterval objects from gtf file ''' try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except: gtf_db = gffutils.create_db(gtf_file, - gtf_db_path, - disable_infer_transcripts = disable_infer_transcripts, - disable_infer_genes = disable_infer_genes) - genes=gtf_db.features_of_type('gene') + gtf_db_path, + disable_infer_transcripts=disable_infer_transcripts, + disable_infer_genes=disable_infer_genes) + genes = gtf_db.features_of_type('gene') default_overhang = overhang for gene in genes: - for exon in gtf_db.children(gene, featuretype = 'exon'): - isLast = False # track whether is last exon + for exon in gtf_db.children(gene, featuretype='exon'): + isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): - overhang = (overhang[0],0) + overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): - #int(exon.attributes['exon_number'][0]) == 1: - overhang = (0,overhang[1]) - iv=ExonInterval.from_Feature(exon, overhang) + # int(exon.attributes['exon_number'][0]) == 1: + overhang = (0, overhang[1]) + iv = ExonInterval.from_Feature(exon, overhang) iv.isLast = isLast overhang = default_overhang yield iv @@ -106,29 +103,30 @@ def __next__(self): if self.split_seq: seq = self.split(seq, ss.overhang)['exon'][0] out['inputs']['input_3'] = seq - + out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges( - ss.chrom, + ss.chrom, ss.Exon_Start, ss.Exon_End, ss.transcript_id, ss.strand) return out - + def batch_predict_iter(self, **kwargs): """Returns samples directly useful for prediction x["inputs"] Args: **kwargs: Arguments passed to self.batch_iter(**kwargs) """ return (x for x in self.batch_iter(**kwargs)) - + def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long + # need to pad N if left seq not enough long + lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N"*(lackl+1) + x intronl_len += lackl+1 @@ -137,16 +135,16 @@ def split(self, x, overhang): x = x + "N"*(lackr+1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] + acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] + exon = x[(intronl_len+self.exon_cut_l): (-intronr_len-self.exon_cut_r)] + donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] donor_intron = x[-intronr_len+self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) - if self.encode: + if self.encode: return { "acceptor_intron": encodeDNA([acceptor_intron]), "acceptor": encodeDNA([acceptor]), diff --git a/MMSplice/modules/intron_3prime/dataloader.py b/MMSplice/modules/intron_3prime/dataloader.py index 31ffd35c6..514b14d87 100644 --- a/MMSplice/modules/intron_3prime/dataloader.py +++ b/MMSplice/modules/intron_3prime/dataloader.py @@ -1,16 +1,13 @@ """Dataloader """ -import numpy as np -from kipoi.data import SampleIterator import pickle -from pyfaidx import Fasta +import warnings import gffutils +from pyfaidx import Fasta from concise.preprocessing import encodeDNA -import warnings from kipoi.metadata import GenomicRanges - -from mmsplice import MMSplice +from kipoi.data import SampleIterator from mmsplice.vcf_dataloader import ExonInterval @@ -40,15 +37,15 @@ def __init__(self, **kwargs ): try: - with open(gtf, 'rb') as f: + with open(gtf_file, 'rb') as f: self.exons = pickle.load(f) - except: + except (FileNotFoundError, pickle.UnpicklingError): self.exonGenerator = self.GenerateExons(gtf_file, **kwargs) import six if isinstance(fasta_file, six.string_types): fasta = Fasta(fasta_file, as_raw=False) self.fasta = fasta - + self.encode = encode self.split_seq = split_seq self.exon_cut_l = exon_cut_l @@ -63,34 +60,34 @@ def __init__(self, @staticmethod def GenerateExons( - gtf_file, - overhang=(100, 100), - gtf_db_path=":memory:", - disable_infer_transcripts = True, - disable_infer_genes = True, - firstLastNoExtend=True): + gtf_file, + overhang=(100, 100), + gtf_db_path=":memory:", + disable_infer_transcripts=True, + disable_infer_genes=True, + firstLastNoExtend=True): ''' Generate EexonInterval objects from gtf file ''' try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except: gtf_db = gffutils.create_db(gtf_file, - gtf_db_path, - disable_infer_transcripts = disable_infer_transcripts, - disable_infer_genes = disable_infer_genes) - genes=gtf_db.features_of_type('gene') + gtf_db_path, + disable_infer_transcripts=disable_infer_transcripts, + disable_infer_genes=disable_infer_genes) + genes = gtf_db.features_of_type('gene') default_overhang = overhang for gene in genes: - for exon in gtf_db.children(gene, featuretype = 'exon'): - isLast = False # track whether is last exon + for exon in gtf_db.children(gene, featuretype='exon'): + isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): - overhang = (overhang[0],0) + overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): - #int(exon.attributes['exon_number'][0]) == 1: - overhang = (0,overhang[1]) - iv=ExonInterval.from_Feature(exon, overhang) + # int(exon.attributes['exon_number'][0]) == 1: + overhang = (0, overhang[1]) + iv = ExonInterval.from_Feature(exon, overhang) iv.isLast = isLast overhang = default_overhang yield iv @@ -106,29 +103,30 @@ def __next__(self): if self.split_seq: seq = self.split(seq, ss.overhang)['acceptor_intron'][0] out['inputs']['input_5'] = seq - + out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges( - ss.chrom, + ss.chrom, ss.Exon_Start, ss.Exon_End, ss.transcript_id, ss.strand) return out - + def batch_predict_iter(self, **kwargs): """Returns samples directly useful for prediction x["inputs"] Args: **kwargs: Arguments passed to self.batch_iter(**kwargs) """ return (x for x in self.batch_iter(**kwargs)) - + def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long + # need to pad N if left seq not enough long + lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N"*(lackl+1) + x intronl_len += lackl+1 @@ -137,16 +135,16 @@ def split(self, x, overhang): x = x + "N"*(lackr+1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] + acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] + exon = x[(intronl_len+self.exon_cut_l): (-intronr_len-self.exon_cut_r)] + donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] donor_intron = x[-intronr_len+self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) - if self.encode: + if self.encode: return { "acceptor_intron": encodeDNA([acceptor_intron], maxlen=94), "acceptor": encodeDNA([acceptor]), diff --git a/MMSplice/modules/intron_5prime/dataloader.py b/MMSplice/modules/intron_5prime/dataloader.py index 8595a327c..20fea55e5 100644 --- a/MMSplice/modules/intron_5prime/dataloader.py +++ b/MMSplice/modules/intron_5prime/dataloader.py @@ -1,16 +1,13 @@ """Dataloader """ -import numpy as np -from kipoi.data import SampleIterator import pickle -from pyfaidx import Fasta +import warnings import gffutils +from pyfaidx import Fasta from concise.preprocessing import encodeDNA -import warnings +from kipoi.data import SampleIterator from kipoi.metadata import GenomicRanges - -from mmsplice import MMSplice from mmsplice.vcf_dataloader import ExonInterval @@ -40,15 +37,15 @@ def __init__(self, **kwargs ): try: - with open(gtf, 'rb') as f: + with open(gtf_file, 'rb') as f: self.exons = pickle.load(f) - except: + except (FileNotFoundError, pickle.UnpicklingError): self.exonGenerator = self.GenerateExons(gtf_file, **kwargs) import six if isinstance(fasta_file, six.string_types): fasta = Fasta(fasta_file, as_raw=False) self.fasta = fasta - + self.encode = encode self.split_seq = split_seq self.exon_cut_l = exon_cut_l @@ -63,34 +60,34 @@ def __init__(self, @staticmethod def GenerateExons( - gtf_file, - overhang=(100, 100), - gtf_db_path=":memory:", - disable_infer_transcripts = True, - disable_infer_genes = True, - firstLastNoExtend=True): + gtf_file, + overhang=(100, 100), + gtf_db_path=":memory:", + disable_infer_transcripts=True, + disable_infer_genes=True, + firstLastNoExtend=True): ''' Generate EexonInterval objects from gtf file ''' try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except: gtf_db = gffutils.create_db(gtf_file, - gtf_db_path, - disable_infer_transcripts = disable_infer_transcripts, - disable_infer_genes = disable_infer_genes) - genes=gtf_db.features_of_type('gene') + gtf_db_path, + disable_infer_transcripts=disable_infer_transcripts, + disable_infer_genes=disable_infer_genes) + genes = gtf_db.features_of_type('gene') default_overhang = overhang for gene in genes: - for exon in gtf_db.children(gene, featuretype = 'exon'): - isLast = False # track whether is last exon + for exon in gtf_db.children(gene, featuretype='exon'): + isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): - overhang = (overhang[0],0) + overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): - #int(exon.attributes['exon_number'][0]) == 1: - overhang = (0,overhang[1]) - iv=ExonInterval.from_Feature(exon, overhang) + # int(exon.attributes['exon_number'][0]) == 1: + overhang = (0, overhang[1]) + iv = ExonInterval.from_Feature(exon, overhang) iv.isLast = isLast overhang = default_overhang yield iv @@ -106,29 +103,30 @@ def __next__(self): if self.split_seq: seq = self.split(seq, ss.overhang)['donor_intron'][0] out['inputs']['input_15'] = seq - + out['metadata'] = {} out['metadata']['ranges'] = GenomicRanges( - ss.chrom, + ss.chrom, ss.Exon_Start, ss.Exon_End, ss.transcript_id, ss.strand) return out - + def batch_predict_iter(self, **kwargs): """Returns samples directly useful for prediction x["inputs"] Args: **kwargs: Arguments passed to self.batch_iter(**kwargs) """ return (x for x in self.batch_iter(**kwargs)) - + def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long + # need to pad N if left seq not enough long + lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N"*(lackl+1) + x intronl_len += lackl+1 @@ -137,16 +135,16 @@ def split(self, x, overhang): x = x + "N"*(lackr+1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] + acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] + exon = x[(intronl_len+self.exon_cut_l): (-intronr_len-self.exon_cut_r)] + donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] donor_intron = x[-intronr_len+self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) - if self.encode: + if self.encode: return { "acceptor_intron": encodeDNA([acceptor_intron]), "acceptor": encodeDNA([acceptor]), diff --git a/MMSplice/pathogenicity/dataloader.py b/MMSplice/pathogenicity/dataloader.py index c961c086e..eda944f33 100644 --- a/MMSplice/pathogenicity/dataloader.py +++ b/MMSplice/pathogenicity/dataloader.py @@ -1,14 +1,6 @@ import numpy as np -from kipoi.data import SampleIterator - -import pickle -from pyfaidx import Fasta -from cyvcf2 import VCF -from concise.preprocessing import encodeDNA -import warnings - from mmsplice import MMSplice -from mmsplice.vcf_dataloader import GenerateExonIntervalTree, VariantInterval, get_var_side +from mmsplice.vcf_dataloader import SplicingVCFDataloader as BaseSplicingVCFDataloader model = MMSplice( exon_cut_l=0, @@ -20,149 +12,14 @@ donor_exon_len=5, donor_intron_len=13) -class SplicingVCFDataloader(SampleIterator): - """ - Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence. - Args: - gtf: gtf file or pickled gtf IntervalTree. - fasta_file: file path; Genome sequence - vcf_file: file path; vcf file with variants to score - """ - - def __init__(self, - gtf_file, - fasta_file, - vcf_file, - split_seq=False, - encode=True, - exon_cut_l=0, - exon_cut_r=0, - acceptor_intron_cut=6, - donor_intron_cut=6, - acceptor_intron_len=50, - acceptor_exon_len=3, - donor_exon_len=5, - donor_intron_len=13, - **kwargs - ): - try: - with open(gtf, 'rb') as f: - self.exons = pickle.load(f) - except: - self.exons = GenerateExonIntervalTree(gtf_file, **kwargs) - import six - if isinstance(fasta_file, six.string_types): - fasta = Fasta(fasta_file, as_raw=False) - self.fasta = fasta - self.ssGenerator = self.spliceSiteGenerator(vcf_file, self.exons) - - self.encode = encode - self.split_seq = split_seq - self.exon_cut_l = exon_cut_l - self.exon_cut_r = exon_cut_r - self.acceptor_intron_cut = acceptor_intron_cut - self.donor_intron_cut = donor_intron_cut - self.acceptor_intron_len = acceptor_intron_len - self.acceptor_exon_len = acceptor_exon_len - self.donor_exon_len = donor_exon_len - self.donor_intron_len = donor_intron_len - - @staticmethod - def spliceSiteGenerator(vcf_file, exonTree, variant_filter=True): - variants = VCF(vcf_file) - for var in variants: - if variant_filter and var.FILTER: - next - iv = VariantInterval.from_Variant(var) - - matches = map(lambda x: x.interval, - exonTree.intersect(iv, ignore_strand=True)) - - for match in matches: - side = get_var_side(( - var.POS, - var.REF, - var.ALT, - match.Exon_Start, - match.Exon_End, - match.strand - )) - var = iv.to_Variant(match.strand, side) # to my Variant class - yield match, var - - - def __iter__(self): - return self +class SplicingVCFDataloader(BaseSplicingVCFDataloader): def __next__(self): - ss, var = next(self.ssGenerator) - out = {} - x = {} - x['inputs'] = {} - x['inputs_mut'] = {} - seq = ss.get_seq(self.fasta).upper() - mut_seq = ss.get_mut_seq(self.fasta, var).upper() - if self.split_seq: - seq = self.split(seq, ss.overhang) - mut_seq = self.split(mut_seq, ss.overhang) - x['inputs']['seq'] = seq - x['inputs_mut']['seq'] = mut_seq - x['inputs']['intronl_len'] = ss.overhang[0] - x['inputs']['intronr_len'] = ss.overhang[1] - x['inputs_mut']['intronl_len'] = ss.overhang[0] - x['inputs_mut']['intronr_len'] = ss.overhang[1] - - out['inputs'] = np.concatenate([model.predict(x['inputs_mut']).values, model.predict(x['inputs']).values]) - - out['metadata'] = {} - out['metadata']['ranges'] = ss.grange - out['metadata']['variant'] = var.to_dict - out['metadata']['ExonInterval'] = ss.to_dict # so that np collate will work - out['metadata']['annotation'] = str(ss) - return out - - def batch_predict_iter(self, **kwargs): - """Returns samples directly useful for prediction x["inputs"] - Args: - **kwargs: Arguments passed to self.batch_iter(**kwargs) - """ - return (x for x in self.batch_iter(**kwargs)) - - def split(self, x, overhang): - ''' x: a sequence to split - ''' - intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long - if lackl >= 0: - x = "N"*(lackl+1) + x - intronl_len += lackl+1 - lackr = self.donor_intron_len - intronr_len - if lackr >= 0: - x = x + "N"*(lackr+1) - intronr_len += lackr + 1 - acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] - donor_intron = x[-intronr_len+self.donor_intron_cut:] - if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": - warnings.warn("None GT donor", UserWarning) - if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": - warnings.warn("None AG donor", UserWarning) - - if self.encode: - return { - "acceptor_intron": encodeDNA([acceptor_intron]), - "acceptor": encodeDNA([acceptor]), - "exon": encodeDNA([exon]), - "donor": encodeDNA([donor]), - "donor_intron": encodeDNA([donor_intron]) - } - else: - return { - "acceptor_intron": acceptor_intron, - "acceptor": acceptor, - "exon": exon, - "donor": donor, - "donor_intron": donor_intron - } + super_out = super().__next__() + return { + 'inputs': np.concatenate([ + model.predict(super_out['inputs_mut']).values, + model.predict(super_out['inputs']).values + ]), + 'metadata': super_out['metadata'] + } diff --git a/MMSplice/pathogenicity/dataloader.yaml b/MMSplice/pathogenicity/dataloader.yaml index 28435add5..96a94bc12 100644 --- a/MMSplice/pathogenicity/dataloader.yaml +++ b/MMSplice/pathogenicity/dataloader.yaml @@ -1,7 +1,7 @@ type: SampleIterator defined_as: dataloader.py::SplicingVCFDataloader args: - gtf_file: + gtf: doc: path to the GTF file required by the models (Ensemble) example: url: https://sandbox.zenodo.org/record/248604/files/test.gtf?download=1 @@ -19,6 +19,9 @@ args: split_seq: doc: Whether split the sequence in dataloader optional: True + variant_filter: + doc: If set True (default), variants with `FILTER` field other than `PASS` will be filtered out. + optional: True encode: doc: If split the sequence, whether one hot encoding optional: True @@ -73,7 +76,7 @@ dependencies: - python=3.5 pip: - kipoi - - mmsplice + - mmsplice>=0.2.7 output_schema: inputs: shape: (10, ) @@ -157,4 +160,4 @@ output_schema: doc: genomic end position of the retrieved sequence annotation: type: str - doc: retrieved sequence name \ No newline at end of file + doc: retrieved sequence name diff --git a/MMSplice/pathogenicity/model.py b/MMSplice/pathogenicity/model.py index 86b95e05a..0bc51f721 100644 --- a/MMSplice/pathogenicity/model.py +++ b/MMSplice/pathogenicity/model.py @@ -1,21 +1,15 @@ from kipoi.model import BaseModel -import sklearn import numpy as np from mmsplice import LOGISTIC_MODEL from mmsplice.utils.postproc import transform -# Model to predict delta logit PSI class MMSpliceModel(BaseModel): - - def __init__(self): + '''Model to predict delta logit PSI''' - self.model = LOGISTIC_MODEL - def predict_on_batch(self, inputs): - # inputs shape (,10), corresponding to 5 module predictions of mut and wt - X = inputs[:,:5] - inputs[:,-5:] - X = transform(X, True) - X = np.concatenate([inputs, X[:,-3:]], axis=-1) - pred = self.model.predict_proba(X) - return pred \ No newline at end of file + '''inputs shape (,10), corresponding to 5 module predictions of mut and wt''' + X_alt, X_ref = inputs[:, 5:], inputs[:, :-5] + X = transform(X_alt - X_ref, True) + X = np.concatenate([X_ref, X_alt, X[:, -3:]], axis=-1) + return LOGISTIC_MODEL.predict_proba(X) diff --git a/MMSplice/pathogenicity/model.yaml b/MMSplice/pathogenicity/model.yaml index ec038574b..53a6478f0 100644 --- a/MMSplice/pathogenicity/model.yaml +++ b/MMSplice/pathogenicity/model.yaml @@ -19,6 +19,7 @@ dependencies: - numpy pip: - scikit-learn + - mmsplice>=0.2.7 schema: inputs: shape: (10, ) diff --git a/MMSplice/splicingEfficiency/dataloader.py b/MMSplice/splicingEfficiency/dataloader.py index e54f73c4e..eda944f33 100644 --- a/MMSplice/splicingEfficiency/dataloader.py +++ b/MMSplice/splicingEfficiency/dataloader.py @@ -1,14 +1,6 @@ import numpy as np -from kipoi.data import SampleIterator - -import pickle -from pyfaidx import Fasta -from cyvcf2 import VCF -from concise.preprocessing import encodeDNA -import warnings - from mmsplice import MMSplice -from mmsplice.vcf_dataloader import GenerateExonIntervalTree, VariantInterval, get_var_side +from mmsplice.vcf_dataloader import SplicingVCFDataloader as BaseSplicingVCFDataloader model = MMSplice( exon_cut_l=0, @@ -20,148 +12,14 @@ donor_exon_len=5, donor_intron_len=13) -class SplicingVCFDataloader(SampleIterator): - """ - Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence. - Args: - gtf: gtf file or pickled gtf IntervalTree. - fasta_file: file path; Genome sequence - vcf_file: file path; vcf file with variants to score - """ - - def __init__(self, - gtf_file, - fasta_file, - vcf_file, - split_seq=False, - encode=True, - exon_cut_l=0, - exon_cut_r=0, - acceptor_intron_cut=6, - donor_intron_cut=6, - acceptor_intron_len=50, - acceptor_exon_len=3, - donor_exon_len=5, - donor_intron_len=13, - **kwargs - ): - try: - with open(gtf, 'rb') as f: - self.exons = pickle.load(f) - except: - self.exons = GenerateExonIntervalTree(gtf_file, **kwargs) - import six - if isinstance(fasta_file, six.string_types): - fasta = Fasta(fasta_file, as_raw=False) - self.fasta = fasta - self.ssGenerator = self.spliceSiteGenerator(vcf_file, self.exons) - - self.encode = encode - self.split_seq = split_seq - self.exon_cut_l = exon_cut_l - self.exon_cut_r = exon_cut_r - self.acceptor_intron_cut = acceptor_intron_cut - self.donor_intron_cut = donor_intron_cut - self.acceptor_intron_len = acceptor_intron_len - self.acceptor_exon_len = acceptor_exon_len - self.donor_exon_len = donor_exon_len - self.donor_intron_len = donor_intron_len - - @staticmethod - def spliceSiteGenerator(vcf_file, exonTree, variant_filter=True): - variants = VCF(vcf_file) - for var in variants: - if variant_filter and var.FILTER: - next - iv = VariantInterval.from_Variant(var) - - matches = map(lambda x: x.interval, - exonTree.intersect(iv, ignore_strand=True)) - - for match in matches: - side = get_var_side(( - var.POS, - var.REF, - var.ALT, - match.Exon_Start, - match.Exon_End, - match.strand - )) - var = iv.to_Variant(match.strand, side) # to my Variant class - yield match, var - - def __iter__(self): - return self +class SplicingVCFDataloader(BaseSplicingVCFDataloader): def __next__(self): - ss, var = next(self.ssGenerator) - out = {} - x = {} - x['inputs'] = {} - x['inputs_mut'] = {} - seq = ss.get_seq(self.fasta).upper() - mut_seq = ss.get_mut_seq(self.fasta, var).upper() - if self.split_seq: - seq = self.split(seq, ss.overhang) - mut_seq = self.split(mut_seq, ss.overhang) - x['inputs']['seq'] = seq - x['inputs_mut']['seq'] = mut_seq - x['inputs']['intronl_len'] = ss.overhang[0] - x['inputs']['intronr_len'] = ss.overhang[1] - x['inputs_mut']['intronl_len'] = ss.overhang[0] - x['inputs_mut']['intronr_len'] = ss.overhang[1] - - out['inputs'] = np.concatenate([model.predict(x['inputs_mut']).values, model.predict(x['inputs']).values]) - - out['metadata'] = {} - out['metadata']['ranges'] = ss.grange - out['metadata']['variant'] = var.to_dict - out['metadata']['ExonInterval'] = ss.to_dict # so that np collate will work - out['metadata']['annotation'] = str(ss) - return out - - def batch_predict_iter(self, **kwargs): - """Returns samples directly useful for prediction x["inputs"] - Args: - **kwargs: Arguments passed to self.batch_iter(**kwargs) - """ - return (x for x in self.batch_iter(**kwargs)) - - def split(self, x, overhang): - ''' x: a sequence to split - ''' - intronl_len, intronr_len = overhang - lackl = self.acceptor_intron_len - intronl_len # need to pad N if left seq not enough long - if lackl >= 0: - x = "N"*(lackl+1) + x - intronl_len += lackl+1 - lackr = self.donor_intron_len - intronr_len - if lackr >= 0: - x = x + "N"*(lackr+1) - intronr_len += lackr + 1 - acceptor_intron = x[:intronl_len-self.acceptor_intron_cut] - acceptor = x[(intronl_len-self.acceptor_intron_len) : (intronl_len+self.acceptor_exon_len)] - exon = x[(intronl_len+self.exon_cut_l) : (-intronr_len-self.exon_cut_r)] - donor = x[(-intronr_len-self.donor_exon_len) : (-intronr_len+self.donor_intron_len)] - donor_intron = x[-intronr_len+self.donor_intron_cut:] - if donor[self.donor_exon_len:self.donor_exon_len+2] != "GT": - warnings.warn("None GT donor", UserWarning) - if acceptor[self.acceptor_intron_len-2:self.acceptor_intron_len] != "AG": - warnings.warn("None AG donor", UserWarning) - - if self.encode: - return { - "acceptor_intron": encodeDNA([acceptor_intron]), - "acceptor": encodeDNA([acceptor]), - "exon": encodeDNA([exon]), - "donor": encodeDNA([donor]), - "donor_intron": encodeDNA([donor_intron]) - } - else: - return { - "acceptor_intron": acceptor_intron, - "acceptor": acceptor, - "exon": exon, - "donor": donor, - "donor_intron": donor_intron - } + super_out = super().__next__() + return { + 'inputs': np.concatenate([ + model.predict(super_out['inputs_mut']).values, + model.predict(super_out['inputs']).values + ]), + 'metadata': super_out['metadata'] + } diff --git a/MMSplice/splicingEfficiency/dataloader.yaml b/MMSplice/splicingEfficiency/dataloader.yaml index 4175e22b0..ba9f94228 100644 --- a/MMSplice/splicingEfficiency/dataloader.yaml +++ b/MMSplice/splicingEfficiency/dataloader.yaml @@ -1,7 +1,7 @@ type: SampleIterator defined_as: dataloader.py::SplicingVCFDataloader args: - gtf_file: + gtf: doc: path to the GTF file required by the models (Ensemble) example: url: https://sandbox.zenodo.org/record/248604/files/test.gtf?download=1 @@ -19,6 +19,9 @@ args: split_seq: doc: Whether split the sequence in dataloader optional: True + variant_filter: + doc: If set True (default), variants with `FILTER` field other than `PASS` will be filtered out. + optional: True encode: doc: If split the sequence, whether one hot encoding optional: True @@ -73,7 +76,7 @@ dependencies: - python=3.5 pip: - kipoi - - mmsplice + - mmsplice>=0.2.7 output_schema: inputs: shape: (10, ) @@ -157,4 +160,4 @@ output_schema: doc: genomic end position of the retrieved sequence annotation: type: str - doc: retrieved sequence name \ No newline at end of file + doc: retrieved sequence name diff --git a/MMSplice/splicingEfficiency/model.py b/MMSplice/splicingEfficiency/model.py index b12f66244..423bd493f 100644 --- a/MMSplice/splicingEfficiency/model.py +++ b/MMSplice/splicingEfficiency/model.py @@ -1,21 +1,12 @@ from kipoi.model import BaseModel -import sklearn -import numpy as np from mmsplice import EFFICIENCY_MODEL from mmsplice.utils.postproc import transform -# Model to predict delta logit PSI class MMSpliceModel(BaseModel): - - def __init__(self): + '''Model to predict delta logit PSI''' - self.model = EFFICIENCY_MODEL - def predict_on_batch(self, inputs): - # inputs shape (,10), corresponding to 5 module predictions of mut and wt - X = inputs[:,:5] - inputs[:,-5:] - X = transform(X, True) - X = X[:,[1,2,3,5]] - pred = self.model.predict(X) - return pred \ No newline at end of file + '''inputs shape (,10), corresponding to 5 module predictions of mut and wt''' + X = transform(inputs[:, :5] - inputs[:, -5:], True)[:, [1, 2, 3, 5]] + return EFFICIENCY_MODEL.predict(X) diff --git a/MMSplice/splicingEfficiency/model.yaml b/MMSplice/splicingEfficiency/model.yaml index 923191e0e..acd53b1a7 100644 --- a/MMSplice/splicingEfficiency/model.yaml +++ b/MMSplice/splicingEfficiency/model.yaml @@ -18,6 +18,7 @@ dependencies: - numpy pip: - scikit-learn + - mmsplice>=0.2.7 schema: inputs: shape: (10, )