Merge pull request #18 from MayroseLab/v1.2

v1.2
MayroseLab · Nov 2, 2022 · ff744ba · ff744ba
2 parents 9e1ef68 + 9da5405
commit ff744ba
Show file tree

Hide file tree

Showing 60 changed files with 4,269 additions and 1,500 deletions.
diff --git a/EVM_annotation/EVM_annotation.snakefile b/EVM_annotation/EVM_annotation.snakefile
diff --git a/EVM_annotation/add_AED_to_gff3.py b/EVM_annotation/add_AED_to_gff3.py
@@ -0,0 +1,192 @@
+"""
+This script calculates Annotation Edit Distances
+(AED) and weighted AED (wAED) for mRNA features
+predicted by EVidencdModeler. These scores can
+be used as per-gene quality measures, based on
+the support of evidence for gene models.
+The idea of AED is further explain in
+Holt, C., & Yandell, M. (2011).
+The output of the script is a GFF3 file, identical
+to the input GFF3, except all mRNA features have
+two additional attributes: AED (raw distance), and
+wAED (considering the weight assigned to each
+source of evidence). Use:
+python add_AED_to_gff3.py -h
+for usage instructions.
+Requirements: python3, pandas, gffutils, intervaltree
+"""
+
+# IMPORTS
+import gffutils
+import os
+import sys
+import pandas as pd
+import argparse
+from intervaltree import Interval, IntervalTree
+
+# FUNCTIONS
+
+def create_gff_db(gff_path, force=False):
+    print('Reading GFF %s' % gff_path)
+    db_path = gff_path + '.db'
+    if os.path.isfile(db_path) and not force:
+        print('Using existing DB %s' % db_path)
+    else:
+        gff_db = gffutils.create_db(gff_path, dbfn=db_path, force=True, keep_order=True, merge_strategy='create_unique', verbose=True)
+    return gffutils.FeatureDB(db_path, keep_order=True)
+
+def gff_to_intervals(gff_path, feature_types=None, merge_overlaps=False):
+    """
+    Reads a GFF3 file and creates a data structure:
+    {chr1: IntervalTree(), chr2: IntervalTree(),...}
+    if feature_types (iterable) is given, only
+    include features of the given types.
+    """
+    print('Reading GFF %s' % gff_path)
+    res = {}
+    with open(gff_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            fields = line.split('\t')
+            chrom = fields[0]
+            source = fields[1]
+            ftype = fields[2]
+            start = int(fields[3])
+            end = int(fields[4])
+            if source not in feature_types or (feature_types is not None and ftype not in feature_types[source]):
+                continue
+            if start == end:	# not allowing empty intervals
+                continue
+            if chrom not in res:
+                res[chrom] = IntervalTree()
+            res[chrom].add(Interval(start, end, source))
+    if merge_overlaps:
+        for chrom in res:
+            res[chrom].merge_overlaps(data_reducer=lambda x, y: x)
+    return res
+
+def merge_gff_iv_trees(gff_ds_list):
+    """
+    Takes a list of data structures created
+    by gff_to_intervals and merges them into
+    one structure of the same type.
+    """
+    res = {}
+    for ds in gff_ds_list:
+        for chrom in ds:
+            if chrom not in res:
+                res[chrom] = IntervalTree()
+            res[chrom].update(ds[chrom])
+    return res
+
+def overlap_length(range1, range2):
+    """
+    Compute the length of overlap
+    between two ranges given as (start,end)
+    """
+    return min(range1[1], range2[1]) - max(range1[0], range2[0])
+
+def main():
+    # command line arguments
+    DESC = "Add Annotation Edit Distance (AED) to EvidenceModeler GFF3"
+    USAGE = "python %s -h (display full usage doc)" % sys.argv[0]
+    parser = argparse.ArgumentParser(description=DESC, usage=USAGE)
+    parser.add_argument('-g', '--gff', help='EVM output GFF3', required=True)
+    parser.add_argument('-w', '--weights', help='EVM weights TSV', required=True)
+    parser.add_argument('-l', '--feature_types', help='A TSV file denoting which feature types to use as evidence for each source', required=True)
+    parser.add_argument('-t', '--transcript', help='Transcript evidence GFF3', default=None)
+    parser.add_argument('-p', '--protein', help='Protein evidence GFF3', default=None)
+    parser.add_argument('-a', '--prediction', help='Gene predictions GFF3', default=None)
+    parser.add_argument('-f', '--force', help='Force override existng gff DBs', action='store_true', default=False)
+    parser.add_argument('-i', '--ignore_ab_initio', help='Ab-initio predictions will not be considered when calculating AED', action='store_true', default=False)
+    parser.add_argument('-o', '--out_gff', help='Output GFF with AEDs', required=True)
+    args = parser.parse_args()
+
+    # Read weights TSV
+    weights_df = pd.read_csv(args.weights, sep='\t', names=['class','source','weight'], index_col='source')
+    # Read feature types TSV
+    ftypes_df = pd.read_csv(args.feature_types, sep='\t', names=['source','feature_types'], index_col='source', converters={'feature_types': lambda x: set(x.split(','))})
+    assert set(ftypes_df.index) == set(weights_df.index), "Must list the same sources in the weights and in the feature types files"
+    weights_df = weights_df.join(ftypes_df)
+    if args.ignore_ab_initio:
+        weights_df = weights_df.query('`class` != "ABINITIO_PREDICTION"')
+    weights_df['relative_weight'] = weights_df['weight'] / sum(weights_df['weight'])
+
+    # Read EVM GFF
+    evm_gff = create_gff_db(args.gff, args.force)
+
+    # Read ab-initio and evidence GFFs
+    evidence = []
+    for gff in [args.transcript, args.protein, args.prediction]:
+        if not gff:
+            continue
+        evidence.append(gff_to_intervals(gff, feature_types=weights_df['feature_types'], merge_overlaps=True))
+    all_evidence = merge_gff_iv_trees(evidence)
+
+    # Assign AEDs
+    print('Calculating AEDs...')
+    with open(args.out_gff,'w') as fo:
+        print('##gff-version 3', file=fo)
+        for evm_feat in evm_gff.all_features():
+            if evm_feat.featuretype != 'mRNA':
+                print(str(evm_feat), file=fo)
+                continue
+            mrna = evm_feat
+            chrom = mrna.seqid
+            mrna_sup = {}
+            mrna_cov = 0
+            sup_len = 0
+            tot_exon_len = 0
+            for exon in evm_gff.children(mrna, featuretype='exon'):
+                tot_exon_len += exon.end - exon.start
+                if chrom in all_evidence:
+                    sup_features = all_evidence[chrom][exon.start:exon.end]
+                else:
+                    sup_features = []
+                # for non-weighted AED
+                sup_features_tree = IntervalTree(sup_features)
+                sup_features_tree.merge_overlaps()
+                for supp_feature in sup_features_tree:
+                    mrna_cov += overlap_length((exon.start,exon.end), (supp_feature.begin,supp_feature.end))
+                    sup_len += supp_feature.end - supp_feature.begin
+                # for weighted AED
+                for supp_feature in sup_features:
+                    overlap_len = overlap_length((exon.start,exon.end), (supp_feature.begin,supp_feature.end))
+                    source = supp_feature.data
+                    if source not in mrna_sup:
+                        mrna_sup[source] = [0,0]
+                    mrna_sup[source][0] += overlap_len
+                    mrna_sup[source][1] += supp_feature.end - supp_feature.begin
+
+            # calculate non-weighted AED
+            if sup_len == 0:
+                mrna_aed = 1
+            else:
+                nw_sensitivity = mrna_cov / tot_exon_len
+                nw_specificity = mrna_cov / sup_len
+                nw_congruence = (nw_sensitivity + nw_specificity)/2
+                mrna_aed = 1 - nw_congruence
+            mrna_aed = ("%.2f" % mrna_aed)
+            mrna['AED'] = [mrna_aed]
+            # calculate weighted AED
+            mrna_waed = 0
+            for source in weights_df.index:
+                if source in mrna_sup:
+                    sensitivity = mrna_sup[source][0] / tot_exon_len
+                    specificity = mrna_sup[source][0] / mrna_sup[source][1]
+                    congruence = (sensitivity + specificity)/2
+                    aed = 1 - congruence
+                else:
+                   aed = 1
+                aed_weighted = aed * weights_df.loc[source]['relative_weight']
+                mrna_waed += aed_weighted
+            mrna_waed = ("%.2f" % mrna_waed)
+            mrna['wAED'] = [mrna_waed]
+
+            print(str(mrna), file=fo)
+
+# MAIN
+if __name__ == "__main__":
+    main()
diff --git a/EVM_annotation/alignAssembly.config.template b/EVM_annotation/alignAssembly.config.template
@@ -0,0 +1,19 @@
+## templated variables to be replaced exist as <__var_name__>
+
+# Pathname of an SQLite database
+# If the environment variable DSN_DRIVER=mysql then it is the name of a MySQL database
+DATABASE=<DBPATH>
+
+
+#######################################################
+# Parameters to specify to specific scripts in pipeline
+# create a key = "script_name" + ":" + "parameter" 
+# assign a value as done above.
+
+#script validate_alignments_in_db.dbi
+validate_alignments_in_db.dbi:--MIN_PERCENT_ALIGNED=75
+validate_alignments_in_db.dbi:--MIN_AVG_PER_ID=95
+validate_alignments_in_db.dbi:--NUM_BP_PERFECT_SPLICE_BOUNDARY=0
+
+#script subcluster_builder.dbi
+subcluster_builder.dbi:-m=50
diff --git a/EVM_annotation/conf_template_EVM.yml b/EVM_annotation/conf_template_EVM.yml
@@ -0,0 +1,50 @@
+# NOTE: if running the EVM pipeline as part of Panoramic, DO NOT modify values wrapped in <>
+name: EVM-annotation
+
+# input/output
+input_genome: <INPUT_GENOME>
+sample_name: <SAMPLE_NAME>
+out_dir: <OUT_DIR>
+
+# Repeat masking
+mask_genome: 1  # 1 - run EDTA, 0 - don't run
+reference_cds: <REFERENCE_CDS>
+
+# Liftover
+reference_liftover: <REFERENCE_LIFTOVER>   # 1- perform reference genes liftover, 0 - skip
+reference_gff: <REFERENCE_GFF>
+reference_fasta: <REFERENCE_FASTA>
+liftover_weight:
+
+# Ab-initio prediction
+augustus_species: 
+glimmerhmm_species: 
+snap_species: 
+ab-initio_weight: 
+
+# Transcript evidence
+transcripts_fasta: <TRANSCRIPTS_FASTA>
+transcripts_weight: 
+
+# Protein evidence
+proteins_fasta: <PROTEINS_FASTA>
+proteins_weight:
+
+# EVM partitioning
+segment_size: 5000000
+overlap_size: 20000
+
+# Split chimeras
+split_chimeras: 1       # 1 - run chimeraBuster, 0 -don't run
+chimeraBuster_dir:
+
+# Annotation filtration
+min_protein: 50
+max_AED: 0.99
+
+# Environment
+queue: <QUEUE>
+priority: <PRIORITY>
+ppn: <PPN>
+max_ram: <MAX_RAM>
+max_jobs: <MAX_JOBS>
diff --git a/EVM_annotation/create_EVM_partitions_list.py b/EVM_annotation/create_EVM_partitions_list.py
@@ -0,0 +1,31 @@
+"""
+Create the partitions list file
+for EVM. Each line has the structure:
+chromosome	chromosome_dir	Y	partition_dir (chromosome_dir/chromosome_start-end)
+"""
+
+import sys
+from Bio import SeqIO
+
+genome_fasta = sys.argv[1]
+segment_size = int(sys.argv[2])
+overlap_size = int(sys.argv[3])
+out_dir = sys.argv[4]
+partitions_out = sys.argv[5]
+
+# get chromosome lengths
+chr_lens = {}
+for rec in SeqIO.parse(genome_fasta, 'fasta'):
+  chr_lens[rec.id] = len(rec.seq)
+
+# compute partitions
+with open(partitions_out, 'w') as fo:
+  for chrom in sorted(chr_lens.keys()):
+    chrom_dir = out_dir + '/' + chrom
+    if chr_lens[chrom] <= overlap_size:
+        partitions = ['%s-%s' %(1,chr_lens[chrom])]
+    else:
+        partitions = ["%s-%s" %(start, min(start+segment_size-1, chr_lens[chrom])) for start in range(1, chr_lens[chrom], segment_size - overlap_size) if (min(start+segment_size-1, chr_lens[chrom]) - start) > overlap_size]
+    for p in partitions:
+      partition_dir = chrom_dir + '/' + "%s_%s" %(chrom, p)
+      print('\t'.join([chrom, chrom_dir, 'Y', partition_dir]), file=fo)
diff --git a/EVM_annotation/feature_types.tsv b/EVM_annotation/feature_types.tsv
@@ -0,0 +1,6 @@
+Augustus	exon
+GlimmerHMM	exon
+SNAP	exon
+assembler-pasa_db.sqlite	cDNA_match
+genomeThreader	exon
+Liftoff	exon
diff --git a/EVM_annotation/filter_gff.py b/EVM_annotation/filter_gff.py
@@ -0,0 +1,28 @@
+import sys
+import gffutils
+
+in_gff = sys.argv[1]
+max_aed = float(sys.argv[2])
+min_prot = int(sys.argv[3])
+
+# create GFF DB
+gff_db_path = in_gff + '.db'
+gff_db = gffutils.create_db(in_gff, dbfn=gff_db_path, force=True, merge_strategy='create_unique')
+gff_db = gffutils.FeatureDB(gff_db_path)
+
+print('##gff-version 3')
+for mrna in gff_db.features_of_type('mRNA', order_by='start'):
+  # if AED or wAED > max_aed - discard
+  if float(mrna['AED'][0]) > max_aed or float(mrna['wAED'][0]) > max_aed:
+    continue
+  # calculate protein length
+  prot_len = sum([cds.end - cds.start + 1 for cds in gff_db.children(mrna, featuretype='CDS')])/3
+  # if protein length < min_prot - discard
+  if prot_len < min_prot:
+    continue
+  # if mRNA passed filtration, find parent gene and print all mRNA children
+  parent_gene = next(gff_db.parents(mrna, featuretype='gene'))
+  print(str(parent_gene))
+  print(str(mrna))
+  for cf in gff_db.children(mrna):
+    print(str(cf))