Skip to content

Commit

Permalink
Merge pull request #18 from MayroseLab/v1.2
Browse files Browse the repository at this point in the history
v1.2
  • Loading branch information
soungalo authored Nov 2, 2022
2 parents 9e1ef68 + 9da5405 commit ff744ba
Show file tree
Hide file tree
Showing 60 changed files with 4,269 additions and 1,500 deletions.
899 changes: 899 additions & 0 deletions EVM_annotation/EVM_annotation.snakefile

Large diffs are not rendered by default.

192 changes: 192 additions & 0 deletions EVM_annotation/add_AED_to_gff3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""
This script calculates Annotation Edit Distances
(AED) and weighted AED (wAED) for mRNA features
predicted by EVidencdModeler. These scores can
be used as per-gene quality measures, based on
the support of evidence for gene models.
The idea of AED is further explain in
Holt, C., & Yandell, M. (2011).
The output of the script is a GFF3 file, identical
to the input GFF3, except all mRNA features have
two additional attributes: AED (raw distance), and
wAED (considering the weight assigned to each
source of evidence). Use:
python add_AED_to_gff3.py -h
for usage instructions.
Requirements: python3, pandas, gffutils, intervaltree
"""

# IMPORTS
import gffutils
import os
import sys
import pandas as pd
import argparse
from intervaltree import Interval, IntervalTree

# FUNCTIONS

def create_gff_db(gff_path, force=False):
print('Reading GFF %s' % gff_path)
db_path = gff_path + '.db'
if os.path.isfile(db_path) and not force:
print('Using existing DB %s' % db_path)
else:
gff_db = gffutils.create_db(gff_path, dbfn=db_path, force=True, keep_order=True, merge_strategy='create_unique', verbose=True)
return gffutils.FeatureDB(db_path, keep_order=True)

def gff_to_intervals(gff_path, feature_types=None, merge_overlaps=False):
"""
Reads a GFF3 file and creates a data structure:
{chr1: IntervalTree(), chr2: IntervalTree(),...}
if feature_types (iterable) is given, only
include features of the given types.
"""
print('Reading GFF %s' % gff_path)
res = {}
with open(gff_path) as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
fields = line.split('\t')
chrom = fields[0]
source = fields[1]
ftype = fields[2]
start = int(fields[3])
end = int(fields[4])
if source not in feature_types or (feature_types is not None and ftype not in feature_types[source]):
continue
if start == end: # not allowing empty intervals
continue
if chrom not in res:
res[chrom] = IntervalTree()
res[chrom].add(Interval(start, end, source))
if merge_overlaps:
for chrom in res:
res[chrom].merge_overlaps(data_reducer=lambda x, y: x)
return res

def merge_gff_iv_trees(gff_ds_list):
"""
Takes a list of data structures created
by gff_to_intervals and merges them into
one structure of the same type.
"""
res = {}
for ds in gff_ds_list:
for chrom in ds:
if chrom not in res:
res[chrom] = IntervalTree()
res[chrom].update(ds[chrom])
return res

def overlap_length(range1, range2):
"""
Compute the length of overlap
between two ranges given as (start,end)
"""
return min(range1[1], range2[1]) - max(range1[0], range2[0])

def main():
# command line arguments
DESC = "Add Annotation Edit Distance (AED) to EvidenceModeler GFF3"
USAGE = "python %s -h (display full usage doc)" % sys.argv[0]
parser = argparse.ArgumentParser(description=DESC, usage=USAGE)
parser.add_argument('-g', '--gff', help='EVM output GFF3', required=True)
parser.add_argument('-w', '--weights', help='EVM weights TSV', required=True)
parser.add_argument('-l', '--feature_types', help='A TSV file denoting which feature types to use as evidence for each source', required=True)
parser.add_argument('-t', '--transcript', help='Transcript evidence GFF3', default=None)
parser.add_argument('-p', '--protein', help='Protein evidence GFF3', default=None)
parser.add_argument('-a', '--prediction', help='Gene predictions GFF3', default=None)
parser.add_argument('-f', '--force', help='Force override existng gff DBs', action='store_true', default=False)
parser.add_argument('-i', '--ignore_ab_initio', help='Ab-initio predictions will not be considered when calculating AED', action='store_true', default=False)
parser.add_argument('-o', '--out_gff', help='Output GFF with AEDs', required=True)
args = parser.parse_args()

# Read weights TSV
weights_df = pd.read_csv(args.weights, sep='\t', names=['class','source','weight'], index_col='source')
# Read feature types TSV
ftypes_df = pd.read_csv(args.feature_types, sep='\t', names=['source','feature_types'], index_col='source', converters={'feature_types': lambda x: set(x.split(','))})
assert set(ftypes_df.index) == set(weights_df.index), "Must list the same sources in the weights and in the feature types files"
weights_df = weights_df.join(ftypes_df)
if args.ignore_ab_initio:
weights_df = weights_df.query('`class` != "ABINITIO_PREDICTION"')
weights_df['relative_weight'] = weights_df['weight'] / sum(weights_df['weight'])

# Read EVM GFF
evm_gff = create_gff_db(args.gff, args.force)

# Read ab-initio and evidence GFFs
evidence = []
for gff in [args.transcript, args.protein, args.prediction]:
if not gff:
continue
evidence.append(gff_to_intervals(gff, feature_types=weights_df['feature_types'], merge_overlaps=True))
all_evidence = merge_gff_iv_trees(evidence)

# Assign AEDs
print('Calculating AEDs...')
with open(args.out_gff,'w') as fo:
print('##gff-version 3', file=fo)
for evm_feat in evm_gff.all_features():
if evm_feat.featuretype != 'mRNA':
print(str(evm_feat), file=fo)
continue
mrna = evm_feat
chrom = mrna.seqid
mrna_sup = {}
mrna_cov = 0
sup_len = 0
tot_exon_len = 0
for exon in evm_gff.children(mrna, featuretype='exon'):
tot_exon_len += exon.end - exon.start
if chrom in all_evidence:
sup_features = all_evidence[chrom][exon.start:exon.end]
else:
sup_features = []
# for non-weighted AED
sup_features_tree = IntervalTree(sup_features)
sup_features_tree.merge_overlaps()
for supp_feature in sup_features_tree:
mrna_cov += overlap_length((exon.start,exon.end), (supp_feature.begin,supp_feature.end))
sup_len += supp_feature.end - supp_feature.begin
# for weighted AED
for supp_feature in sup_features:
overlap_len = overlap_length((exon.start,exon.end), (supp_feature.begin,supp_feature.end))
source = supp_feature.data
if source not in mrna_sup:
mrna_sup[source] = [0,0]
mrna_sup[source][0] += overlap_len
mrna_sup[source][1] += supp_feature.end - supp_feature.begin

# calculate non-weighted AED
if sup_len == 0:
mrna_aed = 1
else:
nw_sensitivity = mrna_cov / tot_exon_len
nw_specificity = mrna_cov / sup_len
nw_congruence = (nw_sensitivity + nw_specificity)/2
mrna_aed = 1 - nw_congruence
mrna_aed = ("%.2f" % mrna_aed)
mrna['AED'] = [mrna_aed]
# calculate weighted AED
mrna_waed = 0
for source in weights_df.index:
if source in mrna_sup:
sensitivity = mrna_sup[source][0] / tot_exon_len
specificity = mrna_sup[source][0] / mrna_sup[source][1]
congruence = (sensitivity + specificity)/2
aed = 1 - congruence
else:
aed = 1
aed_weighted = aed * weights_df.loc[source]['relative_weight']
mrna_waed += aed_weighted
mrna_waed = ("%.2f" % mrna_waed)
mrna['wAED'] = [mrna_waed]

print(str(mrna), file=fo)

# MAIN
if __name__ == "__main__":
main()
19 changes: 19 additions & 0 deletions EVM_annotation/alignAssembly.config.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## templated variables to be replaced exist as <__var_name__>

# Pathname of an SQLite database
# If the environment variable DSN_DRIVER=mysql then it is the name of a MySQL database
DATABASE=<DBPATH>


#######################################################
# Parameters to specify to specific scripts in pipeline
# create a key = "script_name" + ":" + "parameter"
# assign a value as done above.

#script validate_alignments_in_db.dbi
validate_alignments_in_db.dbi:--MIN_PERCENT_ALIGNED=75
validate_alignments_in_db.dbi:--MIN_AVG_PER_ID=95
validate_alignments_in_db.dbi:--NUM_BP_PERFECT_SPLICE_BOUNDARY=0

#script subcluster_builder.dbi
subcluster_builder.dbi:-m=50
50 changes: 50 additions & 0 deletions EVM_annotation/conf_template_EVM.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# NOTE: if running the EVM pipeline as part of Panoramic, DO NOT modify values wrapped in <>
name: EVM-annotation

# input/output
input_genome: <INPUT_GENOME>
sample_name: <SAMPLE_NAME>
out_dir: <OUT_DIR>

# Repeat masking
mask_genome: 1 # 1 - run EDTA, 0 - don't run
reference_cds: <REFERENCE_CDS>

# Liftover
reference_liftover: <REFERENCE_LIFTOVER> # 1- perform reference genes liftover, 0 - skip
reference_gff: <REFERENCE_GFF>
reference_fasta: <REFERENCE_FASTA>
liftover_weight:

# Ab-initio prediction
augustus_species:
glimmerhmm_species:
snap_species:
ab-initio_weight:

# Transcript evidence
transcripts_fasta: <TRANSCRIPTS_FASTA>
transcripts_weight:

# Protein evidence
proteins_fasta: <PROTEINS_FASTA>
proteins_weight:

# EVM partitioning
segment_size: 5000000
overlap_size: 20000

# Split chimeras
split_chimeras: 1 # 1 - run chimeraBuster, 0 -don't run
chimeraBuster_dir:

# Annotation filtration
min_protein: 50
max_AED: 0.99

# Environment
queue: <QUEUE>
priority: <PRIORITY>
ppn: <PPN>
max_ram: <MAX_RAM>
max_jobs: <MAX_JOBS>
31 changes: 31 additions & 0 deletions EVM_annotation/create_EVM_partitions_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Create the partitions list file
for EVM. Each line has the structure:
chromosome chromosome_dir Y partition_dir (chromosome_dir/chromosome_start-end)
"""

import sys
from Bio import SeqIO

genome_fasta = sys.argv[1]
segment_size = int(sys.argv[2])
overlap_size = int(sys.argv[3])
out_dir = sys.argv[4]
partitions_out = sys.argv[5]

# get chromosome lengths
chr_lens = {}
for rec in SeqIO.parse(genome_fasta, 'fasta'):
chr_lens[rec.id] = len(rec.seq)

# compute partitions
with open(partitions_out, 'w') as fo:
for chrom in sorted(chr_lens.keys()):
chrom_dir = out_dir + '/' + chrom
if chr_lens[chrom] <= overlap_size:
partitions = ['%s-%s' %(1,chr_lens[chrom])]
else:
partitions = ["%s-%s" %(start, min(start+segment_size-1, chr_lens[chrom])) for start in range(1, chr_lens[chrom], segment_size - overlap_size) if (min(start+segment_size-1, chr_lens[chrom]) - start) > overlap_size]
for p in partitions:
partition_dir = chrom_dir + '/' + "%s_%s" %(chrom, p)
print('\t'.join([chrom, chrom_dir, 'Y', partition_dir]), file=fo)
6 changes: 6 additions & 0 deletions EVM_annotation/feature_types.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Augustus exon
GlimmerHMM exon
SNAP exon
assembler-pasa_db.sqlite cDNA_match
genomeThreader exon
Liftoff exon
28 changes: 28 additions & 0 deletions EVM_annotation/filter_gff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import sys
import gffutils

in_gff = sys.argv[1]
max_aed = float(sys.argv[2])
min_prot = int(sys.argv[3])

# create GFF DB
gff_db_path = in_gff + '.db'
gff_db = gffutils.create_db(in_gff, dbfn=gff_db_path, force=True, merge_strategy='create_unique')
gff_db = gffutils.FeatureDB(gff_db_path)

print('##gff-version 3')
for mrna in gff_db.features_of_type('mRNA', order_by='start'):
# if AED or wAED > max_aed - discard
if float(mrna['AED'][0]) > max_aed or float(mrna['wAED'][0]) > max_aed:
continue
# calculate protein length
prot_len = sum([cds.end - cds.start + 1 for cds in gff_db.children(mrna, featuretype='CDS')])/3
# if protein length < min_prot - discard
if prot_len < min_prot:
continue
# if mRNA passed filtration, find parent gene and print all mRNA children
parent_gene = next(gff_db.parents(mrna, featuretype='gene'))
print(str(parent_gene))
print(str(mrna))
for cf in gff_db.children(mrna):
print(str(cf))
Loading

0 comments on commit ff744ba

Please sign in to comment.