forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GOGOAParser.py
85 lines (70 loc) · 2.45 KB
/
GOGOAParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import sys
import re
class GOGOAParser(object):
"""
GO GOA Parser Class
"""
name = "go_goa"
description = "This program fills up tables in database biana related with gene ontology gene/protein annotations"
db_name_to_biana_name = { "UniProtKB/Swiss-Prot": 'UniprotAccession',
"UniProtKB/TrEMBL": 'UniprotAccession',
"UniProtKB": 'UniprotAccession',
"ENSEMBL": 'ENSEMBL',
"HINV": 'None',
"TAIR": 'TAIR',
"RefSeq": 'RefSeq',
"VEGA": 'None',
"PDB": 'PDB',
'MGI': 'MGI'
}
def __init__(self, filename):
self.file_name = filename
return
def parse(self, exclude_evidences=None, id_type="genesymbol"):
"""
# EXP: Inferred from Experiment
# IDA: Inferred from Direct Assay
# IPI: Inferred from Physical Interaction
# IMP: Inferred from Mutant Phenotype
# IGI: Inferred from Genetic Interaction
# IEP: Inferred from Expression Pattern
id_type: genesymbol | uniprot
"""
go_id_to_genes = {}
if self.file_name.endswith(".gz") or self.file_name.endswith(".gz?rev=HEAD"):
import gzip
fd = gzip.open(self.file_name)
else:
fd = open(self.file_name)
for line in fd:
if line.startswith("!"):
continue
words = line.rstrip('\n').split("\t")
if len(words) < 13:
continue
db_name, db_id, gene_name, qualifier, go_id, evidence_ref, evidence_type, evidence_id, aspect, description, synonym, type, taxon_id = words[:13] #, date, curator, extension, form_id
if exclude_evidences is not None and evidence_type in exclude_evidences:
continue
if any([ q=="NOT" for q in qualifier.split('|')]): # possible other qualifiers: colocalizes_with, contributes_to
continue
if db_name not in self.db_name_to_biana_name:
#print "Warning: xref db name is not recognized:", db_name
db_name = None
else:
db_name = self.db_name_to_biana_name[db_name]
if db_name is not None:
xref = (db_name, db_id)
tax_ids = [ int(taxon.lstrip("taxon:")) for taxon in taxon_id.split('|') ]
if id_type == "uniprot":
if db_name == "UniprotAccession":
for tax_id in tax_ids:
go_id_to_genes.setdefault(go_id, set()).add((db_id, tax_id))
else:
raise ValueError("Uknown db_name %s" % db_name)
elif id_type == "genesymbol":
for tax_id in tax_ids:
go_id_to_genes.setdefault(go_id, set()).add((gene_name, tax_id))
else:
raise ValueError("Unknown id_type %s" % id_type)
fd.close()
return go_id_to_genes