-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
123 lines (109 loc) · 4.27 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import re
import requests
from collections import defaultdict
from biothings.utils.dataload import dict_sweep, value_convert_to_number, unlist, open_anyfile
class DictQuery(dict):
"""Parse nested dictionary
"""
def get(self, path, default=None):
"""Extract value from dictionary based on path
"""
keys = path.split("/")
val = None
for key in keys:
if val:
if isinstance(val, list):
val = [v.get(key, default) if v else None for v in val]
else:
val = val.get(key, default)
else:
val = dict.get(self, key, default)
if not val:
break
return val
def batch_query_myvariant_id_from_clingen(hgvs_ids, assembly):
"""Query ClinGen to get myvariant IDs for all input non genomic hgvs IDs
Keyword arguments:
hgvs_ids -- list of non genomic hgvs IDs
assembly -- genomic assembly, either hg19 or hg38
"""
def parse_myvariant_ids(doc, assembly):
"""Parse the results from clingen to retrieve myvariant id
Keyword arguments:
doc -- doc retrieved from clingen
"""
ASSEMBLY_MAPPING = {
"hg19": "MyVariantInfo_hg19",
"hg38": "MyVariantInfo_hg38"
}
extract_path = 'externalRecords/' + ASSEMBLY_MAPPING[assembly]
res = DictQuery(doc).get(extract_path)
if res:
return [_doc['id'] for _doc in res if _doc]
else:
return []
hgvs_dict = {}
hgvs_ids = list(set(hgvs_ids))
print('total hgvs ids to process is: {}'.format(len(hgvs_ids)))
for i in range(0, len(hgvs_ids), 1000):
print('currently processing {}th variant'.format(i))
if i + 1000 <= len(hgvs_ids):
batch = hgvs_ids[i: i + 1000]
else:
batch = hgvs_ids[i:]
data = '\n'.join(batch)
res = requests.post('http://reg.genome.network/alleles?file=hgvs',
data=data,
headers={'content-type': "text/plain" }).json()
# loop through clingen results and input hgvs id in parallel
# construct a mapping dictionary with key as input hgvs id
# and value as myvariant hgvs id
for _doc, _id in zip(res, batch):
hgvs_dict[_id] = parse_myvariant_ids(_doc, assembly)
return hgvs_dict
def _map_line_to_json(fields):
"""Mapping each lines in csv file into JSON doc
"""
one_snp_json = {
"gene": fields[1],
"variant_id": fields[2],
"exon": fields[3],
"egl_variant": fields[4],
"egl_protein": fields[5],
"egl_classification": fields[6],
"egl_classification_date": fields[7],
"hgvs": fields[8].split(" | ")
}
return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
def load_data(data_folder, assembly="hg19"):
"""Load data from EMV csv file into list of JSON docs
"""
input_file = os.path.join(data_folder, "EmVClass.2018-Q2.csv")
assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
with open_anyfile(input_file) as in_f:
lines = set(list(in_f))
lines = [_doc.strip().split(',') for _doc in lines]
print(list(lines)[0])
results = defaultdict(list)
# mapping non genomic hgvs ids to genomic hgvs ids used in MyVariant
hgvs_ids = [_item[4] for _item in lines]
#print(hgvs_ids)
hgvs_mapping_dict = batch_query_myvariant_id_from_clingen(hgvs_ids, assembly)
# loop through csv doc to convert into json docs
for row in lines:
# structure the content of emv docs
variant = _map_line_to_json(row)
# fetch corresponding genomic hgvs ids
mapped_ids = hgvs_mapping_dict[row[4]]
# could be one non-genomic hgvs id mapping to mulitple genomic ones
if mapped_ids:
for _id in mapped_ids:
results[_id].append(variant)
for k, v in results.items():
if len(v) == 1:
doc = {'_id': k, 'emv': v[0]}
else:
doc = {'_id': k, 'emv': [_doc for _doc in v]}
print('case of multi hits', doc)
yield doc