forked from nmdp-bioinformatics/liftoverGL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathliftovergl.py
executable file
·312 lines (273 loc) · 10.5 KB
/
liftovergl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
#!/usr/bin/env python3
"""
pyliftgl.py
copyright
Bob Milius, PhD - 31 March 2017
National Marrow Donor Program/Be The Match
Lifts over a GL String from one version of the IMGT/HLA database to another.
usage:
liftovergl.py [-h] [-g GLSTRING | -u URI | -j JSONFILE] [-s SOURCE] [-t TARGET]
optional arguments:
-h, --help show this help message and exit
-g GLSTRING, --glstring GLSTRING GL String to be converted
-u URI, --uri URI GL Service URI of GL String
-f JSONFILE, --jsonfile JSONFILE input file containing JSON
-s SOURCE, --source SOURCE Source IMGT/HLA version, e.g., '3.0.0'
-t TARGET, --target TARGET Target IMGT/HLA version, e.g. '3.25.0'
This program uses AllelelistGgroups_history.txt file that was created by
merging data from the Allelellist_history.txt, hla_ambigs.xml, and
hla_nom_g.txt files that can be obtained from
https://github.com/ANHIG/IMGTHLA
Alleles and G-groups are converted to accession #s, and then mapped to
directly to alleles in the target version. If alleles do not exist in the
target, then they are dropped from the target GL String. Alleles are not
expanded to include new alleles. eg., the following alleles exist for
3.20.0 and 3.24.0:
3.20.0
HLA00053 HLA-A*24:03:01
3.24.0
HLA00053 HLA-A*24:03:01:01
HLA14804 HLA-A*24:03:01:02
When HLA-A*24:03:01 is converted from 3.20.0 to 3.25.0, it gets assigned to
HLA-A*24:03:01:01
and not
HLA-A*24:03:01:01/HLA-A*24:03:01:02
See the README.rst file for more information.
"""
import argparse
import json
import os
import pandas as pd
import re
import requests
import sys
def read_history():
"""
reads the AllelelistGgroups_history.txt file that was downloaded from
https://github.com/ANHIG/IMGTHLA
into a pandas dataframe.
"""
history_file = os.environ["IMGTHLA"] + "/AllelelistGgroups_history.txt"
try:
history = pd.read_csv(history_file,
sep="\t", header=0, index_col=0, dtype=str)
return history
except:
print("could not read {}".format(history_file))
sys.exit()
class Ret:
def __init__(self, retstring, errorflag):
self.retstring = retstring
self.errorflag = errorflag
def err(self, retstring):
self.retstring = retstring
self.errorflag = 1
def mk_glids(glstring, version, history):
"""
takes a GL String containing allele names for a version of
the IMGT/HLA database, and returns a GL String with HLA_IDs
substituted for the allele names
"""
# the history file strips the '.' from the version, and pads the
# middle field to two spaces, so '3.1.0' becomes '3010'
v = version.split(".")
vers = "".join([v[0], v[1].zfill(2), v[-1]])
for allele in get_alleles(glstring):
hla_id = history[history[vers] == allele[4:]].index.tolist()
if len(hla_id) == 1:
glstring = glstring.replace(allele, hla_id[0])
elif len(hla_id) == 0:
return Ret("{} does not exist in IMGT/HLA ver {}".format(allele, version),1)
else:
return Ret("{} has more than one id: {}".format(allele, hla_id),1)
return Ret(glstring,0)
def mk_target(gl_ids, target, history):
"""
takes a GL String made up of HLA IDs, and converts it to
one containing allele names for a specific IMGT/HLA version
"""
# the history file strips the '.' from the version, and pads the
# middle field to two spaces, so '3.1.0' becomes '3010'
t = target.split(".")
targ = "".join([t[0], t[1].zfill(2), t[-1]])
target_gl = gl_ids
for hla_id in get_alleles(gl_ids):
new_allele = "HLA-" + str(history[targ][hla_id])
target_gl = target_gl.replace(hla_id, new_allele)
target_gl = gl_clean(target_gl)
return target_gl
def gl_clean(glstring):
"""
takes a GL String that has just been lifted to another version,
and removes deleted alleles, and cleans up resulting
delimiters left behind
"""
# remove non-existant alleles
glstring = glstring.replace("HLA-nan", "")
# after deleting the above, delimiters may be packed together. These have
# to removed when they are at the beginning or end of the GL String, or
# resolved to the highest precedence when they are in the middle of the
# GL String
# ---
# trailing delimiters
glstring = re.sub(r'[/~+|^]+$', '', glstring)
# leading delimiters
glstring = re.sub(r'^[/~+|^]+', '', glstring)
# the rest in order of precedence
glstring = re.sub(r'[/~+|^]*\^[/~+|^]*', '^', glstring)
glstring = re.sub(r'[/~+|]*\|[/~+|]*', '|', glstring)
glstring = re.sub(r'[/~+]*\+[/~+]*', '+', glstring)
glstring = re.sub(r'[/~]*\~[/+]*', '~', glstring)
glstring = re.sub(r'\/+', '/', glstring)
return glstring
def get_alleles(glstring):
"""
Takes a GL String, and returns a set containing all the alleles
"""
alleles = set()
print ("get_alleles:", glstring)
for allele in re.split(r'[/~+|^]', glstring):
alleles.add(allele)
return alleles
def get_resource(glstring):
"""
takes a GL String, and returns the resource type based on which
delimiters are present
"""
if "^" in glstring:
return "multilocus-unphased-genotype"
elif "|" in glstring:
return "genotype-list"
elif "+" in glstring:
return "genotype"
elif "~" in glstring:
return "haplotype"
elif "/" in glstring:
return "allele-list"
else:
return "allele"
def post_gl(glstring, version, resource):
"""
takes a GL String and version, and posts it to the gl.nmdp.org
and returns a dictionary containing the status code, the
response text, and the response location if successful
"""
# url = 'http://gl.nmdp.org/imgt-hla/' + version + "/" + resource
url = 'http://gl.nmdp.org/imgt-hla/{}/{}'.format(version, resource)
headers = {'content-type': 'plain/text'}
response = requests.post(url, data=glstring, headers=headers)
if response.status_code == 201:
return {'status_code': response.status_code,
'text': response.text,
'location': response.headers['location']}
else:
return {'status_code': response.status_code,
'text': response.text}
def from_source_uri(source_uri):
"""
extracts the source IMGT/HLA version and resource from the source_uri,
then POSTs the uri to retrieve the glstring,
and returns the glstring, source, and resource in a tuple
"""
uri_fields = list(filter(None, source_uri.split('/')))
source = uri_fields[-3]
resource = uri_fields[-2]
response = requests.get(source_uri)
if response.status_code == 200:
glstring = response.text
return (glstring, source, resource)
else:
print("status_code = {}".format(response.status_code))
print("text = {}".format(response.text))
sys.exit()
def build_output(s_response, t_response):
"""
creates the dictionary which will be the output for the program
"""
output = {
'sourceGl': s_response['text'],
'sourceUri': s_response['location'],
'targetGl': t_response['text'],
'targetUri': t_response['location'],
}
return output
def main():
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group()
group.add_argument("-g", "--glstring",
help="GL String to be converted",
type=str)
group.add_argument("-u", "--uri",
help="GL Service URI of GL String",
type=str)
group.add_argument("-f", "--jsonfile",
help="input file containing JSON",
type=str)
parser.add_argument("-s", "--source",
help="Source IMGT/HLA version, e.g., '3.0.0'",
type=str)
parser.add_argument("-t", "--target",
help="Target IMGT/HLA version, e.g. '3.25.0'",
type=str)
args = parser.parse_args()
args = parser.parse_args()
if (args.glstring is None) and (args.uri is None) and (args.jsonfile is None):
parser.error("at least one of -g/--glstring or -u/--uri required or -f/--jsonfile")
if (args.uri) and (args.target is None):
parser.error("If you specify URI, you need also need to specify "
"a -t/--target")
if (args.glstring) and ((args.target is None) or (args.source is None)):
parser.error("If you specify GLSTRING, you need to also specify "
"both -s/--source and -t/--target")
if (args.uri) and (args.source):
print("Warning: source will be obtained from the URI; your specified "
"-s/--source will be ignored")
source_uri = args.uri
glstring = args.glstring
jsonfile = args.jsonfile
source = args.source
target = args.target
# handle input options
if jsonfile:
# same json format used by gl.nmdp.org/liftover
with open(jsonfile, 'r') as jf:
data = json.load(jf)
target = list(filter(None, data['targetNamespace'].split("/")))[-1]
source_uri = data['sourceUri']
glstring, source, resource = from_source_uri(source_uri)
elif glstring:
# get source and target from command line
resource = get_resource(glstring)
elif source_uri:
# get target from command line
glstring, source, resource = from_source_uri(source_uri)
history = read_history()
# print("glstring =", glstring)
# print("source =", source)
# print("target =", target)
r = mk_glids(glstring, source, history)
if r.errorflag:
print (r.retstring)
sys.exit()
gl_ids = r.retstring
# print("IDs =", gl_ids, "\n")
target_gl = mk_target(gl_ids, target, history)
if not target_gl:
print("empty target GL String, all alleles dropped")
sys.exit()
# print("target GL = {}\n".format(target_gl))
s_response = post_gl(glstring, source, resource)
# print("source location = {}\n".format(s_response['location']))
t_response = post_gl(target_gl, target, resource)
outputd= {
'sourceGl': s_response['text'],
'sourceUri': s_response['location'],
'targetGl': t_response['text'],
'targetUri': t_response['location'],
}
output = json.dumps(outputd, sort_keys=True, indent=4)
# output = json.dumps(build_output(s_response, t_response),
# sort_keys=True, indent=4)
print(output)
if __name__ == "__main__":
main()