-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheckexperiments.py
332 lines (306 loc) · 13.2 KB
/
checkexperiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
"""\
Run status check on experiments.
Example.
%(prog)s --username ACCESS_KEY_ID --password SECRET_ACCESS_KEY \\
--out experiments_statuses.log https://www.encodeproject.org
"""
import datetime
import json
import sys
import copy
import os.path
import subprocess
from urllib.parse import urljoin
import requests
from slackclient import SlackClient
EPILOG = __doc__
def run(out, err, url, username, password, search_query, accessions_list=None, bot_token=None, dry_run=False):
session = requests.Session()
session.auth = (username, password)
session.headers['Accept'] = 'application/json'
dr = ""
if dry_run:
dr = "-- Dry Run"
version = '0.12'
try:
ip_output = subprocess.check_output(
['hostname'], stderr=subprocess.STDOUT).strip()
ip = ip_output.decode(errors='replace').rstrip('\n')
except subprocess.CalledProcessError as e:
ip = ''
initiating_run = 'STARTING Checkexperiments version ' + \
'{} ({}) ({}): {} on {} at {}'.format(
version, url, search_query, dr, ip, datetime.datetime.now())
out.write(initiating_run + '\nAward\tAccession\tcurrent status -> new status\tsubmitted date\n')
out.flush()
err.write(initiating_run + '\nAward\tAccession\terror message\n')
err.flush()
if bot_token:
sc = SlackClient(bot_token)
sc.api_call(
"chat.postMessage",
channel="#bot-reporting",
text=initiating_run,
as_user=True
)
# we don't have at the moment minimal requirements for ChIA and HiC
# that being the reason for specifying at least one read in each replicate
minimal_read_depth_requirements = {
'DNase-seq': 20000000,
'genetic modification followed by DNase-seq': 20000000,
'ChIP-seq': 20000000,
'RAMPAGE': 10000000,
'shRNA knockdown followed by RNA-seq': 10000000,
'siRNA knockdown followed by RNA-seq': 10000000,
'single cell isolation followed by RNA-seq': 10000000,
'CRISPR genome editing followed by RNA-seq': 10000000,
'modENCODE-chip': 500000,
'ChIA-PET': 1,
'HiC': 1
}
graph = []
# checkexperiments using a file with a list of experiment accessions to be checked
if accessions_list:
r = None
ACCESSIONS = []
if os.path.isfile(accessions_list):
ACCESSIONS = [line.rstrip('\n') for line in open(accessions_list)]
for acc in ACCESSIONS:
r = session.get(
urljoin(url, '/search/?field=@id&frame=object&limit=all&type=Experiment&accession=' + acc))
try:
r.raise_for_status()
except requests.HTTPError:
return
else:
local = copy.deepcopy(r.json()['@graph'])
graph.extend(local)
# checkexperiments using a query
else:
r = session.get(
urljoin(
url,
'/search/?type=Experiment' \
'&format=json&frame=object&limit=all&' + search_query))
try:
r.raise_for_status()
except requests.HTTPError:
return
else:
graph = r.json()['@graph']
for ex in graph:
if ex['status'] != 'in progress':
continue
assay_term_name = ex.get('assay_term_name')
exp_accession = ex.get('accession')
award_request = session.get(urljoin(
url,
ex.get('award') + '?frame=object&format=json'))
award_obj = award_request.json()
award_rfa = award_obj.get('rfa')
award_name = award_obj.get('name')
# excluding all modERN, ENCORE experiments
# and non ChIP modENCODE experiments from screening
if (
(assay_term_name not in minimal_read_depth_requirements) or
(award_rfa == 'modERN') or (award_name == 'U41HG009889') or
(award_rfa == 'modENCODE' and assay_term_name != 'ChIP-seq')):
err.write(
'{}\t{}\t{}\texcluded from automatic screening\n'.format(
award_rfa,
assay_term_name,
exp_accession)
)
err.flush()
continue
try:
replicates = ex.get('replicates')
if replicates:
replicates_set = set()
submitted_replicates = set()
replicates_reads = {}
bio_rep_reads = {}
replicates_bio_index = {}
for replicate in replicates:
replicate_request = session.get(urljoin(
url,
replicate + '?frame=object&format=json'))
replicate_obj = replicate_request.json()
if replicate_obj.get('status') not in ['deleted']:
replicate_id = replicate_obj.get('@id')
replicates_set.add(replicate_id)
replicates_reads[replicate_id] = 0
replicates_bio_index[replicate_id] = replicate_obj.get('biological_replicate_number')
bio_rep_reads[replicates_bio_index[replicate_id]] = 0
exp_files = ex.get('files')
if exp_files:
erroneous_status = ['uploading', 'content error', 'upload failed']
dates = []
for file_acc in exp_files:
file_request = session.get(urljoin(
url,
file_acc + '?frame=object&format=json'))
file_obj = file_request.json()
if file_obj.get('file_format') == 'fastq' and \
file_obj.get('status') not in erroneous_status:
replicate_id = file_obj.get('replicate')
read_count = file_obj.get('read_count')
if read_count and replicate_id:
submitted_replicates.add(replicate_id)
if replicate_id in replicates_reads:
run_type = file_obj.get('run_type')
if run_type and run_type == 'paired-ended':
read_count == read_count/2
replicates_reads[replicate_id] += read_count
bio_rep_reads[replicates_bio_index[replicate_id]] += read_count
file_date = datetime.datetime.strptime(
file_obj['date_created'][:10], "%Y-%m-%d")
dates.append(file_date)
else:
continue
else:
continue
except requests.exceptions.RequestException as e:
print (e)
continue
else:
submitted_flag = True
if replicates_set and not replicates_set - submitted_replicates:
key = assay_term_name
if award_rfa == 'modENCODE':
key = 'modENCODE-chip'
if assay_term_name in [
'DNase-seq',
'genetic modification followed by DNase-seq',
'ChIP-seq']:
replicates_reads = bio_rep_reads
for rep in replicates_reads:
if replicates_reads[rep] < minimal_read_depth_requirements[key]:
# low read depth in replicate + details
submitted_flag = False
err.write(
'{}\t{}\t{}\t{}\treads_count={}\texpected count={}\n'.format(
award_rfa,
assay_term_name,
exp_accession,
rep,
replicates_reads[rep],
minimal_read_depth_requirements[key])
)
err.flush()
break
if submitted_flag:
pass_audit = True
try:
audit_request = session.get(urljoin(
url,
'/' + exp_accession + '?frame=page&format=json'))
try:
audit_obj = audit_request.json().get('audit')
except ValueError as e:
err.write('{}\t{}\t{}\tValueError: {}\n'.format(
award_rfa,
assay_term_name,
exp_accession, e)
)
err.flush()
else:
if audit_obj.get("ERROR"):
pass_audit = False
except requests.exceptions.RequestException as e:
print (e)
continue
else:
if pass_audit:
submission_date = max(dates).strftime("%Y-%m-%d")
item_url = urljoin(url, exp_accession)
data = {
"status": "submitted",
"date_submitted": submission_date
}
r = session.patch(
item_url,
data=json.dumps(data),
headers={
'content-type': 'application/json',
'accept': 'application/json'
},
)
if not r.ok:
print ('{} {}\n{}'.format(r.status_code, r.reason, r.text))
else:
out.write(
'{}\t{}\t{}\t{}\t-> submitted\t{}\n'.format(
award_rfa,
assay_term_name,
exp_accession,
ex['status'],
submission_date)
)
out.flush()
else:
err.write(
'{}\t{}\t{}\taudit errors\n'.format(
award_rfa,
assay_term_name,
exp_accession)
)
err.flush()
finishing_run = 'FINISHED Checkexperiments at {}'.format(datetime.datetime.now())
out.write(finishing_run + '\n')
out.flush()
output_filename = out.name
out.close()
error_filename = err.name
err.close()
if bot_token:
with open(output_filename, 'r') as output_file:
sc.api_call("files.upload",
title=output_filename,
channels='#bot-reporting',
content=output_file.read(),
as_user=True)
with open(error_filename, 'r') as output_file:
sc.api_call("files.upload",
title=error_filename,
channels='#bot-reporting',
content=output_file.read(),
as_user=True)
sc.api_call(
"chat.postMessage",
channel="#bot-reporting",
text=finishing_run,
as_user=True
)
def main():
import argparse
parser = argparse.ArgumentParser(
description="Update experiments status", epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
'--username', '-u', default='', help="HTTP username (access_key_id)")
parser.add_argument(
'--bot-token', default='', help="Slack bot token")
parser.add_argument(
'--password', '-p', default='',
help="HTTP password (secret_access_key)")
parser.add_argument(
'--out', '-o', type=argparse.FileType('w'), default=sys.stdout,
help="file to write json lines of results with or without errors")
parser.add_argument(
'--err', '-e', type=argparse.FileType('w'), default=sys.stderr,
help="file to write json lines of results with errors")
parser.add_argument(
'--dry-run', action='store_true', help="Don't update status, just check")
parser.add_argument(
'--search-query', default='status=in progress',
help="override the experiment search query, e.g. 'accession=ENCSR000ABC'")
parser.add_argument(
'--accessions-list', default='',
help="list of experiment accessions to check")
parser.add_argument('url', help="server to post to")
args = parser.parse_args()
run(**vars(args))
if __name__ == '__main__':
main()