forked from s-andrews/sradownloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sradownloader
executable file
·456 lines (346 loc) · 17.5 KB
/
sradownloader
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
#!/usr/bin/env python3
#############################################################################
# Copyright 2020 Simon Andrews
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
############################################################################
import sys
import subprocess
import os
import csv
import argparse
import glob
import urllib.request
import re
import time
import threading
import select
from ftplib import FTP
VERSION = "3.11"
# These are the symbols we're going to need to remove
# from any proposed file names
invalid_characters = [" ","/",">","<","|","\"","?","*","(",")","\\","[","]","{","}","!","&","#"]
def download_sample_ena(sample, options):
if not options.quiet:
print(f"[ENA] Downloading {sample['accession']} into {sample['file_base']}", flush=True)
if options.wget:
print("Downloading with wget", flush=True)
elif options.axel:
print(f"Downloading with axel using {options.axel_num_connections} connections", flush=True)
else:
print("Downloading with FTP", flush=True)
# Get the FTP locations for the sample
ena_rest_url = f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={sample['accession']}&result=read_run&fields=run_accession,fastq_ftp"
with urllib.request.urlopen(ena_rest_url) as response:
if response.getcode() != 200:
raise IOError(f"[ENA] Got error code {response.getcode()} from ENA REST for accession {sample['accession']}")
url_list = []
found_accession_line = False
for line in response:
line = line.decode("UTF-8").strip()
print(line)
if line.startswith("SRR") or line.startswith("ERR"):
found_accession_line = True
sections = line.split("\t")
if len(sections) != 2:
raise IOError(f"[ENA] No ENA fastq files found for accession {sample['accession']}")
url_list = sections[1].split(";")
break
if not found_accession_line:
raise IOError(f"[ENA] Found no accession in response from ENA REST for accession {sample['accession']}")
for url in url_list:
# We need to extract the various sections from the URL. This will be the server,
# the folder and the file
url_sections = url.split("/")
server = url_sections[0]
file = url_sections[-1]
folder = "/".join(url_sections[1:-1])
#print(f"URL is {url} server={server} folder={folder} file={file}")
# Work out the output file name. We're assuming there's never
# going to be more than 2 files per sample
if url.endswith("_2.fastq.gz"):
outfile=sample['file_base'] + "_2.fastq.gz"
else:
outfile=sample['file_base'] + "_1.fastq.gz"
if options.outdir != ".":
outfile = f"{options.outdir}/{outfile}"
# Check if the output file already exists. If it does then don't overwrite
# unless --force has been specified
if os.path.exists(outfile) and os.path.getsize(outfile) > 0:
if not options.force:
if not options.quiet:
print (f"Skipping {url} as outfile {outfile} exists (override with --force)", flush=True)
continue
attempt_number = 1
while (attempt_number <= options.retries):
if not options.quiet:
print(f"[ENA Attempt {attempt_number}] Downloading {url} into {outfile}", flush=True)
# For subsequent retries we'll add an increasing delay so we
# can try to wait past temporary glitches
sleep_time = attempt_number**2
if sleep_time > 30:
sleep_time=30
if attempt_number > 1:
print(f"Sleeping for {sleep_time} seconds before retrying")
time.sleep(sleep_time)
try:
if options.ftp and not (options.wget or options.axel):
ftp = FTP(server)
ftp.login()
ftp.cwd(folder)
with open(outfile, 'wb') as ftp_out:
sock = ftp.transfercmd('RETR ' + file)
sock.setblocking(0)
def background():
while True:
ready = select.select([sock], [], [], 30)
if ready:
block = sock.recv(1024*1024)
if not block:
break
ftp_out.write(block)
else:
raise Exception("No data before timeout")
sock.close()
t = threading.Thread(target=background)
t.start()
while t.is_alive():
t.join(60)
ftp.voidcmd('NOOP')
ftp.quit()
elif options.wget:
url = "http://" + url if not url.startswith("http://") else url
wget_cmd = ["wget", "-O", outfile, "-q", url]
result = subprocess.run(wget_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0:
raise Exception(f"wget failed: {result.stderr.decode('utf-8')}")
elif options.axel:
axel_cmd = ["axel", "-q", "-n", str(getattr(options, 'axel_num_connections', 5)), "-o", outfile, url]
result = subprocess.run(axel_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0:
raise Exception(f"Axel failed: {result.stderr.decode('utf-8')}")
# Check for zero-sized file
if os.stat(outfile).st_size == 0:
os.unlink(outfile)
raise IOError("Received zero sized file")
except Exception as ex:
print(ex)
attempt_number += 1
if attempt_number > options.retries:
raise IOError(f"[ENA] Download repeatedly failed for {sample['accession']} - giving up")
else:
print(f"[ENA] Download failed for {sample['accession']} - going around for another try", flush=True)
continue
# It worked
break
def download_sample_ncbi (sample, options):
if not options.quiet:
print (f"[NCBI] Downloading {sample['accession']} into {sample['file_base']}", flush=True)
# We add the --include-technical because 10X runs mark the essential barcode
# reads as technical and don't download them otherwise.
command_options = [options.fqdump,"--split-files","--include-technical","--threads",options.threads,"--temp",options.outdir,"--outfile",sample["file_base"]+".fastq"]
# If they're attached to a terminal and they're not being quiet then we'll show progress
if not options.quiet:
if sys.stdin.isatty():
command_options.append("--progress")
# If they're not using the current directory as output we'll say where it should go
if options.outdir != ".":
command_options.append("--outdir")
command_options.append(options.outdir)
# Finally add the accession
command_options.append(sample['accession'])
command_options = [str(x) for x in command_options]
attempt_number = 1
while (attempt_number <= options.retries):
if not options.quiet:
print(f"[NCBI Attempt {attempt_number}] Running: "+" ".join(command_options), flush=True)
result = subprocess.run(command_options, check=False)
if result.returncode != 0:
if attempt_number == options.retries:
attempt_number += 1
raise IOError(f"[NCBI] Sorry SRAtoolkit just isn't having it, giving up on {sample['accession']}")
else:
print(f"[NCBI] SRAtoolkit failed us - going around for another try", flush=True)
attempt_number += 1
continue
else:
# Amazingly it worked
break
if attempt_number > options.retries:
# This failed so don't try to do anything else with it
return
# Now find the files which were created and compress them
downloaded_files = glob.glob(options.outdir+"/"+sample['file_base']+"*.fastq")
if len(downloaded_files) == 0:
raise IOError(f"[NCBI] Got no files for accession {sample['accession']}")
for file in downloaded_files:
if not options.quiet:
print("Compressing "+file, flush=True)
subprocess.run(["gzip","-4",file], check=True)
def get_geo_name (srr_number, options):
sample_name = ""
if not options.quiet:
print(f"Trying to get name for {srr_number} from GEO", flush=True)
with urllib.request.urlopen(f"https://www.ncbi.nlm.nih.gov/sra/?term={srr_number}&format=text") as response:
for line in response:
line = line.decode("UTF-8")
if line.startswith("Title:"):
line = line.strip()
geosections = re.split("[:; ,/]+",line)
sample_name = "_".join(geosections[1:])
break
return sample_name
def read_samples (options):
if not options.quiet:
print(f"Reading samples from {options.runtable}", flush=True)
sample_file = options.runtable
samples = []
headers = []
name_index = 0
# The sample_file might be one of three things
# 1. An SRA RunTable file
# 2. A simple file of SRR/ERR accessions
# 3. An SRR/ERR accession (not a file name)
#
# We need to account for all of these
try:
with open(sample_file) as fh:
csv_fh = csv.reader(fh)
name_index = -1
for sections in csv_fh:
# Skip blank lines
if len(sections) == 0:
continue
sections[0] = sections[0].strip()
# The first line should be the headers. We'll check for the name
# so we don't inadvertently skip a data line if the header is missing
if len(headers) == 0 and sections[0] == "Run":
headers = sections
if "source_name" in headers:
name_index = headers.index("source_name")
continue
sample_name = ""
# Check to see if there's another set of headers in the middle of the file
if sections[0] == "Run":
continue
# See if we need to look up a nice name from GEO (SRA)
if not options.nogeo :
sample_name = get_geo_name(sections[0], options)
if sample_name == "":
if name_index >= 0:
sample_name = sections[name_index].strip()
for bad_char in invalid_characters:
sample_name = sample_name.replace(bad_char,"_")
# Add a leading underscore to separate it from the accession
sample_name = "_"+sample_name
# Clean up any runs of underscores
while "__" in sample_name:
sample_name = sample_name.replace("__","_")
if not options.quiet:
print (f"Found sample {sections[0]} with basename {sections[0]}{sample_name}", flush=True)
sample = {
"accession" : sections[0],
"file_base" : sections[0] + sample_name
}
samples.append(sample)
except FileNotFoundError as err:
# This could be a bare accession
if sample_file.startswith("SRR") or sample_file.startswith("ERR"):
sample_name = ""
if not options.nogeo :
sample_name = get_geo_name(sample_file, options)
if len(sample_name) > 0:
sample_name = "_"+sample_name
samples.append({
"accession" : sample_file,
"file_base" : sample_file + sample_name
})
else:
raise err
return samples
def read_options():
parser = argparse.ArgumentParser(description="Download data from the SRA")
parser.add_argument('--quiet', dest="quiet", action='store_true', default=False, help="Supress all but essential messages")
parser.add_argument('--version', action='version', version=f"SRA downloader v{VERSION}")
parser.add_argument('--outdir', type=str, help="Folder to save data to (default .)", default=".")
parser.add_argument('--threads', type=int, help="Number of threads (default 1)", default=1)
parser.add_argument('--retries', type=int, help="Number of times we'll retry a download before giving up (default 5)", default=5)
parser.add_argument('--force', type=int, help="Overwrite output files even if they exist", default=False)
parser.add_argument('--fqdump', type=str, help="Path to the fastq dump program (default fasterq-dump)", default="fasterq-dump")
parser.add_argument('--axel-connections', dest="axel_num_connections", type=int, help="Number of connections to use with axel (default 5)", default=5)
parser.add_argument('--ftp', dest="ftp", action='store_true', help="Use FTP to download from ENA (default)", default=True)
parser.add_argument('--wget', dest="wget", action='store_true', help="Use wget to download from ENA")
parser.add_argument('--axel', dest="axel", action='store_true', help="Use axel to download from ENA")
parser.add_argument('--nogeo', dest="nogeo", action='store_true', help="Disable sample name lookup from GEO")
parser.add_argument('--noena', dest="noena", action='store_true', help="Don't try downloading from ENA")
parser.add_argument('--noncbi', dest="noncbi", action='store_true', help="Don't try downloading from NBCI")
parser.add_argument('runtable', type=str, help="The SraRunTable.txt file from the SRA run selector")
options = parser.parse_args()
if not options.noncbi:
# Can we find fasterq-dump
if not options.quiet:
print("Testing for fasterq-dump at "+options.fqdump, flush=True)
try:
subprocess.run([options.fqdump,"--version"], check=True, stdout=subprocess.DEVNULL)
if not options.quiet:
print("Found fasterq-dump at "+options.fqdump, flush=True)
except:
print ("WARNING: Couldn't find fasterq-dump at "+options.fqdump+". Please ensure that sratoolkit is downloaded and that you've run vdb-config if you want to download from NCBI", file=sys.stderr, flush=True)
options.noncbi = True
# Can we find gzip
if not options.quiet:
print("Testing for gzip in the path", flush=True)
try:
subprocess.run(["gzip","--version"], check=True, stdout=subprocess.DEVNULL)
if not options.quiet:
print("Found gzip", flush=True)
except:
print ("WARNING: Couldn't find gzip in the path - can't download from NCBI without this.", file=sys.stderr, flush=True)
options.noncbi = True
if options.noncbi and options.noena:
print ("ERROR: Both NCBI and ENA download options are unavailable. Giving up", file=sys.stderr, flush=True)
sys.exit(1)
# Create the output directory if it's not there already
if not os.path.isdir(options.outdir):
os.makedirs(options.outdir)
return options
def main():
options = read_options()
samples = read_samples(options)
results = {}
for sample in samples:
try:
if options.noena:
raise Exception("ENA disabled with --noena, not trying that.")
download_sample_ena(sample,options)
results[sample['accession']] = "SUCCEEDED"
except Exception as ex:
if options.noncbi:
print(f"WARNING: Failed to download via ENA: {ex}. Can't try NCBI - giving up - sorry.", file=sys.stderr, flush=True)
results[sample['accession']] = "FAILED"
else:
print(f"WARNING: Failed to download via ENA: {ex} trying NCBI instead", file=sys.stderr, flush=True)
try:
download_sample_ncbi(sample, options)
results[sample['accession']] = "SUCCEEDED"
except Exception as ex2:
print(f"WARNING: Failed to download via NCBI: {ex2}. Giving up - sorry.", file=sys.stderr, flush=True)
results[sample['accession']] = "FAILED"
if not options.quiet:
print(f"\nAll done!\n\nRESULTS\n-------", flush=True)
for accession in sorted(results.keys()):
print(f"{accession}:\t{results[accession]}", flush=True)
if __name__ == "__main__":
main()