-
Notifications
You must be signed in to change notification settings - Fork 3
/
JGI_DL.py
188 lines (176 loc) · 6.09 KB
/
JGI_DL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python
"""Downloads soybean expression data from JGI."""
# Import python modules here
from xml.etree import ElementTree
import os
import subprocess
import tempfile
import getpass
import hashlib
import sys
def calculate_md5(fname, blocksize=8192):
"""A helper function to calculate MD5 sums to make sure we grab files
without corruption."""
# If the supplied block size isn't a multiple of 128, then we will exit
# with an error, since md5 has a digest block size of 128 bytes
# Start the md5 digest object
md5 = hashlib.md5()
# Next we read in the file, blocksize bytes at a time, and feed it to md5
# we have to use the binary read option here
with open(fname, 'rb') as f:
while True:
chunk = f.read(blocksize)
# If this is empty, then we have read the end of the file
if not chunk:
break
# Tack the chunk onto the md5 object
md5.update(chunk)
# Then, return the hexadecimal hash (this is the one that is transmitted
# through plain text)
return md5.hexdigest()
def jgi_login(u, p):
"""A function to log in to JGI and return a cookie file for use in XML and
file downloading."""
# Define a cookie file
cookie_file = tempfile.NamedTemporaryFile(
mode='w+t',
prefix='JMM_JGI_Cookie_',
suffix='.txt',
delete=False)
# Log in to JGI
cmd = [
'curl',
'https://signon.jgi.doe.gov/signon/create',
'--data-urlencode',
'login=' + u,
'--data-urlencode',
'password=' + p,
'-c',
cookie_file.name
]
# Open a write handle to /dev/null, we send cURL output there.
dnull = open(os.devnull, 'w')
# Then we execute the command
p = subprocess.Popen(
cmd,
shell=False,
stdout=dnull,
stdin=subprocess.PIPE
)
out, err = p.communicate()
# Close the output handle to /dev/null
dnull.close()
return cookie_file
def parse_xml(x, cookie, flt):
"""Uses the cookie generated by cURL to fetch an XML, and applies a file
name filter to only select certain files for download."""
# Create another temporary named file for the XML output
xml_out = tempfile.NamedTemporaryFile(
mode='w+t',
prefix='JMM_JGI_XML_',
suffix='.xml',
delete=False)
# Use cURL to download the XML, passing the cookies we generated
# earlier to authenticate.
cmd = [
'curl',
x,
'-b',
cookie.name,
'-o',
xml_out.name
]
# Execute the command
p = subprocess.Popen(
cmd,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = p.communicate()
# Then, read the XML back from the file
xml = xml_out.read()
# Convert it to an element tree for easy access
xml_tree = ElementTree.fromstring(xml)
# Get all files
# And start empty lists to house the URLs and their checksums
urls = []
md5s = []
for elem in xml_tree.findall('.//file'):
# if the URL ends in a certain suffix, then save it
if flt in elem.attrib.get('url'):
url = elem.attrib.get('url')
md5 = elem.attrib.get('md5')
urls.append(url)
md5s.append(md5)
# Delete the XML temp file
os.remove(xml_out.name)
return zip(urls, md5s)
def download_files(url_list, cookie):
"""Download a list of URLs, using a cookie file."""
# The download base URL
dl_base = 'http://genome.jgi.doe.gov'
# For each URL...
for u, m in url_list:
# A flag for if we have successfully gotten the file. If the file
# passes the MD5 check, we'll flip this to true.
pass_check = False
while not pass_check:
# The local name to save it as
local_name = u.split('/')[-1]
# And build the command to download it
cmd = [
'curl',
dl_base + u,
'-b',
cookie.name,
'-o',
local_name
]
# Then download it
sys.stderr.write('Downloading ' + u + ' ...')
p = subprocess.Popen(
cmd,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
out, err = p.communicate()
sys.stderr.write(' Done!\n')
# Check the MD5
sys.stderr.write('Checking MD5 of ' + local_name + ' ...')
calc_md5 = calculate_md5(local_name)
if m == calc_md5:
sys.stderr.write(' Pass!\n')
pass_check = True
else:
sys.stderr.write(' Failed integrity check, re-downloading.\n')
os.remove(local_name)
return
def main():
"""The main function. Will ask for username and password to download from
JGI. Also define the URLs to the XML file trees from JGI."""
# Get the user name and password
username = raw_input('Username for JGI: ')
password = getpass.getpass('Password for JGI: ')
# The three XML files we want to download
xml_files = [
'http://genome.jgi.doe.gov/ext-api/downloads/get-directory?organism=GlymaxAtlaPlate2',
'http://genome.jgi.doe.gov/ext-api/downloads/get-directory?organism=GlymaxAtlaPlate3',
'http://genome.jgi.doe.gov/ext-api/downloads/get-directory?organism=GlymaxAtlaPlate6'
]
# The filename filter we apply is the filename must have 'anqrpt' in it.
# This filters down to the 'QC Filtered Raw Data' set
fname_flt = 'anqrpt'
# Then, we log on and save the cookie file
cookie_file = jgi_login(username, password)
# Start saving the URLs to fetch
to_fetch = []
for x in xml_files:
to_fetch += parse_xml(x, cookie_file, fname_flt)
# Then, we'll download them all
download_files(to_fetch, cookie_file)
# Clean up the cookie file
os.remove(cookie_file.name)
return
# Work!
main()