This repository has been archived by the owner on Sep 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
/
gen_info.py
47 lines (42 loc) · 1.53 KB
/
gen_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from collections import OrderedDict
from glob import glob
import json
from os import chdir, getcwd
from os.path import basename, getsize, isfile, join, normpath
from sys import argv, stderr
from urllib.request import urlretrieve
# set the base url without trailing slash!
URL = 'https://datasets.biolab.si'
start_dir = getcwd()
info = []
for root in argv[1:]:
chdir(join(start_dir, root))
for infof in glob('**/*.info', recursive=True):
changed = False
with open(infof, 'r') as f:
d = json.load(f, object_pairs_hook=OrderedDict)
filename = infof[:-5]
location = d.get('url', '')
if location and (not location.startswith(URL) or
basename(location) != basename(filename)):
try:
urlretrieve(location, filename)
d['url'] = '{}/{}/{}'.format(URL, normpath(root), filename)
d['size'] = getsize(filename)
changed = True
except:
print('failed to get file', filename, location, file=stderr)
ref = d.get('references', None)
if isinstance(ref, str):
d['references'] = [ref]
changed = True
if changed:
with open(infof, 'w') as f:
json.dump(d, f, indent=4)
if root:
file_path = [root, filename]
else:
file_path = [filename]
info.append([file_path, d])
info.sort()
print(json.dumps(info, indent=4))