-
Notifications
You must be signed in to change notification settings - Fork 2
/
osf-biohackrxiv-fetch.rb
executable file
·151 lines (124 loc) · 3.43 KB
/
osf-biohackrxiv-fetch.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#! /usr/bin/env ruby
#
# Fetch metadata from OSF using the API - essentially combines an
# existing papers.yaml file with metadata and writes it out. Make sure
# not to overwrite ./etc/papers.yaml - but use a 2-step approach
#
# Also make sure the IDs have no trailing slash.
require 'yaml'
require 'json'
require 'pp'
require 'uri'
require 'net/http'
require 'fileutils'
require 'optparse'
USAGE =<<EOU
Fetch metadata from OSF
EOU
$stderr.print "BioHackrXiv fetch OSF metadata...\n"
options = {show_help: false}
# options[:show_help] = true if ARGV.size == 0
opts = OptionParser.new do |o|
o.banner = "Usage: #{File.basename($0)} [options]\n\n"
o.on("--no-fetch", "Reuse fetched data") do |v|
options[:no_fetch] = true
end
o.on("--debug", "Show debug messages") do |v|
options[:debug] = true
end
o.separator ""
o.on_tail('-h', '--help', 'Display this help and exit') do
options[:show_help] = true
end
end
begin
opts.parse!(ARGV)
if options[:show_help]
print opts
print USAGE
end
rescue OptionParser::InvalidOption => e
options[:invalid_argument] = e.message
end
$stderr.print options if options[:debug]
events = YAML.load_file('etc/events.yaml')['events']
papers = YAML.load_file('etc/papers.yaml')['papers']
# Fetch list of publications
# curl https://api.osf.io/v2/providers/preprints/biohackrxiv/preprints/ |jq -C "." |less -R
OUTDIR = "./papers/OSF"
FileUtils.mkdir_p(OUTDIR)
url = "https://api.osf.io/v2/providers/preprints/biohackrxiv/preprints/"
papers2 = []
num = 0
while url
num += 1
fn = OUTDIR+"/page#{num}.json"
if not options[:no_fetch]
$stderr.print "Fetching '#{url}'...\n"
uri = URI.parse(url)
data = Net::HTTP.get(uri)
File.open(fn,"w") { |f|
f.write(data)
}
else
data = File.read(fn)
end
$stderr.print "Processed #{fn}\n"
osf_ps = JSON.parse(data)
list =
osf_ps['data'].map do | o |
attr = o['attributes']
rec = {}
if attr['is_published'] and not attr['is_preprint_orphan']
date = attr['date_published'][0..9]
rec = { 'id' => "https://biohackrxiv.org/"+o['id'],
'title' => attr['title'],
'date_published' => date
}
end
rec
end
papers2 += list
url = osf_ps['links']['next']
end
p2 = papers2.flatten
# now combine p2 and the original papers
# First create a hash of all OSF entries in p2
h = {}
p2.each do | v |
h[v["id"]] = v
end
# now 'h' contains all OSF records indexed by "id"
# run through the existing entries and combine
papers.each do | v |
id = v["id"]
#p [id,h[id],v]
if h[id]
h[id] = h[id].merge(v)
#p [id,h[id]]
#exit 1 if id == "https://biohackrxiv.org/km9ux/"
end
#p v
#exit 1 if id == "https://biohackrxiv.org/km9ux/"
end
# In the next step we fetch the individual papers and store them
# Fetch info on a paper https://api.osf.io/v2/preprints/wu9et/
h.each { |id,v|
url = "https://api.osf.io/v2/preprints/"+File.basename(id)+"/"
fn = OUTDIR+"/"+File.basename(id)+".json"
if not options[:no_fetch]
$stderr.print "Fetching '#{url}'...\n"
print `curl -s "#{url}" > #{fn}`
end
raise "Can not find #{fn}. Run a fetch!" if not File.exist?(fn)
pjson = JSON.parse(File.read(fn))
attr = pjson['data']['attributes']
v['doi'] = pjson['data']['links']['preprint_doi']
}
# remap to array
a = []
h.sort_by { |k,v| v["date_published"] }.each { |k,v|
v["id"].chomp('/')
a.push v
}
print({"papers" => a.reverse}.to_yaml)