-
Notifications
You must be signed in to change notification settings - Fork 0
/
query_crossref_with_reference_text.py
159 lines (134 loc) · 5.48 KB
/
query_crossref_with_reference_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 7 07:21:33 2016
@author: ShebleAdmin
query crossref with a list of cites that are just dumped into query with minimal parsing
should do this as a session with Requests, I think
used to deal with a file with references that were all in a jumble wrt format, etc
scores from crossref seem to work pretty well as estimate of likelihood of match,
even with data that is somewhat rough
"""
import re
import requests
import json
import pandas as pd
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
#### Set fp variable to your folder with file to be processed ####
fp = "/<path_to_file_with_jumble of references>/"
examples = fp + "publist_stacked_nonMacWin_zapGrem_reduced.csv"
def build_query_url(citation):
query_url = "http://api.crossref.org/works?query="
# citation = re.sub('[&,.()\[\]:/"+Õ\_@Ò\*\n]', '', citation)
citation = re.sub('[^\s\da-zA-Z-/]', ' ', citation)
citation = re.sub(r'\s\D{1,2}\s{0,1}\D{0,1}\s', ' ', citation)
citation = re.sub(r'\s{2,10}', ' ', citation)
citation = re.sub(r'\d-\d', ' ', citation)
citation.strip(' ')
citation = re.sub(r'\s', '+', citation)
query_url = query_url + '"' + citation + '"' + '&rows=1' # &rows=1 limits to the first result (&rows=0 to get a summary of search results)
return query_url
# THIS ONE WORKS BEST AT THE MOMENT think about doing something else with the affiliation data (but there was essentially none for my set) & get rid of 'str' in code
def construct_author(author_item):
count = 0
author_construct = ''
for item in author_item:
if count < (len(author_item)-1):
if 'given' in item:
author_construct = author_construct + str(item['family']) + ', ' + str(item['given']) + ' (' + str(item['affiliation']) + '); '
count += 1
else:
author_construct = author_construct + str(item['family']) + ' (' + str(item['affiliation']) + '); '
count += 1
else:
if 'given' in item:
author_construct = author_construct + str(item['family']) + ', ' + str(item['given']) + ' (' + str(item['affiliation']) + ')'
count += 1
else:
author_construct = author_construct + str(item['family']) + ' (' + str(item['affiliation']) + ')'
count += 1
return author_construct
def construct_title(title_item):
count=0
title_construct = ''
for item in title_item:
if count < (len(title_item)-1):
title_construct = title_construct + item + '; '
count += 1
else:
title_construct = title_construct + item
count += 1
return title_construct
def construct_subject(subject_item):
count=0
subject_construct = ''
for item in subject_item:
if count < (len(subject_item)-1):
subject_construct = subject_construct + item + '; '
count += 1
else:
subject_construct = subject_construct + item
count += 1
return subject_construct
def extract_json_fields(data):
reference = []
doi = data['message']['items'][0]['DOI']
year = str(data['message']['items'][0]['issued']['date-parts'][0][0])
subject = "data['message']['items'][0]['subject']"
if 'subject' in data['message']['items'][0]:
subject = construct_subject(data['message']['items'][0]['subject'])
else:
subject = u'NA'
if 'author' in data['message']['items'][0]:
author = construct_author(data['message']['items'][0]['author'])
else:
author = 'NA'
if 'score' in data['message']['items'][0]:
score = data['message']['items'][0]['score']
else:
score = 'NA'
if 'volume' in data['message']['items'][0]:
volume = data['message']['items'][0]['volume']
else:
volume = u'NA'
if 'issue' in data['message']['items'][0]:
issue = data['message']['items'][0]['issue']
else:
issue = u'NA'
title = data['message']['items'][0]['title'][0]
if 'alternative-id' in data['message']['items'][0]:
alternative_id = data['message']['items'][0]['alternative-id'][0]
else:
alternative_id = u'NA'
container_title = construct_title(data['message']['items'][0]['container-title'])
if 'page' in data['message']['items'][0]:
page = data['message']['items'][0]['page']
else:
page = u'NA'
reference.extend([subject, author, year, title, container_title, volume, issue, page, doi, alternative_id, score])
return reference
df = pd.read_table(examples, sep=',', header=0, verbose=True, quotechar='"', error_bad_lines=True, warn_bad_lines=True)
citations = df['citestring'].tolist()
rows = []
errors = []
for item in citations:
query_item = build_query_url(item)
request = requests.get(query_item)
# print(request.text)
try:
refs = request.text
except:
errors.append('no text from crossref')
print(errors)
# print(refs)
data = json.loads(refs)
data_extract = extract_json_fields(data)
rows.append(data_extract)
df_refs = pd.DataFrame(rows)
df_refs.columns = ['subject', 'author', 'year', 'title', 'journal', 'volume', 'issue', 'page', 'doi', 'alternative_id', 'score']
df_refs.head()
df_out = pd.concat([df, df_refs], axis=1)
df_out[['citestring', 'author', 'title']].head()
df_out.to_csv('final/df_out_all_items_from_crossref_v3.csv', sep='\t', header=True)