-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoi_orcid_openalex_matching.py
119 lines (105 loc) · 3.49 KB
/
doi_orcid_openalex_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests
import csv
import pandas as pd
import time
OPENALEX_API_URL = "https://api.openalex.org"
INPUT_CSV = "input.csv"
OUTPUT_CSV = "output.csv"
SLEEP_TIME = 1
df = pd.read_csv(INPUT_CSV)
# Script for taking a CSV of author names and DOIs of works they are associated with
# and then querying OpenAlex to retrieve the author records associated with the DOI
# and comparing name records from the input CSV to retrieved author record names
# and assigning a score based on name similarity
# SLEEP_TIME is the time between API requests to OpenAlex
def write_header_row():
with open(OUTPUT_CSV, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
header_row = [
"college_name",
"department_name",
"last_name",
"first_name",
"middle_name",
"research_heading",
"heading_type",
"contribution_year",
"title",
"authors",
"publication_name",
"additional_details",
"url",
"school_code",
"report_code",
"category",
"gw_id",
"doi",
"best_match_name_score",
"best_match_orcid",
]
writer.writerow(header_row)
def get_authors_from_open_alex_by_doi(doi):
time.sleep(SLEEP_TIME)
response = requests.get(f"{OPENALEX_API_URL}/works/doi:{doi}")
if response.status_code == 200:
data = response.json()
author_list = []
for authorship in data["authorships"]:
author_list.append(
{
"orcid": authorship["author"]["orcid"],
"display_name": authorship["author"]["display_name"],
}
)
return author_list
else:
return []
def name_similarity_score(display_name, first_name, last_name, middle_name):
score = 0
split_display_name = display_name.split(" ")
if last_name is not None and last_name in split_display_name:
score += 50
if first_name is not None and first_name in split_display_name:
score += 30
if middle_name is not None and middle_name in split_display_name:
score += 10
return score
write_header_row()
with open(OUTPUT_CSV, "a", newline="") as file:
writer = csv.writer(file)
for index, row in df.iterrows():
authors_list = get_authors_from_open_alex_by_doi(row.doi)
if authors_list != []:
for author in authors_list:
author["name_score"] = name_similarity_score(
author["display_name"],
row.first_name,
row.last_name,
row.middle_name,
)
best_match = max(authors_list, key=lambda x: x["name_score"])
else:
best_match = {"name_score": 0, "orcid": "Not found"}
new_row = [
row.college_name,
row.department_name,
row.last_name,
row.first_name,
row.middle_name,
row.research_heading,
row.heading_type,
row.contribution_year,
row.title,
row.authors,
row.publication_name,
row.additional_details,
row.url,
row.school_code,
row.report_code,
row.category,
row.gw_id,
row.doi,
best_match["name_score"],
best_match["orcid"],
]
writer.writerow(new_row)