-
Notifications
You must be signed in to change notification settings - Fork 0
/
abbreviate_journals.py
157 lines (137 loc) · 5.23 KB
/
abbreviate_journals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Author: James Flamino (2021)
import string
import spacy
import re
import sys
import yaml
nlp = spacy.load('en_core_web_sm')
missing_abbrevs = []
missing_journals = []
with open("abbreviations/abbrevs.yml", "r") as stream:
try:
abbrev_map = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
sys.exit()
def contains_number(value):
for character in value:
if character.isdigit():
return True
return False
def abbreviate_journal(res):
if res.lower() == 'plos one':
return 1
if res.lower() == 'science':
return 1
if res.lower() == 'nature':
return 1
if 'arxiv' in res.lower():
return 1
for i in res:
if i in '!"#$%\'()*,-./;<=>?@[]^_`{|}~':
return 0
if res.lower() == 'proceedings of the national academy of sciences':
return 'Proc. Natl. Acad. Sci. U.S.A.'
res = re.sub(r'[^\w\s]','',res)
res = res.replace('&', '').replace('\\', '')
doc = nlp(res)
token_list = []
for token in doc:
if token.pos_ != 'ADP' and token.pos_ != 'DET' and token.pos_ != 'CCONJ' and token.pos_ != 'PRON':
token_list.append(token.text)
final_list = []
for token in token_list:
token_cap = token.capitalize()
if len(token) > 1:
if token_cap in abbrev_map:
token = abbrev_map[token_cap]
else:
if not contains_number(token):
missing_abbrevs.append(token.capitalize())
final_list.append(token)
else:
if token.isalnum():
final_list.append(token)
output = ' '.join(final_list)
output = output.strip()
return output
if __name__ == '__main__':
# PARAMETERS
if len(sys.argv) > 1:
fname = sys.argv[1]
else:
print('ERROR: This program requires one command line argument: fname')
sys.exit()
fname_list = fname.split('.')
oname = '.'.join(fname_list[:-1]) + '_abbrev.' + fname_list[-1]
print('\033[1mSaving abbreviated file to:\033[0m', oname)
w = open(oname, 'w')
with open(fname, 'r') as ins:
for line in ins:
line = line.rstrip('\n')
line = line.strip()
if 'title' in line and 'journal' in line:
res = re.findall(r'journal=\{.*?\}', line)[0]
res = res.replace('journal={', '').replace('}', '')
abbrev_res = abbreviate_journal(res)
if abbrev_res != 0 and abbrev_res != 1:
line = line.replace(res, abbrev_res)
w.write(line + '\n')
else:
if abbrev_res == 0:
w.write(line + '\n')
missing_journals.append(res)
elif abbrev_res == 1:
w.write(line + '\n')
else:
if line[:7] == 'journal' or line[:9] == 'booktitle':
if '{' in line and '}' in line:
res = re.findall(r'\{.*?\}', line)[0]
res = res.replace('{', '').replace('}', '')
elif '"' in line:
res = re.findall(r'"([^"]*)"', line)[0]
res = res.replace('"', '')
abbrev_res = abbreviate_journal(res)
if abbrev_res != 0 and abbrev_res != 1:
if line[:7] == 'journal':
w.write(' journal={' + abbrev_res + '},\n')
elif line[:9] == 'booktitle':
w.write(' booktitle={' + abbrev_res + '},\n')
else:
if abbrev_res == 0:
w.write(' ' + line + '\n')
missing_journals.append(res)
elif abbrev_res == 1:
w.write(' ' + line + '\n')
else:
if len(line) > 0:
if line[0] == '@' or line[0] == '}':
w.write(line + '\n')
else:
w.write(' ' + line + '\n')
else:
w.write('\n')
w.close()
print('===================================================')
print('\033[1mWords that do not have abbreviations (update YAML):\033[0m')
missing_abbrevs = set(missing_abbrevs)
for item in missing_abbrevs:
print(item)
print('===================================================')
print('\033[1mJournals that could not be properly abbreviated:\033[0m')
missing_journals = set(missing_journals)
for item in missing_journals:
period_count = item.count('.')
if period_count > 1:
has_other_punctuation = False
new_item = item.replace('.', '')
for i in new_item:
if i in string.punctuation:
has_other_punctuation = True
if not has_other_punctuation:
print(item, '\033[1m(This journal may already be abbreviated.)\033[0m')
else:
print(item)
else:
print(item)
print('===================================================')