forked from NanoExplorer/AstroRef
-
Notifications
You must be signed in to change notification settings - Fork 0
/
management.py
505 lines (455 loc) · 23.1 KB
/
management.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
# This file handles the logic of managing a bunch of papers. It has the refresh functions and
# storage functions that will be used by the program.
import os
import json
import bibtexparser
from libraries import LibrariesQuery, LibraryQuery
from ads import ExportQuery
from bigquery import BigQuery
import calendar
from errors import APILimitError, ADSMalfunctionError
test_strings = """
@string{january = {January}}
@string{february = {February}}
@string{march = {March}}
@string{april = {April}}
@string{june = {June}}
@string{july = {July}}
@string{august = {August}}
@string{september = {September}}
@string{october = {October}}
@string{november = {November}}
@string{december = {December}}
"""
#May is missing because its abbreviation is the same as the month name.
# other_test_strings = """@string{January = {January}}
# @string{February = {February}}
# @string{March = {March}}
# @string{April = {April}}
# @string{May = {May}}
# @string{June = {June}}
# @string{July = {July}}
# @string{August = {August}}
# @string{September = {September}}
# @string{October = {October}}
# @string{November = {November}}
# @string{December = {December}}"""
def BT_PARSER():
# evilstrings = [('january', "January"),
# ('february', "February"),
# ('march',"March"),
# ('april',"April"),
# ('may',"May"),
# ('june',"June"),
# ('july',"July"),
# ('august',"August"),
# ('september',"September"),
# ('october',"October"),
# ('november',"November"),
# ('december',"December"),
# ('January', "January"),
# ('February', "February"),
# ('March',"March"),
# ('April',"April"),
# ('May',"May"),
# ('June',"June"),
# ('July',"July"),
# ('August',"August"),
# ('September',"September"),
# ('October',"October"),
# ('November',"November"),
# ('December',"December")]
btpsr= bibtexparser.bparser.BibTexParser(common_strings=True)
#btpsr.bib_database.strings.update(evilstrings)
return btpsr
#this is a full-on function because I need a new one every time, otherwise things will go horribly wrong.
#And isn't it great that I have to define my own month strings otherwise the program will crash when I
#inevitably download a file where `month=January` and not month=`jan`? But then why does it save the
#bibtex file so that `month={January}` when `month=January` crashes? Why can't it just import the month
#as a string? I'd rather worry about that than have my program crash for a reason I can't divine by myself.
#Don't you agree that the bibtexparser library v1.0.1 is waay better than the old version? /sarcasm
LETTERS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
#SHORT_MONTHS = {v.lower(): k for k,v in enumerate(calendar.month_abbr)}# Old versions of bibtexparser used this
MONTHS={v: k for k,v in enumerate(calendar.month_name)}
#INV_MONTHS = {v:k for k,v in MONTHS.items()}
#Short months and inv months are only needed if you want to clean up bib files where some months are stored
#as month = {jan} instead of month = {January}
class Bibliography():
def __init__(self):
#will want to check if a local database exists, and if not make an empty one.
#User can choose to grab from ADS later if they want using adsRefresh()
self.qRemaining = 1000
self.exhausted = False
if not os.path.isfile('libraryInfo.json') or not os.path.isfile('libraryPapers.json') or not os.path.isfile('master.bib'):
#Possibly prompt user to ask for permission here.
#If any file is missing we have to regrab everything from ads.
#except *technically* librarypapers, since we can build that from master.bib,
#but this is easier, and I don't think the missing feature will be bothersome.
print('performing first-time setup')
self.libInfo = {} # eventually structure will be {libraryID:{date:datemodified,num:#,name:''}}
self.libPapers = {}# eventually structure will be {lid: [bibcodes]}
self.bibDatabase = {}#Structure will be {bibcode:bibtexinformation}
self.idCodes = {}#Structure will be {idcode:bibcode}
#self.adsRefresh()
self.firstRun = True
else:
print('loading cached data...')
self.loadFiles()
self.firstRun = False
def loadFiles(self):
with open('libraryInfo.json', 'r') as infofile:
self.libInfo = json.loads(infofile.read())
with open('libraryPapers.json','r') as papersfile:
self.libPapers = json.loads(papersfile.read())
with open('master.bib','r') as bibfile:
bibfile_str = bibfile.read()
self.bibDatabase,self.idCodes = bibtexDict(bibfile_str)
def saveFiles(self):
with open('libraryInfo.json','w') as infofile:
infofile.write(json.dumps(self.libInfo,sort_keys=True,
indent=4, separators=(',', ': ')))
with open('libraryPapers.json','w') as papersfile:
papersfile.write(json.dumps(self.libPapers,sort_keys=True,
indent=4, separators=(',', ': ')))
with open('master.bib','w') as bibfile:
bibfile.write(unBibtexDict(self.bibDatabase))
#with open('other.json','w') as take2file:
# take2file.write(json.dumps(self.bibDatabase))
def adsRefresh(self):
if self.exhausted:
raise APILimitError()
try:
#perform a full ads refresh -
# check the list of libraries,
# download libraries that have changed,
# then download info on all the papers that have been added.
#Don't assume that set up has been performed yet. Check to see if objects exist.
toDownload = self.librariesRefresh()
newPapers = []
for lid in toDownload:
#I know it's confusing to have a variable called lid when that's a different word,
#but it's easier to type than libraryId or something, and it looks like 'id' is
#a python reserved word.
newPapers+=self.libraryRefresh(lid)
print("refreshed library {}".format(lid))
if len(newPapers) > 0:
idConflicts = self.downloadPaperInfo(list(set(newPapers)))
print(newPapers)
else:
idConflicts = []
#Will need to do something with the idConflicts, they're important!
#Solution: return the idConflicts and let someone else deal with it.
except:
#Note: This solution assumes that whenever the library gets modified, it gets saved to disk.
#This catch is in place because if the libraryInfo gets successfully fetched and then
#libraryRefresh fails, and the user tries to refresh again, the adsrefresh will compare
#the modified librariesInfo data to the info fetched from ADS, find no new papers, and
#saveFiles(), saving the modified librariesInfo file to disk and preventing the program from
#finding some modifications.
self.loadFiles()
raise
self.updateLibrariesInBibtex()
self.saveFiles()
return list(set(newPapers)),idConflicts
#update libraryInfo
def librariesRefresh(self):
#Download the list of libraries from ads, and compile a list of things that have changed.
if self.exhausted:
return
q = LibrariesQuery()
libs = q.execute().libraries
self.updateRateLimits(q)
toDownload = []
#if hasattr(self, 'libInfo') and hasattr(self,'libPapers') and hasattr(self,'bib_database'):
#Compare ADS library modified dates to local library modified dates
#make a list of what libraries to download
for l in libs:
lid = l['id']
ldate = l['date_last_modified']
lnum = l['num_documents']
lname = l['name']
if lid in self.libInfo:
if self.libInfo[lid]['date'] != ldate or self.libInfo[lid]['num'] != lnum:
toDownload.append(lid)
else:
toDownload.append(lid)
#This if statement could be improved for conciseness, but I think that would
#sacrifice readability.
self.libInfo[lid] = {'date':ldate,
'num':lnum,
'name':lname}
self.lastLibsResponse = q
return toDownload
def libraryRefresh(self,lid):
#download the library specified and make a list of papers that have been added
#lid is the ugly, unique name for the library.
if self.exhausted:
return
q = LibraryQuery(lid,self.libInfo[lid]['num'])
lib = q.execute()
self.updateRateLimits(q)
newPapers = lib.bibcodes[:] #make a copy of lib.bibcodes so we don't modify it below
#Note: although those two for loops appear to do the same thing, they are not totally
#redundant to each other. Because you can have papers that are in bibDatabase but not
#in any library, and if you add the same paper to two different libraries simultaneously
#it will be in libPapers but not bibDatabase.
if lid in self.libPapers:
for bibcode in self.libPapers[lid]:
try:
newPapers.remove(bibcode)
except ValueError:
print("Paper {} must have been removed from library.".format(bibcode))
for paper in self.bibDatabase:
#When you iterate over the dictionary you only get the keys
try:
newPapers.remove(paper)
except ValueError:
pass
self.libPapers[lid] = lib.bibcodes
self.libInfo[lid] = {'date':lib.metadata['date_last_modified'],
'num':lib.metadata['num_documents'],
'name':lib.metadata['name']
}
self.lastLibResponse = q
return newPapers
def updateRateLimits(self,q):
try:
thisQLeft = int(q.response.response.headers['X-RateLimit-Remaining'])
self._updateRateLimits(thisQLeft)
except KeyError:
print("I think they removed rate limits from some of the api endpoints?")
def _updateRateLimits(self,thisQLeft):
if thisQLeft < self.qRemaining:
self.qRemaining = thisQLeft
if self.qRemaining <= 1:
self.exhausted = True
print("queries remaining: {}".format(self.qRemaining))
def downloadPaperInfo(self,papers):
#download info on all the papers in the list of papers.
#adds it to the self.bibDatabase
#papers is a list of ADS bibcode strings.
#This will get bibtex entries from ADS and also abstracts. I think that's all I need.
#Might be easier to get all information from the ADS search endpoint and subsequently
#compile that info into a bibtex file instead of downloading a bibtex file
#and the abstracts separately.
#Update: I think it's easier to get the bibtex and the abstracts separately
if self.exhausted:
return
namesChanged={} # maps old name to new name
#bq = BigQuery(papers)
#bq.execute()
#self.updateRateLimits(bq)
#abstracts = bq.response.abstracts
eq = ExportQuery(papers)
eq.format="bibtexabs"
#SICK HACK COMPLETE!
bibtex = eq.execute()
rate = eq.response.get_ratelimits()['remaining']
if rate:
self._updateRateLimits(int(rate))
else:
print("Unable to get rate limits")
newBibDatabase = bibtexparser.loads(test_strings+bibtex,parser=BT_PARSER()).entries
self.lastBibResponse = eq
#self.lastBigResponse = bq
#Now somehow I need to add all the abstracts to the correct papers in the bib structure
if len(newBibDatabase)!=len(papers):
print("***DUMPING PAPERS***")
print(papers)
print("***DUMPING BIBTEX***")
print(bibtex)
print("***FINISHED***")
raise ADSMalfunctionError
for paper in newBibDatabase:
paper['bibcode'] = paper['ID'] #Make sure to keep the bibcode, since it used to
#only be stored under the ID of the paper, and we want to change the ID.
#paper['abstract'] = abstracts[paper['ID']]
try:
paper['abstract'] = paper['abstract'][1:-1]
except KeyError:
print("Paper {} does not have an abstract.".format(paper['bibcode']))
paper['title']=paper['title'].strip('{}') #don't you just love the new bibtexparser library.
firstAuthor=self.getFirstAuthor(paper['author'])
pid = firstAuthor + paper['year']
if pid in self.idCodes: #This paper will conflict ID wise with another paper, so gotta add letter
other = self.bibDatabase[self.idCodes[pid]]
assert other['bibcode'] != paper['bibcode'], "SOMEHOW the same paper got in twice.(1)"
otherMonth = MONTHS[other['month']] if 'month' in other else 13
paperMonth = MONTHS[paper['month']] if 'month' in paper else 13
if otherMonth>paperMonth:
paper['ID'] = pid+'a' #Add 'a' to this paper because it's older
other['ID'] = pid+'b'
namesChanged[pid]=pid+'b' #Add the old paper to the database of changed names
self.idCodes[pid+'b'] = self.idCodes.pop(pid) #move originally-added paper from
#old id code to new id code in the id codes database
self.idCodes[pid+'a'] = paper['bibcode'] #Add the newly-added paper to the id codes
#database
self.bibDatabase[paper['bibcode']] = paper #Add the newly-added paper to the
#bibdatabase
else:
#I guess if the months are the same it will have to be arbitrary lettering.
#In this section we do the same thing but with 'a' and 'b' swapped
#test
paper['ID'] = pid+'b'
other['ID'] = pid+'a'
namesChanged[pid]=pid+'a'
self.idCodes[pid+'a'] = self.idCodes.pop(pid)
self.idCodes[pid+'b'] = paper['bibcode']
self.bibDatabase[paper['bibcode']] = paper
elif pid+'a' in self.idCodes:
#man, now I have to implement some sort of sorting function.
#first figure out how many papers there are.
sameAuthorYearPapers = []
flag = False
for i,char in enumerate(LETTERS):
#Go through all the letters looking for papers with that name
if pid+char not in self.idCodes and not flag:
numPapers = i
flag = True
elif not flag:
sameAuthorYearPapers.append(self.bibDatabase[self.idCodes[pid+char]])
elif flag:
assert pid+char not in self.idCodes, "SOMEHOW letters are not consecutive"
assert len(sameAuthorYearPapers) == numPapers,"The collision handler broke"
assert numPapers < 26, "This program cannot deal with more than 26 papers published by the same first author in the same year."
#sameAuthorYearPapers is sorted chronologically from [january,...,december]
#It should also have only consecutive letters like ['a','b','c','d'...]
for i,other in enumerate(reversed(sameAuthorYearPapers)):
assert other['bibcode'] != paper['bibcode'], "SOMEHOW the same paper got in twice.(2)"
j = len(sameAuthorYearPapers) - i - 1
otherMonth = MONTHS[other['month']] if 'month' in other else 13
paperMonth = MONTHS[paper['month']] if 'month' in paper else 13
if paperMonth>otherMonth:
#the paper needs to be inserted after the current one
paper['ID'] = pid+LETTERS[j+1]
self.bibDatabase[paper['bibcode']] = paper
self.idCodes[paper['ID']] = paper['bibcode']
#print('inserted new paper as {}'.format(paper['ID']))
break
elif j==0:
#insert the paper at 'a' and bump the current 'a' to 'b'
oldid = other['ID']
other['ID'] = pid+LETTERS[j+1]
self.idCodes[other['ID']] = self.idCodes.pop(oldid)
#print('moved paper from {} to {}'.format(oldid,other['ID']))
paper['ID'] = pid+LETTERS[j]
self.bibDatabase[paper['bibcode']] = paper
self.idCodes[paper['ID']] = paper['bibcode']
namesChanged[oldid] = other['ID']
#print('inserted new paper as {}'.format(paper['ID']))
else:
#bump the current 'x' to 'y'
oldid = other['ID']
other['ID'] = pid+LETTERS[j+1]
self.idCodes[other['ID']] = self.idCodes.pop(oldid)
namesChanged[oldid] = other['ID']
#print('moved paper from {} to {}'.format(oldid,other['ID']))
else:
#WHEW. Now we just get to add something normally!!!!!!
#I've already forgotten how to do that... :(
paper['ID'] = pid
self.bibDatabase[paper['bibcode']] = paper
self.idCodes[pid] = paper['bibcode']
return namesChanged
def getFirstAuthor(self,authors):
authorsList = [i.strip() for i in authors.replace('\n',' ').split(' and ')]
#go through author string and replace newlines with spaces. Then split author
#string on ' and '. Then strip extra whitespace off of each author.
firstAuthor = authorsList[0].split(',')[0][1:-1].replace(' ','')
#Get the first author entry from above, separate first name from last name by ',',
#remove the {} around the {LastName}, and remove any spaces that might be present.
firstAuthor = removeSpecialCharacters(firstAuthor)
#Removes special characters, and usually ends up replacing them with 'normal' characters
return firstAuthor
def updateLibrariesInBibtex(self):
#goes through all the papers in the bibtex library and adds a list of libraries to it
for library in self.libPapers:
for bibcode in self.libPapers[library]:
if 'libraries' in self.bibDatabase[bibcode] and library not in self.bibDatabase[bibcode]['libraries'].split(','):
self.bibDatabase[bibcode]['libraries'] += ',' + library
elif 'libraries' not in self.bibDatabase[bibcode]:
self.bibDatabase[bibcode]['libraries'] = library
def whatPapersNeedPdfs(self):
return [self.bibDatabase[pid] for pid in self.bibDatabase if self.needsPdf(pid)]
def needsPdf(self,pid):
paper = self.bibDatabase[pid]
return ('pdf' not in paper) and (('skip' not in paper) or paper['skip'] == 'false')
def setSkipPdf(self,pid,toSkip=True):
if toSkip:
self.bibDatabase[pid]['skip']='true'
else:
self.bibDatabase[pid]['skip']='false'
def hasPdf(self,pid):
return 'pdf' in self.bibDatabase[pid]
def setPdfs(self,pid,pdfloc):
"""Accepts strings or lists of strings as pid and pdfloc"""
"""Really this adds pdfs instead of setting them."""
if type(pid) is str:
pid = [pid]
if type(pdfloc) is str:
pdfloc = [pdfloc]
#Now we know that pid and pdfloc are both lists.
assert len(pdfloc) == len(pid), "pid and pdfloc must have same number of elements"
for pid,pdfloc in zip(pid,pdfloc):
paper = self.bibDatabase[pid]
if 'pdf' not in paper:
paper['pdf'] = pdfloc.strip(',')
else:
paper['pdf'] = paper['pdf'] + ',' + pdfloc.strip(',')
def setDefaultPdf(self,pid,index):
self.bibDatabase[pid]['default_pdf'] = index
def getDefaultPdf(self,pid):
if 'pdf' not in self.bibDatabase[pid]:
return
try:
return self.bibDatabase[pid]['pdf'].split(',')[int(self.bibDatabase[pid]['default_pdf'])]
except KeyError:
return self.bibDatabase[pid]['pdf'].split(',')[0]
"""
This function is a wrapper for the bibtexparser library
"""
def bibtexDict(btexstr):
raw = bibtexparser.loads(test_strings+btexstr,parser=BT_PARSER()).entries
data = {}
idc={}
for paper in raw:
data[paper['bibcode']] = paper
idc[paper['ID']] = paper['bibcode']
# if 'month' in paper and paper['month'] in SHORT_MONTHS:
# num = SHORT_MONTHS[paper['month']]
# longmonth = INV_MONTHS[num]
# paper['month'] = longmonth
#That section was meant to fix problems, because if you used an old version of this program
#with the old version of bibtexparser, you could have some months stored in the .bib file as
#month={jan} and others as month = {January}. This standardizes everything to the long form name.
return data,idc
def unBibtexDict(btexdict):
paperslist = []
for paper in btexdict:
paperslist.append(btexdict[paper])
db = bibtexparser.bibdatabase.BibDatabase()
db.entries = paperslist
string = bibtexparser.dumps(db)
#print("Dumping bibtex string:")
#print(string)
return string
def removeSpecialCharacters(author):
#This will need to remove things like {\'a} and
#{\'o} and {\'{\i}} and replace them with things like 'a' and 'o' and 'i'
#as blasphemous as that is, it has to be done.
#I don't have a comprehensive list of special characters,
#but I think looping over the name is the only option, and
#finding a whole set of brackets, then finding the only unaccented character
#in the set. I think that should work.
#Might want to review the LaTeX rules for what can be used in a bibtex ID.
#It might be that things like apostraphes have to be taken out, in which case
#I just want to use that long ord(char) statement by itself.
unAccName = ""
for char in author:
if notSpecial(char):
unAccName += char
return unAccName
def notSpecial(char):
isUpper = ord(char) >= ord('a') and ord(char) <= ord('z')
isLower = ord(char) >= ord('A') and ord(char) <= ord('Z')
return (isUpper or isLower)