-
Notifications
You must be signed in to change notification settings - Fork 115
/
bookfind.py
executable file
·603 lines (496 loc) · 21.4 KB
/
bookfind.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
#!/usr/bin/env python
# Search books by author and, optionally, date (looking for recent books).
# Copyright 2019 by Akkana Peck, share and enjoy under the GPLv2 or later.
# Currently uses either of two APIs:
# Python isbnlib or Goodreads.
# Goodreads requires an API key: create a file
# ~/.config/newreads/goodreads.keys
# containing one line:
# key YOUR_KEY_HERE
# isbnlib doesn't require a a key and so is the default.
######################################################################
# Notes on other possible APIs:
#
# The worldcat API initially looked promising:
# http://web.archive.org/web/20100616012651/http://worldcat.org/devnet/wiki/BasicAPIDetails
# But the WorldCat search API implies that you have to be a library
# to get an API key:
# https://www.oclc.org/developer/develop/web-services/worldcat-search-api.en.html
#
# https://isbndb.com/apidocs looks interesting, but isn't free.
#
# Or maybe scrape the Library of Congress search pages;
# the LOC has APIs for seemingly everything *except* books, weirdly.
# Typical LOC search page URL by author:
# https://catalog.loc.gov/vwebv/search?searchArg=connie+willis&searchCode=GKEY%5E*&searchType=1&limitTo=none&fromYear=&toYear=&limitTo=LOCA%3Dall&limitTo=PLAC%3Dall&limitTo=TYPE%3Dam&limitTo=LANG%3DENG&recCount=100&filter=Y
# They also have DMARC requests but they're only free up to 2013,
# anything more recent requires buying a license for DMARC downloads
# for something like $7k.
#
# Other sites that could be scrapd:
# https://www.bookseriesinorder.com/connie-willis/
# https://www.fictiondb.com/search/searchresults.htm?srchtxt=robin+sloan
# Don't request any method more often than once a second
##### end API notes ##################################################
import argparse
import requests
import json
import time
import sys, os
class Book:
monthnames = [ '?', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ]
def __init__(self, ISBN, title, authorlist, description,
pub_year, pub_month, goodreads_id=0):
self.ISBN13 = ISBN
self.title = title
self.authors = authorlist
self.desc = description
try:
self.pub_year = int(pub_year)
except:
self.pub_year = 0
try:
self.pub_month = int(pub_month)
except:
self.pub_month = 0
self.goodreads_id = int(goodreads_id)
# Things that can be filled in later, which may or may not
# be available from any given call:
self.language = None
# Books need to be sortable by publication date
def __lt__(self, other):
if self.pub_year < other.pub_year:
return True
if self.pub_year == other.pub_year and self.pub_month < other.pub_month:
return True
if self.pub_year == other.pub_year and \
self.pub_month == other.pub_month:
if self.ISBN13 and other.ISBN13:
return self.ISBN13 < other.ISBN13
if self.goodreads_id and other.goodreads_id:
return self.goodreads_id < other.goodreads_id
return False
def __repr__(self):
retstr = '%s, by %s' % (self.title, ','.join(self.authors))
if self.pub_month:
retstr += ' (%s %d)' % (Book.monthnames[self.pub_month],
self.pub_year)
elif self.pub_year:
retstr += ' %d' % self.pub_year
if self.ISBN13:
retstr += ' (ISBN %s)' % self.ISBN13
if self.goodreads_id:
retstr += ' (Goodreads %d)' % self.goodreads_id
return retstr
# The new OpenLibrary API is much harder to use, and many entries
# lack important data like publication language, which makes it
# not really usable.
from pprint import pprint
class NewOpenLibraryAPI:
# https://openlibrary.org/developers/api
# replacing older API (which, sigh, was easier to use)
def __init__(self):
self.debug = True
def book_by_ISBN(self, isbn):
return self.books_by_query("isbn=" + isbn)
def book_by_id(self, bookid):
# No special OpenLibrary key, use the isbn
return books_by_ISBN(bookid)
def books_by_author(self, authorname):
'''Find books by all authors matching the given name.
authorname is a string like "Connie Willis")
Order of names probably doesn't matter.
Returns two lists: booklists, anthologies
each of which consists of triples [year, month, title]
'''
r = requests.get("https://openlibrary.org/search/authors.json?q="
+ authorname.replace(' ', '+'))
results = r.json()
authorkey = results['docs'][0]['key']
r = requests.get("https://openlibrary.org/authors/%s/works.json"
% authorkey)
works = r.json()["entries"]
# print('works:')
# pprint(works)
booklist = []
for work in works:
# But the "work" entry doesn't include publication date!
# So each book has to fetch the list of all editions
# to get that.
# print("work:")
# pprint(work)
r = requests.get("https://openlibrary.org/%s/editions.json"
% work["key"])
editions = r.json()
first_ed = None
first_pubdate = 10000
for edition in editions['entries']:
eng = False
if "languages" not in edition:
print("No languages! edition:")
pprint(edition)
sys.exit(0)
for lang in edition["languages"]:
if lang["key"] == "/languages/eng":
eng = True
break
if not eng:
continue
# It's in English.
pubdate = int(edition["publish_date"])
if pubdate < first_pubdate:
first_ed = edition
first_pubdate = pubdate
# Hopefully by now first_ed is populated.
# If not, skip to the next work.
if not first_ed:
print("Couldn't parse any editions:", editions)
continue
book = Book(first_ed["isbn_13"],
first_ed["title"],
authorname,
None, # No description
first_pubdate,
0, # openlibrary doesn't have publish month
0)
booklist.append(book)
class OpenLibraryAPI:
def __init__(self):
self.debug = True
def book_by_ISBN(self, isbn):
return self.books_by_query("isbn=" + isbn)
def book_by_id(self, bookid):
# No special OpenLibrary key, use the isbn
return books_by_ISBN(bookid)
def books_by_author(self, authorname):
'''Find books by all authors matching the given name.
authorname is a string like "Connie Willis")
Order of names probably doesn't matter.
Returns two lists: booklists, anthologies
each of which consists of triples [year, month, title]
'''
return self.books_by_query("author=" + authorname.replace(' ', '+'))
def books_by_query(self, query):
booklist = []
query = f"http://openlibrary.org/search.json?" + query
r = requests.get(query)
if r.status_code != 200:
if self.debug:
raise RuntimeError("Bad status %d on %s" % (r.status_code, url))
else:
return None
results = json.loads(r.text)
if 'docs' not in results:
raise RuntimeError("Badly formatted answer")
for r in results["docs"]:
try:
if "language" in r and 'eng' not in r['language']:
# print("Skipping", r["title"], "because it's in",
# r['language'])
continue
if "isbn" not in r:
# print("Skipping", r["title"], "because no isbn"),
continue
if len(r["author_name"]) > 3:
# print("Skipping", r["title"], "because",
# len(r["author_name"]), "authors is too many")
continue
book = Book(r["isbn"][0],
r["title"],
r["author_name"],
None, # No description
r["first_publish_year"],
# There's also "publish_date" which is a list
# of strings like '1995' and 'January 1, 1996'
0, # openlibrary doesn't have publish month
0)
booklist.append(book)
except Exception as e:
print(">>> EXCEPTION:", e)
from pprint import pprint
pprint(r)
print()
booklist.sort(reverse=True)
return booklist, []
class ISBNlibAPI:
def __init__(self):
self.debug = False
def book_by_ISBN(self, isbn):
# print("Looking up ISBN", isbn)
meta = isbnlib.meta(isbn)
return Book(isbn, meta['Title'], meta['Authors'], '',
meta['Year'], 0)
def book_by_id(self, bookid):
return self.book_by_ISBN(bookid)
def books_by_author(self, authorname):
'''Find books by all authors matching the given name.
authorname is a string like "Connie Willis")
Order of names probably doesn't matter.
Returns two lists: booklists, anthologies
each of which consists of triples [year, month, title]
'''
booklist = []
books = isbnlib.goom(authorname)
for meta in books:
booklist.append(Book(meta['ISBN-13'], meta['Title'],
meta['Authors'], '',
meta['Year'], 0))
booklist.sort(reverse=True)
return booklist, []
class GoodreadsAPI:
def __init__(self):
# Read keys
self.keys = {}
keyfilename = "~/.config/newreads/goodreads.keys"
with open(os.path.expanduser(keyfilename)) as keyfile:
for line in keyfile:
line = line.strip()
if not line or line.startswith('#'):
continue
parts = line.split()
self.keys[parts[0].strip()] = parts[1].strip()
if 'key' not in self.keys:
raise(RuntimeError("No key found in " + keyfile))
# The "secret" goodreads key is for writing back to their database.
# This app doesn't do that anyway, but just in case it ever might:
# if 'secret' not in self.keys:
# print("API key is there, but won't be able to write")
self.debug = False
def book_from_url(self, url):
if self.debug:
print("url", url)
r = requests.get(url)
if r.status_code != 200:
if self.debug:
raise RuntimeError("Bad status %d on %s" % (r.status_code, url))
else:
return None
# print(r.text)
soup = BeautifulSoup(r.text, 'lxml-xml')
authorstag = soup.find('authors')
authors = []
for authortag in authorstag.findAll('author'):
authors.append(authortag.find('name').text)
title = soup.find('title').text
desc = soup.find('description').text
# The pages by goodreads ID seldom have pub year or month,
# but the ISBN searches sometimes do. Doesn't hurt to try:
try:
pubyear = int(soup.find('original_publication_year').text)
except:
pubyear = 0
try:
pubmonth = int(soup.find('original_publication_month').text)
except:
pubmonth = 0
try:
gid = int(soup.find('id').text)
except:
gid = 0
try:
isbn13 = soup.find('isbn13').text
except:
isbn13 = 0
return Book(isbn13, title, authors, desc, pubyear, pubmonth,
goodreads_id=gid)
def book_by_ISBN(self, bookid):
if self.debug:
print("Looking up book by isbn", bookid)
# The reviews page includes book description and language
url = 'https://www.goodreads.com/book/isbn/%s?key=%s' \
% (bookid, self.keys['key'])
return self.book_from_url(url)
def book_by_id(self, bookid):
# The reviews page includes book description and language
url = 'https://www.goodreads.com/book/show/%s.xml?key=%s' \
% (bookid, self.keys['key'])
return self.book_from_url(url)
def books_by_author(self, authorname):
'''Find books by all authors matching the given name.
authorname is a string like "Connie Willis")
Order of names probably doesn't matter.
Returns two lists: booklists, anthologies
each of which consists of triples [year, month, title]
'''
# Url encode, either with %20 or with +, don't care.
authorname = requests.utils.requote_uri(authorname)
url = "https://www.goodreads.com/api/author_url/%s?key=%s" \
% (authorname, self.keys['key'])
if self.debug:
print("URL", url)
r = requests.get(url)
if r.status_code != 200:
raise RuntimeError("Bad status %d on %s" % (r.status_code, url))
# This returns XML that includes <author id="NNNNN">
soup = BeautifulSoup(r.text, 'lxml-xml')
author_tags = soup.findAll('author')
if not author_tags:
print("No authors found matching '%s'" % authorname)
return None, None
booklists = []
for authortag in author_tags:
author_id = authortag.get('id')
author_name = authortag.get('name')
if not author_id:
raise RuntimeError("No id in author tag:", author)
booklist = []
anthologies = []
# Now page through the author's books
page = 1
while True:
url = "https://www.goodreads.com/author/list/%s" \
"?format=xml&key=%s&page=%d" \
% (author_id, self.keys['key'], page)
if self.debug:
print("url", url)
print('%d... ' % page, end='', file=sys.stderr)
sys.stderr.flush()
r = requests.get(url)
if r.status_code != 200:
raise RuntimeError("Bad status %d on %s" % (r.status_code,
url))
soup = BeautifulSoup(r.text, 'lxml-xml')
bookstag = soup.find('books')
if not bookstag:
print("Eek, author", author_id, "doesn't have any books")
break
start = bookstag.get('start')
end = bookstag.get('end')
total = bookstag.get('total')
# print("\npage %s (%s-%s of %s)" % (page, start, end, total))
for booktag in bookstag.findAll('book'):
title = booktag.find('title').text
book_id = booktag.find('id').text
# isbn isn't reliably available from author search pages
# isbn = booktag.find('isbn').text
# isbn13 = booktag.find('isbn13').text
publication_year = booktag.find('publication_year').text
try:
publication_year = int(publication_year)
except:
publication_year = 1099
publication_month = booktag.find('publication_month').text
try:
publication_month = int(publication_month)
except:
publication_month = 0
desc = booktag.find('description')
if desc:
desc = desc.text
else:
desc = ''
# print("%s (%s %s)" % (title,
# publication_month,
# publication_year))
# See if this is really a book authored by this author.
# Goodreads inexplicably gives huge long lists that
# include lots of books this author had nothing to do with.
# Unfortunately, we can't just quit at that point;
# valid books aren't necessarily listed before bogus books.
bookauthors = booktag.findAll('author')
authorlist = []
isauthor = False
for auth in bookauthors:
authorlist.append(auth.find('name').text)
thisid = auth.find('id').text
if thisid == author_id:
# It's valid, the author really wrote it.
# Add it to the book list.
isauthor = True
if isauthor:
booklist.append(Book(book_id, title,
authorlist,
desc,
publication_year,
publication_month))
else:
anthologies.append(Book(book_id, title,
authorlist,
desc,
publication_year,
publication_month))
if int(end) >= int(total):
break
page += 1
booklist.sort(reverse=True)
anthologies.sort(reverse=True)
# The Goodreads API has some sort of once-a-second limit,
# but they're unclear exactly how that works,
# which calls are limited. Just to be safe, it doesn't hurt
# to wait a second between arguments.
time.sleep(1)
return booklist, anthologies
def lookup_books(args):
for val in args.author_or_id:
val = val.strip()
# If the argument is all digits, it's presumably an ID,
# either an ISBN13 or a Goodreads ID.
if val.isdigit():
if args.ISBN or len(val) == 13:
book = api.book_by_ISBN(val)
if not book:
print("No book with ISBN", val)
continue
else:
book = api.book_by_id(val)
if not book:
print("No book with ID", val)
continue
print(book)
if book.desc:
print(book.desc)
else: # Look up by author's name
booklist, anthologies = api.books_by_author(val)
if args.anthologies:
for book in anthologies:
if not args.year or book.pub_year >= args.year:
print(book)
for book in booklist:
if not args.year or book.pub_year >= args.year:
print(book)
if args.desc:
print(book.desc)
print()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-a', action="store_true", dest="anthologies",
help='Include anthologies that include this author')
parser.add_argument('-y', action="store", dest="year", type=int,
help='year: Show only books from this year or later')
parser.add_argument('-d', action="store_true", dest="desc",
help='description: show Goodreads descriptions')
parser.add_argument('-L', action="store_true", dest="ISBNlib",
help='Use the ISBNlib Python library (incomplete)')
parser.add_argument('-G', action="store_true", dest="Goodreads",
help='Use the Goodreads API (requires API key, going away)')
parser.add_argument('-I', action="store_true", dest="ISBN",
help='Consider numbers as ISBN13 even if using the Goodreads API')
parser.add_argument('-D', action="store_true", dest="debug",
help='Show debugging information, like URLs used')
parser.add_argument('author_or_id', nargs='+',
help="Authors or Goodreads numerical IDs")
args = parser.parse_args(sys.argv[1:])
# Imports have to happen at the top level,
# not in a function like lookup_books.
if args.Goodreads:
if args.debug:
print("Using Goodreads API")
import requests
from bs4 import BeautifulSoup
api = GoodreadsAPI()
elif args.ISBNlib: # XXX haven't actually added this flag yet
if args.debug:
print("Using Python ISBNlib")
import isbnlib
api = ISBNlibAPI()
else:
if args.debug:
print("Using OpenLibrary API")
api = OpenLibraryAPI()
if args.debug:
api.debug = True
try:
lookup_books(args)
except KeyboardInterrupt:
print("Interrupt")