This repository has been archived by the owner on Jul 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
doi-url-download.py
61 lines (55 loc) · 1.89 KB
/
doi-url-download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" Bot to download a list of PDFs from given URLs in Selenium. """
#
# (C) Federico Leva, 2018
#
# Distributed under the terms of the MIT license.
#
from xvfbwrapper import Xvfb
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import selenium.common.exceptions
from time import sleep
import random
vdisplay = Xvfb()
def getDriver():
# https://stackoverflow.com/a/47075896
# http://stackoverflow.com/questions/12698843/ddg#12698844
chrome_options = Options()
prefs = { #"download.default_directory": "~",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True,
#"plugins.plugins_disabled": ["Chrome PDF Viewer"]
}
chrome_options.add_experimental_option("prefs", prefs)
return webdriver.Chrome(chrome_options=chrome_options)
def downloadUrl(driver, url=None):
actions = webdriver.ActionChains(driver)
try:
driver.get(url)
sleep(random.randint(200, 900))
elem = driver.find_element_by_css_selector(".icon-pdf-download") #.stats-document-lh-action-downloadPdf_2")
except selenium.common.exceptions.NoSuchElementException:
print "WARNING: Could not click on %s" % url
return
except selenium.common.exceptions.TimeoutException:
print "ERROR: Selenium timeout"
return
except:
print "ERROR: Unknown error when downloading"
sleep(random.randint(30, 180))
return
actions.click(elem)
actions.perform()
def main(argv=None):
#with Xvfb() as xvfb:
with open('urls.txt', 'rb') as urls:
driver = getDriver()
sleep(300)
for url in urls.readlines():
downloadUrl(driver, url.strip())
if __name__ == "__main__":
main()