From fe7de05ec3704e1a6108a75160aef32930d0405a Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Fri, 12 Jul 2013 11:04:39 -0500 Subject: [PATCH 1/8] added the process_spider_exception method in middleware to release lock and schedule next request when exception occurs --- scrapy_webdriver/middlewares.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scrapy_webdriver/middlewares.py b/scrapy_webdriver/middlewares.py index 052c199..ca34b50 100644 --- a/scrapy_webdriver/middlewares.py +++ b/scrapy_webdriver/middlewares.py @@ -57,3 +57,21 @@ def _process_requests(self, items_or_requests, start=False): if request is WebdriverRequest.WAITING: continue # Request has been enqueued, so drop it. yield request + + def process_spider_exception(self, response, exception, spider): + """If there is an exception while parsing, feed the scrapy + scheduler with the next request from the queue in the + webdriver manager. + """ + if isinstance(response.request, WebdriverRequest): + + # release the lock that was acquired for this URL + self.manager.release(response.request.url) + + # get the next request + next_request = self.manager.acquire_next() + + # only schedule if the queue isn't empty + if next_request is not WebdriverRequest.WAITING: + scheduler = self.manager.crawler.engine.slots[spider].scheduler + scheduler.enqueue_request(next_request) From 45270663e4c89415ef40928efec8594006219fa3 Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Mon, 15 Jul 2013 15:52:06 -0500 Subject: [PATCH 2/8] Added a setting WEBDRIVER_TIMEOUT that sets the page load timeout on the browser --- scrapy_webdriver/manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapy_webdriver/manager.py b/scrapy_webdriver/manager.py index ec496dc..3436031 100644 --- a/scrapy_webdriver/manager.py +++ b/scrapy_webdriver/manager.py @@ -19,6 +19,7 @@ def __init__(self, crawler): self._browser = crawler.settings.get('WEBDRIVER_BROWSER', None) self._user_agent = crawler.settings.get('USER_AGENT', None) self._options = crawler.settings.get('WEBDRIVER_OPTIONS', dict()) + self._timeout = crawler.settings.get('WEBDRIVER_TIMEOUT', None) self._webdriver = None if isinstance(self._browser, basestring): if '.' in self._browser: @@ -52,6 +53,8 @@ def webdriver(self): options[cap_attr] = self._desired_capabilities self._webdriver = self._browser(**options) self.crawler.signals.connect(self._cleanup, signal=engine_stopped) + if self._timeout: + self._webdriver.set_page_load_timeout(self._timeout) return self._webdriver def acquire(self, request): From 2f2ce00a0525d4054ad31338ea13a04ede83f4fd Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Thu, 1 Aug 2013 19:58:39 -0500 Subject: [PATCH 3/8] Added an optional timeout on the webdriver.get in Download handler to handle situation where webdriver.get hangs forever. --- scrapy_webdriver/download.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py index bf31fbb..5a0105f 100644 --- a/scrapy_webdriver/download.py +++ b/scrapy_webdriver/download.py @@ -1,3 +1,5 @@ +import signal + from scrapy import log from scrapy.utils.decorator import inthread from scrapy.utils.misc import load_object @@ -6,6 +8,8 @@ FALLBACK_HANDLER = 'scrapy.core.downloader.handlers.http.HttpDownloadHandler' +class WebdriverTimeout(Exception): + pass class WebdriverDownloadHandler(object): """This download handler uses webdriver, deferred in a thread. @@ -15,8 +19,14 @@ class WebdriverDownloadHandler(object): """ def __init__(self, settings): self._enabled = settings.get('WEBDRIVER_BROWSER') is not None + self._timeout = settings.get('WEBDRIVER_TIMEOUT') self._fallback_handler = load_object(FALLBACK_HANDLER)(settings) + # set the signal handler for the SIGALRM event + def handler(signum, frame): + raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout) + signal.signal(signal.SIGALRM, handler) + def download_request(self, request, spider): """Return the result of the right download method for the request.""" if self._enabled and isinstance(request, WebdriverRequest): @@ -32,7 +42,18 @@ def download_request(self, request, spider): def _download_request(self, request, spider): """Download a request URL using webdriver.""" log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG) + + # set a countdown timer for the webdriver.get + if self._timeout: + signal.alarm(self._timeout) + + # make the request request.manager.webdriver.get(request.url) + + # if the get finishes, defuse the bomb + if self._timeout: + signal.alarm(0) + return WebdriverResponse(request.url, request.manager.webdriver) @inthread From c914b75821b713dd3537dedfeb4b4d4a369f5ecb Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Fri, 2 Aug 2013 11:21:27 -0500 Subject: [PATCH 4/8] Switched timeout error handler into download_request so that URL could be in error message, and also trying to stop execution of any hung scripts on page with window.stop() --- scrapy_webdriver/download.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py index 5a0105f..9c9fe20 100644 --- a/scrapy_webdriver/download.py +++ b/scrapy_webdriver/download.py @@ -3,6 +3,7 @@ from scrapy import log from scrapy.utils.decorator import inthread from scrapy.utils.misc import load_object +from scrapy.exceptions import IgnoreRequest from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse @@ -22,14 +23,19 @@ def __init__(self, settings): self._timeout = settings.get('WEBDRIVER_TIMEOUT') self._fallback_handler = load_object(FALLBACK_HANDLER)(settings) - # set the signal handler for the SIGALRM event - def handler(signum, frame): - raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout) - signal.signal(signal.SIGALRM, handler) - def download_request(self, request, spider): """Return the result of the right download method for the request.""" if self._enabled and isinstance(request, WebdriverRequest): + + # set the signal handler for the SIGALRM event + def handler(signum, frame): + # stop the webdriver from executing anything + request.manager.webdriver.execute_script("window.stop()") + raise WebdriverTimeout("'webdriver.get' for '%s' took more than %s seconds." % (request.url, self._timeout)) + + # raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout) + signal.signal(signal.SIGALRM, handler) + if isinstance(request, WebdriverActionRequest): download = self._do_action_request else: From cf290d72337d1350f3c891e392e5bd14de5a1f8c Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Thu, 8 Aug 2013 01:51:26 -0500 Subject: [PATCH 5/8] Added exception handling around the webdriver.get call in the download handler so that download exceptions can be caught later on. --- scrapy_webdriver/download.py | 42 ++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py index 9c9fe20..0fe52bf 100644 --- a/scrapy_webdriver/download.py +++ b/scrapy_webdriver/download.py @@ -29,11 +29,23 @@ def download_request(self, request, spider): # set the signal handler for the SIGALRM event def handler(signum, frame): - # stop the webdriver from executing anything - request.manager.webdriver.execute_script("window.stop()") - raise WebdriverTimeout("'webdriver.get' for '%s' took more than %s seconds." % (request.url, self._timeout)) - # raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout) + # kill the selenium webdriver process (with SIGTERM, + # so that it kills both the primary process and the + # process that gets spawned) + request.manager.webdriver.service.process.send_signal(signal.SIGTERM) + + # set the defunct _webdriver attribute back to + # original value of None, so that the next time it is + # accessed it is recreated. + request.manager._webdriver = None + + # log an informative warning message + msg = "'webdriver.get' for '%s' took more than %s seconds." % \ + (request.url, self._timeout) + spider.log(msg, level=log.WARNING) + + # bind the handler signal.signal(signal.SIGALRM, handler) if isinstance(request, WebdriverActionRequest): @@ -53,15 +65,23 @@ def _download_request(self, request, spider): if self._timeout: signal.alarm(self._timeout) - # make the request - request.manager.webdriver.get(request.url) + # make the get request + try: + request.manager.webdriver.get(request.url) - # if the get finishes, defuse the bomb - if self._timeout: - signal.alarm(0) - - return WebdriverResponse(request.url, request.manager.webdriver) + # if the get fails for any reason, set the webdriver attribute of the + # response to the exception that occurred + except Exception, exception: + exception.page_source = '' + return WebdriverResponse(request.url, exception) + # if the get finishes, defuse the bomb and return a response with the + # webdriver attached + else: + if self._timeout: + signal.alarm(0) + return WebdriverResponse(request.url, request.manager.webdriver) + @inthread def _do_action_request(self, request, spider): """Perform an action on a previously webdriver-loaded page.""" From 5bf0f78705cdfaa10e168ab2b4b2ace13727e5d4 Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Thu, 8 Aug 2013 01:52:12 -0500 Subject: [PATCH 6/8] Added downloader middleware that will output some error logs to alert that a webdriver GET request was not succesful. --- scrapy_webdriver/middlewares.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/scrapy_webdriver/middlewares.py b/scrapy_webdriver/middlewares.py index ca34b50..c212d14 100644 --- a/scrapy_webdriver/middlewares.py +++ b/scrapy_webdriver/middlewares.py @@ -1,9 +1,9 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy import log -from .http import WebdriverActionRequest, WebdriverRequest +from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse from .manager import WebdriverManager - class WebdriverSpiderMiddleware(object): """This middleware coordinates concurrent webdriver access attempts.""" def __init__(self, crawler): @@ -75,3 +75,22 @@ def process_spider_exception(self, response, exception, spider): if next_request is not WebdriverRequest.WAITING: scheduler = self.manager.crawler.engine.slots[spider].scheduler scheduler.enqueue_request(next_request) + + +class WebdriverDownloaderMiddleware(object): + """This middleware handles webdriver.get failures.""" + + def process_response(self, request, response, spider): + + # if there is a downloading error in the WebdriverResponse, + # make a nice error message + if isinstance(response, WebdriverResponse): + if isinstance(response.webdriver, Exception): + msg = 'Error while downloading %s with webdriver (%s)' % \ + (request.url, response.webdriver) + spider.log(msg, level=log.ERROR) + + # but always still return the response. When there are errors, + # parse methods will probably fail. + return response + From 991c3a19540c68ac69e565961124b900289fd06f Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Thu, 8 Aug 2013 14:16:38 -0500 Subject: [PATCH 7/8] more testing on hang timeout stuff in download.py --- scrapy_webdriver/download.py | 64 +++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py index 0fe52bf..5a73541 100644 --- a/scrapy_webdriver/download.py +++ b/scrapy_webdriver/download.py @@ -21,6 +21,7 @@ class WebdriverDownloadHandler(object): def __init__(self, settings): self._enabled = settings.get('WEBDRIVER_BROWSER') is not None self._timeout = settings.get('WEBDRIVER_TIMEOUT') + self._hang_timeout = settings.get('WEBDRIVER_HANG_TIMEOUT', None) self._fallback_handler = load_object(FALLBACK_HANDLER)(settings) def download_request(self, request, spider): @@ -28,25 +29,27 @@ def download_request(self, request, spider): if self._enabled and isinstance(request, WebdriverRequest): # set the signal handler for the SIGALRM event - def handler(signum, frame): + if self._hang_timeout: - # kill the selenium webdriver process (with SIGTERM, - # so that it kills both the primary process and the - # process that gets spawned) - request.manager.webdriver.service.process.send_signal(signal.SIGTERM) + def alarm_handler(signum, frame): - # set the defunct _webdriver attribute back to - # original value of None, so that the next time it is - # accessed it is recreated. - request.manager._webdriver = None + # kill the selenium webdriver process (with SIGTERM, + # so that it kills both the primary process and the + # process that gets spawned) + request.manager.webdriver.service.process.send_signal(signal.SIGTERM) - # log an informative warning message - msg = "'webdriver.get' for '%s' took more than %s seconds." % \ - (request.url, self._timeout) - spider.log(msg, level=log.WARNING) + # set the defunct _webdriver attribute back to + # original value of None, so that the next time it is + # accessed it is recreated. + request.manager._webdriver = None - # bind the handler - signal.signal(signal.SIGALRM, handler) + # log an informative warning message + msg = "WebDriver.get for '%s' took more than WEBDRIVER_HANG_TIMEOUT (%ss)" % \ + (request.url, self._hang_timeout) + spider.log(msg, level=log.INFO) + + # bind the handler + signal.signal(signal.SIGALRM, alarm_handler) if isinstance(request, WebdriverActionRequest): download = self._do_action_request @@ -59,11 +62,11 @@ def handler(signum, frame): @inthread def _download_request(self, request, spider): """Download a request URL using webdriver.""" - log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG) + spider.log('Downloading %s with webdriver' % request.url, level=log.DEBUG) # set a countdown timer for the webdriver.get - if self._timeout: - signal.alarm(self._timeout) + if self._hang_timeout: + signal.alarm(self._hang_timeout) # make the get request try: @@ -72,14 +75,37 @@ def _download_request(self, request, spider): # if the get fails for any reason, set the webdriver attribute of the # response to the exception that occurred except Exception, exception: + + # since it's already failed, don't try to raise alarm anymore (this has no effect if the failure was due to the alarm) + if self._hang_timeout: + spider.log('settings alarm to 0 on FAILURE', level=log.DEBUG) + spider.log('FAIL: ' + str(request.manager._webdriver), level=log.DEBUG) + signal.alarm(0) + + # set page_source to blank so that WebdriverResponse doesn't complain exception.page_source = '' + + # log a nice error message + msg = 'Error while downloading %s with webdriver (%s)' % \ + (request.url, exception) + spider.log(msg, level=log.ERROR) + + # since manager.webdriver is a @property, this will recreate connection + webdriver = request.manager.webdriver + spider.log('FAIL 2. THIS SHOULD BE WEBDRIVER: ' + str(request.manager._webdriver), level=log.DEBUG) return WebdriverResponse(request.url, exception) # if the get finishes, defuse the bomb and return a response with the # webdriver attached else: - if self._timeout: + + # since it succeeded, don't raise any alarm + if self._hang_timeout: + spider.log('settings alarm to 0 on SUCCESS', level=log.DEBUG) + spider.log('YEAH: ' + str(request.manager._webdriver), level=log.DEBUG) signal.alarm(0) + + # return the correct response return WebdriverResponse(request.url, request.manager.webdriver) @inthread From b07e405ba03635361b1dca2ef05cf2df5af370f8 Mon Sep 17 00:00:00 2001 From: Mike Stringer Date: Thu, 8 Aug 2013 14:17:23 -0500 Subject: [PATCH 8/8] Using dont_filter=True on WebdriverRequest in middlewares on exception --- scrapy_webdriver/middlewares.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_webdriver/middlewares.py b/scrapy_webdriver/middlewares.py index c212d14..8581b02 100644 --- a/scrapy_webdriver/middlewares.py +++ b/scrapy_webdriver/middlewares.py @@ -74,7 +74,7 @@ def process_spider_exception(self, response, exception, spider): # only schedule if the queue isn't empty if next_request is not WebdriverRequest.WAITING: scheduler = self.manager.crawler.engine.slots[spider].scheduler - scheduler.enqueue_request(next_request) + scheduler.enqueue_request(next_request.replace(dont_filter=True)) class WebdriverDownloaderMiddleware(object):