Skip to content
This repository has been archived by the owner on Sep 28, 2022. It is now read-only.

added the process_spider_exception method in middleware #9

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
79 changes: 76 additions & 3 deletions scrapy_webdriver/download.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import signal

from scrapy import log
from scrapy.utils.decorator import inthread
from scrapy.utils.misc import load_object
from scrapy.exceptions import IgnoreRequest

from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse

FALLBACK_HANDLER = 'scrapy.core.downloader.handlers.http.HttpDownloadHandler'

class WebdriverTimeout(Exception):
pass

class WebdriverDownloadHandler(object):
"""This download handler uses webdriver, deferred in a thread.
Expand All @@ -15,11 +20,37 @@ class WebdriverDownloadHandler(object):
"""
def __init__(self, settings):
self._enabled = settings.get('WEBDRIVER_BROWSER') is not None
self._timeout = settings.get('WEBDRIVER_TIMEOUT')
self._hang_timeout = settings.get('WEBDRIVER_HANG_TIMEOUT', None)
self._fallback_handler = load_object(FALLBACK_HANDLER)(settings)

def download_request(self, request, spider):
"""Return the result of the right download method for the request."""
if self._enabled and isinstance(request, WebdriverRequest):

# set the signal handler for the SIGALRM event
if self._hang_timeout:

def alarm_handler(signum, frame):

# kill the selenium webdriver process (with SIGTERM,
# so that it kills both the primary process and the
# process that gets spawned)
request.manager.webdriver.service.process.send_signal(signal.SIGTERM)

# set the defunct _webdriver attribute back to
# original value of None, so that the next time it is
# accessed it is recreated.
request.manager._webdriver = None

# log an informative warning message
msg = "WebDriver.get for '%s' took more than WEBDRIVER_HANG_TIMEOUT (%ss)" % \
(request.url, self._hang_timeout)
spider.log(msg, level=log.INFO)

# bind the handler
signal.signal(signal.SIGALRM, alarm_handler)

if isinstance(request, WebdriverActionRequest):
download = self._do_action_request
else:
Expand All @@ -31,10 +62,52 @@ def download_request(self, request, spider):
@inthread
def _download_request(self, request, spider):
"""Download a request URL using webdriver."""
log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG)
request.manager.webdriver.get(request.url)
return WebdriverResponse(request.url, request.manager.webdriver)
spider.log('Downloading %s with webdriver' % request.url, level=log.DEBUG)

# set a countdown timer for the webdriver.get
if self._hang_timeout:
signal.alarm(self._hang_timeout)

# make the get request
try:
request.manager.webdriver.get(request.url)

# if the get fails for any reason, set the webdriver attribute of the
# response to the exception that occurred
except Exception, exception:

# since it's already failed, don't try to raise alarm anymore (this has no effect if the failure was due to the alarm)
if self._hang_timeout:
spider.log('settings alarm to 0 on FAILURE', level=log.DEBUG)
spider.log('FAIL: ' + str(request.manager._webdriver), level=log.DEBUG)
signal.alarm(0)

# set page_source to blank so that WebdriverResponse doesn't complain
exception.page_source = '<html><head></head><body></body></html>'

# log a nice error message
msg = 'Error while downloading %s with webdriver (%s)' % \
(request.url, exception)
spider.log(msg, level=log.ERROR)

# since manager.webdriver is a @property, this will recreate connection
webdriver = request.manager.webdriver
spider.log('FAIL 2. THIS SHOULD BE WEBDRIVER: ' + str(request.manager._webdriver), level=log.DEBUG)
return WebdriverResponse(request.url, exception)

# if the get finishes, defuse the bomb and return a response with the
# webdriver attached
else:

# since it succeeded, don't raise any alarm
if self._hang_timeout:
spider.log('settings alarm to 0 on SUCCESS', level=log.DEBUG)
spider.log('YEAH: ' + str(request.manager._webdriver), level=log.DEBUG)
signal.alarm(0)

# return the correct response
return WebdriverResponse(request.url, request.manager.webdriver)

@inthread
def _do_action_request(self, request, spider):
"""Perform an action on a previously webdriver-loaded page."""
Expand Down
3 changes: 3 additions & 0 deletions scrapy_webdriver/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(self, crawler):
self._browser = crawler.settings.get('WEBDRIVER_BROWSER', None)
self._user_agent = crawler.settings.get('USER_AGENT', None)
self._options = crawler.settings.get('WEBDRIVER_OPTIONS', dict())
self._timeout = crawler.settings.get('WEBDRIVER_TIMEOUT', None)
self._webdriver = None
if isinstance(self._browser, basestring):
if '.' in self._browser:
Expand Down Expand Up @@ -52,6 +53,8 @@ def webdriver(self):
options[cap_attr] = self._desired_capabilities
self._webdriver = self._browser(**options)
self.crawler.signals.connect(self._cleanup, signal=engine_stopped)
if self._timeout:
self._webdriver.set_page_load_timeout(self._timeout)
return self._webdriver

def acquire(self, request):
Expand Down
41 changes: 39 additions & 2 deletions scrapy_webdriver/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy import log

from .http import WebdriverActionRequest, WebdriverRequest
from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse
from .manager import WebdriverManager


class WebdriverSpiderMiddleware(object):
"""This middleware coordinates concurrent webdriver access attempts."""
def __init__(self, crawler):
Expand Down Expand Up @@ -57,3 +57,40 @@ def _process_requests(self, items_or_requests, start=False):
if request is WebdriverRequest.WAITING:
continue # Request has been enqueued, so drop it.
yield request

def process_spider_exception(self, response, exception, spider):
"""If there is an exception while parsing, feed the scrapy
scheduler with the next request from the queue in the
webdriver manager.
"""
if isinstance(response.request, WebdriverRequest):

# release the lock that was acquired for this URL
self.manager.release(response.request.url)

# get the next request
next_request = self.manager.acquire_next()

# only schedule if the queue isn't empty
if next_request is not WebdriverRequest.WAITING:
scheduler = self.manager.crawler.engine.slots[spider].scheduler
scheduler.enqueue_request(next_request.replace(dont_filter=True))


class WebdriverDownloaderMiddleware(object):
"""This middleware handles webdriver.get failures."""

def process_response(self, request, response, spider):

# if there is a downloading error in the WebdriverResponse,
# make a nice error message
if isinstance(response, WebdriverResponse):
if isinstance(response.webdriver, Exception):
msg = 'Error while downloading %s with webdriver (%s)' % \
(request.url, response.webdriver)
spider.log(msg, level=log.ERROR)

# but always still return the response. When there are errors,
# parse methods will probably fail.
return response