From fe7de05ec3704e1a6108a75160aef32930d0405a Mon Sep 17 00:00:00 2001
From: Mike Stringer <mike.stringer@datascopeanalytics.com>
Date: Fri, 12 Jul 2013 11:04:39 -0500
Subject: [PATCH 1/8] added the process_spider_exception method in middleware
 to release lock and schedule next request when exception occurs

---
 scrapy_webdriver/middlewares.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/scrapy_webdriver/middlewares.py b/scrapy_webdriver/middlewares.py
index 052c199..ca34b50 100644
--- a/scrapy_webdriver/middlewares.py
+++ b/scrapy_webdriver/middlewares.py
@@ -57,3 +57,21 @@ def _process_requests(self, items_or_requests, start=False):
                 if request is WebdriverRequest.WAITING:
                     continue  # Request has been enqueued, so drop it.
             yield request
+
+    def process_spider_exception(self, response, exception, spider):
+        """If there is an exception while parsing, feed the scrapy
+        scheduler with the next request from the queue in the
+        webdriver manager.
+        """
+        if isinstance(response.request, WebdriverRequest):
+
+            # release the lock that was acquired for this URL
+            self.manager.release(response.request.url)
+
+            # get the next request
+            next_request = self.manager.acquire_next()
+
+            # only schedule if the queue isn't empty
+            if next_request is not WebdriverRequest.WAITING:
+                scheduler = self.manager.crawler.engine.slots[spider].scheduler
+                scheduler.enqueue_request(next_request)

From 45270663e4c89415ef40928efec8594006219fa3 Mon Sep 17 00:00:00 2001
From: Mike Stringer <mike.stringer@datascopeanalytics.com>
Date: Mon, 15 Jul 2013 15:52:06 -0500
Subject: [PATCH 2/8] Added a setting WEBDRIVER_TIMEOUT that sets the page load
 timeout on the browser

---
 scrapy_webdriver/manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scrapy_webdriver/manager.py b/scrapy_webdriver/manager.py
index ec496dc..3436031 100644
--- a/scrapy_webdriver/manager.py
+++ b/scrapy_webdriver/manager.py
@@ -19,6 +19,7 @@ def __init__(self, crawler):
         self._browser = crawler.settings.get('WEBDRIVER_BROWSER', None)
         self._user_agent = crawler.settings.get('USER_AGENT', None)
         self._options = crawler.settings.get('WEBDRIVER_OPTIONS', dict())
+        self._timeout = crawler.settings.get('WEBDRIVER_TIMEOUT', None)
         self._webdriver = None
         if isinstance(self._browser, basestring):
             if '.' in self._browser:
@@ -52,6 +53,8 @@ def webdriver(self):
             options[cap_attr] = self._desired_capabilities
             self._webdriver = self._browser(**options)
             self.crawler.signals.connect(self._cleanup, signal=engine_stopped)
+            if self._timeout:
+                self._webdriver.set_page_load_timeout(self._timeout)
         return self._webdriver
 
     def acquire(self, request):

From 2f2ce00a0525d4054ad31338ea13a04ede83f4fd Mon Sep 17 00:00:00 2001
From: Mike Stringer <stringer@vmaster5000.(none)>
Date: Thu, 1 Aug 2013 19:58:39 -0500
Subject: [PATCH 3/8] Added an optional timeout on the webdriver.get in
 Download handler to handle situation where webdriver.get hangs forever.

---
 scrapy_webdriver/download.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py
index bf31fbb..5a0105f 100644
--- a/scrapy_webdriver/download.py
+++ b/scrapy_webdriver/download.py
@@ -1,3 +1,5 @@
+import signal
+
 from scrapy import log
 from scrapy.utils.decorator import inthread
 from scrapy.utils.misc import load_object
@@ -6,6 +8,8 @@
 
 FALLBACK_HANDLER = 'scrapy.core.downloader.handlers.http.HttpDownloadHandler'
 
+class WebdriverTimeout(Exception):
+    pass
 
 class WebdriverDownloadHandler(object):
     """This download handler uses webdriver, deferred in a thread.
@@ -15,8 +19,14 @@ class WebdriverDownloadHandler(object):
     """
     def __init__(self, settings):
         self._enabled = settings.get('WEBDRIVER_BROWSER') is not None
+        self._timeout = settings.get('WEBDRIVER_TIMEOUT')
         self._fallback_handler = load_object(FALLBACK_HANDLER)(settings)
 
+        # set the signal handler for the SIGALRM event
+        def handler(signum, frame):
+            raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout)
+        signal.signal(signal.SIGALRM, handler)
+
     def download_request(self, request, spider):
         """Return the result of the right download method for the request."""
         if self._enabled and isinstance(request, WebdriverRequest):
@@ -32,7 +42,18 @@ def download_request(self, request, spider):
     def _download_request(self, request, spider):
         """Download a request URL using webdriver."""
         log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG)
+
+        # set a countdown timer for the webdriver.get
+        if self._timeout:
+            signal.alarm(self._timeout)
+
+        # make the request
         request.manager.webdriver.get(request.url)
+
+        # if the get finishes, defuse the bomb
+        if self._timeout:
+            signal.alarm(0)
+
         return WebdriverResponse(request.url, request.manager.webdriver)
 
     @inthread

From c914b75821b713dd3537dedfeb4b4d4a369f5ecb Mon Sep 17 00:00:00 2001
From: Mike Stringer <mike.stringer@datascopeanalytics.com>
Date: Fri, 2 Aug 2013 11:21:27 -0500
Subject: [PATCH 4/8] Switched timeout error handler into download_request so
 that URL could be in error message, and also trying to stop execution of any
 hung scripts on page with window.stop()

---
 scrapy_webdriver/download.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py
index 5a0105f..9c9fe20 100644
--- a/scrapy_webdriver/download.py
+++ b/scrapy_webdriver/download.py
@@ -3,6 +3,7 @@
 from scrapy import log
 from scrapy.utils.decorator import inthread
 from scrapy.utils.misc import load_object
+from scrapy.exceptions import IgnoreRequest
 
 from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse
 
@@ -22,14 +23,19 @@ def __init__(self, settings):
         self._timeout = settings.get('WEBDRIVER_TIMEOUT')
         self._fallback_handler = load_object(FALLBACK_HANDLER)(settings)
 
-        # set the signal handler for the SIGALRM event
-        def handler(signum, frame):
-            raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout)
-        signal.signal(signal.SIGALRM, handler)
-
     def download_request(self, request, spider):
         """Return the result of the right download method for the request."""
         if self._enabled and isinstance(request, WebdriverRequest):
+
+            # set the signal handler for the SIGALRM event
+            def handler(signum, frame):
+                # stop the webdriver from executing anything
+                request.manager.webdriver.execute_script("window.stop()")
+                raise WebdriverTimeout("'webdriver.get' for '%s' took more than %s seconds." % (request.url, self._timeout))
+
+            # raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout)
+            signal.signal(signal.SIGALRM, handler)
+
             if isinstance(request, WebdriverActionRequest):
                 download = self._do_action_request
             else:

From cf290d72337d1350f3c891e392e5bd14de5a1f8c Mon Sep 17 00:00:00 2001
From: Mike Stringer <mike.stringer@datascopeanalytics.com>
Date: Thu, 8 Aug 2013 01:51:26 -0500
Subject: [PATCH 5/8] Added exception handling around the webdriver.get call in
 the download handler so that download exceptions can be caught later on.

---
 scrapy_webdriver/download.py | 42 ++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py
index 9c9fe20..0fe52bf 100644
--- a/scrapy_webdriver/download.py
+++ b/scrapy_webdriver/download.py
@@ -29,11 +29,23 @@ def download_request(self, request, spider):
 
             # set the signal handler for the SIGALRM event
             def handler(signum, frame):
-                # stop the webdriver from executing anything
-                request.manager.webdriver.execute_script("window.stop()")
-                raise WebdriverTimeout("'webdriver.get' for '%s' took more than %s seconds." % (request.url, self._timeout))
 
-            # raise WebdriverTimeout("'webdriver.get' took more than %s seconds." % self._timeout)
+                # kill the selenium webdriver process (with SIGTERM,
+                # so that it kills both the primary process and the
+                # process that gets spawned)
+                request.manager.webdriver.service.process.send_signal(signal.SIGTERM)
+
+                # set the defunct _webdriver attribute back to
+                # original value of None, so that the next time it is
+                # accessed it is recreated.
+                request.manager._webdriver = None
+
+                # log an informative warning message
+                msg = "'webdriver.get' for '%s' took more than %s seconds." % \
+                    (request.url, self._timeout)
+                spider.log(msg, level=log.WARNING)
+
+            # bind the handler
             signal.signal(signal.SIGALRM, handler)
 
             if isinstance(request, WebdriverActionRequest):
@@ -53,15 +65,23 @@ def _download_request(self, request, spider):
         if self._timeout:
             signal.alarm(self._timeout)
 
-        # make the request
-        request.manager.webdriver.get(request.url)
+        # make the get request
+        try:
+            request.manager.webdriver.get(request.url)
 
-        # if the get finishes, defuse the bomb
-        if self._timeout:
-            signal.alarm(0)
-
-        return WebdriverResponse(request.url, request.manager.webdriver)
+        # if the get fails for any reason, set the webdriver attribute of the
+        # response to the exception that occurred
+        except Exception, exception:
+            exception.page_source = '<html><head></head><body></body></html>'
+            return WebdriverResponse(request.url, exception)
 
+        # if the get finishes, defuse the bomb and return a response with the
+        # webdriver attached
+        else:
+            if self._timeout:
+                signal.alarm(0)
+            return WebdriverResponse(request.url, request.manager.webdriver)
+            
     @inthread
     def _do_action_request(self, request, spider):
         """Perform an action on a previously webdriver-loaded page."""

From 5bf0f78705cdfaa10e168ab2b4b2ace13727e5d4 Mon Sep 17 00:00:00 2001
From: Mike Stringer <mike.stringer@datascopeanalytics.com>
Date: Thu, 8 Aug 2013 01:52:12 -0500
Subject: [PATCH 6/8] Added downloader middleware that will output some error
 logs to alert that a webdriver GET request was not succesful.

---
 scrapy_webdriver/middlewares.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/scrapy_webdriver/middlewares.py b/scrapy_webdriver/middlewares.py
index ca34b50..c212d14 100644
--- a/scrapy_webdriver/middlewares.py
+++ b/scrapy_webdriver/middlewares.py
@@ -1,9 +1,9 @@
 from scrapy.exceptions import IgnoreRequest, NotConfigured
+from scrapy import log
 
-from .http import WebdriverActionRequest, WebdriverRequest
+from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse
 from .manager import WebdriverManager
 
-
 class WebdriverSpiderMiddleware(object):
     """This middleware coordinates concurrent webdriver access attempts."""
     def __init__(self, crawler):
@@ -75,3 +75,22 @@ def process_spider_exception(self, response, exception, spider):
             if next_request is not WebdriverRequest.WAITING:
                 scheduler = self.manager.crawler.engine.slots[spider].scheduler
                 scheduler.enqueue_request(next_request)
+
+
+class WebdriverDownloaderMiddleware(object):
+    """This middleware handles webdriver.get failures."""
+
+    def process_response(self, request, response, spider):
+
+        # if there is a downloading error in the WebdriverResponse,
+        # make a nice error message
+        if isinstance(response, WebdriverResponse):
+            if isinstance(response.webdriver, Exception):
+                msg = 'Error while downloading %s with webdriver (%s)' % \
+                    (request.url, response.webdriver)
+                spider.log(msg, level=log.ERROR)
+
+        # but always still return the response. When there are errors,
+        # parse methods will probably fail.
+        return response
+

From 991c3a19540c68ac69e565961124b900289fd06f Mon Sep 17 00:00:00 2001
From: Mike Stringer <mike.stringer@datascopeanalytics.com>
Date: Thu, 8 Aug 2013 14:16:38 -0500
Subject: [PATCH 7/8] more testing on hang timeout stuff in download.py

---
 scrapy_webdriver/download.py | 64 +++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py
index 0fe52bf..5a73541 100644
--- a/scrapy_webdriver/download.py
+++ b/scrapy_webdriver/download.py
@@ -21,6 +21,7 @@ class WebdriverDownloadHandler(object):
     def __init__(self, settings):
         self._enabled = settings.get('WEBDRIVER_BROWSER') is not None
         self._timeout = settings.get('WEBDRIVER_TIMEOUT')
+        self._hang_timeout = settings.get('WEBDRIVER_HANG_TIMEOUT', None)
         self._fallback_handler = load_object(FALLBACK_HANDLER)(settings)
 
     def download_request(self, request, spider):
@@ -28,25 +29,27 @@ def download_request(self, request, spider):
         if self._enabled and isinstance(request, WebdriverRequest):
 
             # set the signal handler for the SIGALRM event
-            def handler(signum, frame):
+            if self._hang_timeout:
 
-                # kill the selenium webdriver process (with SIGTERM,
-                # so that it kills both the primary process and the
-                # process that gets spawned)
-                request.manager.webdriver.service.process.send_signal(signal.SIGTERM)
+                def alarm_handler(signum, frame):
 
-                # set the defunct _webdriver attribute back to
-                # original value of None, so that the next time it is
-                # accessed it is recreated.
-                request.manager._webdriver = None
+                    # kill the selenium webdriver process (with SIGTERM,
+                    # so that it kills both the primary process and the
+                    # process that gets spawned)
+                    request.manager.webdriver.service.process.send_signal(signal.SIGTERM)
 
-                # log an informative warning message
-                msg = "'webdriver.get' for '%s' took more than %s seconds." % \
-                    (request.url, self._timeout)
-                spider.log(msg, level=log.WARNING)
+                    # set the defunct _webdriver attribute back to
+                    # original value of None, so that the next time it is
+                    # accessed it is recreated.
+                    request.manager._webdriver = None
 
-            # bind the handler
-            signal.signal(signal.SIGALRM, handler)
+                    # log an informative warning message
+                    msg = "WebDriver.get for '%s' took more than WEBDRIVER_HANG_TIMEOUT (%ss)" % \
+                        (request.url, self._hang_timeout)
+                    spider.log(msg, level=log.INFO)
+
+                # bind the handler
+                signal.signal(signal.SIGALRM, alarm_handler)
 
             if isinstance(request, WebdriverActionRequest):
                 download = self._do_action_request
@@ -59,11 +62,11 @@ def handler(signum, frame):
     @inthread
     def _download_request(self, request, spider):
         """Download a request URL using webdriver."""
-        log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG)
+        spider.log('Downloading %s with webdriver' % request.url, level=log.DEBUG)
 
         # set a countdown timer for the webdriver.get
-        if self._timeout:
-            signal.alarm(self._timeout)
+        if self._hang_timeout:
+            signal.alarm(self._hang_timeout)
 
         # make the get request
         try:
@@ -72,14 +75,37 @@ def _download_request(self, request, spider):
         # if the get fails for any reason, set the webdriver attribute of the
         # response to the exception that occurred
         except Exception, exception:
+
+            # since it's already failed, don't try to raise alarm anymore (this has no effect if the failure was due to the alarm)
+            if self._hang_timeout:
+                spider.log('settings alarm to 0 on FAILURE', level=log.DEBUG)
+                spider.log('FAIL: ' + str(request.manager._webdriver), level=log.DEBUG)
+                signal.alarm(0)
+
+            # set page_source to blank so that WebdriverResponse doesn't complain
             exception.page_source = '<html><head></head><body></body></html>'
+
+            # log a nice error message
+            msg = 'Error while downloading %s with webdriver (%s)' % \
+                (request.url, exception)
+            spider.log(msg, level=log.ERROR)
+
+            # since manager.webdriver is a @property, this will recreate connection
+            webdriver = request.manager.webdriver
+            spider.log('FAIL 2. THIS SHOULD BE WEBDRIVER: ' + str(request.manager._webdriver), level=log.DEBUG)
             return WebdriverResponse(request.url, exception)
 
         # if the get finishes, defuse the bomb and return a response with the
         # webdriver attached
         else:
-            if self._timeout:
+
+            # since it succeeded, don't raise any alarm
+            if self._hang_timeout:
+                spider.log('settings alarm to 0 on SUCCESS', level=log.DEBUG)
+                spider.log('YEAH: ' + str(request.manager._webdriver), level=log.DEBUG)
                 signal.alarm(0)
+
+            # return the correct response
             return WebdriverResponse(request.url, request.manager.webdriver)
             
     @inthread

From b07e405ba03635361b1dca2ef05cf2df5af370f8 Mon Sep 17 00:00:00 2001
From: Mike Stringer <mike.stringer@datascopeanalytics.com>
Date: Thu, 8 Aug 2013 14:17:23 -0500
Subject: [PATCH 8/8] Using dont_filter=True on WebdriverRequest in middlewares
 on exception

---
 scrapy_webdriver/middlewares.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapy_webdriver/middlewares.py b/scrapy_webdriver/middlewares.py
index c212d14..8581b02 100644
--- a/scrapy_webdriver/middlewares.py
+++ b/scrapy_webdriver/middlewares.py
@@ -74,7 +74,7 @@ def process_spider_exception(self, response, exception, spider):
             # only schedule if the queue isn't empty
             if next_request is not WebdriverRequest.WAITING:
                 scheduler = self.manager.crawler.engine.slots[spider].scheduler
-                scheduler.enqueue_request(next_request)
+                scheduler.enqueue_request(next_request.replace(dont_filter=True))
 
 
 class WebdriverDownloaderMiddleware(object):