fraserharris · fraserharris · Oct 27, 2015 · Oct 27, 2015 · Oct 28, 2015
diff --git a/scrapy/spiders/crawl.py b/scrapy/spiders/crawl.py
@@ -9,8 +9,10 @@
 import six
 
 from scrapy.http import Request, HtmlResponse
-from scrapy.utils.spider import iterate_spider_output
+from scrapy.linkextractors import _re_type, _matches
 from scrapy.spiders import Spider
+from scrapy.utils.misc import arg_to_iter
+from scrapy.utils.spider import iterate_spider_output
 
 
 def identity(x):
@@ -19,8 +21,13 @@ def identity(x):
 
 class Rule(object):
 
-    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
+    def __init__(self, link_extractor, allow_sources=(), deny_sources=(), callback=None,
+                 cb_kwargs=None, follow=None, process_links=None, process_request=identity):
         self.link_extractor = link_extractor
+        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x)
+                          for x in arg_to_iter(allow_sources)]
+        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x)
+                         for x in arg_to_iter(deny_sources)]
         self.callback = callback
         self.cb_kwargs = cb_kwargs or {}
         self.process_links = process_links
@@ -29,6 +36,13 @@ def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, p
             self.follow = False if callback else True
         else:
             self.follow = follow
+
+    def source_allowed(self, url):
+        if self.allow_res and not _matches(url, self.allow_res):
+            return False
+        if self.deny_res and _matches(url, self.deny_res):
+            return False
+        return True
 
 
 class CrawlSpider(Spider):
@@ -53,6 +67,8 @@ def _requests_to_follow(self, response):
             return
         seen = set()
         for n, rule in enumerate(self._rules):
+            if not rule.source_allowed(response.url):
+                continue
             links = [lnk for lnk in rule.link_extractor.extract_links(response)
                      if lnk not in seen]
             if links and rule.process_links:

diff --git a/tests/test_spider.py b/tests/test_spider.py
@@ -289,6 +289,41 @@ def test_follow_links_attribute_deprecated_population(self):
         self.assertTrue(hasattr(spider, '_follow_links'))
         self.assertFalse(spider._follow_links)
 
+    def test_allow_deny_sources_filter(self):
+        response_pass_url = HtmlResponse("http://example.org/pass_page/index.html",
+                                         body=self.test_body)
+        response_catch_url = HtmlResponse("http://example.org/catch_page/index.html",
+                                          body=self.test_body)
+
+        class _CrawlSpider(self.spider_class):
+            import re
+            name="test"
+            allowed_domains=['example.org']
+            rules = ()
+
+        spider = _CrawlSpider()
+        # Source url matches allow_sources
+        spider._rules = (Rule(LinkExtractor(), allow_sources=r'example.org/pass_page'))
+        output = list(spider._requests_to_follow(response_pass_url))
+        self.assertEquals(len(output),3)
+        # Source url does not match allow_sources
+        spider._rules = (Rule(LinkExtractor(), allow_sources=r'example.org/pass_page'))
+        output = list(spider._requests_to_follow(response_catch_url))
+        self.assertEquals(len(output),0)
+        # Source url does not match deny_sources
+        spider._rules = (Rule(LinkExtractor(), deny_sources=r'example.org/catch_page'))
+        output = list(spider._requests_to_follow(response_pass_url))
+        self.assertEquals(len(output),3)
+        # Source url matches deny_sources
+        spider._rules = (Rule(LinkExtractor(), deny_sources=r'example.org/catch_page'))
+        output = list(spider._requests_to_follow(response_catch_url))
+        self.assertEquals(len(output),0)
+        # Source url matches allow_sources and does not match deny_sources
+        spider._rules = (Rule(LinkExtractor(), allow_sources=r'example.org/pass_page',
+                              deny_sources=r'example.org/catch_page'))
+        output = list(spider._requests_to_follow(response_pass_url))
+        self.assertEquals(len(output),3)
+
 
 class SitemapSpiderTest(SpiderTest):