Improve follow/follow_all (#41)

* Follow/follow_all * Documentation * Formatter and linter * Priority fix * Comments * Example and fix * Docstring * Docstring * ban any action except GoTo * fix page_id = None * Add Compose to except * Fix action validation * Fix action validation * Response's state is saved now
ispras · Oct 23, 2024 · f41d5fa · f41d5fa
1 parent 98704eb
commit f41d5fa
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -23,29 +23,18 @@ DOWNLOADER_MIDDLEWARES = {
     'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
 }
 
-PUPPETEER_SERVICE_URL = 'http://localhost:3000'
+PUPPETEER_SERVICE_URL = "http://localhost:3000"  # Not necessary in other execution methods
 
 # To change the execution method, you must add the corresponding setting:
 EXECUTION_METHOD = "Puppeteer"
 ```
 Available methods: `Puppeteer`, `Pyppeteer`, `Playwright`
 
-The `Pyppeteer` and `Playwright` methods do not require a running service. They use the pyppeteer and playwright libraries for Python to interact with the browser. Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods.
+`Pyppeteer` and `Playwright` methods do not require a running service.
+They use the pyppeteer and playwright libraries for Python to interact with the browser.
+Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods.
 
-To use the `Pyppeteer` or `Playwright` methods you need to install Chromium. 
-
-
-## Configuration
-
-You should have [scrapy-puppeteer-service](https://github.com/ispras/scrapy-puppeteer-service) started.
-Then add its URL to `settings.py` and enable puppeteer downloader middleware:
-```python
-DOWNLOADER_MIDDLEWARES = {
-    'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
-}
-
-PUPPETEER_SERVICE_URL = 'http://localhost:3000'
-``` 
+To use `Pyppeteer` or `Playwright` methods you need to install Chromium.
 
 ## Basic usage
 
@@ -130,6 +119,8 @@ class MySpider(scrapy.Spider):
             )
 ```
 
+You may also use `follow_all` method to continue interacting.
+
 On your first request service will create new incognito browser context and new page in it.
 Their ids will be in returned in response object as `context_id` and `page_id` attributes.
 Following such response means passing context and page ids to next request.

diff --git a/examples/spiders/follow.py b/examples/spiders/follow.py
@@ -0,0 +1,49 @@
+from scrapy import Spider
+from scrapy.http import Response
+
+from scrapypuppeteer import GoTo, PuppeteerRequest, PuppeteerResponse
+
+
+class FollowSpider(Spider):
+    name = "follow"
+
+    start_urls = ["http://quotes.toscrape.com/page/1/"]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield PuppeteerRequest(
+                GoTo(url),
+                close_page=False,
+                callback=self.goto_about,
+                errback=self.errback,
+            )
+
+    def goto_about(self, response: PuppeteerResponse):
+        # yield response.follow(
+        #     response.css("div.quote span a")[0],
+        #     callback=self.parse,
+        #     errback=self.errback,
+        #     close_page=False,
+        # )
+
+        # Or:
+        yield from response.follow_all(
+            response.css("div.quote span a"),
+            callback=self.parse,
+            errback=self.errback,
+            close_page=True,
+        )
+
+        # Or:
+        # yield from response.follow_all(
+        #     css="div.quote span a",
+        #     callback=self.parse,
+        #     errback=self.errback,
+        #     close_page=False,
+        # )
+
+    def parse(self, response: Response, **kwargs):
+        self.log(response.url.split("/")[-1])
+
+    def errback(self, failure):
+        self.log(failure)
diff --git a/scrapypuppeteer/request.py b/scrapypuppeteer/request.py
@@ -92,7 +92,9 @@ def __init__(
             if isinstance(action.actions[0], GoTo):
                 url = action.actions[0].url
         elif not isinstance(action, PuppeteerServiceAction):
-            raise ValueError("Undefined browser action")
+            raise TypeError(
+                f"Undefined browser action: `{type(action)}`. `Expected PuppeteerServiceAction`"
+            )
         if url is None:
             raise ValueError(
                 "Request is not a goto-containing request and does not follow a response"

diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py
@@ -1,11 +1,14 @@
 import warnings
-from typing import Tuple, Union
+from typing import Generator, Tuple, Union
 
+import parsel
 from scrapy.exceptions import ScrapyDeprecationWarning
-from scrapy.http import TextResponse
+from scrapy.http import HtmlResponse, TextResponse
+from scrapy.http.response.text import _url_from_selector
+from scrapy.link import Link
 
 from scrapypuppeteer import PuppeteerRequest
-from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction
+from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction
 
 
 class PuppeteerResponse(TextResponse):
@@ -38,7 +41,7 @@ def __init__(
 
     def follow(
         self,
-        action: Union[str, PuppeteerServiceAction],
+        action: Union[str, parsel.Selector, Link, PuppeteerServiceAction],
         close_page=True,
         accumulate_meta: bool = False,
         **kwargs,
@@ -55,6 +58,10 @@ def follow(
         page_id = None if self.puppeteer_request.close_page else self.page_id
         if isinstance(action, str):
             action = self.urljoin(action)
+        elif isinstance(action, parsel.Selector):
+            action = self.urljoin(_url_from_selector(action))
+        elif isinstance(action, Link):
+            action = self.urljoin(action.url)
         elif isinstance(action, GoTo):
             action.url = self.urljoin(action.url)
         else:
@@ -70,14 +77,71 @@ def follow(
             **kwargs,
         )
 
+    def follow_all(
+        self,
+        actions=None,
+        close_page: bool = True,
+        accumulate_meta: bool = False,
+        css=None,
+        xpath=None,
+        **kwargs,
+    ) -> Generator[PuppeteerRequest, None, None]:
+        """
+        Execute actions in the same context but in other browser pages.
+        Only one of `actions`, `css`, or `xpath` must be specified.`
+        Note that original page from which the method was called lasts unaffected.
 
-class PuppeteerHtmlResponse(PuppeteerResponse):
+        :param actions: iterable of PuppeteerActions or selectors
+        :param close_page: whether to close page after request completion
+        :param accumulate_meta: whether to accumulate meta from response
+        :param css: selector
+        :param xpath: selector
+        :return: Iterable[PuppeteerRequest]
+        """
+
+        arguments = [x for x in (actions, css, xpath) if x is not None]
+        if len(arguments) != 1:
+            raise ValueError(
+                "Please supply exactly one of the following arguments: actions, css, xpath"
+            )
+        if not actions:
+            if css:
+                actions = self.css(css)
+            if xpath:
+                actions = self.xpath(xpath)
+        else:
+            # Ban any PuppeteerAction except GoTo and GoTo-like Compose
+            for action in actions:
+                if isinstance(action, PuppeteerServiceAction):
+                    if isinstance(action, Compose):
+                        action = action.actions[0]
+                    if not isinstance(action, GoTo):
+                        raise TypeError(f"Expected GoTo, got {type(action)}")
+
+        page_id = self.page_id
+        for action in actions:
+            self.page_id = None  # Substitution of page_id in order to create new page
+            try:
+                next_request = self.follow(
+                    action,
+                    close_page=close_page,
+                    accumulate_meta=accumulate_meta,
+                    **kwargs,
+                )
+            finally:  # To save the original state of response
+                self.page_id = page_id
+            yield next_request
+
+
+class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse):
     """
     scrapy.TextResponse capturing state of a page in browser.
     Additionally, exposes received html and cookies via corresponding attributes.
     """
 
-    attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("html", "cookies")
+    attributes: Tuple[str, ...] = tuple(
+        set(PuppeteerResponse.attributes + HtmlResponse.attributes)
+    ) + ("html", "cookies")
     """
         A tuple of :class:`str` objects containing the name of all public
         attributes of the class that are also keyword parameters of the