Skip to content

Commit

Permalink
Improve follow/follow_all (#41)
Browse files Browse the repository at this point in the history
* Follow/follow_all

* Documentation

* Formatter and linter

* Priority fix

* Comments

* Example and fix

* Docstring

* Docstring

* ban any action except GoTo

* fix page_id = None

* Add Compose to except

* Fix action validation

* Fix action validation

* Response's state is saved now
  • Loading branch information
MatthewZMSU authored Oct 23, 2024
1 parent 98704eb commit f41d5fa
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 23 deletions.
23 changes: 7 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,29 +23,18 @@ DOWNLOADER_MIDDLEWARES = {
'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
}

PUPPETEER_SERVICE_URL = 'http://localhost:3000'
PUPPETEER_SERVICE_URL = "http://localhost:3000" # Not necessary in other execution methods

# To change the execution method, you must add the corresponding setting:
EXECUTION_METHOD = "Puppeteer"
```
Available methods: `Puppeteer`, `Pyppeteer`, `Playwright`

The `Pyppeteer` and `Playwright` methods do not require a running service. They use the pyppeteer and playwright libraries for Python to interact with the browser. Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods.
`Pyppeteer` and `Playwright` methods do not require a running service.
They use the pyppeteer and playwright libraries for Python to interact with the browser.
Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods.

To use the `Pyppeteer` or `Playwright` methods you need to install Chromium.


## Configuration

You should have [scrapy-puppeteer-service](https://github.com/ispras/scrapy-puppeteer-service) started.
Then add its URL to `settings.py` and enable puppeteer downloader middleware:
```python
DOWNLOADER_MIDDLEWARES = {
'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
}

PUPPETEER_SERVICE_URL = 'http://localhost:3000'
```
To use `Pyppeteer` or `Playwright` methods you need to install Chromium.

## Basic usage

Expand Down Expand Up @@ -130,6 +119,8 @@ class MySpider(scrapy.Spider):
)
```

You may also use `follow_all` method to continue interacting.

On your first request service will create new incognito browser context and new page in it.
Their ids will be in returned in response object as `context_id` and `page_id` attributes.
Following such response means passing context and page ids to next request.
Expand Down
49 changes: 49 additions & 0 deletions examples/spiders/follow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from scrapy import Spider
from scrapy.http import Response

from scrapypuppeteer import GoTo, PuppeteerRequest, PuppeteerResponse


class FollowSpider(Spider):
name = "follow"

start_urls = ["http://quotes.toscrape.com/page/1/"]

def start_requests(self):
for url in self.start_urls:
yield PuppeteerRequest(
GoTo(url),
close_page=False,
callback=self.goto_about,
errback=self.errback,
)

def goto_about(self, response: PuppeteerResponse):
# yield response.follow(
# response.css("div.quote span a")[0],
# callback=self.parse,
# errback=self.errback,
# close_page=False,
# )

# Or:
yield from response.follow_all(
response.css("div.quote span a"),
callback=self.parse,
errback=self.errback,
close_page=True,
)

# Or:
# yield from response.follow_all(
# css="div.quote span a",
# callback=self.parse,
# errback=self.errback,
# close_page=False,
# )

def parse(self, response: Response, **kwargs):
self.log(response.url.split("/")[-1])

def errback(self, failure):
self.log(failure)
4 changes: 3 additions & 1 deletion scrapypuppeteer/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ def __init__(
if isinstance(action.actions[0], GoTo):
url = action.actions[0].url
elif not isinstance(action, PuppeteerServiceAction):
raise ValueError("Undefined browser action")
raise TypeError(
f"Undefined browser action: `{type(action)}`. `Expected PuppeteerServiceAction`"
)
if url is None:
raise ValueError(
"Request is not a goto-containing request and does not follow a response"
Expand Down
76 changes: 70 additions & 6 deletions scrapypuppeteer/response.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import warnings
from typing import Tuple, Union
from typing import Generator, Tuple, Union

import parsel
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import TextResponse
from scrapy.http import HtmlResponse, TextResponse
from scrapy.http.response.text import _url_from_selector
from scrapy.link import Link

from scrapypuppeteer import PuppeteerRequest
from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction
from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction


class PuppeteerResponse(TextResponse):
Expand Down Expand Up @@ -38,7 +41,7 @@ def __init__(

def follow(
self,
action: Union[str, PuppeteerServiceAction],
action: Union[str, parsel.Selector, Link, PuppeteerServiceAction],
close_page=True,
accumulate_meta: bool = False,
**kwargs,
Expand All @@ -55,6 +58,10 @@ def follow(
page_id = None if self.puppeteer_request.close_page else self.page_id
if isinstance(action, str):
action = self.urljoin(action)
elif isinstance(action, parsel.Selector):
action = self.urljoin(_url_from_selector(action))
elif isinstance(action, Link):
action = self.urljoin(action.url)
elif isinstance(action, GoTo):
action.url = self.urljoin(action.url)
else:
Expand All @@ -70,14 +77,71 @@ def follow(
**kwargs,
)

def follow_all(
self,
actions=None,
close_page: bool = True,
accumulate_meta: bool = False,
css=None,
xpath=None,
**kwargs,
) -> Generator[PuppeteerRequest, None, None]:
"""
Execute actions in the same context but in other browser pages.
Only one of `actions`, `css`, or `xpath` must be specified.`
Note that original page from which the method was called lasts unaffected.
class PuppeteerHtmlResponse(PuppeteerResponse):
:param actions: iterable of PuppeteerActions or selectors
:param close_page: whether to close page after request completion
:param accumulate_meta: whether to accumulate meta from response
:param css: selector
:param xpath: selector
:return: Iterable[PuppeteerRequest]
"""

arguments = [x for x in (actions, css, xpath) if x is not None]
if len(arguments) != 1:
raise ValueError(
"Please supply exactly one of the following arguments: actions, css, xpath"
)
if not actions:
if css:
actions = self.css(css)
if xpath:
actions = self.xpath(xpath)
else:
# Ban any PuppeteerAction except GoTo and GoTo-like Compose
for action in actions:
if isinstance(action, PuppeteerServiceAction):
if isinstance(action, Compose):
action = action.actions[0]
if not isinstance(action, GoTo):
raise TypeError(f"Expected GoTo, got {type(action)}")

page_id = self.page_id
for action in actions:
self.page_id = None # Substitution of page_id in order to create new page
try:
next_request = self.follow(
action,
close_page=close_page,
accumulate_meta=accumulate_meta,
**kwargs,
)
finally: # To save the original state of response
self.page_id = page_id
yield next_request


class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse):
"""
scrapy.TextResponse capturing state of a page in browser.
Additionally, exposes received html and cookies via corresponding attributes.
"""

attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("html", "cookies")
attributes: Tuple[str, ...] = tuple(
set(PuppeteerResponse.attributes + HtmlResponse.attributes)
) + ("html", "cookies")
"""
A tuple of :class:`str` objects containing the name of all public
attributes of the class that are also keyword parameters of the
Expand Down

0 comments on commit f41d5fa

Please sign in to comment.