From ca2ddd90c3694129f1ce6c9ecfcd99ad82402055 Mon Sep 17 00:00:00 2001 From: Matthew Date: Tue, 2 Jul 2024 11:17:47 +0300 Subject: [PATCH] Fix: retrying context closing (#28) * Base for retrying * Request-Response in puppeteer terms. * Closing contexts after retry * Typos and readme * fix: missing cntx_id * Proper merging from other branch * Readme and extra import * Version update * Defaults * Errback and CloseContextRequest fix * Typo * log warning if close context response status is not 200 --------- Co-authored-by: Max Varlamov --- README.md | 3 ++ scrapypuppeteer/__init__.py | 10 ++++-- scrapypuppeteer/middleware.py | 62 ++++++++++++++++++++++++----------- scrapypuppeteer/request.py | 38 ++++++++++++++++++++- scrapypuppeteer/response.py | 3 +- setup.py | 2 +- 6 files changed, 93 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 7a7a350..0472135 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,9 @@ By default, only cookies are sent. You would also like to send meta with your request. By default, you are not allowed to do this in order to sustain backward compatibility. You can change this behaviour by setting `PUPPETEER_INCLUDE_META` to True. +One your spider has done the crawling, the service middleware would close all contexts with +`scrapypuppeteer.CloseContextRequest`. It accepts a list of all browser contexts to be closed. + ## Automatic recaptcha solving Enable PuppeteerRecaptchaDownloaderMiddleware to automatically solve recaptcha during scraping. We do not recommend diff --git a/scrapypuppeteer/__init__.py b/scrapypuppeteer/__init__.py index fa2aab5..4507597 100644 --- a/scrapypuppeteer/__init__.py +++ b/scrapypuppeteer/__init__.py @@ -1,2 +1,8 @@ -from .request import PuppeteerRequest -from .response import * +from .request import PuppeteerRequest, CloseContextRequest +from .response import ( + PuppeteerResponse, + PuppeteerHtmlResponse, + PuppeteerScreenshotResponse, + PuppeteerRecaptchaSolverResponse, + PuppeteerJsonResponse, +) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index 6afcee3..2875676 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -4,10 +4,12 @@ from typing import List, Union from urllib.parse import urlencode, urljoin -from scrapy import Request, signals +from scrapy import signals from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured -from scrapy.http import Headers, TextResponse +from scrapy.exceptions import IgnoreRequest, NotConfigured, DontCloseSpider +from scrapy.http import Headers, TextResponse, Response +from scrapy.utils.log import failure_to_exc_info +from twisted.python.failure import Failure from scrapypuppeteer.actions import ( Click, @@ -26,7 +28,7 @@ PuppeteerRecaptchaSolverResponse, PuppeteerJsonResponse, ) -from scrapypuppeteer.request import ActionRequest, PuppeteerRequest +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest class PuppeteerServiceDownloaderMiddleware: @@ -34,8 +36,8 @@ class PuppeteerServiceDownloaderMiddleware: This downloader middleware converts PuppeteerRequest instances to Puppeteer service API requests and then converts its responses to PuppeteerResponse instances. Additionally, it tracks all browser contexts - that spider uses and performs cleanup request to service once spider - is closed. + that spider uses and performs cleanup request to service right before + spider is closed. Additionally, the middleware uses these meta-keys, do not use them, because their changing could possibly (almost probably) break determined behaviour: @@ -49,7 +51,7 @@ class PuppeteerServiceDownloaderMiddleware: PUPPETEER_INCLUDE_HEADERS (bool|list[str]) Determines which request headers will be sent to remote site by puppeteer service. Either True (all headers), False (no headers) or list of header names. - May be overriden per request. + May be overridden per request. By default, only cookies are sent. PUPPETEER_INCLUDE_META (bool) @@ -92,14 +94,24 @@ def from_crawler(cls, crawler): include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) middleware = cls(crawler, service_url, include_headers, include_meta) crawler.signals.connect( - middleware.close_used_contexts, signal=signals.spider_closed + middleware.close_used_contexts, signal=signals.spider_idle ) return middleware def process_request(self, request, spider): - if not isinstance(request, PuppeteerRequest): - return + if isinstance(request, CloseContextRequest): + return self.process_close_context_request(request) + if isinstance(request, PuppeteerRequest): + return self.process_puppeteer_request(request) + + def process_close_context_request(self, request: CloseContextRequest): + if not request.is_valid_url: + return request.replace( + url=urljoin(self.service_base_url, "/close_context"), + ) + + def process_puppeteer_request(self, request: PuppeteerRequest): action = request.action service_url = urljoin(self.service_base_url, action.endpoint) service_params = self._encode_service_params(request) @@ -176,8 +188,6 @@ def process_response(self, request, response, spider): return response.replace(request=request) response_data = json.loads(response.text) - response_cls = self._get_response_class(puppeteer_request.action) - if response.status != 200: reason = response_data.pop("error", f"undefined, status {response.status}") self.service_logger.warning( @@ -188,6 +198,8 @@ def process_response(self, request, response, spider): self.used_contexts[id(spider)].add(context_id) return response + response_cls = self._get_response_class(puppeteer_request.action) + return self._form_response( response_cls, response_data, @@ -225,16 +237,28 @@ def _get_response_class(request_action): return PuppeteerJsonResponse def close_used_contexts(self, spider): - contexts = list(self.used_contexts[id(spider)]) + contexts = list(self.used_contexts.pop(id(spider), set())) if contexts: - request = Request( - urljoin(self.service_base_url, "/close_context"), - method="POST", - headers=Headers({"Content-Type": "application/json"}), + request = CloseContextRequest( + contexts, meta={"proxy": None}, - body=json.dumps(contexts), ) - return self.crawler.engine.downloader.fetch(request, None) + + def handle_close_contexts_result(result): + if isinstance(result, Response): + if result.status == 200: + self.service_logger.debug(f"Successfully closed {len(request.contexts)} " + f"contexts with request {result.request}") + else: + self.service_logger.warning(f"Could not close contexts: {result.text}") + elif isinstance(result, Failure): + self.service_logger.warning(f"Could not close contexts: {result.value}", + exc_info=failure_to_exc_info(result)) + + dfd = self.crawler.engine.download(request) + dfd.addBoth(handle_close_contexts_result) + + raise DontCloseSpider() class PuppeteerRecaptchaDownloaderMiddleware: diff --git a/scrapypuppeteer/request.py b/scrapypuppeteer/request.py index 4ffc477..a3d55a7 100644 --- a/scrapypuppeteer/request.py +++ b/scrapypuppeteer/request.py @@ -1,6 +1,7 @@ +import json from typing import Tuple, List, Union -from scrapy.http import Request +from scrapy.http import Request, Headers from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction @@ -94,3 +95,38 @@ def __init__( self.page_id = page_id self.close_page = close_page self.include_headers = include_headers + + +class CloseContextRequest(Request): + """ + This request is used to close the browser contexts. + + The response for this request is a regular Scrapy HTMLResponse. + """ + + attributes: Tuple[str, ...] = Request.attributes + ("contexts",) + + def __init__(self, contexts: List, **kwargs): + """ + :param contexts: list of puppeteer contexts to close. + + :param kwargs: arguments of scrapy.Request. + """ + self.contexts = contexts + self.is_valid_url = False + + if "url" in kwargs: + self.is_valid_url = True + url = kwargs.pop("url", "://") # Incorrect url. To be replaced in middleware + + kwargs["method"] = "POST" + kwargs["headers"] = Headers({"Content-Type": "application/json"}) + kwargs["body"] = json.dumps(self.contexts) + + super().__init__(url, **kwargs) + + def __repr__(self): + return f"" + + def __str__(self): + return self.__repr__() diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index efdef1e..08536b7 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -1,3 +1,4 @@ +import warnings from typing import Tuple, Union from scrapy.exceptions import ScrapyDeprecationWarning @@ -6,8 +7,6 @@ from scrapypuppeteer import PuppeteerRequest from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction -import warnings - class PuppeteerResponse(TextResponse): attributes: Tuple[str, ...] = TextResponse.attributes + ( diff --git a/setup.py b/setup.py index 8502e3c..4e8087b 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="scrapy-puppeteer-client", - version="0.2.0", + version="0.3.0", description="A library to use Puppeteer-managed browser in Scrapy spiders", long_description=long_description, long_description_content_type="text/markdown",