From 9bd72a89313e12655139f89da3b6f0b0fb4763be Mon Sep 17 00:00:00 2001 From: Matthew Date: Mon, 1 Jul 2024 12:18:12 +0300 Subject: [PATCH] Logging errors from service (#29) * added fail reason to logging in the service middleware * Formatting and import fix * Print -> self.log * Now logging level is warning --- examples/spiders/auto_recaptcha.py | 14 +++++++------- examples/spiders/manual_recaptcha.py | 14 +++++++------- scrapypuppeteer/middleware.py | 6 ++++++ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/examples/spiders/auto_recaptcha.py b/examples/spiders/auto_recaptcha.py index 7d45478..2d3fef4 100644 --- a/examples/spiders/auto_recaptcha.py +++ b/examples/spiders/auto_recaptcha.py @@ -1,6 +1,6 @@ -import base64 - +import logging import scrapy +import base64 from twisted.python.failure import Failure from scrapypuppeteer import PuppeteerRequest @@ -37,7 +37,7 @@ def start_requests(self): ) def parse_html(self, response: PuppeteerResponse, **kwargs): - with open(f"recaptcha_page.html", "wb") as f: + with open("recaptcha_page.html", "wb") as f: f.write(response.body) action = Screenshot( options={ @@ -48,13 +48,13 @@ def parse_html(self, response: PuppeteerResponse, **kwargs): action, callback=self.make_screenshot, errback=self.error, close_page=True ) - def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs): + @staticmethod + def make_screenshot(response: PuppeteerScreenshotResponse, **kwargs): data = ( response.screenshot ) # Note that data is string containing bytes, don't forget to decode them! with open("imageToSave.png", "wb") as fh: fh.write(base64.b64decode(data)) - @staticmethod - def error(failure: Failure): - print(f"We are in error function!") + def error(self, failure: Failure): + self.log("We are in error function!", level=logging.WARNING) diff --git a/examples/spiders/manual_recaptcha.py b/examples/spiders/manual_recaptcha.py index ba2e66f..dc2a8b3 100644 --- a/examples/spiders/manual_recaptcha.py +++ b/examples/spiders/manual_recaptcha.py @@ -1,12 +1,12 @@ +import logging import scrapy +import base64 from twisted.python.failure import Failure from scrapypuppeteer import PuppeteerRequest from scrapypuppeteer.actions import GoTo, RecaptchaSolver, Click, Screenshot from scrapypuppeteer.response import PuppeteerResponse, PuppeteerScreenshotResponse -import base64 - class ManualRecaptchaSpider(scrapy.Spider): name = "manual_recaptcha" @@ -42,7 +42,7 @@ def submit_recaptcha(self, response, **kwargs): ) def parse_html(self, response: PuppeteerResponse, **kwargs): - with open(f"recaptcha_page.html", "wb") as f: + with open("recaptcha_page.html", "wb") as f: f.write(response.body) action = Screenshot( options={ @@ -53,13 +53,13 @@ def parse_html(self, response: PuppeteerResponse, **kwargs): action, callback=self.make_screenshot, errback=self.error, close_page=True ) - def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs): + @staticmethod + def make_screenshot(response: PuppeteerScreenshotResponse, **kwargs): data = ( response.screenshot ) # Note that data is string containing bytes, don't forget to decode them! with open("imageToSave.png", "wb") as fh: fh.write(base64.b64decode(data)) - @staticmethod - def error(failure: Failure): - print(f"We are in error function!") + def error(self, failure: Failure): + self.log("We are in error function!", level=logging.WARNING) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index 8e1caaf..6afcee3 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -62,6 +62,8 @@ class PuppeteerServiceDownloaderMiddleware: SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + service_logger = logging.getLogger(__name__) + def __init__( self, crawler: Crawler, @@ -177,6 +179,10 @@ def process_response(self, request, response, spider): response_cls = self._get_response_class(puppeteer_request.action) if response.status != 200: + reason = response_data.pop("error", f"undefined, status {response.status}") + self.service_logger.warning( + f"Request {request} is not succeeded. Reason: {reason}" + ) context_id = response_data.get("contextId") if context_id: self.used_contexts[id(spider)].add(context_id)