From 71567e6398b4ef5b976285d19cd170016846f3bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 26 Aug 2024 22:55:22 +0200 Subject: [PATCH 1/9] Switch auto field stats to an item pipeline --- docs/reference/settings.rst | 41 +- docs/setup.rst | 13 + scrapy_zyte_api/_poet_item_pipelines.py | 90 +++ scrapy_zyte_api/addon.py | 2 + scrapy_zyte_api/poet.py | 1 + scrapy_zyte_api/providers.py | 35 +- setup.cfg | 2 + setup.py | 3 +- tests/__init__.py | 4 + tests/test_addon.py | 7 + tests/test_api_requests.py | 3 +- tests/test_auto_field_stats.py | 782 ++++++++++++++++++++++++ tests/test_providers.py | 594 +----------------- 13 files changed, 943 insertions(+), 634 deletions(-) create mode 100644 scrapy_zyte_api/_poet_item_pipelines.py create mode 100644 scrapy_zyte_api/poet.py create mode 100644 tests/test_auto_field_stats.py diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 563052f9..824d28cf 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -13,13 +13,11 @@ ZYTE_API_AUTO_FIELD_STATS Default: ``False`` -Enables stats that indicate which requested fields :ref:`obtained through -scrapy-poet integration ` come directly from -:ref:`zapi-extract`. +Enables stats that indicate which fields from yielded items come from +:ref:`zapi-extract` when using :ref:`scrapy-poet integration `. -If for any request no page object class is used to override -:ref:`zapi-extract` fields for a given item type, the following stat is -set: +If for any combination of item type and URL there is no registered page object +class, the following stat is set: .. code-block:: python @@ -28,8 +26,9 @@ set: .. note:: A literal ``(all fields)`` string is used as value, not a list with all fields. -If for any request a custom page object class is used to override some -:ref:`zapi-extract` fields, the following stat is set: +When a page object class is registered for a given combination of item type and +URL, and that page object class overrides some fields, the following stat is +set: .. code-block:: python @@ -40,6 +39,32 @@ If for any request a custom page object class is used to override some .. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine whether a field has been overridden or not. +Item URLs are read from the ``url`` field by default. Use +:setting:`ZYTE_API_AUTO_FIELD_URL_FIELDS` to configure a different field for +any given item type. + +.. setting:: ZYTE_API_AUTO_FIELD_URL_FIELDS + +ZYTE_API_AUTO_FIELD_URL_FIELDS +============================== + +Default: ``{}`` + +Dictionary where keys are item types or their import paths, and values are +strings with the names of the fields in those item types that indicate the +source URL of the item. + +For example: + +.. code-block:: python + :caption: settings.py + + ZYTE_API_AUTO_FIELD_URL_FIELDS = { + "my_project.items.CustomItem": "custom_url_field", + } + +If a URL field is not specified for an item, ``url`` is used by default. + .. setting:: ZYTE_API_AUTOMAP_PARAMS ZYTE_API_AUTOMAP_PARAMS diff --git a/docs/setup.rst b/docs/setup.rst index 58f9187f..21243cf1 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -120,6 +120,8 @@ spider to use Zyte API for all requests, set the following setting as well: ZYTE_API_TRANSPARENT_MODE = True +.. _scrapy-poet-manual-setup: + For :ref:`scrapy-poet integration `, add the following provider to the ``SCRAPY_POET_PROVIDERS`` setting: @@ -150,6 +152,17 @@ middleware to the :setting:`DOWNLOADER_MIDDLEWARES "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, } +For :setting:`ZYTE_API_AUTO_FIELD_STATS` support, first :ref:`enable +scrapy-poet integration `, and then add the following +item pipeline to the :setting:`ITEM_PIPELINES ` setting: + +.. code-block:: python + :caption: settings.py + + ITEM_PIPELINES = { + "scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0, + } + .. _reactor-change: diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py new file mode 100644 index 00000000..6195a688 --- /dev/null +++ b/scrapy_zyte_api/_poet_item_pipelines.py @@ -0,0 +1,90 @@ +from logging import getLogger +from typing import Any + +from itemadapter import ItemAdapter +from scrapy import Spider +from scrapy.crawler import Crawler +from scrapy.exceptions import NotConfigured +from scrapy.utils.misc import load_object +from scrapy_poet.downloadermiddlewares import InjectionMiddleware +from web_poet.fields import get_fields_dict +from web_poet.utils import get_fq_class_name +from zyte_common_items.fields import is_auto_field + +logger = getLogger(__name__) + + +class ScrapyZyteAPIPoetItemPipeline: + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler: Crawler): + if not crawler.settings.getbool("ZYTE_API_AUTO_FIELD_STATS", False): + raise NotConfigured + + raw_url_fields = crawler.settings.getdict("ZYTE_API_AUTO_FIELD_URL_FIELDS", {}) + self._url_fields = {load_object(k): v for k, v in raw_url_fields.items()} + self._seen = set() + self._crawler = crawler + self._stats = crawler.stats + self._cls_without_url = set() + + def open_spider(self, spider): + for component in self._crawler.engine.downloader.middleware.middlewares: + if isinstance(component, InjectionMiddleware): + self._registry = component.injector.registry + return + raise RuntimeError( + "Could not find " + "scrapy_poet.downloadermiddlewares.InjectionMiddleware among " + "downloader middlewares. scrapy-poet may be misconfigured." + ) + + def process_item(self, item: Any, spider: Spider): + cls = item.__class__ + + url_field = self._url_fields.get(cls, "url") + adapter = ItemAdapter(item) + url = adapter.get(url_field, None) + if not url: + if cls not in self._cls_without_url: + self._cls_without_url.add(cls) + logger.warning( + f"An item of type {cls} was missing a non-empty URL in " + f"its {url_field!r} field. An item URL is necessary to " + f"determine the page object that was used to generate " + f"that item, and hence print the auto field stats that " + f"you requested by enabling the ZYTE_API_AUTO_FIELD_STATS " + f"setting. If {url_field!r} is the wrong URL field for " + f"that item type, use the ZYTE_API_AUTO_FIELD_URL_FIELDS " + f"setting to set a different field." + ) + return + + page_cls = self._registry.page_cls_for_item(url, cls) + cls = page_cls or cls + + if cls in self._seen: + return + self._seen.add(cls) + + if not page_cls: + field_list = "(all fields)" + else: + cls = page_cls + auto_fields = set() + missing_fields = False + for field_name in get_fields_dict(cls): + if is_auto_field(cls, field_name): # type: ignore[arg-type] + auto_fields.add(field_name) + else: + missing_fields = True + if missing_fields: + field_list = " ".join(sorted(auto_fields)) + else: + field_list = "(all fields)" + + cls_fqn = get_fq_class_name(cls) + self._stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) diff --git a/scrapy_zyte_api/addon.py b/scrapy_zyte_api/addon.py index 7cf90257..ca3f1601 100644 --- a/scrapy_zyte_api/addon.py +++ b/scrapy_zyte_api/addon.py @@ -111,9 +111,11 @@ def update_settings(self, settings: BaseSettings) -> None: except ImportError: pass else: + from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline from scrapy_zyte_api.providers import ZyteApiProvider _setdefault(settings, "DOWNLOADER_MIDDLEWARES", InjectionMiddleware, 543) + _setdefault(settings, "ITEM_PIPELINES", ScrapyZyteAPIPoetItemPipeline, 0) _setdefault(settings, "SCRAPY_POET_PROVIDERS", ZyteApiProvider, 1100) if settings.getbool("ZYTE_API_SESSION_ENABLED", False): diff --git a/scrapy_zyte_api/poet.py b/scrapy_zyte_api/poet.py new file mode 100644 index 00000000..bca7fb15 --- /dev/null +++ b/scrapy_zyte_api/poet.py @@ -0,0 +1 @@ +from ._poet_item_pipelines import ScrapyZyteAPIPoetItemPipeline diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 4042775c..f73204c8 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast +from typing import Any, Callable, Dict, List, Optional, Sequence, Set from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request @@ -13,8 +13,6 @@ HttpResponseHeaders, ) from web_poet.annotated import AnnotatedInstance -from web_poet.fields import get_fields_dict -from web_poet.utils import get_fq_class_name from zyte_common_items import ( Article, ArticleList, @@ -32,7 +30,6 @@ ProductList, ProductNavigation, ) -from zyte_common_items.fields import is_auto_field from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot from scrapy_zyte_api._annotations import _ActionResult @@ -84,38 +81,9 @@ class ZyteApiProvider(PageObjectInputProvider): Screenshot, } - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._should_track_auto_fields = None - self._tracked_auto_fields = set() - def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) - def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): - if cls not in _ITEM_KEYWORDS: - return - if self._should_track_auto_fields is None: - self._should_track_auto_fields = crawler.settings.getbool( - "ZYTE_API_AUTO_FIELD_STATS", False - ) - if self._should_track_auto_fields is False: - return - cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls - if cls in self._tracked_auto_fields: - return - self._tracked_auto_fields.add(cls) - if cls in _ITEM_KEYWORDS: - field_list = "(all fields)" - else: - auto_fields = set() - for field_name in get_fields_dict(cls): - if is_auto_field(cls, field_name): # type: ignore[arg-type] - auto_fields.add(field_name) - field_list = " ".join(sorted(auto_fields)) - cls_fqn = get_fq_class_name(cls) - crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) - async def __call__( # noqa: C901 self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: @@ -125,7 +93,6 @@ async def __call__( # noqa: C901 http_response = None screenshot_requested = Screenshot in to_provide for cls in list(to_provide): - self._track_auto_fields(crawler, request, cast(type, cls)) item = self.injector.weak_cache.get(request, {}).get(cls) if item: results.append(item) diff --git a/setup.cfg b/setup.cfg index bcbd4ab7..ceacfb5d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,5 +4,7 @@ max-line-length = 88 max-complexity = 18 select = B,C,E,F,W,T4 per-file-ignores = + tests/test_auto_field_stats.py: E402 tests/test_providers.py: E402 scrapy_zyte_api/__init__.py: F401 + scrapy_zyte_api/poet.py: F401 diff --git a/setup.py b/setup.py index ac2de981..95fba768 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,8 @@ def get_version(): "andi>=0.6.0", "scrapy-poet>=0.22.3", "web-poet>=0.17.0", - "zyte-common-items>=0.20.0", + # "zyte-common-items>=0.20.0", + "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@auto-fields", ] }, classifiers=[ diff --git a/tests/__init__.py b/tests/__init__.py index 55a5b470..0034705d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -42,6 +42,9 @@ SETTINGS["SCRAPY_POET_PROVIDERS"] = { "scrapy_zyte_api.providers.ZyteApiProvider": 1100 } + SETTINGS["ITEM_PIPELINES"] = { + "scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0 + } SETTINGS_ADDON: SETTINGS_T = { "ADDONS": { Addon: 500, @@ -108,6 +111,7 @@ def serialize_settings(settings): del result[setting] for setting in ( "DOWNLOADER_MIDDLEWARES", + "ITEM_PIPELINES", "SCRAPY_POET_PROVIDERS", "SPIDER_MIDDLEWARES", ): diff --git a/tests/test_addon.py b/tests/test_addon.py index a69d7030..605a06a8 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -24,8 +24,10 @@ POET = False InjectionMiddleware = None ZyteApiProvider: Optional[Type] = None + ScrapyZyteAPIPoetItemPipeline: Optional[Type] = None else: POET = True + from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline from scrapy_zyte_api.providers import ZyteApiProvider _crawler = get_crawler() @@ -120,6 +122,7 @@ def _test_setting_changes(initial_settings, expected_settings): # Test separately settings that copy_to_dict messes up. for setting in ( "DOWNLOADER_MIDDLEWARES", + "ITEM_PIPELINES", "SCRAPY_POET_PROVIDERS", "SPIDER_MIDDLEWARES", ): @@ -145,6 +148,7 @@ def _test_setting_changes(initial_settings, expected_settings): "http": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPDownloadHandler", "https": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPSDownloadHandler", }, + "ITEM_PIPELINES": {}, "REQUEST_FINGERPRINTER_CLASS": "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter", "SPIDER_MIDDLEWARES": { ScrapyZyteAPISpiderMiddleware: 100, @@ -230,6 +234,9 @@ def test_no_poet_setting_changes(initial_settings, expected_settings): ScrapyZyteAPISessionDownloaderMiddleware: 667, InjectionMiddleware: 543, }, + "ITEM_PIPELINES": { + ScrapyZyteAPIPoetItemPipeline: 0, + }, "SCRAPY_POET_PROVIDERS": { ZyteApiProvider: 1100, }, diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index b9ca96fc..c02101f3 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -650,6 +650,7 @@ async def test_default_params_immutability(setting_key, meta_key, setting, meta) async def _test_automap( settings, request_kwargs, meta, expected, warnings, caplog, cookie_jar=None ): + caplog.clear() request = Request(url="https://example.com", **request_kwargs) request.meta["zyte_api_automap"] = meta settings = {**settings, "ZYTE_API_TRANSPARENT_MODE": True} @@ -694,7 +695,7 @@ async def _test_automap( for warning in warnings: assert warning in caplog.text else: - assert not caplog.records + assert not caplog.records, caplog.records[0].args @pytest.mark.parametrize( diff --git a/tests/test_auto_field_stats.py b/tests/test_auto_field_stats.py new file mode 100644 index 00000000..184c4bed --- /dev/null +++ b/tests/test_auto_field_stats.py @@ -0,0 +1,782 @@ +from collections import defaultdict +from typing import Optional + +import pytest + +pytest.importorskip("scrapy_poet") + +import attrs +from pytest_twisted import ensureDeferred +from scrapy import Request, Spider +from scrapy_poet import DummyResponse +from scrapy_poet.utils.testing import HtmlResource, crawl_single_item +from web_poet import ItemPage, default_registry, field, handle_urls +from zyte_common_items import AutoProductPage, BaseProductPage, Item, Product +from zyte_common_items.fields import auto_field + +from scrapy_zyte_api.providers import ZyteApiProvider + +from .test_providers import create_scrapy_settings + + +@ensureDeferred +async def test_not_enabled(mockserver): + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == {} + + +@ensureDeferred +async def test_no_override(mockserver): + """When requesting an item directly from Zyte API, without an override to + change fields, stats reflect the entire list of item fields.""" + + from scrapy.statscollectors import MemoryStatsCollector + + duplicate_stat_calls: defaultdict[str, int] = defaultdict(int) + + class OnlyOnceStatsCollector(MemoryStatsCollector): + + def track_duplicate_stat_calls(self, key): + if key.startswith("scrapy-zyte-api/auto_fields/") and key in self._stats: + duplicate_stat_calls[key] += 1 + + def set_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().set_value(key, value, spider) + + def inc_value(self, key, count=1, start=1, spider=None): + self.track_duplicate_stat_calls(key) + super().inc_value(key, count, start, spider) + + def max_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().max_value(key, value, spider) + + def min_value(self, key, value, spider=None): + self.track_duplicate_stat_calls(key) + super().min_value(key, value, spider) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + for url in ( + mockserver.urljoin("/products/a"), + mockserver.urljoin("/products/b"), + ): + yield Request(url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + settings["STATS_CLASS"] = OnlyOnceStatsCollector + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( + "(all fields)" + ), + } + assert all(value == 0 for value in duplicate_stat_calls.values()) + + +@ensureDeferred +async def test_partial_override(mockserver): + """When requesting an item and having an Auto…Page subclass to change + fields, stats reflect the list of item fields not defined in the + subclass. Defined field methods are not listed, even if they return the + original item field, directly or as a fallback.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_partial_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_full_override(mockserver): + """When requesting an item and having an Auto…Page subclass to change + all fields, stats reflect the list of non-overriden item fields as an empty + string.""" + + # Copy-paste of fields from the AutoProductPage implementation, with type + # hints removed. + class MyProductPage(AutoProductPage): + + @field + def additionalProperties(self): + return self.product.additionalProperties + + @field + def aggregateRating(self): + return self.product.aggregateRating + + @field + def availability(self): + return self.product.availability + + @field + def brand(self): + return self.product.brand + + @field + def breadcrumbs(self): + return self.product.breadcrumbs + + @field + def canonicalUrl(self): + return self.product.canonicalUrl + + @field + def color(self): + return self.product.color + + @field + def currency(self): + return self.product.currency + + @field + def currencyRaw(self): + return self.product.currencyRaw + + @field + def description(self): + return self.product.description + + @field + def descriptionHtml(self): + return self.product.descriptionHtml + + @field + def features(self): + return self.product.features + + @field + def gtin(self): + return self.product.gtin + + @field + def images(self): + return self.product.images + + @field + def mainImage(self): + return self.product.mainImage + + @field + def metadata(self): + return self.product.metadata + + @field + def mpn(self): + return self.product.mpn + + @field + def name(self): + return self.product.name + + @field + def price(self): + return self.product.price + + @field + def productId(self): + return self.product.productId + + @field + def regularPrice(self): + return self.product.regularPrice + + @field + def size(self): + return self.product.size + + @field + def sku(self): + return self.product.sku + + @field + def style(self): + return self.product.style + + @field + def url(self): + return self.product.url + + @field + def variants(self): + return self.product.variants + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_full_override..MyProductPage": "", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_callback_override(mockserver): + """Fields overridden in callbacks, instead of using a page object, are not + taken into account.""" + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + product.name = "foo" + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( + "(all fields)" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_item_page_override(mockserver): + """The stat accounts for the configured page for a given item, so if you + request that page directly, things work the same as if you request the item + itself.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + async def parse(self, response: DummyResponse, page: MyProductPage): + yield await page.to_item() + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_item_page_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_alt_page_override(mockserver): + """The stat does not account for alternatives pages, so if you request a + page that provides an item, the page that counts for stats is the + configured page for that item, not the actual page requested.""" + + class MyProductPage(AutoProductPage): + + @field + def brand(self): + return "foo" + + @field + def name(self): + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class AltProductPage(AutoProductPage): + + @field + def sku(self): + return "foo" + + @field + def currencyRaw(self): + return self.product.currencyRaw + + handle_urls(f"{mockserver.host}:{mockserver.port}", priority=0)(AltProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + async def parse(self, response: DummyResponse, page: AltProductPage): + yield await page.to_item() + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_alt_page_override..MyProductPage": ( + "additionalProperties aggregateRating availability breadcrumbs " + "canonicalUrl color currency currencyRaw description descriptionHtml " + "features gtin images mainImage metadata mpn price productId " + "regularPrice size sku style url variants" + ), + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_non_auto_override(mockserver): + """If instead of using an Auto…Page class you use a custom class, all + fields are assumed to be overridden.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @field + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_non_auto_override..MyProductPage": "", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_decorator(mockserver): + """Using @auto_field forces a field to not be considered overridden.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @auto_field + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_auto_field_decorator..MyProductPage": "additionalProperties", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_auto_field_meta(mockserver): + """Using @field(meta={"auto_field": True}) has the same effect as using + @auto_field.""" + + @attrs.define + class MyProductPage(BaseProductPage): + product: Product + + @field(meta={"auto_field": True}) + def additionalProperties(self): + return self.product.additionalProperties + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_auto_field_meta..MyProductPage": "additionalProperties", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_custom_item(mockserver): + server = f"{mockserver.host}:{mockserver.port}" + url = f"http://{server}/" + + @attrs.define + class CustomProduct(Item): + url: str + product_title: Optional[str] = None + + @attrs.define + class MyProductPage(ItemPage[CustomProduct]): + product: Product + + @field + def url(self) -> str: + return url + + @field(meta={"auto_field": True}) + def product_title(self) -> Optional[str]: + return self.product.name + + handle_urls(server)(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: CustomProduct): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_custom_item..MyProductPage": "product_title", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_custom_item_missing_url(mockserver, caplog): + @attrs.define + class CustomProduct(Item): + weight: Optional[float] = None + product_title: Optional[str] = None + + @attrs.define + class MyProductPage(ItemPage[CustomProduct]): + product: Product + + @field + def weight(self) -> Optional[float]: + return None + + @field(meta={"auto_field": True}) + def product_title(self) -> Optional[str]: + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: CustomProduct): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + caplog.clear() + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == {} + + assert len(caplog.records) == 1 + assert "was missing a non-empty URL" in caplog.records[0].msg + + # Reset rules + default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_custom_item_custom_url_field(mockserver): + @attrs.define + class CustomProduct(Item): + product_url: str + product_title: Optional[str] = None + + @attrs.define + class MyProductPage(ItemPage[CustomProduct]): + product: Product + + @field(meta={"auto_field": True}) + def product_url(self) -> str: + return self.product.url + + @field(meta={"auto_field": True}) + def product_title(self) -> Optional[str]: + return self.product.name + + handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + yield Request(self.url, callback=self.parse) + + def parse(self, response: DummyResponse, product: CustomProduct): + yield product + + settings = create_scrapy_settings() + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_AUTO_FIELD_URL_FIELDS"] = {CustomProduct: "product_url"} + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + _, _, crawler = await crawl_single_item( + TestSpider, HtmlResource, settings, port=mockserver.port + ) + + auto_field_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/auto_fields") + } + assert auto_field_stats == { + "scrapy-zyte-api/auto_fields/tests.test_auto_field_stats.test_custom_item_custom_url_field..MyProductPage": "(all fields)", + } + + # Reset rules + default_registry.__init__() # type: ignore[misc] diff --git a/tests/test_providers.py b/tests/test_providers.py index 74dd17cc..452d54a4 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,5 +1,4 @@ import sys -from collections import defaultdict import pytest @@ -19,13 +18,11 @@ BrowserResponse, HttpResponse, ItemPage, - default_registry, field, handle_urls, ) from web_poet.pages import get_item_cls -from zyte_common_items import AutoProductPage, BasePage, BaseProductPage, Product -from zyte_common_items.fields import auto_field +from zyte_common_items import BasePage, Product from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot, actions from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler @@ -40,7 +37,9 @@ def create_scrapy_settings(): settings = _create_scrapy_settings() for setting, value in SETTINGS.items(): - if setting.endswith("_MIDDLEWARES") and settings[setting]: + if ( + setting.endswith("_MIDDLEWARES") or setting == "ITEM_PIPELINES" + ) and settings[setting]: settings[setting].update(value) else: settings[setting] = value @@ -1023,588 +1022,3 @@ def parse_(self, response: DummyResponse, page: ActionProductPage): # type: ign def test_auto_pages_set(): assert set(_ITEM_KEYWORDS) == {get_item_cls(cls) for cls in _AUTO_PAGES} # type: ignore[call-overload] - - -@ensureDeferred -async def test_auto_field_stats_not_enabled(mockserver): - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == {} - - -@ensureDeferred -async def test_auto_field_stats_no_override(mockserver): - """When requesting an item directly from Zyte API, without an override to - change fields, stats reflect the entire list of item fields.""" - - from scrapy.statscollectors import MemoryStatsCollector - - duplicate_stat_calls: defaultdict[str, int] = defaultdict(int) - - class OnlyOnceStatsCollector(MemoryStatsCollector): - - def track_duplicate_stat_calls(self, key): - if key.startswith("scrapy-zyte-api/auto_fields/") and key in self._stats: - duplicate_stat_calls[key] += 1 - - def set_value(self, key, value, spider=None): - self.track_duplicate_stat_calls(key) - super().set_value(key, value, spider) - - def inc_value(self, key, count=1, start=1, spider=None): - self.track_duplicate_stat_calls(key) - super().inc_value(key, count, start, spider) - - def max_value(self, key, value, spider=None): - self.track_duplicate_stat_calls(key) - super().max_value(key, value, spider) - - def min_value(self, key, value, spider=None): - self.track_duplicate_stat_calls(key) - super().min_value(key, value, spider) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - for url in ("data:,a", "data:,b"): - yield Request(url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - settings = create_scrapy_settings() - settings["STATS_CLASS"] = OnlyOnceStatsCollector - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( - "(all fields)" - ), - } - assert all(value == 0 for value in duplicate_stat_calls.values()) - - -@ensureDeferred -async def test_auto_field_stats_partial_override(mockserver): - """When requesting an item and having an Auto…Page subclass to change - fields, stats reflect the list of item fields not defined in the - subclass. Defined field methods are not listed, even if they return the - original item field, directly or as a fallback.""" - - class MyProductPage(AutoProductPage): - - @field - def brand(self): - return "foo" - - @field - def name(self): - return self.product.name - - handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_partial_override..MyProductPage": ( - "additionalProperties aggregateRating availability breadcrumbs " - "canonicalUrl color currency currencyRaw description descriptionHtml " - "features gtin images mainImage metadata mpn price productId " - "regularPrice size sku style url variants" - ), - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] - - -@ensureDeferred -async def test_auto_field_stats_full_override(mockserver): - """When requesting an item and having an Auto…Page subclass to change - all fields, stats reflect the list of non-overriden item fields as an empty - string.""" - - # Copy-paste of fields from the AutoProductPage implementation, with type - # hints removed. - class MyProductPage(AutoProductPage): - - @field - def additionalProperties(self): - return self.product.additionalProperties - - @field - def aggregateRating(self): - return self.product.aggregateRating - - @field - def availability(self): - return self.product.availability - - @field - def brand(self): - return self.product.brand - - @field - def breadcrumbs(self): - return self.product.breadcrumbs - - @field - def canonicalUrl(self): - return self.product.canonicalUrl - - @field - def color(self): - return self.product.color - - @field - def currency(self): - return self.product.currency - - @field - def currencyRaw(self): - return self.product.currencyRaw - - @field - def description(self): - return self.product.description - - @field - def descriptionHtml(self): - return self.product.descriptionHtml - - @field - def features(self): - return self.product.features - - @field - def gtin(self): - return self.product.gtin - - @field - def images(self): - return self.product.images - - @field - def mainImage(self): - return self.product.mainImage - - @field - def metadata(self): - return self.product.metadata - - @field - def mpn(self): - return self.product.mpn - - @field - def name(self): - return self.product.name - - @field - def price(self): - return self.product.price - - @field - def productId(self): - return self.product.productId - - @field - def regularPrice(self): - return self.product.regularPrice - - @field - def size(self): - return self.product.size - - @field - def sku(self): - return self.product.sku - - @field - def style(self): - return self.product.style - - @field - def url(self): - return self.product.url - - @field - def variants(self): - return self.product.variants - - handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_full_override..MyProductPage": "", - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] - - -@ensureDeferred -async def test_auto_field_stats_callback_override(mockserver): - """Fields overridden in callbacks, instead of using a page object, are not - taken into account.""" - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - product.name = "foo" - yield product - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/zyte_common_items.items.product.Product": ( - "(all fields)" - ), - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] - - -@ensureDeferred -async def test_auto_field_stats_item_page_override(mockserver): - """The stat accounts for the configured page for a given item, so if you - request that page directly, things work the same as if you request the item - itself.""" - - class MyProductPage(AutoProductPage): - - @field - def brand(self): - return "foo" - - @field - def name(self): - return self.product.name - - handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, page: MyProductPage): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_item_page_override..MyProductPage": ( - "additionalProperties aggregateRating availability breadcrumbs " - "canonicalUrl color currency currencyRaw description descriptionHtml " - "features gtin images mainImage metadata mpn price productId " - "regularPrice size sku style url variants" - ), - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] - - -@ensureDeferred -async def test_auto_field_stats_alt_page_override(mockserver): - """The stat does not account for alternatives pages, so if you request a - page that provides an item, the page that counts for stats is the - configured page for that item, not the actual page requested.""" - - class MyProductPage(AutoProductPage): - - @field - def brand(self): - return "foo" - - @field - def name(self): - return self.product.name - - handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) - - class AltProductPage(AutoProductPage): - - @field - def sku(self): - return "foo" - - @field - def currencyRaw(self): - return self.product.currencyRaw - - handle_urls(f"{mockserver.host}:{mockserver.port}", priority=0)(AltProductPage) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, page: AltProductPage): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_alt_page_override..MyProductPage": ( - "additionalProperties aggregateRating availability breadcrumbs " - "canonicalUrl color currency currencyRaw description descriptionHtml " - "features gtin images mainImage metadata mpn price productId " - "regularPrice size sku style url variants" - ), - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] - - -@ensureDeferred -async def test_auto_field_stats_non_auto_override(mockserver): - """If instead of using an Auto…Page class you use a custom class, all - fields are assumed to be overridden.""" - - @attrs.define - class MyProductPage(BaseProductPage): - product: Product - - @field - def additionalProperties(self): - return self.product.additionalProperties - - handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_non_auto_override..MyProductPage": "", - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] - - -@ensureDeferred -async def test_auto_field_stats_auto_field_decorator(mockserver): - """Using @auto_field forces a field to not be considered overridden.""" - - @attrs.define - class MyProductPage(BaseProductPage): - product: Product - - @auto_field - def additionalProperties(self): - return self.product.additionalProperties - - handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_decorator..MyProductPage": "additionalProperties", - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] - - -@ensureDeferred -async def test_auto_field_stats_auto_field_meta(mockserver): - """Using @field(meta={"auto_field": True}) has the same effect as using - @auto_field.""" - - @attrs.define - class MyProductPage(BaseProductPage): - product: Product - - @field(meta={"auto_field": True}) - def additionalProperties(self): - return self.product.additionalProperties - - handle_urls(f"{mockserver.host}:{mockserver.port}")(MyProductPage) - - class TestSpider(Spider): - name = "test_spider" - url: str - - def start_requests(self): - yield Request(self.url, callback=self.parse) - - def parse(self, response: DummyResponse, product: Product): - pass - - settings = create_scrapy_settings() - settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ZYTE_API_AUTO_FIELD_STATS"] = True - settings["ZYTE_API_URL"] = mockserver.urljoin("/") - _, _, crawler = await crawl_single_item( - TestSpider, HtmlResource, settings, port=mockserver.port - ) - - auto_field_stats = { - k: v - for k, v in crawler.stats.get_stats().items() - if k.startswith("scrapy-zyte-api/auto_fields") - } - assert auto_field_stats == { - "scrapy-zyte-api/auto_fields/tests.test_providers.test_auto_field_stats_auto_field_meta..MyProductPage": "additionalProperties", - } - - # Reset rules - default_registry.__init__() # type: ignore[misc] From e1eaa042e467031290a38515d8270dd073da47e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 26 Aug 2024 23:01:43 +0200 Subject: [PATCH 2/9] Add types --- scrapy_zyte_api/_poet_item_pipelines.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py index 6195a688..695dc163 100644 --- a/scrapy_zyte_api/_poet_item_pipelines.py +++ b/scrapy_zyte_api/_poet_item_pipelines.py @@ -1,5 +1,5 @@ from logging import getLogger -from typing import Any +from typing import Any, Set, Type from itemadapter import ItemAdapter from scrapy import Spider @@ -26,10 +26,10 @@ def __init__(self, crawler: Crawler): raw_url_fields = crawler.settings.getdict("ZYTE_API_AUTO_FIELD_URL_FIELDS", {}) self._url_fields = {load_object(k): v for k, v in raw_url_fields.items()} - self._seen = set() + self._seen: Set[Type] = set() self._crawler = crawler self._stats = crawler.stats - self._cls_without_url = set() + self._cls_without_url: Set[Type] = set() def open_spider(self, spider): for component in self._crawler.engine.downloader.middleware.middlewares: From 193501275480fbd072a6c5978292f1899971a169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 26 Aug 2024 23:04:03 +0200 Subject: [PATCH 3/9] Remove unnecessary line --- scrapy_zyte_api/_poet_item_pipelines.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py index 695dc163..ad062b05 100644 --- a/scrapy_zyte_api/_poet_item_pipelines.py +++ b/scrapy_zyte_api/_poet_item_pipelines.py @@ -73,7 +73,6 @@ def process_item(self, item: Any, spider: Spider): if not page_cls: field_list = "(all fields)" else: - cls = page_cls auto_fields = set() missing_fields = False for field_name in get_fields_dict(cls): From 619f87c457f43d7117c27291760bc95564b8ee8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 27 Aug 2024 10:59:00 +0200 Subject: [PATCH 4/9] Require the latest zyte-common-items in provider-pinned --- tox.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 086d7971..0b9f55aa 100644 --- a/tox.ini +++ b/tox.ini @@ -90,7 +90,8 @@ deps = andi==0.6.0 scrapy-poet==0.22.3 web-poet==0.17.0 - zyte-common-items==0.20.0 + # zyte-common-items==0.20.0 + zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@auto-fields [testenv:pinned-extra] basepython=python3.8 From afe00e8b941b29c680f3e4ed635ade351fa3470d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 27 Aug 2024 12:26:14 +0200 Subject: [PATCH 5/9] Require zyte-common-items 0.21.0 --- setup.py | 3 +-- tox.ini | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 95fba768..d35a80e1 100644 --- a/setup.py +++ b/setup.py @@ -33,8 +33,7 @@ def get_version(): "andi>=0.6.0", "scrapy-poet>=0.22.3", "web-poet>=0.17.0", - # "zyte-common-items>=0.20.0", - "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@auto-fields", + "zyte-common-items>=0.21.0", ] }, classifiers=[ diff --git a/tox.ini b/tox.ini index 0b9f55aa..254cce08 100644 --- a/tox.ini +++ b/tox.ini @@ -90,8 +90,7 @@ deps = andi==0.6.0 scrapy-poet==0.22.3 web-poet==0.17.0 - # zyte-common-items==0.20.0 - zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@auto-fields + zyte-common-items==0.21.0 [testenv:pinned-extra] basepython=python3.8 From 52c87173b0587dd5c3ae997738a3cb69e4dc0539 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 27 Aug 2024 12:34:36 +0200 Subject: [PATCH 6/9] Test missing InjectionMiddleware --- scrapy_zyte_api/_poet_item_pipelines.py | 7 +++---- tests/test_auto_field_stats.py | 27 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py index ad062b05..445582a7 100644 --- a/scrapy_zyte_api/_poet_item_pipelines.py +++ b/scrapy_zyte_api/_poet_item_pipelines.py @@ -6,7 +6,7 @@ from scrapy.crawler import Crawler from scrapy.exceptions import NotConfigured from scrapy.utils.misc import load_object -from scrapy_poet.downloadermiddlewares import InjectionMiddleware +from scrapy_poet import InjectionMiddleware from web_poet.fields import get_fields_dict from web_poet.utils import get_fq_class_name from zyte_common_items.fields import is_auto_field @@ -37,9 +37,8 @@ def open_spider(self, spider): self._registry = component.injector.registry return raise RuntimeError( - "Could not find " - "scrapy_poet.downloadermiddlewares.InjectionMiddleware among " - "downloader middlewares. scrapy-poet may be misconfigured." + "Could not find scrapy_poet.InjectionMiddleware among downloader " + "middlewares. scrapy-poet may be misconfigured." ) def process_item(self, item: Any, spider: Spider): diff --git a/tests/test_auto_field_stats.py b/tests/test_auto_field_stats.py index 184c4bed..c9181989 100644 --- a/tests/test_auto_field_stats.py +++ b/tests/test_auto_field_stats.py @@ -780,3 +780,30 @@ def parse(self, response: DummyResponse, product: CustomProduct): # Reset rules default_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_missing_injection_middleware(mockserver): + + class TestSpider(Spider): + name = "test_spider" + url: str + + def start_requests(self): + for url in ( + mockserver.urljoin("/"), + mockserver.urljoin("/products/b"), + ): + yield Request(url, callback=self.parse) + + def parse(self, response: DummyResponse, product: Product): + yield product + + settings = create_scrapy_settings() + del settings["DOWNLOADER_MIDDLEWARES"]["scrapy_poet.InjectionMiddleware"] + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ZYTE_API_AUTO_FIELD_STATS"] = True + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + with pytest.raises(RuntimeError): + _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) From bd49a107b08bd0b316fa4ca53d559e642b2f8b39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 30 Aug 2024 10:11:35 +0200 Subject: [PATCH 7/9] =?UTF-8?q?ScrapyZyteAPIPoetItemPipeline=20=E2=86=92?= =?UTF-8?q?=20ScrapyZyteAPIAutoFieldStatsItemPipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/setup.rst | 2 +- scrapy_zyte_api/_poet_item_pipelines.py | 2 +- scrapy_zyte_api/addon.py | 6 ++- scrapy_zyte_api/poet.py | 2 +- tests/__init__.py | 2 +- tests/test_addon.py | 6 +-- tests/test_auto_field_stats.py | 56 ++++++++++++++++++------- 7 files changed, 53 insertions(+), 23 deletions(-) diff --git a/docs/setup.rst b/docs/setup.rst index 21243cf1..2d6db2ff 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -160,7 +160,7 @@ item pipeline to the :setting:`ITEM_PIPELINES ` setting: :caption: settings.py ITEM_PIPELINES = { - "scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0, + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline": 0, } diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py index 445582a7..0e5b7ab8 100644 --- a/scrapy_zyte_api/_poet_item_pipelines.py +++ b/scrapy_zyte_api/_poet_item_pipelines.py @@ -14,7 +14,7 @@ logger = getLogger(__name__) -class ScrapyZyteAPIPoetItemPipeline: +class ScrapyZyteAPIAutoFieldStatsItemPipeline: @classmethod def from_crawler(cls, crawler): diff --git a/scrapy_zyte_api/addon.py b/scrapy_zyte_api/addon.py index ca3f1601..4b745d3d 100644 --- a/scrapy_zyte_api/addon.py +++ b/scrapy_zyte_api/addon.py @@ -111,11 +111,13 @@ def update_settings(self, settings: BaseSettings) -> None: except ImportError: pass else: - from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline + from scrapy_zyte_api.poet import ScrapyZyteAPIAutoFieldStatsItemPipeline from scrapy_zyte_api.providers import ZyteApiProvider _setdefault(settings, "DOWNLOADER_MIDDLEWARES", InjectionMiddleware, 543) - _setdefault(settings, "ITEM_PIPELINES", ScrapyZyteAPIPoetItemPipeline, 0) + _setdefault( + settings, "ITEM_PIPELINES", ScrapyZyteAPIAutoFieldStatsItemPipeline, 0 + ) _setdefault(settings, "SCRAPY_POET_PROVIDERS", ZyteApiProvider, 1100) if settings.getbool("ZYTE_API_SESSION_ENABLED", False): diff --git a/scrapy_zyte_api/poet.py b/scrapy_zyte_api/poet.py index bca7fb15..259226f4 100644 --- a/scrapy_zyte_api/poet.py +++ b/scrapy_zyte_api/poet.py @@ -1 +1 @@ -from ._poet_item_pipelines import ScrapyZyteAPIPoetItemPipeline +from ._poet_item_pipelines import ScrapyZyteAPIAutoFieldStatsItemPipeline diff --git a/tests/__init__.py b/tests/__init__.py index 0034705d..bf6196ea 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -43,7 +43,7 @@ "scrapy_zyte_api.providers.ZyteApiProvider": 1100 } SETTINGS["ITEM_PIPELINES"] = { - "scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0 + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline": 0 } SETTINGS_ADDON: SETTINGS_T = { "ADDONS": { diff --git a/tests/test_addon.py b/tests/test_addon.py index 605a06a8..51e0a96b 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -24,10 +24,10 @@ POET = False InjectionMiddleware = None ZyteApiProvider: Optional[Type] = None - ScrapyZyteAPIPoetItemPipeline: Optional[Type] = None + ScrapyZyteAPIAutoFieldStatsItemPipeline: Optional[Type] = None else: POET = True - from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline + from scrapy_zyte_api.poet import ScrapyZyteAPIAutoFieldStatsItemPipeline from scrapy_zyte_api.providers import ZyteApiProvider _crawler = get_crawler() @@ -235,7 +235,7 @@ def test_no_poet_setting_changes(initial_settings, expected_settings): InjectionMiddleware: 543, }, "ITEM_PIPELINES": { - ScrapyZyteAPIPoetItemPipeline: 0, + ScrapyZyteAPIAutoFieldStatsItemPipeline: 0, }, "SCRAPY_POET_PROVIDERS": { ZyteApiProvider: 1100, diff --git a/tests/test_auto_field_stats.py b/tests/test_auto_field_stats.py index c9181989..87664c14 100644 --- a/tests/test_auto_field_stats.py +++ b/tests/test_auto_field_stats.py @@ -33,7 +33,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) @@ -93,7 +95,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["STATS_CLASS"] = OnlyOnceStatsCollector settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item(TestSpider, HtmlResource, settings) @@ -142,7 +146,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -295,7 +301,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -333,7 +341,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -385,7 +395,9 @@ async def parse(self, response: DummyResponse, page: MyProductPage): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -452,7 +464,9 @@ async def parse(self, response: DummyResponse, page: AltProductPage): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -504,7 +518,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -550,7 +566,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -597,7 +615,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -653,7 +673,9 @@ def parse(self, response: DummyResponse, product: CustomProduct): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") _, _, crawler = await crawl_single_item( @@ -706,7 +728,9 @@ def parse(self, response: DummyResponse, product: CustomProduct): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") caplog.clear() @@ -761,7 +785,9 @@ def parse(self, response: DummyResponse, product: CustomProduct): settings = create_scrapy_settings() settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_AUTO_FIELD_URL_FIELDS"] = {CustomProduct: "product_url"} settings["ZYTE_API_URL"] = mockserver.urljoin("/") @@ -802,7 +828,9 @@ def parse(self, response: DummyResponse, product: Product): settings = create_scrapy_settings() del settings["DOWNLOADER_MIDDLEWARES"]["scrapy_poet.InjectionMiddleware"] settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} - settings["ITEM_PIPELINES"]["scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline"] = 0 + settings["ITEM_PIPELINES"][ + "scrapy_zyte_api.poet.ScrapyZyteAPIAutoFieldStatsItemPipeline" + ] = 0 settings["ZYTE_API_AUTO_FIELD_STATS"] = True settings["ZYTE_API_URL"] = mockserver.urljoin("/") with pytest.raises(RuntimeError): From a89f3b72d3e6e28565292c11afed6c8d74c891ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 30 Aug 2024 10:15:31 +0200 Subject: [PATCH 8/9] Improve code readability --- scrapy_zyte_api/_poet_item_pipelines.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py index 0e5b7ab8..975b31d2 100644 --- a/scrapy_zyte_api/_poet_item_pipelines.py +++ b/scrapy_zyte_api/_poet_item_pipelines.py @@ -29,7 +29,7 @@ def __init__(self, crawler: Crawler): self._seen: Set[Type] = set() self._crawler = crawler self._stats = crawler.stats - self._cls_without_url: Set[Type] = set() + self._item_cls_without_url: Set[Type] = set() def open_spider(self, spider): for component in self._crawler.engine.downloader.middleware.middlewares: @@ -42,17 +42,17 @@ def open_spider(self, spider): ) def process_item(self, item: Any, spider: Spider): - cls = item.__class__ + item_cls = item.__class__ - url_field = self._url_fields.get(cls, "url") + url_field = self._url_fields.get(item_cls, "url") adapter = ItemAdapter(item) url = adapter.get(url_field, None) if not url: - if cls not in self._cls_without_url: - self._cls_without_url.add(cls) + if item_cls not in self._item_cls_without_url: + self._item_cls_without_url.add(item_cls) logger.warning( - f"An item of type {cls} was missing a non-empty URL in " - f"its {url_field!r} field. An item URL is necessary to " + f"An item of type {item_cls} was missing a non-empty URL " + f"in its {url_field!r} field. An item URL is necessary to " f"determine the page object that was used to generate " f"that item, and hence print the auto field stats that " f"you requested by enabling the ZYTE_API_AUTO_FIELD_STATS " @@ -62,9 +62,9 @@ def process_item(self, item: Any, spider: Spider): ) return - page_cls = self._registry.page_cls_for_item(url, cls) - cls = page_cls or cls + page_cls = self._registry.page_cls_for_item(url, item_cls) + cls = page_cls or item_cls if cls in self._seen: return self._seen.add(cls) @@ -74,8 +74,8 @@ def process_item(self, item: Any, spider: Spider): else: auto_fields = set() missing_fields = False - for field_name in get_fields_dict(cls): - if is_auto_field(cls, field_name): # type: ignore[arg-type] + for field_name in get_fields_dict(page_cls): + if is_auto_field(page_cls, field_name): # type: ignore[arg-type] auto_fields.add(field_name) else: missing_fields = True From 959bbdf2542d8c1e2074f1edd83458824aa86534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 26 Sep 2024 09:46:11 +0200 Subject: [PATCH 9/9] Add missing return values to process_item --- scrapy_zyte_api/_poet_item_pipelines.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/_poet_item_pipelines.py b/scrapy_zyte_api/_poet_item_pipelines.py index 975b31d2..158356c0 100644 --- a/scrapy_zyte_api/_poet_item_pipelines.py +++ b/scrapy_zyte_api/_poet_item_pipelines.py @@ -60,13 +60,13 @@ def process_item(self, item: Any, spider: Spider): f"that item type, use the ZYTE_API_AUTO_FIELD_URL_FIELDS " f"setting to set a different field." ) - return + return item page_cls = self._registry.page_cls_for_item(url, item_cls) cls = page_cls or item_cls if cls in self._seen: - return + return item self._seen.add(cls) if not page_cls: @@ -86,3 +86,4 @@ def process_item(self, item: Any, spider: Spider): cls_fqn = get_fq_class_name(cls) self._stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list) + return item