From ab5fdd01de2e098165a5c1748380b7c13133c653 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 9 Oct 2024 22:36:03 -0400 Subject: [PATCH 1/5] Add additional mypy settings, quiet new and existing errors --- indexer/scripts/elastic-conf.py | 1 + indexer/workers/fetcher/BlacklistRedirectMiddleware.py | 2 +- indexer/workers/fetcher/batch_spider.py | 2 +- indexer/workers/fetcher/fetch_worker.py | 1 + pyproject.toml | 9 +++++++++ 5 files changed, 13 insertions(+), 2 deletions(-) diff --git a/indexer/scripts/elastic-conf.py b/indexer/scripts/elastic-conf.py index 77ea617f..2e0cb650 100644 --- a/indexer/scripts/elastic-conf.py +++ b/indexer/scripts/elastic-conf.py @@ -152,6 +152,7 @@ def register_repository(self, es: Elasticsearch) -> bool: if self.es_snapshot_s3_endpoint: settings["endpoint"] = self.es_snapshot_s3_endpoint else: # repo-type=fs + assert self.es_snapshot_fs_location settings = {"location": self.es_snapshot_fs_location} try: diff --git a/indexer/workers/fetcher/BlacklistRedirectMiddleware.py b/indexer/workers/fetcher/BlacklistRedirectMiddleware.py index fce69b89..aca09681 100644 --- a/indexer/workers/fetcher/BlacklistRedirectMiddleware.py +++ b/indexer/workers/fetcher/BlacklistRedirectMiddleware.py @@ -11,7 +11,7 @@ from w3lib.url import safe_url_string -class BlacklistRedirectMiddleware(BaseRedirectMiddleware): # type: ignore[no-any-unimported] +class BlacklistRedirectMiddleware(BaseRedirectMiddleware): # type: ignore[no-any-unimported,misc] """ Handle redirection of requests based on response status and meta-refresh html tag. Very Minor Edit of scrapy's builtin RedirectMiddleware diff --git a/indexer/workers/fetcher/batch_spider.py b/indexer/workers/fetcher/batch_spider.py index 13d840b8..2f4e1318 100644 --- a/indexer/workers/fetcher/batch_spider.py +++ b/indexer/workers/fetcher/batch_spider.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -class BatchSpider(scrapy.Spider): # type: ignore[no-any-unimported] +class BatchSpider(scrapy.Spider): # type: ignore[no-any-unimported,misc] """ This spider is given a batch_index, loads the corresponding batchfile from the disk, then fetches the urls in that batch's stories, and then saves the corresponding html and http_metadata diff --git a/indexer/workers/fetcher/fetch_worker.py b/indexer/workers/fetcher/fetch_worker.py index 17fef3a2..af498a1c 100644 --- a/indexer/workers/fetcher/fetch_worker.py +++ b/indexer/workers/fetcher/fetch_worker.py @@ -145,6 +145,7 @@ def scrapy_cb(self, story: BaseStory) -> None: if not hasattr(self, "sender"): self.qconnect() self.sender = self.story_sender() + assert self.sender self.sender.send_story(story) status_label = "success" diff --git a/pyproject.toml b/pyproject.toml index 83b54f76..16100621 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,15 @@ warn_return_any = true warn_unused_ignores = true show_error_codes = true +# from https://quantlane.com/blog/type-checking-large-codebase/ +# in the order they appear: +disallow_untyped_calls = true +disallow_untyped_decorators = true +#disallow_any_generics = true +disallow_subclassing_any = true +warn_redundant_casts = true +warn_unused_configs = true + # look for local stubs to PyPI packages in top level stubs directory mypy_path = "stubs" From 8018b967d10af8c073993ccf10e18730e5ef468e Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 9 Oct 2024 23:08:39 -0400 Subject: [PATCH 2/5] enable mypy "disallow_any_generics", with associated additional specificities --- indexer/elastic.py | 2 +- indexer/pipeline.py | 7 +++---- indexer/scripts/qutil.py | 9 ++++++--- indexer/story.py | 22 ++++++++++++---------- indexer/tests/test_story.py | 2 +- indexer/worker.py | 2 +- pyproject.toml | 2 +- 7 files changed, 25 insertions(+), 21 deletions(-) diff --git a/indexer/elastic.py b/indexer/elastic.py index 0d053d10..784538e0 100644 --- a/indexer/elastic.py +++ b/indexer/elastic.py @@ -75,7 +75,7 @@ def process_args(self) -> None: logger.fatal("need --elasticsearch_config_dir or ELASTICSEARCH_CONFIG_DIR") sys.exit(1) - def _load_template(self, name: str) -> dict | Any: + def _load_template(self, name: str) -> dict[str, Any] | Any: """ Load a JSON file from the Elasticsearch configuration directory. Args: diff --git a/indexer/pipeline.py b/indexer/pipeline.py index a53388e2..d0dc7bc4 100644 --- a/indexer/pipeline.py +++ b/indexer/pipeline.py @@ -14,7 +14,7 @@ import argparse import logging import sys -from typing import Callable, Dict, List, Union, cast +from typing import Any, Callable, Dict, List, Union, cast # PyPI from pika.exchange_type import ExchangeType @@ -121,8 +121,7 @@ def __init__(self, pipeline: "Pipeline", name: str): Outputs = List[Union[Worker, Consumer]] -# CommandMethod = Callable[["Pipeline"],None] -CommandMethod = Callable +CommandMethod = Callable[["Pipeline"], None] def command(func: CommandMethod) -> CommandMethod: @@ -367,7 +366,7 @@ def get_command_func(self, cmd: str) -> Callable[[], None]: assert callable(meth) return cast(Callable[[], None], meth) - def get_definitions(self) -> Dict: + def get_definitions(self) -> Dict[str, Any]: api = self.admin_api() defns = api.get_definitions() assert isinstance(defns, dict) diff --git a/indexer/scripts/qutil.py b/indexer/scripts/qutil.py index c9928a18..15ec8423 100644 --- a/indexer/scripts/qutil.py +++ b/indexer/scripts/qutil.py @@ -29,8 +29,7 @@ COMMANDS: List[str] = [] -# CommandMethod = Callable[["Pipeline"],None] -CommandMethod = Callable +CommandMethod = Callable[["QUtil"], None] def command(func: CommandMethod) -> CommandMethod: @@ -222,7 +221,11 @@ def get_queue(self) -> str: sys.exit(1) return str(queue) - def dump_msgs(self, writer: Callable, flush: Callable) -> None: + def dump_msgs( + self, + writer: Callable[[bytes, int, BasicProperties], None], + flush: Callable[[], None], + ) -> None: """ utility to read messages from queue & call writer """ diff --git a/indexer/story.py b/indexer/story.py index 6830e45c..fd945851 100644 --- a/indexer/story.py +++ b/indexer/story.py @@ -24,7 +24,9 @@ def uuid_by_link(link: str) -> str: # enforces a specific naming pattern within this object, for concision and extensibility in the exit cb # https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case -def class_to_member_name(original_class: Callable, private: bool = True) -> str: +def class_to_member_name( + original_class: type["StoryData"], private: bool = True +) -> str: name = original_class.__name__ name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) if private: @@ -41,19 +43,19 @@ class StoryData: """ dirty: bool = False - exit_cb: Callable = field(repr=False) + exit_cb: Callable[["StoryData"], None] = field(repr=False) frozen: bool = field(default=False, repr=False) CONTENT_TYPE: str = field(default="json", repr=False) MEMBER_NAME: str = field(default="", repr=False) - internals: tuple = field( - default=( + internals: list[str] = field( + default=[ "internals", "dirty", "frozen", "exit_cb", "CONTENT_TYPE", "MEMBER_NAME", - ), + ], repr=False, ) @@ -87,8 +89,8 @@ def __exit__(self, type: Any, value: Any, traceback: Any) -> None: self.frozen = True self.exit_cb(self) - def as_dict(self) -> dict: - output: dict = {} + def as_dict(self) -> dict[str, Any]: + output: dict[str, Any] = {} for key in fields(self): if key.name not in self.internals: output[key.name] = getattr(self, key.name) @@ -96,7 +98,7 @@ def as_dict(self) -> dict: return output # As a convenience for loading in values from a storage interface. - def load_dict(self, load_dict: dict) -> None: + def load_dict(self, load_dict: dict[str, Any]) -> None: field_names: list[str] = [f.name for f in fields(self)] for key, value in load_dict.items(): if key in field_names: @@ -319,7 +321,7 @@ def __init__(self, directory: Optional[str] = None): # Using the dict interface to story_data so as to avoid typing issues. def init_storage(self, story_data: StoryData) -> None: - data_dict: dict = story_data.as_dict() + data_dict: dict[str, Any] = story_data.as_dict() fetch_date = data_dict["fetch_date"] if fetch_date is None: @@ -421,7 +423,7 @@ def __init__(self) -> None: "DiskStory": DiskStory, } - def __call__(self, *args: List, **kwargs: Dict[Any, Any]) -> BaseStory: + def __call__(self, *args: List[Any], **kwargs: Dict[Any, Any]) -> BaseStory: instance = self.classes[self.iface](*args, **kwargs) assert isinstance(instance, BaseStory) return instance diff --git a/indexer/tests/test_story.py b/indexer/tests/test_story.py index 8d02b5c3..507057d1 100644 --- a/indexer/tests/test_story.py +++ b/indexer/tests/test_story.py @@ -128,7 +128,7 @@ def set_env(self) -> None: # We want this to be cmdline toggleable probably. @pytest.fixture(scope="class", autouse=True) - def teardown_test_datadir(self, request: Any) -> Generator: + def teardown_test_datadir(self, request: Any) -> Generator[None, None, None]: yield shutil.rmtree(TEST_DATA_DIR) diff --git a/indexer/worker.py b/indexer/worker.py index 1dd3dc28..900b3b59 100644 --- a/indexer/worker.py +++ b/indexer/worker.py @@ -751,7 +751,7 @@ def _pika_ack_and_commit(self, im: InputMessage, multiple: bool = False) -> None # AFTER basic_ack! chan.tx_commit() # commit sent messages and ack atomically! - def _exc_headers(self, e: Exception) -> Dict: + def _exc_headers(self, e: Exception) -> Dict[str, str]: """ return dict of headers to add to a message after an exception was caught diff --git a/pyproject.toml b/pyproject.toml index 16100621..920069eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ show_error_codes = true # in the order they appear: disallow_untyped_calls = true disallow_untyped_decorators = true -#disallow_any_generics = true +disallow_any_generics = true disallow_subclassing_any = true warn_redundant_casts = true warn_unused_configs = true From 68d4f4635b731f4215d97ef2f52e6eb36053d14c Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Thu, 10 Oct 2024 00:05:58 -0400 Subject: [PATCH 3/5] add stubs/rabbitmq_admin.pyi and adjust usage --- indexer/pipeline.py | 19 ++++++++++--------- indexer/scripts/rabbitmq-stats.py | 2 +- indexer/worker.py | 2 +- stubs/rabbitmq_admin.pyi | 18 ++++++++++++++++++ 4 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 stubs/rabbitmq_admin.pyi diff --git a/indexer/pipeline.py b/indexer/pipeline.py index d0dc7bc4..3e864e33 100644 --- a/indexer/pipeline.py +++ b/indexer/pipeline.py @@ -332,9 +332,10 @@ def qlen(self) -> None: @command def show(self) -> None: """show queues, exchanges, bindings""" - defns = self.get_definitions() - for what in ("queues", "exchanges", "bindings"): - things = defns[what] + api = self.admin_api() + defns = api.get_definitions() + + def dump_things(what: str, things: list[dict[str, Any]]) -> None: print("") if things: print(what) @@ -343,6 +344,12 @@ def show(self) -> None: else: print(f"no {what}") + # stubs/rabbitmq_admin.pyi defines get_definitions() + # as returning a TypedDict, so need to index with constant: + dump_things("queues", defns["queues"]) + dump_things("exchanges", defns["exchanges"]) + dump_things("bindings", defns["bindings"]) + @command def test(self) -> None: """test if queues configured""" @@ -366,12 +373,6 @@ def get_command_func(self, cmd: str) -> Callable[[], None]: assert callable(meth) return cast(Callable[[], None], meth) - def get_definitions(self) -> Dict[str, Any]: - api = self.admin_api() - defns = api.get_definitions() - assert isinstance(defns, dict) - return defns - class MyPipeline(Pipeline): PIPE_TYPES = ["batch-fetcher", "queue-fetcher", "historical", "archive", "csv"] diff --git a/indexer/scripts/rabbitmq-stats.py b/indexer/scripts/rabbitmq-stats.py index f325b75e..ea77fc24 100644 --- a/indexer/scripts/rabbitmq-stats.py +++ b/indexer/scripts/rabbitmq-stats.py @@ -79,7 +79,7 @@ def main_loop(self) -> None: nodes = api.list_nodes() for node in nodes: # List # leading part is cluster name, split and give second part?? - name = node.get("name").replace("@", "-") + name = node.get("name", "").replace("@", "-") self.g(node, "fd_used", "nodes", "fds", "name", name) self.g(node, "mem_used", "nodes", "memory", "name", name) self.g(node, "sockets_used", "nodes", "sockets", "name", name) diff --git a/indexer/worker.py b/indexer/worker.py index 900b3b59..0475a1e9 100644 --- a/indexer/worker.py +++ b/indexer/worker.py @@ -523,7 +523,7 @@ def sender() -> None: dest = routing_key # using default exchange self.incr("sent-msgs", labels=[("dest", dest)]) - def admin_api(self) -> rabbitmq_admin.AdminAPI: # type: ignore[no-any-unimported] + def admin_api(self) -> rabbitmq_admin.AdminAPI: args = self.args assert args diff --git a/stubs/rabbitmq_admin.pyi b/stubs/rabbitmq_admin.pyi new file mode 100644 index 00000000..470db8f0 --- /dev/null +++ b/stubs/rabbitmq_admin.pyi @@ -0,0 +1,18 @@ +# only what we use... + +import typing as _t + +_JSON = dict[str, _t.Any] +_LIST_JSON = list[_JSON] + +class _GetDefinitionsReturn(_t.TypedDict): + bindings: _LIST_JSON + exchanges: _LIST_JSON + queues: _LIST_JSON + +class AdminAPI: + def __init__(self, url: str, auth: tuple[str, str]): ... + def _api_get(self, url: str) -> _t.Any: ... + def get_definitions(self) -> _GetDefinitionsReturn: ... + def list_exchanges(self) -> _LIST_JSON: ... + def list_nodes(self) -> _LIST_JSON: ... From 1a9fa648aef8946931467bd4f14959d005326fdf Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Thu, 10 Oct 2024 00:17:05 -0400 Subject: [PATCH 4/5] fix generics complaints in batch fetcher files --- indexer/workers/fetcher/batch_spider.py | 8 ++++++-- indexer/workers/fetcher/rss_utils.py | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/indexer/workers/fetcher/batch_spider.py b/indexer/workers/fetcher/batch_spider.py index 2f4e1318..0840124b 100644 --- a/indexer/workers/fetcher/batch_spider.py +++ b/indexer/workers/fetcher/batch_spider.py @@ -40,14 +40,18 @@ class BatchSpider(scrapy.Spider): # type: ignore[no-any-unimported,misc] } def __init__( - self, batch: List[BaseStory], cb: Callable, *args: List, **kwargs: Dict + self, + batch: List[BaseStory], + cb: Callable[[BaseStory], None], + *args: List[Any], + **kwargs: Dict[str, Any] ) -> None: super(BatchSpider, self).__init__(*args, **kwargs) self.batch = batch self.cb = cb - def start_requests(self) -> Generator: + def start_requests(self) -> Generator[scrapy.Request, None, None]: # type: ignore[no-any-unimported] for story in self.batch: url = story.rss_entry().link diff --git a/indexer/workers/fetcher/rss_utils.py b/indexer/workers/fetcher/rss_utils.py index 6292235a..7e7d04b7 100644 --- a/indexer/workers/fetcher/rss_utils.py +++ b/indexer/workers/fetcher/rss_utils.py @@ -109,7 +109,7 @@ def src_attr_int(attr_name: str) -> Optional[int]: def batch_rss( source_list: List[RSSEntry], num_batches: int = 20, max_domain_size: int = 10000 -) -> tuple[List, Dict]: +) -> tuple[List[List[RSSEntry]], Dict[str, int]]: """ Greedily pack source_list into num_batches batches, keeping each domain together. """ @@ -127,8 +127,8 @@ def batch_rss( [(k, v) for k, v in agg.items()], key=lambda x: x[1] ) - batches: List = [[] for i in range(num_batches)] - batch_map: Dict = {} + batches: List[List[RSSEntry]] = [[] for i in range(num_batches)] + batch_map: Dict[str, int] = {} while len(domains_sorted) > 0: domain = domains_sorted.pop() From 77d70eb7ea76c56b03c7831aea03c89bd3b5c32c Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Thu, 10 Oct 2024 00:50:12 -0400 Subject: [PATCH 5/5] indexer/story.py: fix internals with variadic tuple? --- indexer/story.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/indexer/story.py b/indexer/story.py index fd945851..4f7cf125 100644 --- a/indexer/story.py +++ b/indexer/story.py @@ -47,15 +47,15 @@ class StoryData: frozen: bool = field(default=False, repr=False) CONTENT_TYPE: str = field(default="json", repr=False) MEMBER_NAME: str = field(default="", repr=False) - internals: list[str] = field( - default=[ + internals: tuple[str, ...] = field( + default=( "internals", "dirty", "frozen", "exit_cb", "CONTENT_TYPE", "MEMBER_NAME", - ], + ), repr=False, )