ruff: update config and fix errors

karlicoss · Aug 30, 2024 · 9bcea3f · 9bcea3f
1 parent 1773802
commit 9bcea3f
Show file tree

Hide file tree

Showing 55 changed files with 399 additions and 263 deletions.
diff --git a/ruff.toml b/ruff.toml
@@ -1,4 +1,50 @@
+target-version = "py38"  # NOTE: inferred from pyproject.toml if present
+
+lint.extend-select = [
+    "F",    # flakes rules -- default, but extend just in case
+    "E",    # pycodestyle  -- default, but extend just in case
+    "W",    # various warnings
+
+    "B",    # 'bugbear' set -- various possible bugs
+    "C4",   # flake8-comprehensions -- unnecessary list/map/dict calls
+    "COM",  # trailing commas
+    "EXE",  # various checks wrt executable files
+    # "I",    # sort imports
+    "ICN",  # various import conventions
+    "FBT",  # detect use of boolean arguments
+    "FURB", # various rules
+    "PERF", # various potential performance speedups
+    "PD",   # pandas rules
+    "PIE",  # 'misc' lints
+    "PLC",  # pylint convention rules
+    "PLR",  # pylint refactor rules
+    "PLW",  # pylint warnings
+    "PT",   # pytest stuff
+    "PYI",  # various type hinting rules
+    "RET",  # early returns
+    "RUF",  # various ruff-specific rules
+    "TID",  # various imports suggestions
+    "TRY",  # various exception handling rules
+    "UP",   # detect deprecated python stdlib stuff
+    "FA",   # suggest using from __future__ import annotations
+    "PTH",  # pathlib migration
+    "ARG",  # unused argument checks
+    "A",    # builtin shadowing
+    # "EM",  # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying
+
+    # "ALL", # uncomment this to check for new rules!
+]
+
 lint.ignore = [
+    "D",     # annoying nags about docstrings
+    "N",     # pep naming
+    "TCH",   # type checking rules, mostly just suggests moving imports under TYPE_CHECKING
+    "S",     # bandit (security checks) -- tends to be not very useful, lots of nitpicks
+    "DTZ",   # datetimes checks -- complaining about missing tz and mostly false positives
+    "FIX",   # complains about fixmes/todos -- annoying
+    "TD",    # complains about todo formatting -- too annoying
+    "ANN",   # missing type annotations? seems way to strict though
+
 ### too opinionated style checks
     "E501",  # too long lines
     "E702",  # Multiple statements on one line (semicolon)
@@ -17,9 +63,81 @@ lint.ignore = [
     "E402",  # Module level import not at top of file
 
 ### maybe consider these soon
-# sometimes it's useful to give a variable a name even if we don't use it as a documentation
-# on the other hand, often is a sign of error
+    # sometimes it's useful to give a variable a name even if we don't use it as a documentation
+    # on the other hand, often is a sign of error
     "F841",  # Local variable `count` is assigned to but never used
-    "F401",  # imported but unused
 ###
+
+    "RUF100",  # unused noqa -- handle later
+    "RUF012",  # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs
+
+### these are just nitpicky, we usually know better
+    "PLR0911",  # too many return statements
+    "PLR0912",  # too many branches
+    "PLR0913",  # too many function arguments
+    "PLR0915",  # too many statements
+    "PLR1714",  # consider merging multiple comparisons
+    "PLR2044",  # line with empty comment
+    "PLR5501",  # use elif instead of else if
+    "PLR2004",  # magic value in comparison -- super annoying in tests
+###
+    "PLR0402",  # import X.Y as Y -- TODO maybe consider enabling it, but double check
+
+    "B009",  # calling gettattr with constant attribute -- this is useful to convince mypy
+    "B010",  # same as above, but setattr
+    "B011",  # complains about assert False
+    "B017",  # pytest.raises(Exception)
+    "B023",  # seems to result in false positives?
+    "B028",  # suggest using explicit stacklevel? TODO double check later, but not sure it's useful
+
+    # complains about useless pass, but has sort of a false positive if the function has a docstring?
+    # this is common for click entrypoints (e.g. in __main__), so disable
+    "PIE790",
+
+    # a bit too annoying, offers to convert for loops to list comprehension
+    # , which may heart readability
+    "PERF401",
+
+    # suggests no using exception in for loops
+    # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost"
+    "PERF203",
+
+    "RET504", # unnecessary assignment before returning -- that can be useful for readability
+    "RET505", # unnecessary else after return -- can hurt readability
+
+    "PLW0603",  # global variable update.. we usually know why we are doing this
+    "PLW2901",  # for loop variable overwritten, usually this is intentional
+
+    "PT004",  # deprecated rule, will be removed later
+    "PT011",  # pytest raises should is too broad
+    "PT012",  # pytest raises should contain a single statement
+
+    "COM812",  # trailing comma missing -- mostly just being annoying with long multiline strings
+
+    "PD901",   # generic variable name df
+
+    "TRY003",  # suggests defining exception messages in exception class -- kinda annoying
+    "TRY004",  # prefer TypeError -- don't see the point
+    "TRY201",  # raise without specifying exception name -- sometimes hurts readability
+    "TRY400",  # TODO double check this, might be useful
+    "TRY401",  # redundant exception in logging.exception call? TODO double check, might result in excessive logging
+
+    "PGH",  # TODO force error code in mypy instead
+
+    "TID252",  # Prefer absolute imports over relative imports from parent modules
+
+    "UP038",  # suggests using | (union) in isisntance checks.. but it results in slower code
+
+    ## too annoying
+    "T20",     # just complains about prints and pprints
+    "Q",       # flake quotes, too annoying
+    "C90",     # some complexity checking
+    "G004",    # logging statement uses f string
+    "ERA001",  # commented out code
+    "SLF001",  # private member accessed
+    "BLE001",  # do not catch 'blind' Exception
+    "INP001",  # complains about implicit namespace packages
+    "SIM",     # some if statements crap
+    "RSE102",  # complains about missing parens in exceptions
+    ##
 ]
diff --git a/setup.py b/setup.py
@@ -29,6 +29,7 @@ def main() -> None:
 
         install_requires=[
             'more-itertools',
+            'typing-extensions',
             'click'        , # nicer cli
             'plumbum'      , # nicer command composition/piping
             'python-magic' , # better mimetype detection (could be optional?)

diff --git a/src/bleanser/core/__init__.py b/src/bleanser/core/__init__.py
@@ -1,2 +1,5 @@
 from .common import logger
-from .main import main
+
+__all__ = [
+    'logger',
+]
diff --git a/src/bleanser/core/common.py b/src/bleanser/core/common.py
@@ -1,11 +1,11 @@
+from __future__ import annotations
+
 from dataclasses import dataclass
 from pathlib import Path
-from typing import NamedTuple, Sequence, Union, List
+from typing import TYPE_CHECKING, Sequence, Union
 
-from .utils import assert_never
 from .ext.logging import LazyLogger
 
-
 logger = LazyLogger(__name__, level='debug')
 
 
@@ -61,11 +61,12 @@ class Keep(Instruction):
 
 ### helper to define paramertized tests in function's body
 from .utils import under_pytest
-if under_pytest:
+
+if TYPE_CHECKING or under_pytest:
     import pytest
     parametrize = pytest.mark.parametrize
 else:
-    parametrize = lambda *args,**kwargs: (lambda f: f)  # type: ignore
+    parametrize = lambda *_args, **_kwargs: (lambda f: f)
 ###
 
 
@@ -100,7 +101,7 @@ def divide_by_size(*, buckets: int, paths: Sequence[Path]) -> Sequence[Sequence[
     with_size = [(p, p.stat().st_size) for p in paths]
     bucket_size = sum(sz for _, sz in with_size) / buckets
 
-    group: List[Path] = []
+    group: list[Path] = []
     group_size = 0
 
     def dump() -> None:

diff --git a/src/bleanser/core/compat.py b/src/bleanser/core/compat.py
@@ -0,0 +1,6 @@
+import sys
+
+if sys.version_info[:2] >= (3, 11):
+    from typing import Never, assert_never, assert_type, Self  # noqa: F401
+else:
+    from typing_extensions import Never, assert_never, assert_type, Self  # noqa: F401
diff --git a/src/bleanser/core/ext/dummy_executor.py b/src/bleanser/core/ext/dummy_executor.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
 # https://stackoverflow.com/a/10436851/706389
-from typing import Any, Optional
+from typing import Any
 from concurrent.futures import Future, Executor
 class DummyExecutor(Executor):
-    def __init__(self, max_workers: Optional[int]=1) -> None:
+    def __init__(self, max_workers: int | None=1) -> None:
         self._shutdown = False
         self._max_workers = max_workers
 
@@ -22,5 +24,5 @@ def submit(self, fn, *args, **kwargs):  # type: ignore[override,unused-ignore]
 
         return f
 
-    def shutdown(self, wait: bool=True, **kwargs) -> None:
+    def shutdown(self, wait: bool = True, **kwargs) -> None:  # noqa: FBT001,FBT002,ARG002
         self._shutdown = True
diff --git a/src/bleanser/core/ext/logging.py b/src/bleanser/core/ext/logging.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 '''
 Default logger is a bit meh, see 'test'/run this file for a demo
 TODO name 'klogging' to avoid possible conflict with default 'logging' module

diff --git a/src/bleanser/core/ext/sqlite_dumben.py b/src/bleanser/core/ext/sqlite_dumben.py
@@ -7,18 +7,19 @@
 # - constraints
 # this is useful if you want to mess/cleanup the database, but don't want to trip over constraints/triggers
 # NOTE: handling everything as bytes since not sure I wanna mess with encoding here (esp. row data encoding)
+from __future__ import annotations
 
 import hashlib
 import os
-from pathlib import Path
-from tempfile import TemporaryDirectory, TemporaryFile
 import re
 import shutil
 import sqlite3
 import subprocess
-from subprocess import check_output, check_call, DEVNULL
 import sys
-from typing import List, Dict, Optional
+from pathlib import Path
+from subprocess import DEVNULL, check_call, check_output
+from tempfile import TemporaryDirectory
+from typing import Dict
 
 
 Tables = Dict[str, Dict[str, str]]
@@ -35,7 +36,7 @@ def _get_tables(db: Path) -> Tables:
             tables.append(table)
 
         for table in tables:
-            schema: Dict[str, str] = {}
+            schema: dict[str, str] = {}
             for row in conn.execute(f'PRAGMA table_info({table})'):
                 col   = row[1]
                 type_ = row[2]
@@ -45,7 +46,7 @@ def _get_tables(db: Path) -> Tables:
 
 
 def _sqlite(*cmd):
-    return ['sqlite3', '-bail'] + list(cmd)
+    return ['sqlite3', '-bail', *cmd]
 
 
 def _dumben_db(output_db: Path) -> None:
@@ -108,7 +109,7 @@ def _dumben_db(output_db: Path) -> None:
     subprocess.check_call(_sqlite(output_db, 'PRAGMA integrity_check;'), stdout=DEVNULL)
 
 
-def run(*, db: Path, output: Optional[Path], output_as_db: bool) -> None:
+def run(*, db: Path, output: Path | None, output_as_db: bool) -> None:
     if output is not None:
         assert not output.exists(), output
 
@@ -118,7 +119,7 @@ def run(*, db: Path, output: Optional[Path], output_as_db: bool) -> None:
     if output_as_db:
         assert output is not None
 
-        dumben_cache: Optional[Path] = None
+        dumben_cache: Path | None = None
         _DUMBEN_CACHE_BASE = os.environ.get('SQLITE_DUMBEN_USE_CACHE')
         if _DUMBEN_CACHE_BASE is not None:
             DUMBEN_CACHE_BASE = Path(_DUMBEN_CACHE_BASE)

diff --git a/src/bleanser/core/main.py b/src/bleanser/core/main.py
@@ -1,18 +1,25 @@
 # not to confuse with __main__.py... meh
+from __future__ import annotations
+
+import os
 from glob import glob as do_glob
 from pathlib import Path
-import os
-from typing import Optional, List, Type, cast
-
-from .common import logger, Dry, Move, Remove, Mode
-from .processor import BaseNormaliser, compute_instructions, apply_instructions, bleanser_tmp_directory
+from typing import cast
 
 import click
 
+from .common import Dry, Mode, Move, Remove, logger
+from .processor import (
+    BaseNormaliser,
+    apply_instructions,
+    bleanser_tmp_directory,
+    compute_instructions,
+)
+
 
 # TODO use context and default_map
 # https://click.palletsprojects.com/en/7.x/commands/#overriding-defaults
-def main(*, Normaliser: Type[BaseNormaliser]) -> None:
+def main(*, Normaliser: type[BaseNormaliser]) -> None:
     # meh.. by default the width is stupid, like 80 chars
     context_settings = {
         'max_content_width': 120,
@@ -34,7 +41,7 @@ def call_main() -> None:
     @click.option  ('--difftool'     , type=str                                      , help='Custom difftool to use')
     @click.option  ('--from', 'from_', type=int    , default=None)
     @click.option  ('--to'           , type=int    , default=None                    , help='non-inclusive, i.e. [from, to)')
-    def diff(path1: str, path2: Path, *, glob: bool, from_: Optional[int], to: Optional[int], vim: bool, difftool: str) -> None:
+    def diff(path1: str, path2: Path, *, glob: bool, from_: int | None, to: int | None, vim: bool, difftool: str) -> None:
         path1_: Path
         if glob:
             assert path2 is cast(Path, _DEFAULT), path2
@@ -44,7 +51,8 @@ def diff(path1: str, path2: Path, *, glob: bool, from_: Optional[int], to: Optio
             paths = _get_paths(path=path1, from_=from_, to=to, glob=glob)
         else:
             assert cast(str, path2) is not _DEFAULT
-            assert from_ is None and to is None  # just for sanity
+            assert from_ is None
+            assert to is None
             path1_ = Path(path1)
             path2 = Path(path2)
             paths = [path1_, path2]
@@ -63,7 +71,7 @@ def diff(path1: str, path2: Path, *, glob: bool, from_: Optional[int], to: Optio
     @call_main.command(name='normalised', short_help='normalise file and dump to stdout')
     @click.argument('path', type=Path)
     @click.option('--stdout', is_flag=True, help='print normalised files to stdout instead of printing the path to it')
-    def normalised(path: Path, stdout: bool) -> None:
+    def normalised(*, path: Path, stdout: bool) -> None:
         with bleanser_tmp_directory() as base_tmp_dir:
             n = Normaliser(original=path, base_tmp_dir=base_tmp_dir)
             with n.do_normalise() as cleaned:
@@ -94,8 +102,8 @@ def normalised(path: Path, stdout: bool) -> None:
     ##
     @click.option  ('--multiway'       , is_flag=True, default=None                , help='force "multiway" cleanup')
     @click.option  ('--prune-dominated', is_flag=True, default=None)
-    def prune(path: str, sort_by: str, glob: bool, dry: bool, move: Optional[Path], remove: bool, threads: Optional[int], from_: Optional[int], to: Optional[int], multiway: Optional[bool], prune_dominated: Optional[bool], yes: bool) -> None:
-        modes: List[Mode] = []
+    def prune(*, path: str, sort_by: str, glob: bool, dry: bool, move: Path | None, remove: bool, threads: int | None, from_: int | None, to: int | None, multiway: bool | None, prune_dominated: bool | None, yes: bool) -> None:
+        modes: list[Mode] = []
         if dry is True:
             modes.append(Dry())
         if move is not None:
@@ -128,18 +136,18 @@ def prune(path: str, sort_by: str, glob: bool, dry: bool, move: Optional[Path],
     call_main()
 
 
-def _get_paths(*, path: str, from_: Optional[int], to: Optional[int], sort_by: str = "name", glob: bool=False) -> List[Path]:
+def _get_paths(*, path: str, from_: int | None, to: int | None, sort_by: str = "name", glob: bool=False) -> list[Path]:
     if not glob:
         pp = Path(path)
         assert pp.is_dir(), pp
         path = str(pp) + os.sep + '**'
-    paths = [Path(p) for p in do_glob(path, recursive=True)]
+    paths = [Path(p) for p in do_glob(path, recursive=True)]  # noqa: PTH207
     paths = [p for p in paths if p.is_file()]
     if sort_by == "name":
         # assumes sort order is same as date order? guess it's reasonable
-        paths = list(sorted(paths))
+        paths = sorted(paths)
     else:
-        paths = list(sorted(paths, key=lambda s: s.stat().st_size))
+        paths = sorted(paths, key=lambda s: s.stat().st_size)
 
     if from_ is None:
         from_ = 0

diff --git a/src/bleanser/core/modules/extract.py b/src/bleanser/core/modules/extract.py
@@ -1,9 +1,13 @@
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Iterator, Any
-
-
-from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir, sort_file, Normalised
+from typing import Any, Iterator
+
+from bleanser.core.processor import (
+    BaseNormaliser,
+    Normalised,
+    sort_file,
+    unique_file_in_tempdir,
+)
 
 
 class ExtractObjectsNormaliser(BaseNormaliser):