From b3034fa518ef2db14e94d294a21bf7ae7fe812b0 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 13:41:23 +0000
Subject: [PATCH] Speed up `codespell:ignore` check by skipping the regex in
 most cases

The changes to provide a public API had some performance related costs
of about 1% runtime. There is no trivial way to offset this any
further without undermining the API we are building. However, we can
pull performance-related shenanigans to compenstate for the cost
introduced.

The codespell codebase unsurprisingly spends a vast majority of its
runtime in various regex related code such as `search` and `finditer`.

The best way to optimize runtime spend in regexes is to not do a regex
in the first place, since the regex engine has a rather steep overhead
over regular string primitives (that is the cost of flexibility). If
the regex rarely matches and there is a very easy static substring
that can be used to rule out the match, then you can speed up the code
by using `substring in string` as a conditional to skip the
regex. This is assuming the regex is used enough for the performance
to matter.

An obvious choice here falls on the `codespell:ignore` regex, because
it has a very distinctive substring in the form of `codespell:ignore`,
which will rule out almost all lines that will not match.

With this little trick, runtime goes from ~5.6s to ~4.9s on the corpus
mentioned in #3419.
---
 codespell_lib/spellchecker.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index f1ad6885b6c..ac43074798f 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -109,7 +109,10 @@
 
 _builtin_default_as_tuple = tuple(_builtin_default.split(","))
 
-_inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
+_codespell_ignore_tag = "codespell:ignore"
+_inline_ignore_regex = re.compile(
+    rf"[^\w\s]\s?{_codespell_ignore_tag}\b(\s+(?P<words>[\w,]*))?"
+)
 
 
 class UnknownBuiltinDictionaryError(ValueError):
@@ -177,6 +180,8 @@ def __init__(self) -> None:
         self.ignore_words_cased: Container[str] = frozenset()
 
     def _parse_inline_ignore(self, line: str) -> Optional[FrozenSet[str]]:
+        if _codespell_ignore_tag not in line:
+            return frozenset()
         inline_ignore_match = _inline_ignore_regex.search(line)
         if inline_ignore_match:
             words = frozenset(