From 8d5b1176c8340412ca1a7c0ad104dd29077a4905 Mon Sep 17 00:00:00 2001
From: Torben <59419684+entorb@users.noreply.github.com>
Date: Sun, 24 Nov 2024 06:41:54 +0100
Subject: [PATCH] check_chapters V2

---
 scripts/check_chapters.py      |  88 +++++++++++-----------
 scripts/check_chapters_test.py | 133 ++++++++++++++++++++++++---------
 2 files changed, 138 insertions(+), 83 deletions(-)

diff --git a/scripts/check_chapters.py b/scripts/check_chapters.py
index fc914219..e0e5707c 100755
--- a/scripts/check_chapters.py
+++ b/scripts/check_chapters.py
@@ -186,14 +186,18 @@ def fix_ellipsis(s: str) -> str:
 
     if settings["lang"] != "DE":
         # after punctuation: add space
-        s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s)
+        s = re.sub(r"(?<=[\.\?!:,;])…", " …", s)
 
-    # new rule for German
+    # new rule for German (SYNC with fix_hyphens)
     if settings["lang"] == "DE":
         # before: add space if not at start of line or quote
-        s = re.sub(r"(?<![ „‚\n^])…", r" …", s)
+        s = re.sub(r"(?<=[^ „‚\(\{\n^])…", " …", s)
+
         # after: add space if not followed by punctuation
-        s = re.sub(r"…(?![ \.\?!:,;“‘\n$])", r"… ", s)
+        s = re.sub(r"…(?=[^ \.\?\)\}!:,;“‘\n$])", "… ", s)
+
+        # after: …“Text -> …“ Text
+        s = re.sub(r"…“(?=[^\s])", r"…“ ", s)
 
     return s
 
@@ -309,12 +313,6 @@ def fix_quotations(s: str) -> str:  # noqa: C901, PLR0912, PLR0915
     if settings["lang"] == "DE":
         s = re.sub(r'(^|\s)"((\\|\w).*?)"', r"\1„\2“", s)
 
-    # add space between … and “
-    # if settings["lang"] == "EN":
-    #     s = re.sub(r"…“", r"… “", s)
-    if settings["lang"] == "DE":
-        s = re.sub(r"…„", r"… „", s)
-
     # space at opening "
     if settings["lang"] == "EN":
         s = re.sub(r"“ +", r"“", s)
@@ -329,9 +327,10 @@ def fix_quotations(s: str) -> str:  # noqa: C901, PLR0912, PLR0915
 
     # space between "…" and "“"
     # if settings["lang"] == "EN":
-    #     s = re.sub(r"…„", r"… “", s)     # rrthomas voted againt it
+    #     s = re.sub(r"…„", r"… “", s)
+    #     # rrthomas voted againt it
     if settings["lang"] == "DE":
-        s = re.sub(r"…„", r"… „", s)
+        s = re.sub("…„", "… „", s)
 
     # ” } -> ”}
     if settings["lang"] == "EN":
@@ -364,7 +363,7 @@ def fix_quotations(s: str) -> str:  # noqa: C901, PLR0912, PLR0915
     if settings["lang"] == "DE":
         # not, this is wrong, it is correct to have „...“,
         # s = re.sub(r"(?<![\.,!\?;])(?<![\.,!\?;]\})“,", r",“", s)
-        s = re.sub(r"(?<![\.,!\?;]),“", r"“,", s)
+        s = re.sub(r"(?<![\.,!\?;]),“", "“,", s)
 
     # nested single quote + emph
     if settings["lang"] == "EN":
@@ -435,13 +434,15 @@ def fix_emph(s: str) -> str:
 
 
 def fix_hyphens(s: str) -> str:
+    # fix simple dash to em dash
     # --- -> em dash —
     s = s.replace("---", "—")
+    # -- -> em dash —
     s = s.replace("--", "—")
     # hyphens: (space-hyphen-space) should be "—" (em dash).
     # trim space around em-dash
     s = s.replace(" — ", "—")
-    # shorter dash as well
+    # mid dash as well
     s = s.replace(" – ", "—")
     # NOT for '— ' as in ', no— “I'
     # s = re.sub(r"— ", r"—", s)
@@ -450,49 +451,41 @@ def fix_hyphens(s: str) -> str:
     # remove space before — followed by punctuation
     s = re.sub(r" —([,\.!\?;])", r"—\1", s)
 
-    # - at start of line
-    s = re.sub(r"^[\-—] *", r"—", s)
-    # if settings["lang"] == "EN":
-    #     s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it
-    if settings["lang"] == "DE":
-        # end of line
-        s = re.sub(r" [\-—]$", r"—", s)
-    # - at end of emph
-    s = re.sub(r"(\s*)\-\}", r"—}\1", s)
-    # at start of quote
-    # if settings["lang"] == "EN":
-    #     s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it
-    if settings["lang"] == "DE":
-        s = re.sub(r"\s*—„", r"— „", s)
-        s = re.sub(r"„\s*—\s*", r"„—", s)
+    # mid dash is used between numbers:
+    # 2-4 -> 2–4 using mid length hyphen
+    s = re.sub(r"(\d)\-(?=\d)", r"\1–", s)
 
-    # at end of quote
+    # fix spaces around —
     if settings["lang"] == "EN":
+        # - at start of line
+        s = re.sub(r"^[\-—] *", r"—", s)
+        # if settings["lang"] == "EN":
+        #     s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it
+        # - at end of emph
+        s = re.sub(r"(\s*)\-\}", r"—}\1", s)
+        # at start of quote
+        # if settings["lang"] == "EN":
+        #     s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it
+
+        # at end of quote
         s = re.sub(r"(\s*)\-”", r"—”\1", s)
-    if settings["lang"] == "DE":
-        s = re.sub(r"(\s*)\-“", r"—“\1", s)
 
-    # space-hyphen-quotation end
-    if settings["lang"] == "EN":
+        # space-hyphen-quotation end
         s = re.sub(r"\s+(—”)", r"\1", s)
-    if settings["lang"] == "DE":
-        s = re.sub(r"\s+(—“)", r"\1", s)
-
-    # there is a shorter dash as well:
-    # 2-4 -> 2–4 using mid length hyphen
-    s = re.sub(r"(\d)\-(?=\d)", r"\1–", s)
-    # NOT: mid-length dash ->  em dash (caution: false positives!)
-    # s = s.replace("–", "—")
 
-    # new rule for German
+    # new rule for German (SYNC with fix_ellipsis)
     if settings["lang"] == "DE":
         # remove all spaces around hyphens
-        s = re.sub(r" *— *", r"—", s)
+        s = re.sub(r" *— *", "—", s)
 
         # before: add space if not at start of line or quote
-        s = re.sub(r"(?<![ „‚\n^])—", r" —", s)
+        s = re.sub(r"(?<=[^ „‚\(\{\n^])—", " —", s)
+
         # after: add space if not followed by punctuation
-        s = re.sub(r"—(?![ \.\?!:,;“‘\n$])", r"— ", s)
+        s = re.sub(r"—(?=[^ \.\?\)\}!:,;“‘\n$])", "— ", s)
+
+        # after: —“Text -> —“ Text
+        s = re.sub(r"—“(?=[^\s])", r"—“ ", s)
 
     return s
 
@@ -610,6 +603,9 @@ def fix_spell(s: str) -> str:
 
     list_of_chapter_files = get_list_of_chapter_files()
 
+    # reduce to debugging just one file
+    # list_of_chapter_files = (Path("chapters/hpmor-chapter-021.tex"),)
+
     # V2: using multiprocessing
     # prepare
     num_processes = min(cpu_count(), len(list_of_chapter_files))
diff --git a/scripts/check_chapters_test.py b/scripts/check_chapters_test.py
index b366107b..21d5e7e8 100644
--- a/scripts/check_chapters_test.py
+++ b/scripts/check_chapters_test.py
@@ -15,6 +15,7 @@
     fix_MrMrs,
     fix_numbers,
     fix_punctuation,
+    fix_quotations,
     fix_spaces,
     fix_spell,
 )
@@ -74,6 +75,7 @@ def test_fix_ellipsis(lang: str) -> None:
                 ("foo, …“", "foo, …“"),
                 ("foo,…“", "foo, …“"),
                 ("foo …! bar", "foo …! bar"),
+                ("\\emph{…ihm", "\\emph{… ihm"),
             ]
         )
 
@@ -86,20 +88,20 @@ def test_fix_emph(lang: str) -> None:
     pairs = [
         (r"That’s not \emph{true!}", r"That’s not \emph{true}!"),
         (r"she got \emph{magic,} can you", r"she got \emph{magic}, can you"),
-        ("asdf", "asdf"),
+        ("foo", "foo"),
     ]
     if lang == "EN":
         pairs.extend(
             [
                 (r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless.} Both"),
-                ("asdf", "asdf"),
+                ("foo", "foo"),
             ]
         )
     elif lang == "DE":
         pairs.extend(
             [
                 (r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless}. Both"),
-                ("asdf", "asdf"),
+                ("foo", "foo"),
             ]
         )
     checkit(fix_emph, pairs)
@@ -118,22 +120,36 @@ def test_fix_hyphens(lang: str) -> None:
                 (" —.", "—."),
                 (" —!", "—!"),
                 (" —?", "—?"),
-                ("— asdf", "—asdf"),
-                ("- asdf", "—asdf"),
-                ("-asdf", "—asdf"),
+                ("— foo", "—foo"),
+                ("- foo", "—foo"),
+                ("-foo", "—foo"),
             )
         )
     if lang == "DE":
         pairs.extend(
             (
-                ("Text — Text", "Text — Text"),
-                ("Text—„", "Text — „"),
-                ("Text —„", "Text — „"),
-                ("Text „ —Quote", "Text „— Quote"),
-                ("Text „ — Quote", "Text „— Quote"),
-                ("Text—„— Quote", "Text — „— Quote"),
-                ("Text -“asdf", "Text —“ asdf"),
-                ("Text —“", "Text —“"),
+                ("foo - bar", "foo — bar"),
+                ("foo -- bar", "foo — bar"),
+                ("foo --- bar", "foo — bar"),
+                ("foo—bar", "foo — bar"),
+                ("foo — bar", "foo — bar"),
+                ("foo – bar", "foo — bar"),  # mid dash
+                # quote start
+                ("foo—„", "foo — „"),
+                ("foo—‚", "foo — ‚"),
+                ("foo —„", "foo — „"),
+                ("foo „ —quote", "foo „— quote"),
+                ("foo „ — quote", "foo „— quote"),
+                ("foo—„— quote", "foo — „— quote"),
+                # quote end
+                ("quote —“foo", "quote —“ foo"),
+                ("foo —“", "foo —“"),
+                # emph
+                ("\\emph{foo—}", "\\emph{foo —}"),
+                ("\\emph{foo —}", "\\emph{foo —}"),
+                ("\\emph{foo—} bar", "\\emph{foo —} bar"),
+                ("foo—\\emph{bar}", "foo — \\emph{bar}"),
+                ("\\emph{—ihm", "\\emph{— ihm"),
             )
         )
     checkit(fix_hyphens, pairs)
@@ -145,7 +161,7 @@ def test_fix_latex(lang: str) -> None:
     pairs = [
         ("begin at new line\\begin{em}", "begin at new line\n\\begin{em}"),
         ("end at new line\\end{em}", "end at new line\n\\end{em}"),
-        ("new line after \\\\ asdf", "new line after \\\\\nasdf"),
+        ("new line after \\\\ foo", "new line after \\\\\nfoo"),
         ("no new line after \\\\", "no new line after \\\\"),
     ]
     checkit(fix_latex, pairs)
@@ -189,24 +205,10 @@ def test_fix_numbers(lang: str) -> None:
     settings["lang"] = lang
     pairs = [
         ("Es ist 12:23 Uhr.", "Es ist 12:23~Uhr."),
-        ("asdf", "asdf"),
     ]
     checkit(fix_numbers, pairs)
 
 
-@pytest.mark.parametrize("lang", ["EN", "DE"])
-def test_fix_punctuation(lang: str) -> None:
-    settings["lang"] = lang
-    pairs = [
-        ("!!", "!"),
-        ("??", "?"),
-        ("! !", "!"),
-        ("..", "."),
-        (",,", ","),
-    ]
-    checkit(fix_punctuation, pairs)
-
-
 @pytest.mark.parametrize("lang", ["EN", "DE"])
 def test_fix_spaces(lang: str) -> None:
     settings["lang"] = lang
@@ -220,16 +222,71 @@ def test_fix_spaces(lang: str) -> None:
     checkit(fix_spaces, pairs)
 
 
+@pytest.mark.parametrize("lang", ["EN", "DE"])
+def test_fix_punctuation(lang: str) -> None:
+    settings["lang"] = lang
+    pairs = [
+        ("foo,, bar", "foo, bar"),
+        ("foo.. bar", "foo. bar"),
+        ("foo!! bar", "foo! bar"),
+        ("foo?? bar", "foo? bar"),
+        ("foo:: bar", "foo: bar"),
+        ("foo;; bar", "foo; bar"),
+    ]
+    checkit(fix_punctuation, pairs)
+
+
+@pytest.mark.parametrize("lang", ["EN", "DE"])
+def test_fix_quotations(lang: str) -> None:
+    settings["lang"] = lang
+    if settings["lang"] == "EN":
+        pairs = [
+            ('"foo"', "“foo”"),
+            ("'foo'", "‘foo’"),
+            (' "foo bar"', " “foo bar”"),
+            # space at opening "
+            ("“ foo ”", "“foo”"),
+            ("\\emph{foo} ” bar", "\\emph{foo}” bar"),
+            ("\\heading{“foo ”} bar", "\\heading{“foo”} bar"),
+            ("\\emph{“foo”} bar", "“\\emph{foo}” bar"),
+            ("\\emph{“ foo ”} bar", "“\\emph{foo}” bar"),
+            ("\\emph{foo ”} bar", "\\emph{foo}” bar"),
+            ("‘\\emph{foo}’", "‘foo’"),
+        ]
+    if settings["lang"] == "DE":
+        pairs = [
+            ('"foo"', "„foo“"),
+            ("“foo”", "„foo“"),
+            ("»foo«", "„foo“"),
+            ("'foo'", "‚foo‘"),
+            ("’foo‘", "‚foo‘"),
+            (' "foo bar"', " „foo bar“"),
+            ("…„", "… „"),
+            ("„ foo “", "„foo“"),
+            ("\\heading{„foo “} bar", "\\heading{„foo“} bar"),
+            ("\\emph{„foo“} bar", "„\\emph{foo}“ bar"),
+            ("\\emph{„ foo “} bar", "„\\emph{foo}“ bar"),
+            ("\\emph{foo “} bar", "\\emph{foo}“ bar"),
+            ("foo,“ bar", "foo“, bar"),
+            ("‚\\emph{foo}‘", "‚foo‘"),
+            ("„foo,“", "„foo“,"),
+            ("„foo“bar", "„foo“ bar"),
+            # EN closing
+            ("„foo”", "„foo“"),
+        ]
+    checkit(fix_quotations, pairs)
+
+
 @pytest.mark.parametrize("lang", ["DE"])
 def test_fix_spell(lang: str) -> None:
     settings["lang"] = lang
     pairs = [
-        (r"‚Lumos‘", r"\spell{Lumos}"),
-        (r"„Lumos“", r"\spell{Lumos}"),
-        (r"„\emph{Lumos}“", r"\spell{Lumos}"),
-        (r"\emph{„Lumos“}", r"\spell{Lumos}"),
-        (r"\emph{Lumos!}", r"\spell{Lumos}"),
-        (r"„\spell{Lumos}“", r"\spell{Lumos}"),
+        ("‚Lumos‘", "\\spell{Lumos}"),
+        ("„Lumos“", "\\spell{Lumos}"),
+        ("„\\emph{Lumos}“", "\\spell{Lumos}"),
+        ("\\emph{„Lumos“}", "\\spell{Lumos}"),
+        ("\\emph{Lumos!}", "\\spell{Lumos}"),
+        ("„\\spell{Lumos}“", "\\spell{Lumos}"),
     ]
     checkit(fix_spell, pairs)
 
@@ -237,9 +294,11 @@ def test_fix_spell(lang: str) -> None:
 def checkit(fct: Callable, pairs: list[tuple[str, str]]) -> None:
     for text, expected_output in pairs:
         # test of isolated function
-        assert fct(text) == expected_output, f"'{fct(text)}' != '{expected_output}'"
+        assert (
+            fct(text) == expected_output
+        ), f"'{text}' -> '{fct(text)}' != '{expected_output}'"
 
         # test in complete fix_line context
         assert (
             fix_line(text) == expected_output
-        ), f"'{fix_line(text)}' != '{expected_output}' (fix_line)"
+        ), f"'{text}' -> '{fix_line(text)}' != '{expected_output}' (fix_line)"