From 8d5b1176c8340412ca1a7c0ad104dd29077a4905 Mon Sep 17 00:00:00 2001 From: Torben <59419684+entorb@users.noreply.github.com> Date: Sun, 24 Nov 2024 06:41:54 +0100 Subject: [PATCH] check_chapters V2 --- scripts/check_chapters.py | 88 +++++++++++----------- scripts/check_chapters_test.py | 133 ++++++++++++++++++++++++--------- 2 files changed, 138 insertions(+), 83 deletions(-) diff --git a/scripts/check_chapters.py b/scripts/check_chapters.py index fc914219..e0e5707c 100755 --- a/scripts/check_chapters.py +++ b/scripts/check_chapters.py @@ -186,14 +186,18 @@ def fix_ellipsis(s: str) -> str: if settings["lang"] != "DE": # after punctuation: add space - s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s) + s = re.sub(r"(?<=[\.\?!:,;])…", " …", s) - # new rule for German + # new rule for German (SYNC with fix_hyphens) if settings["lang"] == "DE": # before: add space if not at start of line or quote - s = re.sub(r"(? …“ Text + s = re.sub(r"…“(?=[^\s])", r"…“ ", s) return s @@ -309,12 +313,6 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915 if settings["lang"] == "DE": s = re.sub(r'(^|\s)"((\\|\w).*?)"', r"\1„\2“", s) - # add space between … and “ - # if settings["lang"] == "EN": - # s = re.sub(r"…“", r"… “", s) - if settings["lang"] == "DE": - s = re.sub(r"…„", r"… „", s) - # space at opening " if settings["lang"] == "EN": s = re.sub(r"“ +", r"“", s) @@ -329,9 +327,10 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915 # space between "…" and "“" # if settings["lang"] == "EN": - # s = re.sub(r"…„", r"… “", s) # rrthomas voted againt it + # s = re.sub(r"…„", r"… “", s) + # # rrthomas voted againt it if settings["lang"] == "DE": - s = re.sub(r"…„", r"… „", s) + s = re.sub("…„", "… „", s) # ” } -> ”} if settings["lang"] == "EN": @@ -364,7 +363,7 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915 if settings["lang"] == "DE": # not, this is wrong, it is correct to have „...“, # s = re.sub(r"(? str: def fix_hyphens(s: str) -> str: + # fix simple dash to em dash # --- -> em dash — s = s.replace("---", "—") + # -- -> em dash — s = s.replace("--", "—") # hyphens: (space-hyphen-space) should be "—" (em dash). # trim space around em-dash s = s.replace(" — ", "—") - # shorter dash as well + # mid dash as well s = s.replace(" – ", "—") # NOT for '— ' as in ', no— “I' # s = re.sub(r"— ", r"—", s) @@ -450,49 +451,41 @@ def fix_hyphens(s: str) -> str: # remove space before — followed by punctuation s = re.sub(r" —([,\.!\?;])", r"—\1", s) - # - at start of line - s = re.sub(r"^[\-—] *", r"—", s) - # if settings["lang"] == "EN": - # s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it - if settings["lang"] == "DE": - # end of line - s = re.sub(r" [\-—]$", r"—", s) - # - at end of emph - s = re.sub(r"(\s*)\-\}", r"—}\1", s) - # at start of quote - # if settings["lang"] == "EN": - # s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it - if settings["lang"] == "DE": - s = re.sub(r"\s*—„", r"— „", s) - s = re.sub(r"„\s*—\s*", r"„—", s) + # mid dash is used between numbers: + # 2-4 -> 2–4 using mid length hyphen + s = re.sub(r"(\d)\-(?=\d)", r"\1–", s) - # at end of quote + # fix spaces around — if settings["lang"] == "EN": + # - at start of line + s = re.sub(r"^[\-—] *", r"—", s) + # if settings["lang"] == "EN": + # s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it + # - at end of emph + s = re.sub(r"(\s*)\-\}", r"—}\1", s) + # at start of quote + # if settings["lang"] == "EN": + # s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it + + # at end of quote s = re.sub(r"(\s*)\-”", r"—”\1", s) - if settings["lang"] == "DE": - s = re.sub(r"(\s*)\-“", r"—“\1", s) - # space-hyphen-quotation end - if settings["lang"] == "EN": + # space-hyphen-quotation end s = re.sub(r"\s+(—”)", r"\1", s) - if settings["lang"] == "DE": - s = re.sub(r"\s+(—“)", r"\1", s) - - # there is a shorter dash as well: - # 2-4 -> 2–4 using mid length hyphen - s = re.sub(r"(\d)\-(?=\d)", r"\1–", s) - # NOT: mid-length dash -> em dash (caution: false positives!) - # s = s.replace("–", "—") - # new rule for German + # new rule for German (SYNC with fix_ellipsis) if settings["lang"] == "DE": # remove all spaces around hyphens - s = re.sub(r" *— *", r"—", s) + s = re.sub(r" *— *", "—", s) # before: add space if not at start of line or quote - s = re.sub(r"(? —“ Text + s = re.sub(r"—“(?=[^\s])", r"—“ ", s) return s @@ -610,6 +603,9 @@ def fix_spell(s: str) -> str: list_of_chapter_files = get_list_of_chapter_files() + # reduce to debugging just one file + # list_of_chapter_files = (Path("chapters/hpmor-chapter-021.tex"),) + # V2: using multiprocessing # prepare num_processes = min(cpu_count(), len(list_of_chapter_files)) diff --git a/scripts/check_chapters_test.py b/scripts/check_chapters_test.py index b366107b..21d5e7e8 100644 --- a/scripts/check_chapters_test.py +++ b/scripts/check_chapters_test.py @@ -15,6 +15,7 @@ fix_MrMrs, fix_numbers, fix_punctuation, + fix_quotations, fix_spaces, fix_spell, ) @@ -74,6 +75,7 @@ def test_fix_ellipsis(lang: str) -> None: ("foo, …“", "foo, …“"), ("foo,…“", "foo, …“"), ("foo …! bar", "foo …! bar"), + ("\\emph{…ihm", "\\emph{… ihm"), ] ) @@ -86,20 +88,20 @@ def test_fix_emph(lang: str) -> None: pairs = [ (r"That’s not \emph{true!}", r"That’s not \emph{true}!"), (r"she got \emph{magic,} can you", r"she got \emph{magic}, can you"), - ("asdf", "asdf"), + ("foo", "foo"), ] if lang == "EN": pairs.extend( [ (r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless.} Both"), - ("asdf", "asdf"), + ("foo", "foo"), ] ) elif lang == "DE": pairs.extend( [ (r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless}. Both"), - ("asdf", "asdf"), + ("foo", "foo"), ] ) checkit(fix_emph, pairs) @@ -118,22 +120,36 @@ def test_fix_hyphens(lang: str) -> None: (" —.", "—."), (" —!", "—!"), (" —?", "—?"), - ("— asdf", "—asdf"), - ("- asdf", "—asdf"), - ("-asdf", "—asdf"), + ("— foo", "—foo"), + ("- foo", "—foo"), + ("-foo", "—foo"), ) ) if lang == "DE": pairs.extend( ( - ("Text — Text", "Text — Text"), - ("Text—„", "Text — „"), - ("Text —„", "Text — „"), - ("Text „ —Quote", "Text „— Quote"), - ("Text „ — Quote", "Text „— Quote"), - ("Text—„— Quote", "Text — „— Quote"), - ("Text -“asdf", "Text —“ asdf"), - ("Text —“", "Text —“"), + ("foo - bar", "foo — bar"), + ("foo -- bar", "foo — bar"), + ("foo --- bar", "foo — bar"), + ("foo—bar", "foo — bar"), + ("foo — bar", "foo — bar"), + ("foo – bar", "foo — bar"), # mid dash + # quote start + ("foo—„", "foo — „"), + ("foo—‚", "foo — ‚"), + ("foo —„", "foo — „"), + ("foo „ —quote", "foo „— quote"), + ("foo „ — quote", "foo „— quote"), + ("foo—„— quote", "foo — „— quote"), + # quote end + ("quote —“foo", "quote —“ foo"), + ("foo —“", "foo —“"), + # emph + ("\\emph{foo—}", "\\emph{foo —}"), + ("\\emph{foo —}", "\\emph{foo —}"), + ("\\emph{foo—} bar", "\\emph{foo —} bar"), + ("foo—\\emph{bar}", "foo — \\emph{bar}"), + ("\\emph{—ihm", "\\emph{— ihm"), ) ) checkit(fix_hyphens, pairs) @@ -145,7 +161,7 @@ def test_fix_latex(lang: str) -> None: pairs = [ ("begin at new line\\begin{em}", "begin at new line\n\\begin{em}"), ("end at new line\\end{em}", "end at new line\n\\end{em}"), - ("new line after \\\\ asdf", "new line after \\\\\nasdf"), + ("new line after \\\\ foo", "new line after \\\\\nfoo"), ("no new line after \\\\", "no new line after \\\\"), ] checkit(fix_latex, pairs) @@ -189,24 +205,10 @@ def test_fix_numbers(lang: str) -> None: settings["lang"] = lang pairs = [ ("Es ist 12:23 Uhr.", "Es ist 12:23~Uhr."), - ("asdf", "asdf"), ] checkit(fix_numbers, pairs) -@pytest.mark.parametrize("lang", ["EN", "DE"]) -def test_fix_punctuation(lang: str) -> None: - settings["lang"] = lang - pairs = [ - ("!!", "!"), - ("??", "?"), - ("! !", "!"), - ("..", "."), - (",,", ","), - ] - checkit(fix_punctuation, pairs) - - @pytest.mark.parametrize("lang", ["EN", "DE"]) def test_fix_spaces(lang: str) -> None: settings["lang"] = lang @@ -220,16 +222,71 @@ def test_fix_spaces(lang: str) -> None: checkit(fix_spaces, pairs) +@pytest.mark.parametrize("lang", ["EN", "DE"]) +def test_fix_punctuation(lang: str) -> None: + settings["lang"] = lang + pairs = [ + ("foo,, bar", "foo, bar"), + ("foo.. bar", "foo. bar"), + ("foo!! bar", "foo! bar"), + ("foo?? bar", "foo? bar"), + ("foo:: bar", "foo: bar"), + ("foo;; bar", "foo; bar"), + ] + checkit(fix_punctuation, pairs) + + +@pytest.mark.parametrize("lang", ["EN", "DE"]) +def test_fix_quotations(lang: str) -> None: + settings["lang"] = lang + if settings["lang"] == "EN": + pairs = [ + ('"foo"', "“foo”"), + ("'foo'", "‘foo’"), + (' "foo bar"', " “foo bar”"), + # space at opening " + ("“ foo ”", "“foo”"), + ("\\emph{foo} ” bar", "\\emph{foo}” bar"), + ("\\heading{“foo ”} bar", "\\heading{“foo”} bar"), + ("\\emph{“foo”} bar", "“\\emph{foo}” bar"), + ("\\emph{“ foo ”} bar", "“\\emph{foo}” bar"), + ("\\emph{foo ”} bar", "\\emph{foo}” bar"), + ("‘\\emph{foo}’", "‘foo’"), + ] + if settings["lang"] == "DE": + pairs = [ + ('"foo"', "„foo“"), + ("“foo”", "„foo“"), + ("»foo«", "„foo“"), + ("'foo'", "‚foo‘"), + ("’foo‘", "‚foo‘"), + (' "foo bar"', " „foo bar“"), + ("…„", "… „"), + ("„ foo “", "„foo“"), + ("\\heading{„foo “} bar", "\\heading{„foo“} bar"), + ("\\emph{„foo“} bar", "„\\emph{foo}“ bar"), + ("\\emph{„ foo “} bar", "„\\emph{foo}“ bar"), + ("\\emph{foo “} bar", "\\emph{foo}“ bar"), + ("foo,“ bar", "foo“, bar"), + ("‚\\emph{foo}‘", "‚foo‘"), + ("„foo,“", "„foo“,"), + ("„foo“bar", "„foo“ bar"), + # EN closing + ("„foo”", "„foo“"), + ] + checkit(fix_quotations, pairs) + + @pytest.mark.parametrize("lang", ["DE"]) def test_fix_spell(lang: str) -> None: settings["lang"] = lang pairs = [ - (r"‚Lumos‘", r"\spell{Lumos}"), - (r"„Lumos“", r"\spell{Lumos}"), - (r"„\emph{Lumos}“", r"\spell{Lumos}"), - (r"\emph{„Lumos“}", r"\spell{Lumos}"), - (r"\emph{Lumos!}", r"\spell{Lumos}"), - (r"„\spell{Lumos}“", r"\spell{Lumos}"), + ("‚Lumos‘", "\\spell{Lumos}"), + ("„Lumos“", "\\spell{Lumos}"), + ("„\\emph{Lumos}“", "\\spell{Lumos}"), + ("\\emph{„Lumos“}", "\\spell{Lumos}"), + ("\\emph{Lumos!}", "\\spell{Lumos}"), + ("„\\spell{Lumos}“", "\\spell{Lumos}"), ] checkit(fix_spell, pairs) @@ -237,9 +294,11 @@ def test_fix_spell(lang: str) -> None: def checkit(fct: Callable, pairs: list[tuple[str, str]]) -> None: for text, expected_output in pairs: # test of isolated function - assert fct(text) == expected_output, f"'{fct(text)}' != '{expected_output}'" + assert ( + fct(text) == expected_output + ), f"'{text}' -> '{fct(text)}' != '{expected_output}'" # test in complete fix_line context assert ( fix_line(text) == expected_output - ), f"'{fix_line(text)}' != '{expected_output}' (fix_line)" + ), f"'{text}' -> '{fix_line(text)}' != '{expected_output}' (fix_line)"