From b8d29b8c3d95dea7a6eaf662540e5a08a98b36db Mon Sep 17 00:00:00 2001 From: Torben <59419684+entorb@users.noreply.github.com> Date: Sat, 23 Nov 2024 09:07:46 +0100 Subject: [PATCH] check_chapters --- scripts/check_chapters.py | 92 ++++++++++------- scripts/check_chapters_test.py | 184 ++++++++++++++++++++++++--------- 2 files changed, 188 insertions(+), 88 deletions(-) diff --git a/scripts/check_chapters.py b/scripts/check_chapters.py index bf323664..e0e5707c 100755 --- a/scripts/check_chapters.py +++ b/scripts/check_chapters.py @@ -184,8 +184,21 @@ def fix_ellipsis(s: str) -> str: # remove all spaces around ellipsis s = re.sub(r" *… *", r"…", s) - # after punctuation: add space - s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s) + if settings["lang"] != "DE": + # after punctuation: add space + s = re.sub(r"(?<=[\.\?!:,;])…", " …", s) + + # new rule for German (SYNC with fix_hyphens) + if settings["lang"] == "DE": + # before: add space if not at start of line or quote + s = re.sub(r"(?<=[^ „‚\(\{\n^])…", " …", s) + + # after: add space if not followed by punctuation + s = re.sub(r"…(?=[^ \.\?\)\}!:,;“‘\n$])", "… ", s) + + # after: …“Text -> …“ Text + s = re.sub(r"…“(?=[^\s])", r"…“ ", s) + return s @@ -300,12 +313,6 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915 if settings["lang"] == "DE": s = re.sub(r'(^|\s)"((\\|\w).*?)"', r"\1„\2“", s) - # add space between … and “ - # if settings["lang"] == "EN": - # s = re.sub(r"…“", r"… “", s) - if settings["lang"] == "DE": - s = re.sub(r"…„", r"… „", s) - # space at opening " if settings["lang"] == "EN": s = re.sub(r"“ +", r"“", s) @@ -320,9 +327,10 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915 # space between "…" and "“" # if settings["lang"] == "EN": - # s = re.sub(r"…„", r"… “", s) # rrthomas voted againt it + # s = re.sub(r"…„", r"… “", s) + # # rrthomas voted againt it if settings["lang"] == "DE": - s = re.sub(r"…„", r"… „", s) + s = re.sub("…„", "… „", s) # ” } -> ”} if settings["lang"] == "EN": @@ -355,7 +363,7 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915 if settings["lang"] == "DE": # not, this is wrong, it is correct to have „...“, # s = re.sub(r"(? str: def fix_hyphens(s: str) -> str: + # fix simple dash to em dash # --- -> em dash — s = s.replace("---", "—") + # -- -> em dash — s = s.replace("--", "—") # hyphens: (space-hyphen-space) should be "—" (em dash). # trim space around em-dash s = s.replace(" — ", "—") - # shorter dash as well + # mid dash as well s = s.replace(" – ", "—") # NOT for '— ' as in ', no— “I' # s = re.sub(r"— ", r"—", s) @@ -441,39 +451,42 @@ def fix_hyphens(s: str) -> str: # remove space before — followed by punctuation s = re.sub(r" —([,\.!\?;])", r"—\1", s) - # - at start of line - s = re.sub(r"^[\-—] *", r"—", s) - # if settings["lang"] == "EN": - # s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it - if settings["lang"] == "DE": - # end of line - s = re.sub(r" [\-—]$", r"—", s) - # - at end of emph - s = re.sub(r"(\s*)\-\}", r"—}\1", s) - # at start of quote - # if settings["lang"] == "EN": - # s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it - if settings["lang"] == "DE": - s = re.sub(r"\s*—„", r"— „", s) - s = re.sub(r"„\s*—\s*", r"„—", s) + # mid dash is used between numbers: + # 2-4 -> 2–4 using mid length hyphen + s = re.sub(r"(\d)\-(?=\d)", r"\1–", s) - # at end of quote + # fix spaces around — if settings["lang"] == "EN": + # - at start of line + s = re.sub(r"^[\-—] *", r"—", s) + # if settings["lang"] == "EN": + # s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it + # - at end of emph + s = re.sub(r"(\s*)\-\}", r"—}\1", s) + # at start of quote + # if settings["lang"] == "EN": + # s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it + + # at end of quote s = re.sub(r"(\s*)\-”", r"—”\1", s) - if settings["lang"] == "DE": - s = re.sub(r"(\s*)\-“", r"—“\1", s) - # space-hyphen-quotation end - if settings["lang"] == "EN": + # space-hyphen-quotation end s = re.sub(r"\s+(—”)", r"\1", s) + + # new rule for German (SYNC with fix_ellipsis) if settings["lang"] == "DE": - s = re.sub(r"\s+(—“)", r"\1", s) + # remove all spaces around hyphens + s = re.sub(r" *— *", "—", s) + + # before: add space if not at start of line or quote + s = re.sub(r"(?<=[^ „‚\(\{\n^])—", " —", s) + + # after: add space if not followed by punctuation + s = re.sub(r"—(?=[^ \.\?\)\}!:,;“‘\n$])", "— ", s) + + # after: —“Text -> —“ Text + s = re.sub(r"—“(?=[^\s])", r"—“ ", s) - # there is a shorter dash as well: - # 2-4 -> 2–4 using mid length hyphen - s = re.sub(r"(\d)\-(?=\d)", r"\1–", s) - # NOT: mid-length dash -> em dash (caution: false positives!) - # s = s.replace("–", "—") return s @@ -590,6 +603,9 @@ def fix_spell(s: str) -> str: list_of_chapter_files = get_list_of_chapter_files() + # reduce to debugging just one file + # list_of_chapter_files = (Path("chapters/hpmor-chapter-021.tex"),) + # V2: using multiprocessing # prepare num_processes = min(cpu_count(), len(list_of_chapter_files)) diff --git a/scripts/check_chapters_test.py b/scripts/check_chapters_test.py index 9dfcf444..21d5e7e8 100644 --- a/scripts/check_chapters_test.py +++ b/scripts/check_chapters_test.py @@ -15,6 +15,7 @@ fix_MrMrs, fix_numbers, fix_punctuation, + fix_quotations, fix_spaces, fix_spell, ) @@ -50,13 +51,34 @@ def test_fix_common_typos(lang: str) -> None: @pytest.mark.parametrize("lang", ["EN", "DE"]) def test_fix_ellipsis(lang: str) -> None: settings["lang"] = lang - pairs = [ - ("foo…bar", "foo…bar"), - ("foo … bar", "foo…bar"), - ("foo… bar", "foo…bar"), - ("foo …bar", "foo…bar"), - ("foo, …", "foo, …"), - ] + pairs = [] + if lang != "DE": + pairs.extend( + [ + ("foo...bar", "foo…bar"), + ("foo…bar", "foo…bar"), + ("foo … bar", "foo…bar"), + ("foo… bar", "foo…bar"), + ("foo …bar", "foo…bar"), + ("foo, …", "foo, …"), + ("foo …! bar", "foo…! bar"), + ] + ) + if lang == "DE": + pairs.extend( + [ + ("foo...bar", "foo … bar"), + ("foo…bar", "foo … bar"), + ("foo … bar", "foo … bar"), + ("foo… bar", "foo … bar"), + ("foo …bar", "foo … bar"), + ("foo, …“", "foo, …“"), + ("foo,…“", "foo, …“"), + ("foo …! bar", "foo …! bar"), + ("\\emph{…ihm", "\\emph{… ihm"), + ] + ) + checkit(fix_ellipsis, pairs) @@ -66,20 +88,20 @@ def test_fix_emph(lang: str) -> None: pairs = [ (r"That’s not \emph{true!}", r"That’s not \emph{true}!"), (r"she got \emph{magic,} can you", r"she got \emph{magic}, can you"), - ("asdf", "asdf"), + ("foo", "foo"), ] if lang == "EN": pairs.extend( [ (r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless.} Both"), - ("asdf", "asdf"), + ("foo", "foo"), ] ) elif lang == "DE": pairs.extend( [ (r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless}. Both"), - ("asdf", "asdf"), + ("foo", "foo"), ] ) checkit(fix_emph, pairs) @@ -90,26 +112,45 @@ def test_fix_hyphens(lang: str) -> None: settings["lang"] = lang pairs = [ ("2-3-4", "2–3–4"), - (" —,", "—,"), - (" —.", "—."), - (" —!", "—!"), - (" —?", "—?"), - ("— asdf", "—asdf"), - ("- asdf", "—asdf"), - ("-asdf", "—asdf"), ] + if lang != "DE": + pairs.extend( + ( + (" —,", "—,"), + (" —.", "—."), + (" —!", "—!"), + (" —?", "—?"), + ("— foo", "—foo"), + ("- foo", "—foo"), + ("-foo", "—foo"), + ) + ) if lang == "DE": pairs.extend( - [ - ("Text —", "Text—"), - ("Text—„", "Text— „"), - ("Text —„", "Text— „"), - ("Text „ —Quote", "Text „—Quote"), - ("Text „ — Quote", "Text „—Quote"), - ("Text—„— Quote", "Text— „—Quote"), - ("Text -“asdf", "Text—“ asdf"), - ("Text —“", "Text—“"), - ] + ( + ("foo - bar", "foo — bar"), + ("foo -- bar", "foo — bar"), + ("foo --- bar", "foo — bar"), + ("foo—bar", "foo — bar"), + ("foo — bar", "foo — bar"), + ("foo – bar", "foo — bar"), # mid dash + # quote start + ("foo—„", "foo — „"), + ("foo—‚", "foo — ‚"), + ("foo —„", "foo — „"), + ("foo „ —quote", "foo „— quote"), + ("foo „ — quote", "foo „— quote"), + ("foo—„— quote", "foo — „— quote"), + # quote end + ("quote —“foo", "quote —“ foo"), + ("foo —“", "foo —“"), + # emph + ("\\emph{foo—}", "\\emph{foo —}"), + ("\\emph{foo —}", "\\emph{foo —}"), + ("\\emph{foo—} bar", "\\emph{foo —} bar"), + ("foo—\\emph{bar}", "foo — \\emph{bar}"), + ("\\emph{—ihm", "\\emph{— ihm"), + ) ) checkit(fix_hyphens, pairs) @@ -120,7 +161,7 @@ def test_fix_latex(lang: str) -> None: pairs = [ ("begin at new line\\begin{em}", "begin at new line\n\\begin{em}"), ("end at new line\\end{em}", "end at new line\n\\end{em}"), - ("new line after \\\\ asdf", "new line after \\\\\nasdf"), + ("new line after \\\\ foo", "new line after \\\\\nfoo"), ("no new line after \\\\", "no new line after \\\\"), ] checkit(fix_latex, pairs) @@ -164,24 +205,10 @@ def test_fix_numbers(lang: str) -> None: settings["lang"] = lang pairs = [ ("Es ist 12:23 Uhr.", "Es ist 12:23~Uhr."), - ("asdf", "asdf"), ] checkit(fix_numbers, pairs) -@pytest.mark.parametrize("lang", ["EN", "DE"]) -def test_fix_punctuation(lang: str) -> None: - settings["lang"] = lang - pairs = [ - ("!!", "!"), - ("??", "?"), - ("! !", "!"), - ("..", "."), - (",,", ","), - ] - checkit(fix_punctuation, pairs) - - @pytest.mark.parametrize("lang", ["EN", "DE"]) def test_fix_spaces(lang: str) -> None: settings["lang"] = lang @@ -195,16 +222,71 @@ def test_fix_spaces(lang: str) -> None: checkit(fix_spaces, pairs) +@pytest.mark.parametrize("lang", ["EN", "DE"]) +def test_fix_punctuation(lang: str) -> None: + settings["lang"] = lang + pairs = [ + ("foo,, bar", "foo, bar"), + ("foo.. bar", "foo. bar"), + ("foo!! bar", "foo! bar"), + ("foo?? bar", "foo? bar"), + ("foo:: bar", "foo: bar"), + ("foo;; bar", "foo; bar"), + ] + checkit(fix_punctuation, pairs) + + +@pytest.mark.parametrize("lang", ["EN", "DE"]) +def test_fix_quotations(lang: str) -> None: + settings["lang"] = lang + if settings["lang"] == "EN": + pairs = [ + ('"foo"', "“foo”"), + ("'foo'", "‘foo’"), + (' "foo bar"', " “foo bar”"), + # space at opening " + ("“ foo ”", "“foo”"), + ("\\emph{foo} ” bar", "\\emph{foo}” bar"), + ("\\heading{“foo ”} bar", "\\heading{“foo”} bar"), + ("\\emph{“foo”} bar", "“\\emph{foo}” bar"), + ("\\emph{“ foo ”} bar", "“\\emph{foo}” bar"), + ("\\emph{foo ”} bar", "\\emph{foo}” bar"), + ("‘\\emph{foo}’", "‘foo’"), + ] + if settings["lang"] == "DE": + pairs = [ + ('"foo"', "„foo“"), + ("“foo”", "„foo“"), + ("»foo«", "„foo“"), + ("'foo'", "‚foo‘"), + ("’foo‘", "‚foo‘"), + (' "foo bar"', " „foo bar“"), + ("…„", "… „"), + ("„ foo “", "„foo“"), + ("\\heading{„foo “} bar", "\\heading{„foo“} bar"), + ("\\emph{„foo“} bar", "„\\emph{foo}“ bar"), + ("\\emph{„ foo “} bar", "„\\emph{foo}“ bar"), + ("\\emph{foo “} bar", "\\emph{foo}“ bar"), + ("foo,“ bar", "foo“, bar"), + ("‚\\emph{foo}‘", "‚foo‘"), + ("„foo,“", "„foo“,"), + ("„foo“bar", "„foo“ bar"), + # EN closing + ("„foo”", "„foo“"), + ] + checkit(fix_quotations, pairs) + + @pytest.mark.parametrize("lang", ["DE"]) def test_fix_spell(lang: str) -> None: settings["lang"] = lang pairs = [ - (r"‚Lumos‘", r"\spell{Lumos}"), - (r"„Lumos“", r"\spell{Lumos}"), - (r"„\emph{Lumos}“", r"\spell{Lumos}"), - (r"\emph{„Lumos“}", r"\spell{Lumos}"), - (r"\emph{Lumos!}", r"\spell{Lumos}"), - (r"„\spell{Lumos}“", r"\spell{Lumos}"), + ("‚Lumos‘", "\\spell{Lumos}"), + ("„Lumos“", "\\spell{Lumos}"), + ("„\\emph{Lumos}“", "\\spell{Lumos}"), + ("\\emph{„Lumos“}", "\\spell{Lumos}"), + ("\\emph{Lumos!}", "\\spell{Lumos}"), + ("„\\spell{Lumos}“", "\\spell{Lumos}"), ] checkit(fix_spell, pairs) @@ -212,9 +294,11 @@ def test_fix_spell(lang: str) -> None: def checkit(fct: Callable, pairs: list[tuple[str, str]]) -> None: for text, expected_output in pairs: # test of isolated function - assert fct(text) == expected_output, f"'{fct(text)}' != '{expected_output}'" + assert ( + fct(text) == expected_output + ), f"'{text}' -> '{fct(text)}' != '{expected_output}'" # test in complete fix_line context assert ( fix_line(text) == expected_output - ), f"'{fix_line(text)}' != '{expected_output}'" + ), f"'{text}' -> '{fix_line(text)}' != '{expected_output}' (fix_line)"