Skip to content

Commit

Permalink
check_chapters V2
Browse files Browse the repository at this point in the history
  • Loading branch information
entorb committed Nov 24, 2024
1 parent a3def26 commit 8d5b117
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 83 deletions.
88 changes: 42 additions & 46 deletions scripts/check_chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,18 @@ def fix_ellipsis(s: str) -> str:

if settings["lang"] != "DE":
# after punctuation: add space
s = re.sub(r"(?<=[\.\?!:,;])…", r" …", s)
s = re.sub(r"(?<=[\.\?!:,;])…", " …", s)

# new rule for German
# new rule for German (SYNC with fix_hyphens)
if settings["lang"] == "DE":
# before: add space if not at start of line or quote
s = re.sub(r"(?<![ „‚\n^])…", r" …", s)
s = re.sub(r"(?<=[^ „‚\(\{\n^])…", " …", s)

# after: add space if not followed by punctuation
s = re.sub(r"…(?![ \.\?!:,;“‘\n$])", r"… ", s)
s = re.sub(r"…(?=[^ \.\?\)\}!:,;“‘\n$])", "… ", s)

# after: …“Text -> …“ Text
s = re.sub(r"…“(?=[^\s])", r"…“ ", s)

return s

Expand Down Expand Up @@ -309,12 +313,6 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915
if settings["lang"] == "DE":
s = re.sub(r'(^|\s)"((\\|\w).*?)"', r"\1„\2“", s)

# add space between … and “
# if settings["lang"] == "EN":
# s = re.sub(r"…“", r"… “", s)
if settings["lang"] == "DE":
s = re.sub(r"…„", r"… „", s)

# space at opening "
if settings["lang"] == "EN":
s = re.sub(r"“ +", r"“", s)
Expand All @@ -329,9 +327,10 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915

# space between "…" and "“"
# if settings["lang"] == "EN":
# s = re.sub(r"…„", r"… “", s) # rrthomas voted againt it
# s = re.sub(r"…„", r"… “", s)
# # rrthomas voted againt it
if settings["lang"] == "DE":
s = re.sub(r"…„", r"… „", s)
s = re.sub("…„", "… „", s)

# ” } -> ”}
if settings["lang"] == "EN":
Expand Down Expand Up @@ -364,7 +363,7 @@ def fix_quotations(s: str) -> str: # noqa: C901, PLR0912, PLR0915
if settings["lang"] == "DE":
# not, this is wrong, it is correct to have „...“,
# s = re.sub(r"(?<![\.,!\?;])(?<![\.,!\?;]\})“,", r",“", s)
s = re.sub(r"(?<![\.,!\?;]),“", r"“,", s)
s = re.sub(r"(?<![\.,!\?;]),“", "“,", s)

# nested single quote + emph
if settings["lang"] == "EN":
Expand Down Expand Up @@ -435,13 +434,15 @@ def fix_emph(s: str) -> str:


def fix_hyphens(s: str) -> str:
# fix simple dash to em dash
# --- -> em dash —
s = s.replace("---", "—")
# -- -> em dash —
s = s.replace("--", "—")
# hyphens: (space-hyphen-space) should be "—" (em dash).
# trim space around em-dash
s = s.replace(" — ", "—")
# shorter dash as well
# mid dash as well
s = s.replace(" – ", "—")
# NOT for '— ' as in ', no— “I'
# s = re.sub(r"— ", r"—", s)
Expand All @@ -450,49 +451,41 @@ def fix_hyphens(s: str) -> str:
# remove space before — followed by punctuation
s = re.sub(r" —([,\.!\?;])", r"—\1", s)

# - at start of line
s = re.sub(r"^[\-—] *", r"—", s)
# if settings["lang"] == "EN":
# s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it
if settings["lang"] == "DE":
# end of line
s = re.sub(r" [\-—]$", r"—", s)
# - at end of emph
s = re.sub(r"(\s*)\-\}", r"—}\1", s)
# at start of quote
# if settings["lang"] == "EN":
# s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it
if settings["lang"] == "DE":
s = re.sub(r"\s*—„", r"— „", s)
s = re.sub(r"„\s*—\s*", r"„—", s)
# mid dash is used between numbers:
# 2-4 -> 2–4 using mid length hyphen
s = re.sub(r"(\d)\-(?=\d)", r"\1–", s)

# at end of quote
# fix spaces around —
if settings["lang"] == "EN":
# - at start of line
s = re.sub(r"^[\-—] *", r"—", s)
# if settings["lang"] == "EN":
# s = re.sub(r" [\-—]$", r"—", s) # rrthomas voted againt it
# - at end of emph
s = re.sub(r"(\s*)\-\}", r"—}\1", s)
# at start of quote
# if settings["lang"] == "EN":
# s = re.sub(r"—“", r"— “", s) # rrthomas voted againt it

# at end of quote
s = re.sub(r"(\s*)\-”", r"—”\1", s)
if settings["lang"] == "DE":
s = re.sub(r"(\s*)\-“", r"—“\1", s)

# space-hyphen-quotation end
if settings["lang"] == "EN":
# space-hyphen-quotation end
s = re.sub(r"\s+(—”)", r"\1", s)
if settings["lang"] == "DE":
s = re.sub(r"\s+(—“)", r"\1", s)

# there is a shorter dash as well:
# 2-4 -> 2–4 using mid length hyphen
s = re.sub(r"(\d)\-(?=\d)", r"\1–", s)
# NOT: mid-length dash -> em dash (caution: false positives!)
# s = s.replace("–", "—")

# new rule for German
# new rule for German (SYNC with fix_ellipsis)
if settings["lang"] == "DE":
# remove all spaces around hyphens
s = re.sub(r" *— *", r"—", s)
s = re.sub(r" *— *", "—", s)

# before: add space if not at start of line or quote
s = re.sub(r"(?<![ „‚\n^])—", r" —", s)
s = re.sub(r"(?<=[^ „‚\(\{\n^])—", " —", s)

# after: add space if not followed by punctuation
s = re.sub(r"—(?![ \.\?!:,;“‘\n$])", r"— ", s)
s = re.sub(r"—(?=[^ \.\?\)\}!:,;“‘\n$])", "— ", s)

# after: —“Text -> —“ Text
s = re.sub(r"—“(?=[^\s])", r"—“ ", s)

return s

Expand Down Expand Up @@ -610,6 +603,9 @@ def fix_spell(s: str) -> str:

list_of_chapter_files = get_list_of_chapter_files()

# reduce to debugging just one file
# list_of_chapter_files = (Path("chapters/hpmor-chapter-021.tex"),)

# V2: using multiprocessing
# prepare
num_processes = min(cpu_count(), len(list_of_chapter_files))
Expand Down
133 changes: 96 additions & 37 deletions scripts/check_chapters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
fix_MrMrs,
fix_numbers,
fix_punctuation,
fix_quotations,
fix_spaces,
fix_spell,
)
Expand Down Expand Up @@ -74,6 +75,7 @@ def test_fix_ellipsis(lang: str) -> None:
("foo, …“", "foo, …“"),
("foo,…“", "foo, …“"),
("foo …! bar", "foo …! bar"),
("\\emph{…ihm", "\\emph{… ihm"),
]
)

Expand All @@ -86,20 +88,20 @@ def test_fix_emph(lang: str) -> None:
pairs = [
(r"That’s not \emph{true!}", r"That’s not \emph{true}!"),
(r"she got \emph{magic,} can you", r"she got \emph{magic}, can you"),
("asdf", "asdf"),
("foo", "foo"),
]
if lang == "EN":
pairs.extend(
[
(r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless.} Both"),
("asdf", "asdf"),
("foo", "foo"),
]
)
elif lang == "DE":
pairs.extend(
[
(r"briefly. \emph{Hopeless.} Both", r"briefly. \emph{Hopeless}. Both"),
("asdf", "asdf"),
("foo", "foo"),
]
)
checkit(fix_emph, pairs)
Expand All @@ -118,22 +120,36 @@ def test_fix_hyphens(lang: str) -> None:
(" —.", "—."),
(" —!", "—!"),
(" —?", "—?"),
("— asdf", "—asdf"),
("- asdf", "—asdf"),
("-asdf", "—asdf"),
("— foo", "—foo"),
("- foo", "—foo"),
("-foo", "—foo"),
)
)
if lang == "DE":
pairs.extend(
(
("Text — Text", "Text — Text"),
("Text—„", "Text — „"),
("Text —„", "Text — „"),
("Text „ —Quote", "Text „— Quote"),
("Text „ — Quote", "Text „— Quote"),
("Text—„— Quote", "Text — „— Quote"),
("Text -“asdf", "Text —“ asdf"),
("Text —“", "Text —“"),
("foo - bar", "foo — bar"),
("foo -- bar", "foo — bar"),
("foo --- bar", "foo — bar"),
("foo—bar", "foo — bar"),
("foo — bar", "foo — bar"),
("foo – bar", "foo — bar"), # mid dash
# quote start
("foo—„", "foo — „"),
("foo—‚", "foo — ‚"),
("foo —„", "foo — „"),
("foo „ —quote", "foo „— quote"),
("foo „ — quote", "foo „— quote"),
("foo—„— quote", "foo — „— quote"),
# quote end
("quote —“foo", "quote —“ foo"),
("foo —“", "foo —“"),
# emph
("\\emph{foo—}", "\\emph{foo —}"),
("\\emph{foo —}", "\\emph{foo —}"),
("\\emph{foo—} bar", "\\emph{foo —} bar"),
("foo—\\emph{bar}", "foo — \\emph{bar}"),
("\\emph{—ihm", "\\emph{— ihm"),
)
)
checkit(fix_hyphens, pairs)
Expand All @@ -145,7 +161,7 @@ def test_fix_latex(lang: str) -> None:
pairs = [
("begin at new line\\begin{em}", "begin at new line\n\\begin{em}"),
("end at new line\\end{em}", "end at new line\n\\end{em}"),
("new line after \\\\ asdf", "new line after \\\\\nasdf"),
("new line after \\\\ foo", "new line after \\\\\nfoo"),
("no new line after \\\\", "no new line after \\\\"),
]
checkit(fix_latex, pairs)
Expand Down Expand Up @@ -189,24 +205,10 @@ def test_fix_numbers(lang: str) -> None:
settings["lang"] = lang
pairs = [
("Es ist 12:23 Uhr.", "Es ist 12:23~Uhr."),
("asdf", "asdf"),
]
checkit(fix_numbers, pairs)


@pytest.mark.parametrize("lang", ["EN", "DE"])
def test_fix_punctuation(lang: str) -> None:
settings["lang"] = lang
pairs = [
("!!", "!"),
("??", "?"),
("! !", "!"),
("..", "."),
(",,", ","),
]
checkit(fix_punctuation, pairs)


@pytest.mark.parametrize("lang", ["EN", "DE"])
def test_fix_spaces(lang: str) -> None:
settings["lang"] = lang
Expand All @@ -220,26 +222,83 @@ def test_fix_spaces(lang: str) -> None:
checkit(fix_spaces, pairs)


@pytest.mark.parametrize("lang", ["EN", "DE"])
def test_fix_punctuation(lang: str) -> None:
settings["lang"] = lang
pairs = [
("foo,, bar", "foo, bar"),
("foo.. bar", "foo. bar"),
("foo!! bar", "foo! bar"),
("foo?? bar", "foo? bar"),
("foo:: bar", "foo: bar"),
("foo;; bar", "foo; bar"),
]
checkit(fix_punctuation, pairs)


@pytest.mark.parametrize("lang", ["EN", "DE"])
def test_fix_quotations(lang: str) -> None:
settings["lang"] = lang
if settings["lang"] == "EN":
pairs = [
('"foo"', "“foo”"),
("'foo'", "‘foo’"),
(' "foo bar"', " “foo bar”"),
# space at opening "
("“ foo ”", "“foo”"),
("\\emph{foo} ” bar", "\\emph{foo}” bar"),
("\\heading{“foo ”} bar", "\\heading{“foo”} bar"),
("\\emph{“foo”} bar", "“\\emph{foo}” bar"),
("\\emph{“ foo ”} bar", "“\\emph{foo}” bar"),
("\\emph{foo ”} bar", "\\emph{foo}” bar"),
("‘\\emph{foo}’", "‘foo’"),
]
if settings["lang"] == "DE":
pairs = [
('"foo"', "„foo“"),
("“foo”", "„foo“"),
("»foo«", "„foo“"),
("'foo'", "‚foo‘"),
("’foo‘", "‚foo‘"),
(' "foo bar"', " „foo bar“"),
("…„", "… „"),
("„ foo “", "„foo“"),
("\\heading{„foo “} bar", "\\heading{„foo“} bar"),
("\\emph{„foo“} bar", "„\\emph{foo}“ bar"),
("\\emph{„ foo “} bar", "„\\emph{foo}“ bar"),
("\\emph{foo “} bar", "\\emph{foo}“ bar"),
("foo,“ bar", "foo“, bar"),
("‚\\emph{foo}‘", "‚foo‘"),
("„foo,“", "„foo“,"),
("„foo“bar", "„foo“ bar"),
# EN closing
("„foo”", "„foo“"),
]
checkit(fix_quotations, pairs)


@pytest.mark.parametrize("lang", ["DE"])
def test_fix_spell(lang: str) -> None:
settings["lang"] = lang
pairs = [
(r"‚Lumos‘", r"\spell{Lumos}"),
(r"„Lumos“", r"\spell{Lumos}"),
(r"„\emph{Lumos}“", r"\spell{Lumos}"),
(r"\emph{„Lumos“}", r"\spell{Lumos}"),
(r"\emph{Lumos!}", r"\spell{Lumos}"),
(r"„\spell{Lumos}“", r"\spell{Lumos}"),
("‚Lumos‘", "\\spell{Lumos}"),
("„Lumos“", "\\spell{Lumos}"),
("„\\emph{Lumos}“", "\\spell{Lumos}"),
("\\emph{„Lumos“}", "\\spell{Lumos}"),
("\\emph{Lumos!}", "\\spell{Lumos}"),
("„\\spell{Lumos}“", "\\spell{Lumos}"),
]
checkit(fix_spell, pairs)


def checkit(fct: Callable, pairs: list[tuple[str, str]]) -> None:
for text, expected_output in pairs:
# test of isolated function
assert fct(text) == expected_output, f"'{fct(text)}' != '{expected_output}'"
assert (
fct(text) == expected_output
), f"'{text}' -> '{fct(text)}' != '{expected_output}'"

# test in complete fix_line context
assert (
fix_line(text) == expected_output
), f"'{fix_line(text)}' != '{expected_output}' (fix_line)"
), f"'{text}' -> '{fix_line(text)}' != '{expected_output}' (fix_line)"

0 comments on commit 8d5b117

Please sign in to comment.