From c4c357b4ba0a8a8c1720dd77809357c3041c1cf1 Mon Sep 17 00:00:00 2001 From: Robert Tonsing Date: Sun, 27 Nov 2022 22:41:48 -0600 Subject: [PATCH] Add ppcomp option to suppress word join (U+2060) (#18) Fixes issue #17 --- ppcomp/ppcomp.py | 10 ++++++++-- tests/fossilplants1.html | 4 ++-- tests/test_ppcomp.py | 8 ++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/ppcomp/ppcomp.py b/ppcomp/ppcomp.py index cd8e7cf..e7c368f 100644 --- a/ppcomp/ppcomp.py +++ b/ppcomp/ppcomp.py @@ -586,13 +586,16 @@ def remove_nbspaces(self): """Remove non-breakable spaces between numbers. For instance, a text file could have 250000, and the html could have 250 000. """ - # Todo:  ,  ,  ? if self.args.suppress_nbsp_num: self.text = re.sub(r"(\d)\u00A0(\d)", r"\1\2", self.text) + def remove_wordjoin(self): + """Remove word join (NoBreak) (U+2060).""" + if self.args.suppress_word_join: + self.text = re.sub(r"\u2060", r"", self.text) + def remove_soft_hyphen(self): """Suppress shy (soft hyphen)""" - # Todo: ­, ­? self.text = re.sub(r"\u00AD", r"", self.text) def cleanup(self): @@ -617,6 +620,7 @@ def cleanup(self): self.remove_nbspaces() self.remove_soft_hyphen() + self.remove_wordjoin() @staticmethod def _text_transform(val, errors: list): @@ -1121,6 +1125,8 @@ def main(): help="HTML: do not use default transformation CSS") parser.add_argument('--suppress-nbsp-num', action='store_true', default=False, help="HTML: Suppress non-breakable spaces between numbers") + parser.add_argument('--suppress-word-join', action='store_true', default=False, + help="HTML: Suppress word join (NoBreak) (U+2060)") parser.add_argument('--ignore-0-space', action='store_true', default=False, help='HTML: suppress zero width space (U+200b)') parser.add_argument('--css-greek-title-plus', action='store_true', default=False, diff --git a/tests/fossilplants1.html b/tests/fossilplants1.html index b4b31b0..b029c13 100644 --- a/tests/fossilplants1.html +++ b/tests/fossilplants1.html @@ -5419,7 +5419,7 @@

CHAPTER V.

98

-

Endless examples might be quoted illustrating the absolute futility, +

Endless examples might be quoted⁠ illustrating the absolute futility, in many cases, of relying on external features even for the purpose of class distinction. An acquaintance with the general habit and appearance of only the better known members of a family, frequently @@ -5476,7 +5476,7 @@

CHAPTER V.

correspond in the character of their venation to Monocotyledonous leaves. Eryngium montanum Coult., E. Lassauxi Dcne., and other species of this genus of Umbelliferæ agree closely with - such a plant as Pandanus or other Monocotyledons; similarly + such a plant as Pandanus or other Monocotyledons⁠; similarly the long linear leaves of Richea dracophylla, R. Br., one of the Ericaceæ, are identical in form with many monocotyledonous leaves. Instances might also be quoted of monocotyledonous leaves, diff --git a/tests/test_ppcomp.py b/tests/test_ppcomp.py index af1fbda..59995d2 100644 --- a/tests/test_ppcomp.py +++ b/tests/test_ppcomp.py @@ -242,6 +242,12 @@ def test_remove_nbspaces(): assert not re.search(r"(\d)\u00A0(\d)", html_file.text) assert 0 <= html_file.text.find("2885") +def test_suppress_word_join(): + args = myargs + ['--suppress-word-join'] + html_file = PgdpFileHtml(load_args(args)) + html_file.load('fossilplants1.html') + html_file.remove_nbspaces() + assert not re.search(r"\u2060", html_file.text) def test_remove_soft_hyphen(): html_file = PgdpFileHtml(load_args(myargs)) @@ -401,6 +407,8 @@ def load_args(myargs): help="HTML: do not use default transformation CSS") parser.add_argument('--suppress-nbsp-num', action='store_true', default=False, help="HTML: Suppress non-breakable spaces between numbers") + parser.add_argument('--suppress-word-join', action='store_true', default=False, + help="HTML: Suppress word join (NoBreak) (U+2060)") parser.add_argument('--ignore-0-space', action='store_true', default=False, help='HTML: suppress zero width space (U+200b)') parser.add_argument('--css-greek-title-plus', action='store_true', default=False,