Skip to content

Commit

Permalink
Add ppcomp option to suppress word join (U+2060) (#18)
Browse files Browse the repository at this point in the history
Fixes issue #17
  • Loading branch information
rtonsing authored Nov 28, 2022
1 parent fdde140 commit c4c357b
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 4 deletions.
10 changes: 8 additions & 2 deletions ppcomp/ppcomp.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,13 +586,16 @@ def remove_nbspaces(self):
"""Remove non-breakable spaces between numbers. For instance, a
text file could have 250000, and the html could have 250 000.
"""
# Todo:  ,  ,  ?
if self.args.suppress_nbsp_num:
self.text = re.sub(r"(\d)\u00A0(\d)", r"\1\2", self.text)

def remove_wordjoin(self):
"""Remove word join (NoBreak) (U+2060)."""
if self.args.suppress_word_join:
self.text = re.sub(r"\u2060", r"", self.text)

def remove_soft_hyphen(self):
"""Suppress shy (soft hyphen)"""
# Todo: ­, ­?
self.text = re.sub(r"\u00AD", r"", self.text)

def cleanup(self):
Expand All @@ -617,6 +620,7 @@ def cleanup(self):

self.remove_nbspaces()
self.remove_soft_hyphen()
self.remove_wordjoin()

@staticmethod
def _text_transform(val, errors: list):
Expand Down Expand Up @@ -1121,6 +1125,8 @@ def main():
help="HTML: do not use default transformation CSS")
parser.add_argument('--suppress-nbsp-num', action='store_true', default=False,
help="HTML: Suppress non-breakable spaces between numbers")
parser.add_argument('--suppress-word-join', action='store_true', default=False,
help="HTML: Suppress word join (NoBreak) (U+2060)")
parser.add_argument('--ignore-0-space', action='store_true', default=False,
help='HTML: suppress zero width space (U+200b)')
parser.add_argument('--css-greek-title-plus', action='store_true', default=False,
Expand Down
4 changes: 2 additions & 2 deletions tests/fossilplants1.html
Original file line number Diff line number Diff line change
Expand Up @@ -5419,7 +5419,7 @@ <h2 class="nobreak" id="CHAPTER_V">CHAPTER V.</h2>

<p><span class="pagenum" id="Page_98">98</span></p>

<p>Endless examples might be quoted illustrating the absolute futility,
<p>Endless examples might be quoted&#8288; illustrating the absolute futility,
in many cases, of relying on external features even for the purpose
of class distinction. An acquaintance with the general habit and
appearance of only the better known members of a family, frequently
Expand Down Expand Up @@ -5476,7 +5476,7 @@ <h2 class="nobreak" id="CHAPTER_V">CHAPTER V.</h2>
correspond in the character of their venation to Monocotyledonous
leaves. <i>Eryngium montanum</i> Coult., <i>E. Lassauxi</i> Dcne., and
other species of this genus of <i>Umbelliferæ</i> agree closely with
such a plant as <i>Pandanus</i> or other Monocotyledons; similarly
such a plant as <i>Pandanus</i> or other Monocotyledons&#8288;; similarly
the long linear leaves of <i>Richea dracophylla</i>, R. Br., one
of the Ericaceæ, are identical in form with many monocotyledonous
leaves. Instances might also be quoted of monocotyledonous leaves,
Expand Down
8 changes: 8 additions & 0 deletions tests/test_ppcomp.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,12 @@ def test_remove_nbspaces():
assert not re.search(r"(\d)\u00A0(\d)", html_file.text)
assert 0 <= html_file.text.find("2885")

def test_suppress_word_join():
args = myargs + ['--suppress-word-join']
html_file = PgdpFileHtml(load_args(args))
html_file.load('fossilplants1.html')
html_file.remove_nbspaces()
assert not re.search(r"\u2060", html_file.text)

def test_remove_soft_hyphen():
html_file = PgdpFileHtml(load_args(myargs))
Expand Down Expand Up @@ -401,6 +407,8 @@ def load_args(myargs):
help="HTML: do not use default transformation CSS")
parser.add_argument('--suppress-nbsp-num', action='store_true', default=False,
help="HTML: Suppress non-breakable spaces between numbers")
parser.add_argument('--suppress-word-join', action='store_true', default=False,
help="HTML: Suppress word join (NoBreak) (U+2060)")
parser.add_argument('--ignore-0-space', action='store_true', default=False,
help='HTML: suppress zero width space (U+200b)')
parser.add_argument('--css-greek-title-plus', action='store_true', default=False,
Expand Down

0 comments on commit c4c357b

Please sign in to comment.