Skip to content

Commit

Permalink
Remove tabs, newlines and carriage in fix
Browse files Browse the repository at this point in the history
Prevents generating newlines or wrong numbers of columns when not using
normalization.

The replacements of this escaped entities is no longer required.
  • Loading branch information
ZJaume committed Jan 27, 2023
1 parent 7ebd8cf commit a44b2c2
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 20 deletions.
22 changes: 2 additions & 20 deletions src/bifixer/restorative_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
quotesRegex = regex.compile("(?P<start>[[:alpha:]])\'\'(?P<end>(s|S|t|T|m|M|d|D|re|RE|ll|LL|ve|VE|em|EM)\W)")
collapse_spaced_entities = regex.compile('([&][ ]*[#][ ]*)([0-9]{2,6})([ ]*[;])')
html_tags_regex = re.compile('<.*?>')
remove_tabs_endlines = str.maketrans({k:' ' for k in '\r\n\t'})

#https://en.wikipedia.org/wiki/CJK_Symbols_and_Punctuation
cjk_langs = [
Expand Down Expand Up @@ -82,25 +83,6 @@ def getCharsReplacements(lang):

# Annoying characters, common for all languages
chars = {
'\u2028': ' ', # line separators (\n)
'&#10;': " ", # \n
'&#9;': " ", # \t
'&#10': " ", # \n
'&#9': " ", # \t
'&Tab;': " ",
'\t': " ", # when normalization is disabled and &amp;Tab; is unescaped by ftfy, creating poisonous \t
'\n': "",
'\u000C' : " ", # \v vertical tab
'\u000D' : " ", # \f form feed
'&#xa': "",
'&#xA': "",
'&NewLine;': " ",

'\u000D': "", # carriage returns (\r)
'&#13;': " ",
'&#xd;': " ",
'&#xD;': " ",

# unicode ligatures
'\uFB00': 'ff',
'\uFB01': 'fi',
Expand Down Expand Up @@ -748,7 +730,7 @@ def fix(text, lang, chars_rep, chars_pattern):

replaced_text = chars_pattern.sub(replace_chars, ftfy_fixed_text)

return html.unescape(replaced_text)
return html.unescape(replaced_text).translate(remove_tabs_endlines)

def normalize(text, lang, punct_rep, punct_pattern):
normalized_text = text
Expand Down
5 changes: 5 additions & 0 deletions tests/test_bifixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ def test_html_entities(self):
fixed_1 = restorative_cleaning.fix(text_1, "es", self.chars_es, self.charsRe_es)
assert fixed_1 == correct

correct_2 = "This is a very triccky sentence "
text_2 = "This&#9;is &amp;#13a very&#9triccky&amp;NewLine;\n sentence&amp;#13;&amp;Tab;"
fixed_2 = restorative_cleaning.fix(text_2, "en", self.chars_en, self.charsRe_en)
assert fixed_2 == correct_2

def test_punct(self):
text_1 = " Did I pass the acid test ? "
correct = "Did I pass the acid test?"
Expand Down

0 comments on commit a44b2c2

Please sign in to comment.