diff --git a/src/bifixer/restorative_cleaning.py b/src/bifixer/restorative_cleaning.py index 6077393..a2d91a2 100644 --- a/src/bifixer/restorative_cleaning.py +++ b/src/bifixer/restorative_cleaning.py @@ -15,6 +15,7 @@ quotesRegex = regex.compile("(?P[[:alpha:]])\'\'(?P(s|S|t|T|m|M|d|D|re|RE|ll|LL|ve|VE|em|EM)\W)") collapse_spaced_entities = regex.compile('([&][ ]*[#][ ]*)([0-9]{2,6})([ ]*[;])') html_tags_regex = re.compile('<.*?>') +remove_tabs_endlines = str.maketrans({k:' ' for k in '\r\n\t'}) #https://en.wikipedia.org/wiki/CJK_Symbols_and_Punctuation cjk_langs = [ @@ -82,25 +83,6 @@ def getCharsReplacements(lang): # Annoying characters, common for all languages chars = { - '\u2028': ' ', # line separators (\n) - ' ': " ", # \n - ' ': " ", # \t - ' ': " ", # \n - ' ': " ", # \t - ' ': " ", - '\t': " ", # when normalization is disabled and &Tab; is unescaped by ftfy, creating poisonous \t - '\n': "", - '\u000C' : " ", # \v vertical tab - '\u000D' : " ", # \f form feed - ' ': "", - ' ': "", - ' ': " ", - - '\u000D': "", # carriage returns (\r) - ' ': " ", - ' ': " ", - ' ': " ", - # unicode ligatures '\uFB00': 'ff', '\uFB01': 'fi', @@ -748,7 +730,7 @@ def fix(text, lang, chars_rep, chars_pattern): replaced_text = chars_pattern.sub(replace_chars, ftfy_fixed_text) - return html.unescape(replaced_text) + return html.unescape(replaced_text).translate(remove_tabs_endlines) def normalize(text, lang, punct_rep, punct_pattern): normalized_text = text diff --git a/tests/test_bifixer.py b/tests/test_bifixer.py index 7f37bad..ac58ddc 100644 --- a/tests/test_bifixer.py +++ b/tests/test_bifixer.py @@ -120,6 +120,11 @@ def test_html_entities(self): fixed_1 = restorative_cleaning.fix(text_1, "es", self.chars_es, self.charsRe_es) assert fixed_1 == correct + correct_2 = "This is a very triccky sentence " + text_2 = "This is &#13a very triccky&NewLine;\n sentence&#13;&Tab;" + fixed_2 = restorative_cleaning.fix(text_2, "en", self.chars_en, self.charsRe_en) + assert fixed_2 == correct_2 + def test_punct(self): text_1 = " Did I pass the acid test ? " correct = "Did I pass the acid test?"