Skip to content

Commit

Permalink
support ignoring characters
Browse files Browse the repository at this point in the history
  • Loading branch information
mollerhoj committed Nov 26, 2019
1 parent 632af82 commit 22cb29f
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
18 changes: 18 additions & 0 deletions tests/test_unidecode.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ def test_ascii(self):

wlog.stop()


def test_ignore(self):
wlog = WarningLogger()
wlog.start("should be ignored")

r = self.unidecode(u'æøåÆØÅ', ignore=u'æøå')
self.assertEqual(r, u'æøåAEOA')

if sys.version_info[0] >= 3:
self.assertEqual(type(r), str)
else:
self.assertEqual(type(r), unicode)

# unicode objects shouldn't raise warnings
self.assertEqual(0, len(wlog.log))

wlog.stop()

def test_bmp(self):
for n in range(0,0x10000):
# skip over surrogate pairs, which throw a warning
Expand Down
14 changes: 9 additions & 5 deletions unidecode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def _warn_if_not_unicode(string):
RuntimeWarning, 2)


def unidecode_expect_ascii(string):
def unidecode_expect_ascii(string, ignore=u''):
"""Transliterate an Unicode object into an ASCII string
>>> unidecode(u"\u5317\u4EB0")
Expand All @@ -47,30 +47,34 @@ def unidecode_expect_ascii(string):
try:
bytestring = string.encode('ASCII')
except UnicodeEncodeError:
return _unidecode(string)
return _unidecode(string, ignore)
if version_info[0] >= 3:
return string
else:
return bytestring

def unidecode_expect_nonascii(string):
def unidecode_expect_nonascii(string, ignore=u''):
"""Transliterate an Unicode object into an ASCII string
>>> unidecode(u"\u5317\u4EB0")
"Bei Jing "
"""

_warn_if_not_unicode(string)
return _unidecode(string)
return _unidecode(string, ignore)

unidecode = unidecode_expect_ascii

def _unidecode(string):
def _unidecode(string, ignore=u''):
retval = []

for char in string:
codepoint = ord(char)

if char in ignore:
retval.append(char)
continue

if codepoint < 0x80: # Basic ASCII
retval.append(str(char))
continue
Expand Down

0 comments on commit 22cb29f

Please sign in to comment.