From 8bb434f020e9cad80ee00007b29a425fd1d4f480 Mon Sep 17 00:00:00 2001 From: Andrew Kamau Date: Sat, 18 Feb 2017 22:39:32 +0300 Subject: [PATCH] expose decompose_nfkd() --- normality/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/normality/__init__.py b/normality/__init__.py index cdf5e72..17d85ef 100644 --- a/normality/__init__.py +++ b/normality/__init__.py @@ -1,6 +1,6 @@ import six -from normality.cleaning import collapse_spaces, category_replace +from normality.cleaning import collapse_spaces, category_replace, decompose_nfkd from normality.constants import UNICODE_CATEGORIES from normality.transliteration import latinize_text, ascii_text from normality.encoding import guess_encoding # noqa @@ -8,7 +8,7 @@ WS = ' ' -def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False, +def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False, decompose=False, replace_categories=UNICODE_CATEGORIES): """The main normalization function for text. @@ -38,6 +38,9 @@ def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False, # Yeah I made a Python package for this. text = text.lower() + if decompose: + text = decompose_nfkd(text) + if ascii: # A stricter form of transliteration that leaves only ASCII # characters.