Merge pull request #97 from robinwhittleton/nbsp-support

Add an option to preserve the input space characters
ppannuto · Apr 6, 2024 · 42f43e2 · 42f43e2
2 parents 418c57c + 027dd21
commit 42f43e2
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 4 deletions.
diff --git a/titlecase/__init__.py b/titlecase/__init__.py
@@ -77,14 +77,18 @@ def set_small_word_list(small=SMALL):
     SUBPHRASE = regex.compile(r'([:.;?!][ ])(%s)' % small)
 
 
-def titlecase(text, callback=None, small_first_last=True, preserve_blank_lines=False):
+def titlecase(text, callback=None, small_first_last=True, preserve_blank_lines=False, normalise_space_characters=False):
     """
     :param text: Titlecases input text
     :param callback: Callback function that returns the titlecase version of a specific word
     :param small_first_last: Capitalize small words (e.g. 'A') at the beginning; disabled when recursing
+    :param preserve_blank_lines: Preserve blank lines in the output
+    :param normalise_space_characters: Convert all original spaces to normal space characters
     :type text: str
     :type callback: function
     :type small_first_last: bool
+    :type preserve_blank_lines: bool
+    :type normalise_space_characters: bool
 
     This filter changes all words to Title Caps, and attempts to be clever
     about *un*capitalizing SMALL words like a/an/the in the input.
@@ -100,7 +104,9 @@ def titlecase(text, callback=None, small_first_last=True, preserve_blank_lines=F
     processed = []
     for line in lines:
         all_caps = line.upper() == line
-        words = regex.split('[\t ]', line)
+        split_line = regex.split(r'(\s)', line)
+        words = split_line[::2]
+        spaces = split_line[1::2]
         tc_line = []
         for word in words:
             if callback:
@@ -188,7 +194,13 @@ def titlecase(text, callback=None, small_first_last=True, preserve_blank_lines=F
                     lambda m: m.group(0).capitalize(), tc_line[-1]
                 )
 
-        result = " ".join(tc_line)
+        if normalise_space_characters:
+            result = " ".join(tc_line)
+        else:
+            line_to_be_joined = tc_line + spaces
+            line_to_be_joined[::2] = tc_line
+            line_to_be_joined[1::2] = spaces
+            result = "".join(line_to_be_joined)
 
         result = SUBPHRASE.sub(lambda m: '%s%s' % (
             m.group(1),

diff --git a/titlecase/tests.py b/titlecase/tests.py
@@ -12,7 +12,7 @@
 from titlecase import titlecase, set_small_word_list, create_wordlist_filter_from_file
 
 
-# (executed by `test_input_output` below)
+# (executed by `test_specific_string` below)
 TEST_DATA = (
     (
         "",
@@ -307,6 +307,10 @@
         "Mr mr Mrs Ms Mss Dr dr , Mr. and Mrs. Person",
         "Mr Mr Mrs Ms MSS Dr Dr , Mr. And Mrs. Person",
     ),
+    (
+        "a mix of\tdifferent\u200aspace\u2006characters",
+        "A Mix of\tDifferent\u200aSpace\u2006Characters",
+    ),
 )
 
 
@@ -429,6 +433,16 @@ def test_complex_blanks(self):
         self.assertEqual(titlecase(s, preserve_blank_lines=True),
                 '\n\nLeading Blank\n\n\nMulti-Blank\n\n\n\n\nTrailing Blank\n\n')
 
+class TestNormaliseSpaceCharacters(unittest.TestCase):
+    def test_tabs(self):
+        s = 'text\twith\ttabs'
+        self.assertEqual(titlecase(s), 'Text\tWith\tTabs')
+        self.assertEqual(titlecase(s, normalise_space_characters=True), 'Text With Tabs')
+
+    def test_nbsps(self):
+        s = 'text with nonbreaking spaces'
+        self.assertEqual(titlecase(s), 'Text With Nonbreaking Spaces')
+        self.assertEqual(titlecase(s, normalise_space_characters=True), 'Text With Nonbreaking Spaces')
 
 if __name__ == '__main__':
     unittest.main()