diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index 16979c5a8b..81ebb3b349 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -40,6 +40,19 @@ Some test strings. >>> word_tokenize(s10) ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'] + +Testing improvement made to the TreebankWordTokenizer + + >>> sx1 = u'\xabNow that I can do.\xbb' + >>> expected = [u'\xab', u'Now', u'that', u'I', u'can', u'do', u'.', u'\xbb'] + >>> word_tokenize(sx1) == expected + True + >>> sx2 = u'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.' + >>> expected = [u'The', u'unicode', u'201C', u'and', u'201D', u'\u201c', u'LEFT', u'(', u'RIGHT', u')', u'DOUBLE', u'QUOTATION', u'MARK', u'\u201d', u'is', u'also', u'OPEN_PUNCT', u'and', u'CLOSE_PUNCT', u'.'] + >>> word_tokenize(sx2) == expected + True + + Sentence tokenization in word_tokenize: >>> s11 = "I called Dr. Jones. I called Dr. Jones." diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py index 3eed916179..5503ff6de1 100644 --- a/nltk/tokenize/__init__.py +++ b/nltk/tokenize/__init__.py @@ -4,6 +4,7 @@ # Copyright (C) 2001-2017 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) +# Contributors: matthewmc, clouds56 # URL: # For license information, see LICENSE.TXT @@ -59,6 +60,8 @@ For further information, please see Chapter 3 of the NLTK book. """ +import re + from nltk.data import load from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize) from nltk.tokenize.mwe import MWETokenizer @@ -94,12 +97,26 @@ def sent_tokenize(text, language='english'): return tokenizer.tokenize(text) # Standard word tokenizer. -_treebank_word_tokenize = TreebankWordTokenizer().tokenize +_treebank_word_tokenizer = TreebankWordTokenizer() + +# See discussion on https://github.com/nltk/nltk/pull/1437 +# Adding to TreebankWordTokenizer, the splits on +# - chervon quotes u'\xab' and u'\xbb' . +# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' + +improved_open_quote_regex = re.compile(u'([«“‘])', re.U) +improved_close_quote_regex = re.compile(u'([»”’])', re.U) +improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’' r']*)\s*$', re.U) +_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) +_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) +_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 ')) + + def word_tokenize(text, language='english'): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer - (currently :class:`.TreebankWordTokenizer` + (currently an improved :class:`.TreebankWordTokenizer` along with :class:`.PunktSentenceTokenizer` for the specified language). @@ -107,5 +124,4 @@ def word_tokenize(text, language='english'): :param language: the model name in the Punkt corpus """ return [token for sent in sent_tokenize(text, language) - for token in _treebank_word_tokenize(sent)] - + for token in _treebank_word_tokenizer.tokenize(sent)] diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index 3f960d4fa5..83ad2c3a2d 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -93,7 +93,7 @@ class TreebankWordTokenizer(TokenizerI): CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"), re.compile(r"(?i)\b(wha)(t)(cha)\b")] - def tokenize(self, text): + def tokenize(self, text, return_str=False): for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) @@ -119,6 +119,4 @@ def tokenize(self, text): # for regexp in self.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) - return text.split() - - + return text if return_str else text.split()