Skip to content

Commit

Permalink
Merge pull request nltk#1682 from alvations/develop
Browse files Browse the repository at this point in the history
Added support for additional quotes to TreebankWordTokenizer
  • Loading branch information
stevenbird authored Apr 13, 2017
2 parents 887af8f + 329e517 commit b7c2aff
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 8 deletions.
13 changes: 13 additions & 0 deletions nltk/test/tokenize.doctest
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,19 @@ Some test strings.
>>> word_tokenize(s10)
['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']


Testing improvement made to the TreebankWordTokenizer

>>> sx1 = u'\xabNow that I can do.\xbb'
>>> expected = [u'\xab', u'Now', u'that', u'I', u'can', u'do', u'.', u'\xbb']
>>> word_tokenize(sx1) == expected
True
>>> sx2 = u'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
>>> expected = [u'The', u'unicode', u'201C', u'and', u'201D', u'\u201c', u'LEFT', u'(', u'RIGHT', u')', u'DOUBLE', u'QUOTATION', u'MARK', u'\u201d', u'is', u'also', u'OPEN_PUNCT', u'and', u'CLOSE_PUNCT', u'.']
>>> word_tokenize(sx2) == expected
True


Sentence tokenization in word_tokenize:

>>> s11 = "I called Dr. Jones. I called Dr. Jones."
Expand Down
24 changes: 20 additions & 4 deletions nltk/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <[email protected]>
# Steven Bird <[email protected]> (minor additions)
# Contributors: matthewmc, clouds56
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

Expand Down Expand Up @@ -59,6 +60,8 @@
For further information, please see Chapter 3 of the NLTK book.
"""

import re

from nltk.data import load
from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize)
from nltk.tokenize.mwe import MWETokenizer
Expand Down Expand Up @@ -94,18 +97,31 @@ def sent_tokenize(text, language='english'):
return tokenizer.tokenize(text)

# Standard word tokenizer.
_treebank_word_tokenize = TreebankWordTokenizer().tokenize
_treebank_word_tokenizer = TreebankWordTokenizer()

# See discussion on https://github.com/nltk/nltk/pull/1437
# Adding to TreebankWordTokenizer, the splits on
# - chervon quotes u'\xab' and u'\xbb' .
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'

improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
improved_close_quote_regex = re.compile(u'([»”’])', re.U)
improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’' r']*)\s*$', re.U)
_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))


def word_tokenize(text, language='english'):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently :class:`.TreebankWordTokenizer`
(currently an improved :class:`.TreebankWordTokenizer`
along with :class:`.PunktSentenceTokenizer`
for the specified language).
:param text: text to split into words
:param language: the model name in the Punkt corpus
"""
return [token for sent in sent_tokenize(text, language)
for token in _treebank_word_tokenize(sent)]

for token in _treebank_word_tokenizer.tokenize(sent)]
6 changes: 2 additions & 4 deletions nltk/tokenize/treebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class TreebankWordTokenizer(TokenizerI):
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
re.compile(r"(?i)\b(wha)(t)(cha)\b")]

def tokenize(self, text):
def tokenize(self, text, return_str=False):
for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)

Expand All @@ -119,6 +119,4 @@ def tokenize(self, text):
# for regexp in self.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)

return text.split()


return text if return_str else text.split()

0 comments on commit b7c2aff

Please sign in to comment.