-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request nltk#1682 from alvations/develop
Added support for additional quotes to TreebankWordTokenizer
- Loading branch information
Showing
3 changed files
with
35 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
# Copyright (C) 2001-2017 NLTK Project | ||
# Author: Edward Loper <[email protected]> | ||
# Steven Bird <[email protected]> (minor additions) | ||
# Contributors: matthewmc, clouds56 | ||
# URL: <http://nltk.org/> | ||
# For license information, see LICENSE.TXT | ||
|
||
|
@@ -59,6 +60,8 @@ | |
For further information, please see Chapter 3 of the NLTK book. | ||
""" | ||
|
||
import re | ||
|
||
from nltk.data import load | ||
from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize) | ||
from nltk.tokenize.mwe import MWETokenizer | ||
|
@@ -94,18 +97,31 @@ def sent_tokenize(text, language='english'): | |
return tokenizer.tokenize(text) | ||
|
||
# Standard word tokenizer. | ||
_treebank_word_tokenize = TreebankWordTokenizer().tokenize | ||
_treebank_word_tokenizer = TreebankWordTokenizer() | ||
|
||
# See discussion on https://github.com/nltk/nltk/pull/1437 | ||
# Adding to TreebankWordTokenizer, the splits on | ||
# - chervon quotes u'\xab' and u'\xbb' . | ||
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' | ||
|
||
improved_open_quote_regex = re.compile(u'([«“‘])', re.U) | ||
improved_close_quote_regex = re.compile(u'([»”’])', re.U) | ||
improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’' r']*)\s*$', re.U) | ||
_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) | ||
_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) | ||
_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 ')) | ||
|
||
|
||
def word_tokenize(text, language='english'): | ||
""" | ||
Return a tokenized copy of *text*, | ||
using NLTK's recommended word tokenizer | ||
(currently :class:`.TreebankWordTokenizer` | ||
(currently an improved :class:`.TreebankWordTokenizer` | ||
along with :class:`.PunktSentenceTokenizer` | ||
for the specified language). | ||
:param text: text to split into words | ||
:param language: the model name in the Punkt corpus | ||
""" | ||
return [token for sent in sent_tokenize(text, language) | ||
for token in _treebank_word_tokenize(sent)] | ||
|
||
for token in _treebank_word_tokenizer.tokenize(sent)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters