Skip to content

Commit

Permalink
Merge pull request nltk#1598 from chmeyer/fix/CHUNK_TAG_PATTERN
Browse files Browse the repository at this point in the history
Fix/chunk tag pattern
  • Loading branch information
stevenbird authored May 16, 2017
2 parents 837ec2a + daa324a commit 2a95665
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
2 changes: 1 addition & 1 deletion nltk/chunk/regexp.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def __repr__(self):
# this should probably be made more strict than it is -- e.g., it
# currently accepts 'foo'.
CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
('[^\{\}<>]+',
('([^\{\}<>]|\{\d+,?\d*\}|\{\d*,?\d+\})+',
'[^\{\}<>]+'))

def tag_pattern2re_pattern(tag_pattern):
Expand Down
49 changes: 49 additions & 0 deletions nltk/test/unit/test_chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest

from nltk import RegexpParser

class TestChunkRule(unittest.TestCase):

def test_tag_pattern2re_pattern_quantifier(self):
"""Test for bug https://github.com/nltk/nltk/issues/1597
Ensures that curly bracket quantifiers can be used inside a chunk rule.
This type of quantifier has been used for the supplementary example
in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
"""
sent = [('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HVD'), ('been', 'BEN'), ('charged', 'VBN'), ('by', 'IN'), ('Fulton', 'NP-TL'), ('Superior', 'JJ-TL'), ('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('reports', 'NNS'), ('of', 'IN'), ('possible', 'JJ'), ('``', '``'), ('irregularities', 'NNS'), ("''", "''"), ('in', 'IN'), ('the', 'AT'), ('hard-fought', 'JJ'), ('primary', 'NN'), ('which', 'WDT'), ('was', 'BEDZ'), ('won', 'VBN'), ('by', 'IN'), ('Mayor-nominate', 'NN-TL'), ('Ivan', 'NP'), ('Allen', 'NP'), ('Jr.', 'NP'), ('.', '.')] # source: brown corpus
cp = RegexpParser('CHUNK: {<N.*>{4,}}')
tree = cp.parse(sent)
assert tree.pformat() == """(S
The/AT
September-October/NP
term/NN
jury/NN
had/HVD
been/BEN
charged/VBN
by/IN
Fulton/NP-TL
Superior/JJ-TL
(CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
to/TO
investigate/VB
reports/NNS
of/IN
possible/JJ
``/``
irregularities/NNS
''/''
in/IN
the/AT
hard-fought/JJ
primary/NN
which/WDT
was/BEDZ
won/VBN
by/IN
(CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
./.)"""

0 comments on commit 2a95665

Please sign in to comment.