From 6a42d70a9f74478ed9d650d5b96a385ea84213b7 Mon Sep 17 00:00:00 2001 From: "Christian M. Meyer" Date: Thu, 19 Jan 2017 16:29:45 +0100 Subject: [PATCH 1/3] Fix 1597. Allow for curly bracket quantifiers in nltk.chunk.regexp.CHUNK_TAG_PATTERN. --- nltk/test/unit/test_chunk.py | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 nltk/test/unit/test_chunk.py diff --git a/nltk/test/unit/test_chunk.py b/nltk/test/unit/test_chunk.py new file mode 100644 index 0000000000..892ce0c9b7 --- /dev/null +++ b/nltk/test/unit/test_chunk.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, unicode_literals +import unittest + +from nltk import RegexpParser + +class TestChunkRule(unittest.TestCase): + + def test_tag_pattern2re_pattern_quantifier(self): + """Test for bug https://github.com/nltk/nltk/issues/1597 + + Ensures that curly bracket quantifiers can be used inside a chunk rule. + This type of quantifier has been used for the supplementary example + in http://www.nltk.org/book/ch07.html#exploring-text-corpora. + """ + sent = [('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HVD'), ('been', 'BEN'), ('charged', 'VBN'), ('by', 'IN'), ('Fulton', 'NP-TL'), ('Superior', 'JJ-TL'), ('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('reports', 'NNS'), ('of', 'IN'), ('possible', 'JJ'), ('``', '``'), ('irregularities', 'NNS'), ("''", "''"), ('in', 'IN'), ('the', 'AT'), ('hard-fought', 'JJ'), ('primary', 'NN'), ('which', 'WDT'), ('was', 'BEDZ'), ('won', 'VBN'), ('by', 'IN'), ('Mayor-nominate', 'NN-TL'), ('Ivan', 'NP'), ('Allen', 'NP'), ('Jr.', 'NP'), ('.', '.')] # source: brown corpus + cp = RegexpParser('CHUNK: {{4,}}') + tree = cp.parse(sent) + assert tree.pformat() == """(S + The/AT + September-October/NP + term/NN + jury/NN + had/HVD + been/BEN + charged/VBN + by/IN + Fulton/NP-TL + Superior/JJ-TL + (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP) + to/TO + investigate/VB + reports/NNS + of/IN + possible/JJ + ``/`` + irregularities/NNS + ''/'' + in/IN + the/AT + hard-fought/JJ + primary/NN + which/WDT + was/BEDZ + won/VBN + by/IN + (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP) + ./.)""" + From a24587bd627be9fe9e8968bd579ba5efe957fc4f Mon Sep 17 00:00:00 2001 From: "Christian M. Meyer" Date: Thu, 19 Jan 2017 16:37:03 +0100 Subject: [PATCH 2/3] Fix 1597. Allow for curly bracket quantifiers in nltk.chunk.regexp.CHUNK_TAG_PATTERN. --- nltk/chunk/regexp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/chunk/regexp.py b/nltk/chunk/regexp.py index a30810baa2..ad485a6d74 100644 --- a/nltk/chunk/regexp.py +++ b/nltk/chunk/regexp.py @@ -829,7 +829,7 @@ def __repr__(self): # this should probably be made more strict than it is -- e.g., it # currently accepts 'foo'. CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' % - ('[^\{\}<>]+', + ('[^<>]+', '[^\{\}<>]+')) def tag_pattern2re_pattern(tag_pattern): From daa324a059f3311a37b7df6d038dff189f5757e6 Mon Sep 17 00:00:00 2001 From: "Christian M. Meyer" Date: Thu, 19 Jan 2017 17:31:50 +0100 Subject: [PATCH 3/3] Issue #1597: Disallow non quantifier constructions in CHUNK_TAG_PATTERN regex. --- nltk/chunk/regexp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/chunk/regexp.py b/nltk/chunk/regexp.py index ad485a6d74..9b47292fcb 100644 --- a/nltk/chunk/regexp.py +++ b/nltk/chunk/regexp.py @@ -829,7 +829,7 @@ def __repr__(self): # this should probably be made more strict than it is -- e.g., it # currently accepts 'foo'. CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' % - ('[^<>]+', + ('([^\{\}<>]|\{\d+,?\d*\}|\{\d*,?\d+\})+', '[^\{\}<>]+')) def tag_pattern2re_pattern(tag_pattern):