From fa93dbb885526a4c879114280e895d6d31919955 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Tue, 12 Jan 2016 23:04:15 +0000 Subject: [PATCH 01/32] Initial support of Stanford CoreNLP server API. --- nltk/parse/stanford.py | 212 ++++++++++++++++++++++++++++------------- setup.py | 3 + 2 files changed, 150 insertions(+), 65 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 60b349d5ec..5ae3ce1d6b 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -12,12 +12,13 @@ import tempfile import os import re -import warnings +import json from subprocess import PIPE -from io import StringIO + +import requests from nltk import compat -from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options +from nltk.internals import find_jar_iter, config_java, java, _java_options from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph @@ -25,6 +26,7 @@ _stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml' + class GenericStanfordParser(ParserI): """Interface to the Stanford Parser""" @@ -40,6 +42,8 @@ def __init__(self, path_to_jar=None, path_to_models_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m', corenlp_options=''): + return + # find the most recent code and model jar stanford_jar = max( find_jar_iter( @@ -92,11 +96,13 @@ def _parse_trees_output(self, output_): return iter(res) def parse_sents(self, sentences, verbose=False): - """ - Use StanfordParser to parse multiple sentences. Takes multiple sentences as a - list where each sentence is a list of words. - Each sentence will be automatically tagged with this StanfordParser instance's - tagger. + """Parse multiple sentences. + + + Takes multiple sentences as a list where each sentence is a list of + words. Each sentence will be automatically tagged with this + StanfordParser instance's tagger. + If whitespaces exists inside a token, then the token will be treated as separate tokens. @@ -104,16 +110,10 @@ def parse_sents(self, sentences, verbose=False): :type sentences: list(list(str)) :rtype: iter(iter(Tree)) """ - cmd = [ - self._MAIN_CLASS, - '-model', self.model_path, - '-sentences', 'newline', - '-outputFormat', self._OUTPUT_FORMAT, - '-tokenized', - '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor', - ] - return self._parse_trees_output(self._execute( - cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose)) + + sentences = [' '.join(words) for words in sentences] + + return self.raw_parse_sents(sentences, verbose=False, tokenize_whitespace=True) def raw_parse(self, sentence, verbose=False): """ @@ -127,23 +127,45 @@ def raw_parse(self, sentence, verbose=False): """ return next(self.raw_parse_sents([sentence], verbose)) - def raw_parse_sents(self, sentences, verbose=False): - """ - Use StanfordParser to parse multiple sentences. Takes multiple sentences as a - list of strings. - Each sentence will be automatically tokenized and tagged by the Stanford Parser. + def raw_parse_sents(self, sentences, verbose=False, tokenize_whitespace=False): + """Use StanfordParser to parse multiple sentences. - :param sentences: Input sentences to parse + Takes multiple sentences as a list of strings. Each sentence will be + automatically tokenized and tagged by the Stanford Parser. + + :param sentences: Input sentences to parse. :type sentences: list(str) :rtype: iter(iter(Tree)) """ - cmd = [ - self._MAIN_CLASS, - '-model', self.model_path, - '-sentences', 'newline', - '-outputFormat', self._OUTPUT_FORMAT, - ] - return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), verbose)) + session = requests.Session() + + for sentence in sentences: + + properties = { + 'annotators': 'tokenize,pos,parse', + 'outputFormat': 'json', + 'tokenize.options': 'normalizeParentheses=true', + } + + if tokenize_whitespace: + properties['tokenize.whitespace'] = 'true' + + response = session.post( + 'http://localhost:9000', + params={ + 'properties': json.dumps(properties), + }, + data=sentence, + ) + + response.raise_for_status() + + parsed_data = response.json() + assert len(parsed_data['sentences']) == 1 + + tree = Tree.fromstring(parsed_data['sentences'][0]['parse']) + + yield iter([tree]) def tagged_parse(self, sentence, verbose=False): """ @@ -222,37 +244,99 @@ def _execute(self, cmd, input_, verbose=False): return stdout + class StanfordParser(GenericStanfordParser): """ - >>> parser=StanfordParser( - ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" + >>> parser=StanfordParser() + + >>> next( + ... parser.raw_parse('the quick brown fox jumps over the lazy dog') + ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|_________ + | VP + | _________|___ + | | PP + | | ________|___ + NP | | NP + ____|__________ | | _______|____ + DT JJ JJ NN VBZ IN DT JJ NN + | | | | | | | | | + the quick brown fox jumps over the lazy dog + + >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents( + ... [ + ... 'the quick brown fox jumps over the lazy dog', + ... 'the quick grey wolf jumps over the lazy fox', + ... ] ... ) - - >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE - [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), - Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), - Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])] - - >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents(( - ... "the quick brown fox jumps over the lazy dog", - ... "the quick grey wolf jumps over the lazy fox" - ... ))], []) # doctest: +NORMALIZE_WHITESPACE - [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), - Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), - Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', - [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP', - [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), - Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] - - >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents(( - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ))], []) # doctest: +NORMALIZE_WHITESPACE - [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), - Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', - [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), - Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']), - Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])] + >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|_________ + | VP + | _________|___ + | | PP + | | ________|___ + NP | | NP + ____|__________ | | _______|____ + DT JJ JJ NN VBZ IN DT JJ NN + | | | | | | | | | + the quick brown fox jumps over the lazy dog + + >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|_________ + | VP + | _________|___ + | | PP + | | ________|___ + NP | | NP + ____|_________ | | _______|____ + DT JJ JJ NN VBZ IN DT JJ NN + | | | | | | | | | + the quick grey wolf jumps over the lazy fox + + >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents( + ... [ + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ] + ... ) + >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______|____ + | VP + | ________|___ + NP | NP + | | ___|___ + PRP VBP DT NN + | | | | + I 'm a dog + + >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + SINV + ________________|________________ + S | + ________|______ | + | VP | + | __________|_____ | + | | NP VP + | | _____|________ _______|____ + NP | NP | VBD NP + | | ______|_________ | | ________|____ + DT VBZ PRP$ NNS POS NN DT JJ NN + | | | | | | | | | | + This is my friends ' cat ... the tabby ... >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( ... ( @@ -393,10 +477,8 @@ def _make_tree(self, result): def setup_module(module): from nose import SkipTest - try: - StanfordParser( - model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' + if not requests.get('http://localhost:9000').ok: + raise SkipTest( + 'Doctests from nltk.parse.stanford are skipped because ' + 'one of the CoreNLP server is not available.' ) - StanfordNeuralDependencyParser() - except LookupError: - raise SkipTest('doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist') diff --git a/setup.py b/setup.py index 59d5436659..94da254c7f 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,9 @@ ], package_data = {'nltk': ['test/*.doctest', 'VERSION', 'sentiment/vader_lexicon.txt']}, # install_requires = ['six>=1.9.0'], + extras_require={ + 'stanford-corenl': ['requests'], + }, packages = find_packages(), zip_safe=False, # since normal files will be present too? ) From 5dd7841b0f184b6b3b3bc6bf5d3a13a216df71f5 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Tue, 12 Jan 2016 23:19:18 +0000 Subject: [PATCH 02/32] Typos --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 94da254c7f..2f42907669 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,7 @@ package_data = {'nltk': ['test/*.doctest', 'VERSION', 'sentiment/vader_lexicon.txt']}, # install_requires = ['six>=1.9.0'], extras_require={ - 'stanford-corenl': ['requests'], + 'stanford-corenlp': ['requests'], }, packages = find_packages(), zip_safe=False, # since normal files will be present too? From c1be4868574749ffee81cc3286de3f1edc211afa Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Wed, 13 Jan 2016 13:03:04 +0000 Subject: [PATCH 03/32] Dependency parser. --- nltk/parse/stanford.py | 206 ++++++++++++++++++++++++----------------- 1 file changed, 122 insertions(+), 84 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 5ae3ce1d6b..d9e76bb860 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -142,7 +142,9 @@ def raw_parse_sents(self, sentences, verbose=False, tokenize_whitespace=False): for sentence in sentences: properties = { - 'annotators': 'tokenize,pos,parse', + 'annotators': 'tokenize,pos,lemma,{parser_annotator}'.format( + parser_annotator=self.parser_annotator, + ), 'outputFormat': 'json', 'tokenize.options': 'normalizeParentheses=true', } @@ -163,7 +165,7 @@ def raw_parse_sents(self, sentences, verbose=False, tokenize_whitespace=False): parsed_data = response.json() assert len(parsed_data['sentences']) == 1 - tree = Tree.fromstring(parsed_data['sentences'][0]['parse']) + tree = self.make_tree(parsed_data['sentences'][0]) yield iter([tree]) @@ -232,7 +234,7 @@ def _execute(self, cmd, input_, verbose=False): cmd.append(input_file.name) stdout, stderr = java(cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE) - + stdout = stdout.replace(b'\xc2\xa0',b' ') stdout = stdout.replace(b'\xa0',b' ') stdout = stdout.decode(encoding) @@ -351,46 +353,107 @@ class StanfordParser(GenericStanfordParser): ... ("dog", "NN"), ... (".", "."), ... ), - ... ))],[]) # doctest: +NORMALIZE_WHITESPACE + ... ))],[]) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] """ _OUTPUT_FORMAT = 'penn' + parser_annotator = 'parse' - def _make_tree(self, result): - return Tree.fromstring(result) + def make_tree(self, result): + return Tree.fromstring(result['parse']) class StanfordDependencyParser(GenericStanfordParser): """ - >>> dep_parser=StanfordDependencyParser( - ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" - ... ) - - >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] - - >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE - [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), - ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), - ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), - ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] + >>> dep_parser=StanfordDependencyParser() - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( - ... "The quick brown fox jumps over the lazy dog.", - ... "The quick grey wolf jumps over the lazy fox." - ... ))], []) # doctest: +NORMALIZE_WHITESPACE - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), - Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] - - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ))], []) # doctest: +NORMALIZE_WHITESPACE - [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] + >>> parse, = dep_parser.raw_parse( + ... 'The quick brown fox jumps over the lazy dog.' + ... ) + >>> print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + brown JJ 4 amod + fox NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + dog NN 5 nmod + . . 5 punct + + >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE + (jumps (fox The quick brown) (dog over the lazy) .) + + >>> for governor, dep, dependent in parse.triples(): + ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE + ('jumps', 'VBZ') nsubj ('fox', 'NN') + ('fox', 'NN') det ('The', 'DT') + ('fox', 'NN') amod ('quick', 'JJ') + ('fox', 'NN') amod ('brown', 'JJ') + ('jumps', 'VBZ') nmod ('dog', 'NN') + ('dog', 'NN') case ('over', 'IN') + ('dog', 'NN') det ('the', 'DT') + ('dog', 'NN') amod ('lazy', 'JJ') + ('jumps', 'VBZ') punct ('.', '.') + + >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents( + ... [ + ... 'The quick brown fox jumps over the lazy dog.', + ... 'The quick grey wolf jumps over the lazy fox.', + ... ] + ... ) + >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + brown JJ 4 amod + fox NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + dog NN 5 nmod + . . 5 punct + + >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + grey JJ 4 amod + wolf NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + fox NN 5 nmod + . . 5 punct + + >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents( + ... [ + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ] + ... ) + >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + I PRP 4 nsubj + 'm VBP 4 cop + a DT 4 det + dog NN 0 ROOT + + >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + This DT 6 nsubj + is VBZ 6 cop + my PRP$ 4 nmod:poss + friends NNS 6 nmod:poss + ' POS 4 case + cat NN 0 ROOT + ( VBD 6 acl + the DT 10 det + tabby JJ 10 amod + ) NN 7 dobj >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( ... ( @@ -414,65 +477,40 @@ class StanfordDependencyParser(GenericStanfordParser): """ _OUTPUT_FORMAT = 'conll2007' + parser_annotator = 'depparse' - def _make_tree(self, result): - return DependencyGraph(result, top_relation_label='root') - - -class StanfordNeuralDependencyParser(GenericStanfordParser): - ''' - >>> from nltk.parse.stanford import StanfordNeuralDependencyParser - >>> dep_parser=StanfordNeuralDependencyParser() - - >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] + def make_tree(self, result): - >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE - [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), - ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), - ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), - ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] + return DependencyGraph( + ( + ' '.join(items) # NLTK expects an iterable of strings... + for n, *items in sorted(transform(result)) + ), + ) - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( - ... "The quick brown fox jumps over the lazy dog.", - ... "The quick grey wolf jumps over the lazy fox." - ... ))], []) # doctest: +NORMALIZE_WHITESPACE - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), - Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] - - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ))], []) # doctest: +NORMALIZE_WHITESPACE - [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] - ''' - - _OUTPUT_FORMAT = 'conll' - _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP' - _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar' - _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar' - _USE_STDIN = True - _DOUBLE_SPACED_OUTPUT = True - - def __init__(self, *args, **kwargs): - super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs) - self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse' - def tagged_parse_sents(self, sentences, verbose=False): - ''' - Currently unimplemented because the neural dependency parser (and - the StanfordCoreNLP pipeline class) doesn't support passing in pre- - tagged tokens. - ''' - raise NotImplementedError( - 'tagged_parse[_sents] is not supported by ' - 'StanfordNeuralDependencyParser; use ' - 'parse[_sents] or raw_parse[_sents] instead.' +def transform(sentence): + for dependency in sentence['basic-dependencies']: + + dependent_index = dependency['dependent'] + token = sentence['tokens'][dependent_index - 1] + + # Return values we don't know as '_'. Also, consider tag and ctag to be + # equal. + yield ( + dependent_index, + '_', + token['word'], + token['lemma'], + token['pos'], + token['pos'], + '_', + str(dependency['governor']), + dependency['dep'], + '_', + '_', ) - def _make_tree(self, result): - return DependencyGraph(result, top_relation_label='ROOT') - def setup_module(module): from nose import SkipTest @@ -480,5 +518,5 @@ def setup_module(module): if not requests.get('http://localhost:9000').ok: raise SkipTest( 'Doctests from nltk.parse.stanford are skipped because ' - 'one of the CoreNLP server is not available.' + 'the CoreNLP server is not available.' ) From f87126cf14d3acb3d5a2bb20bfb81e5824f95ab1 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Wed, 13 Jan 2016 13:58:37 +0000 Subject: [PATCH 04/32] parse_text() --- nltk/parse/stanford.py | 87 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index d9e76bb860..c0dfaaf908 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -127,7 +127,13 @@ def raw_parse(self, sentence, verbose=False): """ return next(self.raw_parse_sents([sentence], verbose)) - def raw_parse_sents(self, sentences, verbose=False, tokenize_whitespace=False): + def raw_parse_sents( + self, + sentences, + verbose=False, + tokenize_whitespace=False, + sentence_split=False, + ): """Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list of strings. Each sentence will be @@ -136,16 +142,19 @@ def raw_parse_sents(self, sentences, verbose=False, tokenize_whitespace=False): :param sentences: Input sentences to parse. :type sentences: list(str) :rtype: iter(iter(Tree)) + """ session = requests.Session() for sentence in sentences: properties = { - 'annotators': 'tokenize,pos,lemma,{parser_annotator}'.format( - parser_annotator=self.parser_annotator, + 'annotators': ','.join( + ['tokenize', 'pos', 'lemma', self.parser_annotator] + + (['ssplit'] if sentence_split else []) ), 'outputFormat': 'json', + # TODO: Does it work? 'tokenize.options': 'normalizeParentheses=true', } @@ -158,16 +167,22 @@ def raw_parse_sents(self, sentences, verbose=False, tokenize_whitespace=False): 'properties': json.dumps(properties), }, data=sentence, - ) + ) response.raise_for_status() parsed_data = response.json() - assert len(parsed_data['sentences']) == 1 - tree = self.make_tree(parsed_data['sentences'][0]) + if not sentence_split: + assert len(parsed_data['sentences']) == 1 - yield iter([tree]) + # TODO: Originally, we can return several parsers for a sentence. + # * We always return one wrapped in an iterator. + # * When text is split to sentences we return one parse per + # sentence, not an iterable of parses per sentence. + for parse in parsed_data['sentences']: + tree = self.make_tree(parse) + yield iter([tree]) def tagged_parse(self, sentence, verbose=False): """ @@ -246,6 +261,17 @@ def _execute(self, cmd, input_, verbose=False): return stdout + def parse_text(self, text): + """Parse a piece of text. + + The text might contain several sentences which will be split by CoreNLP. + + :param str text: text to be split. + :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables. + + """ + return self.raw_parse_sents([text], sentence_split=True) + class StanfordParser(GenericStanfordParser): """ @@ -274,6 +300,7 @@ class StanfordParser(GenericStanfordParser): ... 'the quick grey wolf jumps over the lazy fox', ... ] ... ) + >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | @@ -310,6 +337,7 @@ class StanfordParser(GenericStanfordParser): ... "This is my friends ' cat ( the tabby )".split(), ... ] ... ) + >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | @@ -340,6 +368,34 @@ class StanfordParser(GenericStanfordParser): | | | | | | | | | | This is my friends ' cat ... the tabby ... + >>> (parse_john, ), (parse_mary, ) = parser.parse_text( + ... 'John loves Mary. Mary walks.' + ... ) + + >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _____|_____________ + | VP | + | ____|___ | + NP | NP | + | | | | + NNP VBZ NNP . + | | | | + John loves Mary . + + >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _____|____ + NP VP | + | | | + NNP VBZ . + | | | + Mary walks . + >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( ... ( ... ("The", "DT"), @@ -369,7 +425,7 @@ def make_tree(self, result): class StanfordDependencyParser(GenericStanfordParser): """ - >>> dep_parser=StanfordDependencyParser() + >>> dep_parser = StanfordDependencyParser() >>> parse, = dep_parser.raw_parse( ... 'The quick brown fox jumps over the lazy dog.' @@ -455,6 +511,21 @@ class StanfordDependencyParser(GenericStanfordParser): tabby JJ 10 amod ) NN 7 dobj + >>> (parse_john, ), (parse_mary, ) = dep_parser.parse_text( + ... 'John loves Mary. Mary walks.' + ... ) + + >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + John NNP 2 nsubj + loves VBZ 0 ROOT + Mary NNP 2 dobj + . . 2 punct + + >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + Mary NNP 2 nsubj + walks VBZ 0 ROOT + . . 2 punct + >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( ... ( ... ("The", "DT"), From 79b5456f429d18cffc7715c0dec5e3b28c468904 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Wed, 13 Jan 2016 22:56:06 +0000 Subject: [PATCH 05/32] Fixing special cases found in the MS paraphrase corpus. --- nltk/parse/stanford.py | 46 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index c0dfaaf908..c46b9cf2c5 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -149,13 +149,13 @@ def raw_parse_sents( for sentence in sentences: properties = { - 'annotators': ','.join( - ['tokenize', 'pos', 'lemma', self.parser_annotator] + - (['ssplit'] if sentence_split else []) + 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format( + parser_annotator=self.parser_annotator, ), 'outputFormat': 'json', + 'ssplit.isOneSentence': 'false' if sentence_split else 'true', # TODO: Does it work? - 'tokenize.options': 'normalizeParentheses=true', + 'tokenize.options': 'normalizeParentheses=true,normalizeOtherBrackets=true', } if tokenize_whitespace: @@ -166,7 +166,7 @@ def raw_parse_sents( params={ 'properties': json.dumps(properties), }, - data=sentence, + data=sentence.encode('utf-8'), ) response.raise_for_status() @@ -396,6 +396,26 @@ class StanfordParser(GenericStanfordParser): | | | Mary walks . + Special cases + ------------- + + >>> next( + ... parser.raw_parse( + ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war ' + ... 'Jessica Lynch have angrily dismissed claims made in her biography ' + ... 'that she was raped by her Iraqi captors.' + ... ) + ... ).height() + 17 + + >>> next( + ... parser.raw_parse( + ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or " + ... '0.05 percent, at 997.02.' + ... ) + ... ).height() + 10 + >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( ... ( ... ("The", "DT"), @@ -526,6 +546,21 @@ class StanfordDependencyParser(GenericStanfordParser): walks VBZ 0 ROOT . . 2 punct + Special cases + ------------- + + Non-breaking space inside of a token. + + >>> len( + ... next( + ... dep_parser.raw_parse( + ... 'Anhalt said children typically treat a 20-ounce soda bottle as one ' + ... 'serving, while it actually contains 2 1/2 servings.' + ... ) + ... ).nodes + ... ) + 21 + >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( ... ( ... ("The", "DT"), @@ -557,6 +592,7 @@ def make_tree(self, result): ' '.join(items) # NLTK expects an iterable of strings... for n, *items in sorted(transform(result)) ), + cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token. ) From 52bde24d00b11a2c73450e67dc88618c8e52af82 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Fri, 15 Jan 2016 15:00:29 +0000 Subject: [PATCH 06/32] Moving the api call code to a separate method. --- nltk/parse/stanford.py | 206 ++++++++++++++++++++--------------------- 1 file changed, 100 insertions(+), 106 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index c46b9cf2c5..8c42e8419a 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -37,10 +37,12 @@ class GenericStanfordParser(ParserI): _USE_STDIN = False _DOUBLE_SPACED_OUTPUT = False - def __init__(self, path_to_jar=None, path_to_models_jar=None, - model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', - encoding='utf8', verbose=False, - java_options='-mx1000m', corenlp_options=''): + def __init__(self, url='http://localhost:9000', encoding='utf8'): + + self.url = url + self.encoding = encoding + + self.session = requests.Session() return @@ -95,27 +97,25 @@ def _parse_trees_output(self, output_): blank = False return iter(res) - def parse_sents(self, sentences, verbose=False): + def parse_sents(self, sentences, *args, **kwargs): """Parse multiple sentences. - Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this StanfordParser instance's tagger. - If whitespaces exists inside a token, then the token will be treated as - separate tokens. + If a whitespace exists inside a token, then the token will be treated as + several tokens. :param sentences: Input sentences to parse :type sentences: list(list(str)) :rtype: iter(iter(Tree)) """ - sentences = [' '.join(words) for words in sentences] - - return self.raw_parse_sents(sentences, verbose=False, tokenize_whitespace=True) + sentences = (' '.join(words) for words in sentences) + return self.raw_parse_sents(sentences, *args, **kwargs) - def raw_parse(self, sentence, verbose=False): + def raw_parse(self, sentence, properties=None, *args, **kwargs): """ Use StanfordParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by @@ -125,14 +125,56 @@ def raw_parse(self, sentence, verbose=False): :type sentence: str :rtype: iter(Tree) """ - return next(self.raw_parse_sents([sentence], verbose)) + if properties is None: + properties = {} + + default_properties = { + 'tokenize.whitespace': 'false', + } + + default_properties.update(properties) + + return next( + self.raw_parse_sents( + [sentence], + properties=default_properties, + *args, + **kwargs + ) + ) + + def api_call(self, data, properties=None): + if properties is None: + properties = {} + + default_properties = { + 'outputFormat': 'json', + 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format( + parser_annotator=self.parser_annotator, + ), + } + + default_properties.update(properties) + + response = self.session.post( + self.url, + params={ + 'properties': json.dumps(default_properties), + }, + data=data.encode(self.encoding), + ) + + response.raise_for_status() + + return response.json() def raw_parse_sents( self, sentences, verbose=False, - tokenize_whitespace=False, - sentence_split=False, + properties=None, + *args, + **kwargs ): """Use StanfordParser to parse multiple sentences. @@ -144,42 +186,20 @@ def raw_parse_sents( :rtype: iter(iter(Tree)) """ - session = requests.Session() + if properties is None: + properties = {} - for sentence in sentences: - - properties = { - 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format( - parser_annotator=self.parser_annotator, - ), - 'outputFormat': 'json', - 'ssplit.isOneSentence': 'false' if sentence_split else 'true', - # TODO: Does it work? - 'tokenize.options': 'normalizeParentheses=true,normalizeOtherBrackets=true', - } - - if tokenize_whitespace: - properties['tokenize.whitespace'] = 'true' - - response = session.post( - 'http://localhost:9000', - params={ - 'properties': json.dumps(properties), - }, - data=sentence.encode('utf-8'), - ) + default_properties = { + 'ssplit.isOneSentence': 'true', + } - response.raise_for_status() + default_properties.update(properties) - parsed_data = response.json() + for sentence in sentences: + parsed_data = self.api_call(sentence, properties=default_properties) - if not sentence_split: - assert len(parsed_data['sentences']) == 1 + assert len(parsed_data['sentences']) == 1 - # TODO: Originally, we can return several parsers for a sentence. - # * We always return one wrapped in an iterator. - # * When text is split to sentences we return one parse per - # sentence, not an iterable of parses per sentence. for parse in parsed_data['sentences']: tree = self.make_tree(parse) yield iter([tree]) @@ -261,7 +281,7 @@ def _execute(self, cmd, input_, verbose=False): return stdout - def parse_text(self, text): + def parse_text(self, text, properties=None, *args, **kwargs): """Parse a piece of text. The text might contain several sentences which will be split by CoreNLP. @@ -270,7 +290,17 @@ def parse_text(self, text): :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables. """ - return self.raw_parse_sents([text], sentence_split=True) + if properties is None: + properties is {} + + default_properties = { + '' + } + + parsed_data = self.api_call(text, properties=properties, *args, **kwargs) + + for parse in parsed_data['sentences']: + yield self.make_tree(parse) class StanfordParser(GenericStanfordParser): @@ -352,23 +382,23 @@ class StanfordParser(GenericStanfordParser): I 'm a dog >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - SINV - ________________|________________ - S | - ________|______ | - | VP | - | __________|_____ | - | | NP VP - | | _____|________ _______|____ - NP | NP | VBD NP - | | ______|_________ | | ________|____ - DT VBZ PRP$ NNS POS NN DT JJ NN - | | | | | | | | | | - This is my friends ' cat ... the tabby ... - - >>> (parse_john, ), (parse_mary, ) = parser.parse_text( + ROOT + | + S + ____|___________ + | VP + | ___________|_____________ + | | NP + | | _______|_________ + | | NP PRN + | | _____|_______ ____|______________ + NP | NP | | NP | + | | ______|_________ | | ___|____ | + DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB- + | | | | | | | | | | + This is my friends ' cat -LRB- the tabby -RRB- + + >>> parse_john, parse_mary, = parser.parse_text( ... 'John loves Mary. Mary walks.' ... ) @@ -416,23 +446,6 @@ class StanfordParser(GenericStanfordParser): ... ).height() 10 - >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( - ... ( - ... ("The", "DT"), - ... ("quick", "JJ"), - ... ("brown", "JJ"), - ... ("fox", "NN"), - ... ("jumped", "VBD"), - ... ("over", "IN"), - ... ("the", "DT"), - ... ("lazy", "JJ"), - ... ("dog", "NN"), - ... (".", "."), - ... ), - ... ))],[]) # doctest: +NORMALIZE_WHITESPACE - [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), - Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', - [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] """ _OUTPUT_FORMAT = 'penn' @@ -526,12 +539,12 @@ class StanfordDependencyParser(GenericStanfordParser): friends NNS 6 nmod:poss ' POS 4 case cat NN 0 ROOT - ( VBD 6 acl - the DT 10 det - tabby JJ 10 amod - ) NN 7 dobj + -LRB- -LRB- 9 punct + the DT 9 det + tabby NN 6 appos + -RRB- -RRB- 9 punct - >>> (parse_john, ), (parse_mary, ) = dep_parser.parse_text( + >>> parse_john, parse_mary, = dep_parser.parse_text( ... 'John loves Mary. Mary walks.' ... ) @@ -561,25 +574,6 @@ class StanfordDependencyParser(GenericStanfordParser): ... ) 21 - >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( - ... ( - ... ("The", "DT"), - ... ("quick", "JJ"), - ... ("brown", "JJ"), - ... ("fox", "NN"), - ... ("jumped", "VBD"), - ... ("over", "IN"), - ... ("the", "DT"), - ... ("lazy", "JJ"), - ... ("dog", "NN"), - ... (".", "."), - ... ), - ... ))],[]) # doctest: +NORMALIZE_WHITESPACE - [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), - ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), - ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), - ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] - """ _OUTPUT_FORMAT = 'conll2007' From 0b2e57a1455b026f975fb3f691f844a59b21cbbd Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 16 Jan 2016 16:44:43 +0000 Subject: [PATCH 07/32] Playing with properties. --- nltk/parse/stanford.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 8c42e8419a..f0535ef0c6 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -125,14 +125,11 @@ def raw_parse(self, sentence, properties=None, *args, **kwargs): :type sentence: str :rtype: iter(Tree) """ - if properties is None: - properties = {} - default_properties = { 'tokenize.whitespace': 'false', } - default_properties.update(properties) + default_properties.update(properties or {}) return next( self.raw_parse_sents( @@ -144,9 +141,6 @@ def raw_parse(self, sentence, properties=None, *args, **kwargs): ) def api_call(self, data, properties=None): - if properties is None: - properties = {} - default_properties = { 'outputFormat': 'json', 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format( @@ -154,7 +148,7 @@ def api_call(self, data, properties=None): ), } - default_properties.update(properties) + default_properties.update(properties or {}) response = self.session.post( self.url, @@ -186,14 +180,11 @@ def raw_parse_sents( :rtype: iter(iter(Tree)) """ - if properties is None: - properties = {} - default_properties = { 'ssplit.isOneSentence': 'true', } - default_properties.update(properties) + default_properties.update(properties or {}) for sentence in sentences: parsed_data = self.api_call(sentence, properties=default_properties) From a33580aeb86a640c19b57d4bbd479b2524caf759 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 16 Jan 2016 16:45:45 +0000 Subject: [PATCH 08/32] StanfordParser is also a tokeniser. --- nltk/parse/stanford.py | 34 ++++++++++++++- nltk/tokenize/stanford.py | 92 ++++++++------------------------------- 2 files changed, 51 insertions(+), 75 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index f0535ef0c6..c1628bc925 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -21,13 +21,14 @@ from nltk.internals import find_jar_iter, config_java, java, _java_options from nltk.parse.api import ParserI +from nltk.tokenize.api import TokenizerI from nltk.parse.dependencygraph import DependencyGraph from nltk.tree import Tree _stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml' -class GenericStanfordParser(ParserI): +class GenericStanfordParser(ParserI, TokenizerI): """Interface to the Stanford Parser""" _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar' @@ -293,6 +294,37 @@ def parse_text(self, text, properties=None, *args, **kwargs): for parse in parsed_data['sentences']: yield self.make_tree(parse) + def tokenize(self, text, properties=None): + """Tokenize a string of text. + + >>> parser = StanfordParser() + + >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.' + >>> list(parser.tokenize(text)) + ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + >>> s = "The colour of the wall is blue." + + >>> list( + ... parser.tokenize( + ... 'The colour of the wall is blue.', + ... properties={'tokenize.options': 'americanize=true'}, + ... ) + ... ) + ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] + + """ + default_properties = { + 'annotators': 'tokenize,ssplit', + } + + default_properties.update(properties or {}) + + result = self.api_call(text, properties=default_properties) + + for sentence in result['sentences']: + for token in sentence['tokens']: + yield token['originalText'] + class StanfordParser(GenericStanfordParser): """ diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py index 74a18e5633..d95eb2786f 100644 --- a/nltk/tokenize/stanford.py +++ b/nltk/tokenize/stanford.py @@ -8,22 +8,13 @@ # For license information, see LICENSE.TXT from __future__ import unicode_literals, print_function - -import tempfile -import os -import json -from subprocess import PIPE - -from nltk import compat -from nltk.internals import find_jar, config_java, java, _java_options +import warnings from nltk.tokenize.api import TokenizerI -_stanford_url = 'http://nlp.stanford.edu/software/tokenizer.shtml' class StanfordTokenizer(TokenizerI): - r""" - Interface to the Stanford Tokenizer + r"""Interface to the Stanford Tokenizer. >>> from nltk.tokenize import StanfordTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." @@ -32,78 +23,31 @@ class StanfordTokenizer(TokenizerI): >>> s = "The colour of the wall is blue." >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] - """ - _JAR = 'stanford-postagger.jar' + """ - def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'): - self._stanford_jar = find_jar( - self._JAR, path_to_jar, - env_vars=('STANFORD_POSTAGGER',), - searchpath=(), url=_stanford_url, - verbose=verbose + def __init__(self, options=None, *args, **kwargs): + warnings.warn( + 'StanfordTokenizer is deprecated, use nltk.parse.stanford.StanfordParser instead.', + DeprecationWarning, ) - self._encoding = encoding - self.java_options = java_options - - options = {} if options is None else options - self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items()) + self.options = options or {} - @staticmethod - def _parse_tokenized_output(s): - return s.splitlines() + from nltk.parse.stanford import StanfordParser + self.parser = StanfordParser(*args, **kwargs) def tokenize(self, s): - """ - Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. - """ - cmd = [ - 'edu.stanford.nlp.process.PTBTokenizer', - ] - return self._parse_tokenized_output(self._execute(cmd, s)) - - def _execute(self, cmd, input_, verbose=False): - encoding = self._encoding - cmd.extend(['-charset', encoding]) - _options_cmd = self._options_cmd - if _options_cmd: - cmd.extend(['-options', self._options_cmd]) - - default_options = ' '.join(_java_options) - - # Configure java. - config_java(options=self.java_options, verbose=verbose) - - # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. - with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: - # Write the actual sentences to the temporary input file - if isinstance(input_, compat.text_type) and encoding: - input_ = input_.encode(encoding) - input_file.write(input_) - input_file.flush() - - cmd.append(input_file.name) - - # Run the tagger and get the output. - stdout, stderr = java(cmd, classpath=self._stanford_jar, - stdout=PIPE, stderr=PIPE) - stdout = stdout.decode(encoding) - - os.unlink(input_file.name) - - # Return java configurations to their default values. - config_java(options=default_options, verbose=False) - - return stdout + properties = {'tokenize.options': 'americanize=true'} if self.options.get('americanize', False) else {} + return list(self.parser.tokenize(s, properties=properties)) def setup_module(module): from nose import SkipTest + import requests - try: - StanfordTokenizer() - except LookupError: - raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist') - - + if not requests.get('http://localhost:9000').ok: + raise SkipTest( + 'Doctests from nltk.parse.stanford are skipped because ' + 'the CoreNLP server is not available.' + ) From f70fa55a10f7a6830c85760f94d4d70a3cae0478 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 16 Jan 2016 17:14:51 +0000 Subject: [PATCH 09/32] A minor update --- nltk/tokenize/stanford.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py index d95eb2786f..3815279f5d 100644 --- a/nltk/tokenize/stanford.py +++ b/nltk/tokenize/stanford.py @@ -48,6 +48,6 @@ def setup_module(module): if not requests.get('http://localhost:9000').ok: raise SkipTest( - 'Doctests from nltk.parse.stanford are skipped because ' + 'Doctests from nltk.tokenize.stanford are skipped because ' 'the CoreNLP server is not available.' ) From aaf90e3741f99638903f0b5fb84d11df560bbdc0 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sun, 17 Jan 2016 12:47:39 +0000 Subject: [PATCH 10/32] CoreNLP server. --- nltk/parse/stanford.py | 237 +++++++++++++++++++++----------------- nltk/tokenize/stanford.py | 18 ++- 2 files changed, 142 insertions(+), 113 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index c1628bc925..330464841e 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -13,6 +13,7 @@ import os import re import json +import time from subprocess import PIPE import requests @@ -28,75 +29,138 @@ _stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml' -class GenericStanfordParser(ParserI, TokenizerI): - """Interface to the Stanford Parser""" - - _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar' - _JAR = r'stanford-parser\.jar' - _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' - - _USE_STDIN = False - _DOUBLE_SPACED_OUTPUT = False +class CoreNLPServerError(EnvironmentError): + """Exceptions assciated with the Core NLP server.""" - def __init__(self, url='http://localhost:9000', encoding='utf8'): - - self.url = url - self.encoding = encoding +class CoreNLPServer(object): - self.session = requests.Session() + _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar' + _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar' - return + other_jars = ( + 'ejml-0.23.jar', + 'javax.json.jar', + 'joda-time.jar', + 'jollyday.jar', + 'protobuf.jar', + 'slf4j-api.jar', + 'slf4j-simple.jar', + 'xom.jar', + ) + def __init__( + self, path_to_jar=None, path_to_models_jar=None, verbose=False, + java_options='-mx4g', corenlp_options='' + ): # find the most recent code and model jar stanford_jar = max( find_jar_iter( - self._JAR, path_to_jar, + self._JAR, + path_to_jar, env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'), - searchpath=(), url=_stanford_url, - verbose=verbose, is_regex=True + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, ), key=lambda model_name: re.match(self._JAR, model_name) ) - model_jar=max( + # TODO: take a free random port. + self.url = 'http://localhost:9000' + + model_jar = max( find_jar_iter( - self._MODEL_JAR_PATTERN, path_to_models_jar, + self._MODEL_JAR_PATTERN, + path_to_models_jar, env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'), - searchpath=(), url=_stanford_url, - verbose=verbose, is_regex=True + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, ), key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name) ) - self._classpath = (stanford_jar, model_jar) + self.other_jars = tuple( + next( + find_jar_iter( + jar, + None, + searchpath=(os.path.dirname(stanford_jar), ), + verbose=verbose, + is_regex=False, + ) + ) + for jar in self.other_jars + ) + + self.verbose = verbose + + self._classpath = (stanford_jar, model_jar) + self.other_jars - self.model_path = model_path - self._encoding = encoding self.corenlp_options = corenlp_options self.java_options = java_options - def _parse_trees_output(self, output_): - res = [] - cur_lines = [] - cur_trees = [] - blank = False - for line in output_.splitlines(False): - if line == '': - if blank: - res.append(iter(cur_trees)) - cur_trees = [] - blank = False - elif self._DOUBLE_SPACED_OUTPUT: - cur_trees.append(self._make_tree('\n'.join(cur_lines))) - cur_lines = [] - blank = True - else: - res.append(iter([self._make_tree('\n'.join(cur_lines))])) - cur_lines = [] - else: - cur_lines.append(line) - blank = False - return iter(res) + def start(self): + cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] + + if self.corenlp_options: + cmd.append(self.corenlp_options) + + # Configure java. + default_options = ' '.join(_java_options) + config_java(options=self.java_options, verbose=self.verbose) + + try: + # TODO: it's probably a bad idea to pipe stdout, as it will + # accumulate when lots of text is being parsed. + self.popen = java( + cmd, + classpath=self._classpath, + blocking=False, + stderr='pipe', + ) + finally: + # Return java configurations to their default values. + config_java(options=default_options, verbose=self.verbose) + + # Check that the server is istill running. + # TODO: is there a better way of checking whether a server is ready to + # accept connections? + time.sleep(5) + returncode = self.popen.poll() + if returncode is not None: + _, stderrdata = self.popen.communicate() + raise CoreNLPServerError( + returncode, + 'Could not start the server. ' + 'The error was: {}'.format(stderrdata.decode('ascii')) + ) + + def stop(self): + self.popen.terminate() + self.popen.wait() + + def __enter__(self): + self.start() + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop() + return False + + +class GenericStanfordParser(ParserI, TokenizerI): + """Interface to the Stanford Parser""" + + def __init__(self, url='http://localhost:9000', encoding='utf8'): + + self.url = url + self.encoding = encoding + + self.session = requests.Session() def parse_sents(self, sentences, *args, **kwargs): """Parse multiple sentences. @@ -233,63 +297,16 @@ def tagged_parse_sents(self, sentences, verbose=False): return self._parse_trees_output(self._execute( cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose)) - def _execute(self, cmd, input_, verbose=False): - encoding = self._encoding - cmd.extend(['-encoding', encoding]) - if self.corenlp_options: - cmd.append(self.corenlp_options) - - default_options = ' '.join(_java_options) - - # Configure java. - config_java(options=self.java_options, verbose=verbose) - - # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. - with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: - # Write the actual sentences to the temporary input file - if isinstance(input_, compat.text_type) and encoding: - input_ = input_.encode(encoding) - input_file.write(input_) - input_file.flush() - - # Run the tagger and get the output. - if self._USE_STDIN: - input_file.seek(0) - stdout, stderr = java(cmd, classpath=self._classpath, - stdin=input_file, stdout=PIPE, stderr=PIPE) - else: - cmd.append(input_file.name) - stdout, stderr = java(cmd, classpath=self._classpath, - stdout=PIPE, stderr=PIPE) - - stdout = stdout.replace(b'\xc2\xa0',b' ') - stdout = stdout.replace(b'\xa0',b' ') - stdout = stdout.decode(encoding) - - os.unlink(input_file.name) - - # Return java configurations to their default values. - config_java(options=default_options, verbose=False) - - return stdout - - def parse_text(self, text, properties=None, *args, **kwargs): + def parse_text(self, text, *args, **kwargs): """Parse a piece of text. The text might contain several sentences which will be split by CoreNLP. :param str text: text to be split. - :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables. + :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables? """ - if properties is None: - properties is {} - - default_properties = { - '' - } - - parsed_data = self.api_call(text, properties=properties, *args, **kwargs) + parsed_data = self.api_call(text, *args, **kwargs) for parse in parsed_data['sentences']: yield self.make_tree(parse) @@ -297,17 +314,17 @@ def parse_text(self, text, properties=None, *args, **kwargs): def tokenize(self, text, properties=None): """Tokenize a string of text. - >>> parser = StanfordParser() + >>> parser = StanfordParser(url='http://localhost:9000') >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.' >>> list(parser.tokenize(text)) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - >>> s = "The colour of the wall is blue." + >>> s = "The colour of the wall is blue." >>> list( ... parser.tokenize( ... 'The colour of the wall is blue.', - ... properties={'tokenize.options': 'americanize=true'}, + ... properties={'tokenize.options': 'americanize=true'}, ... ) ... ) ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] @@ -328,7 +345,7 @@ def tokenize(self, text, properties=None): class StanfordParser(GenericStanfordParser): """ - >>> parser=StanfordParser() + >>> parser = StanfordParser(url='http://localhost:9000') >>> next( ... parser.raw_parse('the quick brown fox jumps over the lazy dog') @@ -481,7 +498,7 @@ def make_tree(self, result): class StanfordDependencyParser(GenericStanfordParser): """ - >>> dep_parser = StanfordDependencyParser() + >>> dep_parser = StanfordDependencyParser(url='http://localhost:9000') >>> parse, = dep_parser.raw_parse( ... 'The quick brown fox jumps over the lazy dog.' @@ -639,8 +656,14 @@ def transform(sentence): def setup_module(module): from nose import SkipTest - if not requests.get('http://localhost:9000').ok: - raise SkipTest( - 'Doctests from nltk.parse.stanford are skipped because ' - 'the CoreNLP server is not available.' - ) + global server + server = CoreNLPServer() + + try: + server.start() + except CoreNLPServerError as e: + raise SkipTest('Skiping CoreNLP tests because the server could not be started. {}'.format(e.strerror)) + + +def teardown_module(module): + server.stop() diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py index 3815279f5d..0fc1d41ac7 100644 --- a/nltk/tokenize/stanford.py +++ b/nltk/tokenize/stanford.py @@ -43,11 +43,17 @@ def tokenize(self, s): def setup_module(module): + from nltk.parse.stanford import CoreNLPServer, CoreNLPServerError from nose import SkipTest - import requests - if not requests.get('http://localhost:9000').ok: - raise SkipTest( - 'Doctests from nltk.tokenize.stanford are skipped because ' - 'the CoreNLP server is not available.' - ) + global server + server = CoreNLPServer() + + try: + server.start() + except CoreNLPServerError as e: + raise SkipTest('Skiping CoreNLP tests because the server could not be started. {}'.format(e.strerror)) + + +def teardown_module(module): + server.stop() From 75ce6cd05d4ee0e6b2c1005a6c17a0fbcc9f22a5 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Tue, 19 Jan 2016 13:21:15 +0000 Subject: [PATCH 11/32] Options are iterables of strings --- nltk/parse/stanford.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 330464841e..1664a0b1cf 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -50,7 +50,7 @@ class CoreNLPServer(object): def __init__( self, path_to_jar=None, path_to_models_jar=None, verbose=False, - java_options='-mx4g', corenlp_options='' + java_options=None, corenlp_options=None ): # find the most recent code and model jar stanford_jar = max( @@ -99,14 +99,14 @@ def __init__( self._classpath = (stanford_jar, model_jar) + self.other_jars - self.corenlp_options = corenlp_options - self.java_options = java_options + self.corenlp_options = corenlp_options or [] + self.java_options = java_options or ['-mx4g'] def start(self): cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] if self.corenlp_options: - cmd.append(self.corenlp_options) + cmd.extend(self.corenlp_options) # Configure java. default_options = ' '.join(_java_options) From 9118408ebb6b3dbd92f0dbc44696e0d22c759e23 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Wed, 2 Mar 2016 11:38:39 +0000 Subject: [PATCH 12/32] Pep8 --- nltk/parse/stanford.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 1664a0b1cf..bc00a8fb15 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -9,16 +9,13 @@ from __future__ import unicode_literals -import tempfile import os import re import json import time -from subprocess import PIPE import requests -from nltk import compat from nltk.internals import find_jar_iter, config_java, java, _java_options from nltk.parse.api import ParserI @@ -32,6 +29,7 @@ class CoreNLPServerError(EnvironmentError): """Exceptions assciated with the Core NLP server.""" + class CoreNLPServer(object): _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar' From ffdfbdd9bf3c8207f206f828681150f2b7dfe48f Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Wed, 2 Mar 2016 11:43:30 +0000 Subject: [PATCH 13/32] Find all jars that are in the same folder with sanford_jar. --- nltk/parse/stanford.py | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index bc00a8fb15..28428a5f20 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -16,7 +16,7 @@ import requests -from nltk.internals import find_jar_iter, config_java, java, _java_options +from nltk.internals import find_jar_iter, config_java, java, _java_options, find_jars_within_path from nltk.parse.api import ParserI from nltk.tokenize.api import TokenizerI @@ -35,17 +35,6 @@ class CoreNLPServer(object): _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar' _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar' - other_jars = ( - 'ejml-0.23.jar', - 'javax.json.jar', - 'joda-time.jar', - 'jollyday.jar', - 'protobuf.jar', - 'slf4j-api.jar', - 'slf4j-simple.jar', - 'xom.jar', - ) - def __init__( self, path_to_jar=None, path_to_models_jar=None, verbose=False, java_options=None, corenlp_options=None @@ -80,18 +69,7 @@ def __init__( key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name) ) - self.other_jars = tuple( - next( - find_jar_iter( - jar, - None, - searchpath=(os.path.dirname(stanford_jar), ), - verbose=verbose, - is_regex=False, - ) - ) - for jar in self.other_jars - ) + self.other_jars = tuple(find_jars_within_path(os.path.dirname(stanford_jar))) self.verbose = verbose From c3057a9665aff40d99ec4ad6f3cd0624a42e63cb Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 2 Jul 2016 17:06:03 +0100 Subject: [PATCH 14/32] =?UTF-8?q?CoreNLPServer=20tries=20to=20use=20port?= =?UTF-8?q?=209000=20by=20default.=20If=20it=E2=80=99s=20taken=20than=20a?= =?UTF-8?q?=20random=20port=20is=20used.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nltk/parse/stanford.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 28428a5f20..c708594daa 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -13,6 +13,7 @@ import re import json import time +import socket import requests @@ -30,6 +31,16 @@ class CoreNLPServerError(EnvironmentError): """Exceptions assciated with the Core NLP server.""" +def try_port(port=0): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(('', port)) + + p = sock.getsockname()[1] + sock.close() + + return p + + class CoreNLPServer(object): _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar' @@ -37,7 +48,7 @@ class CoreNLPServer(object): def __init__( self, path_to_jar=None, path_to_models_jar=None, verbose=False, - java_options=None, corenlp_options=None + java_options=None, corenlp_options=None, port=None, ): # find the most recent code and model jar stanford_jar = max( @@ -53,8 +64,20 @@ def __init__( key=lambda model_name: re.match(self._JAR, model_name) ) - # TODO: take a free random port. - self.url = 'http://localhost:9000' + if port is None: + try: + port = try_port(9000) + except socket.error: + port = try_port() + + if corenlp_options: + corenlp_options.append(str(port)) + else: + corenlp_options = [str(port)] + else: + try_port(port) + + self.url = 'http://localhost:{}'.format(port) model_jar = max( find_jar_iter( @@ -633,12 +656,16 @@ def setup_module(module): from nose import SkipTest global server - server = CoreNLPServer() + server = CoreNLPServer(port=9000) try: server.start() except CoreNLPServerError as e: - raise SkipTest('Skiping CoreNLP tests because the server could not be started. {}'.format(e.strerror)) + raise SkipTest( + 'Skiping CoreNLP tests because the server could not be started. ' + 'Make sure that the 9000 port is free. ' + '{}'.format(e.strerror) + ) def teardown_module(module): From 82fc351b72f2f02b198ae33b09b578d661657956 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sun, 10 Jul 2016 10:24:39 -0400 Subject: [PATCH 15/32] Java memory limit is set to 1GB. --- nltk/parse/stanford.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 739c7dbaed..1d60546071 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -99,7 +99,7 @@ def __init__( self._classpath = (stanford_jar, model_jar) + self.other_jars self.corenlp_options = corenlp_options or [] - self.java_options = java_options or ['-mx4g'] + self.java_options = java_options or ['-mx1g'] def start(self): cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] From ff1204d745c504d58a9b1775b836024408db72b2 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sun, 10 Jul 2016 10:24:56 -0400 Subject: [PATCH 16/32] Typos --- nltk/parse/stanford.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 1d60546071..5e3e634f5a 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -28,7 +28,7 @@ class CoreNLPServerError(EnvironmentError): - """Exceptions assciated with the Core NLP server.""" + """Exceptions associated with the Core NLP server.""" def try_port(port=0): @@ -662,7 +662,7 @@ def setup_module(module): server.start() except CoreNLPServerError as e: raise SkipTest( - 'Skiping CoreNLP tests because the server could not be started. ' + 'Skipping CoreNLP tests because the server could not be started. ' 'Make sure that the 9000 port is free. ' '{}'.format(e.strerror) ) From 252e7cfe410840cd2b48fcc79dea9ac2f668c9c5 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sun, 10 Jul 2016 10:36:38 -0400 Subject: [PATCH 17/32] Typos and CoreNLPServer port is set to 9000 in tests. --- nltk/tokenize/stanford.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py index 283a026622..05980c62ee 100644 --- a/nltk/tokenize/stanford.py +++ b/nltk/tokenize/stanford.py @@ -47,12 +47,12 @@ def setup_module(module): from nose import SkipTest global server - server = CoreNLPServer() + server = CoreNLPServer(port=9000) try: server.start() except CoreNLPServerError as e: - raise SkipTest('Skiping CoreNLP tests because the server could not be started. {}'.format(e.strerror)) + raise SkipTest('Skipping CoreNLP tests because the server could not be started. {}'.format(e.strerror)) def teardown_module(module): From 52a5bf0fc0ebee42f6b114e682f0577732c79d78 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sun, 10 Jul 2016 10:53:03 -0400 Subject: [PATCH 18/32] More tests. --- nltk/parse/stanford.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 5e3e634f5a..84861fba58 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -613,6 +613,15 @@ class StanfordDependencyParser(GenericStanfordParser): ... ) 21 + Phone numbers. + + >>> len( + ... next( + ... dep_parser.raw_parse('This is not going to crash: 01 111 555.') + ... ).nodes + ... ) + 10 + """ _OUTPUT_FORMAT = 'conll2007' From 01a51bcbcc1adaa13c7416d04179f6bd096ca67e Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Thu, 29 Dec 2016 17:34:32 -0500 Subject: [PATCH 19/32] Revert StanfordParser. --- nltk/parse/stanford.py | 819 +++++++++++++------------------------- nltk/tokenize/stanford.py | 96 ++++- 2 files changed, 347 insertions(+), 568 deletions(-) diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py index 84861fba58..e7bdfe013e 100644 --- a/nltk/parse/stanford.py +++ b/nltk/parse/stanford.py @@ -9,177 +9,117 @@ from __future__ import unicode_literals +import tempfile import os import re -import json -import time -import socket +import warnings +from subprocess import PIPE +from io import StringIO -import requests - -from nltk.internals import find_jar_iter, config_java, java, _java_options, find_jars_within_path +from nltk import compat +from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options, find_jars_within_path from nltk.parse.api import ParserI -from nltk.tokenize.api import TokenizerI from nltk.parse.dependencygraph import DependencyGraph from nltk.tree import Tree _stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml' +class GenericStanfordParser(ParserI): + """Interface to the Stanford Parser""" -class CoreNLPServerError(EnvironmentError): - """Exceptions associated with the Core NLP server.""" - - -def try_port(port=0): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(('', port)) - - p = sock.getsockname()[1] - sock.close() - - return p - + _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar' + _JAR = r'stanford-parser\.jar' + _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' -class CoreNLPServer(object): + _USE_STDIN = False + _DOUBLE_SPACED_OUTPUT = False - _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar' - _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar' + def __init__(self, path_to_jar=None, path_to_models_jar=None, + model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', + encoding='utf8', verbose=False, + java_options='-mx1000m', corenlp_options=''): - def __init__( - self, path_to_jar=None, path_to_models_jar=None, verbose=False, - java_options=None, corenlp_options=None, port=None, - ): # find the most recent code and model jar stanford_jar = max( find_jar_iter( - self._JAR, - path_to_jar, + self._JAR, path_to_jar, env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'), - searchpath=(), - url=_stanford_url, - verbose=verbose, - is_regex=True, + searchpath=(), url=_stanford_url, + verbose=verbose, is_regex=True ), key=lambda model_name: re.match(self._JAR, model_name) ) - if port is None: - try: - port = try_port(9000) - except socket.error: - port = try_port() - - if corenlp_options: - corenlp_options.append(str(port)) - else: - corenlp_options = [str(port)] - else: - try_port(port) - - self.url = 'http://localhost:{}'.format(port) - - model_jar = max( + model_jar=max( find_jar_iter( - self._MODEL_JAR_PATTERN, - path_to_models_jar, + self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'), - searchpath=(), - url=_stanford_url, - verbose=verbose, - is_regex=True, + searchpath=(), url=_stanford_url, + verbose=verbose, is_regex=True ), key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name) ) - self.other_jars = tuple(find_jars_within_path(os.path.dirname(stanford_jar))) - - self.verbose = verbose - - self._classpath = (stanford_jar, model_jar) + self.other_jars - - self.corenlp_options = corenlp_options or [] - self.java_options = java_options or ['-mx1g'] - - def start(self): - cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] - - if self.corenlp_options: - cmd.extend(self.corenlp_options) - - # Configure java. - default_options = ' '.join(_java_options) - config_java(options=self.java_options, verbose=self.verbose) - - try: - # TODO: it's probably a bad idea to pipe stdout, as it will - # accumulate when lots of text is being parsed. - self.popen = java( - cmd, - classpath=self._classpath, - blocking=False, - stderr='pipe', - ) - finally: - # Return java configurations to their default values. - config_java(options=default_options, verbose=self.verbose) - - # Check that the server is istill running. - # TODO: is there a better way of checking whether a server is ready to - # accept connections? - time.sleep(5) - returncode = self.popen.poll() - if returncode is not None: - _, stderrdata = self.popen.communicate() - raise CoreNLPServerError( - returncode, - 'Could not start the server. ' - 'The error was: {}'.format(stderrdata.decode('ascii')) - ) - - def stop(self): - self.popen.terminate() - self.popen.wait() - - def __enter__(self): - self.start() - - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.stop() - return False - - -class GenericStanfordParser(ParserI, TokenizerI): - """Interface to the Stanford Parser""" - - def __init__(self, url='http://localhost:9000', encoding='utf8'): - - self.url = url - self.encoding = encoding - - self.session = requests.Session() - - def parse_sents(self, sentences, *args, **kwargs): - """Parse multiple sentences. - - Takes multiple sentences as a list where each sentence is a list of - words. Each sentence will be automatically tagged with this - StanfordParser instance's tagger. + #self._classpath = (stanford_jar, model_jar) + + # Adding logging jar files to classpath + stanford_dir = os.path.split(stanford_jar)[0] + self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir)) + + self.model_path = model_path + self._encoding = encoding + self.corenlp_options = corenlp_options + self.java_options = java_options + + def _parse_trees_output(self, output_): + res = [] + cur_lines = [] + cur_trees = [] + blank = False + for line in output_.splitlines(False): + if line == '': + if blank: + res.append(iter(cur_trees)) + cur_trees = [] + blank = False + elif self._DOUBLE_SPACED_OUTPUT: + cur_trees.append(self._make_tree('\n'.join(cur_lines))) + cur_lines = [] + blank = True + else: + res.append(iter([self._make_tree('\n'.join(cur_lines))])) + cur_lines = [] + else: + cur_lines.append(line) + blank = False + return iter(res) - If a whitespace exists inside a token, then the token will be treated as - several tokens. + def parse_sents(self, sentences, verbose=False): + """ + Use StanfordParser to parse multiple sentences. Takes multiple sentences as a + list where each sentence is a list of words. + Each sentence will be automatically tagged with this StanfordParser instance's + tagger. + If whitespaces exists inside a token, then the token will be treated as + separate tokens. :param sentences: Input sentences to parse :type sentences: list(list(str)) :rtype: iter(iter(Tree)) """ + cmd = [ + self._MAIN_CLASS, + '-model', self.model_path, + '-sentences', 'newline', + '-outputFormat', self._OUTPUT_FORMAT, + '-tokenized', + '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor', + ] + return self._parse_trees_output(self._execute( + cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose)) - sentences = (' '.join(words) for words in sentences) - return self.raw_parse_sents(sentences, *args, **kwargs) - - def raw_parse(self, sentence, properties=None, *args, **kwargs): + def raw_parse(self, sentence, verbose=False): """ Use StanfordParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by @@ -189,75 +129,25 @@ def raw_parse(self, sentence, properties=None, *args, **kwargs): :type sentence: str :rtype: iter(Tree) """ - default_properties = { - 'tokenize.whitespace': 'false', - } - - default_properties.update(properties or {}) - - return next( - self.raw_parse_sents( - [sentence], - properties=default_properties, - *args, - **kwargs - ) - ) - - def api_call(self, data, properties=None): - default_properties = { - 'outputFormat': 'json', - 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format( - parser_annotator=self.parser_annotator, - ), - } - - default_properties.update(properties or {}) + return next(self.raw_parse_sents([sentence], verbose)) - response = self.session.post( - self.url, - params={ - 'properties': json.dumps(default_properties), - }, - data=data.encode(self.encoding), - ) - - response.raise_for_status() - - return response.json() - - def raw_parse_sents( - self, - sentences, - verbose=False, - properties=None, - *args, - **kwargs - ): - """Use StanfordParser to parse multiple sentences. - - Takes multiple sentences as a list of strings. Each sentence will be - automatically tokenized and tagged by the Stanford Parser. + def raw_parse_sents(self, sentences, verbose=False): + """ + Use StanfordParser to parse multiple sentences. Takes multiple sentences as a + list of strings. + Each sentence will be automatically tokenized and tagged by the Stanford Parser. - :param sentences: Input sentences to parse. + :param sentences: Input sentences to parse :type sentences: list(str) :rtype: iter(iter(Tree)) - """ - default_properties = { - 'ssplit.isOneSentence': 'true', - } - - default_properties.update(properties or {}) - - for sentence in sentences: - parsed_data = self.api_call(sentence, properties=default_properties) - - assert len(parsed_data['sentences']) == 1 - - for parse in parsed_data['sentences']: - tree = self.make_tree(parse) - yield iter([tree]) + cmd = [ + self._MAIN_CLASS, + '-model', self.model_path, + '-sentences', 'newline', + '-outputFormat', self._OUTPUT_FORMAT, + ] + return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), verbose)) def tagged_parse(self, sentence, verbose=False): """ @@ -296,386 +186,221 @@ def tagged_parse_sents(self, sentences, verbose=False): return self._parse_trees_output(self._execute( cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose)) - def parse_text(self, text, *args, **kwargs): - """Parse a piece of text. - - The text might contain several sentences which will be split by CoreNLP. - - :param str text: text to be split. - :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables? - - """ - parsed_data = self.api_call(text, *args, **kwargs) - - for parse in parsed_data['sentences']: - yield self.make_tree(parse) - - def tokenize(self, text, properties=None): - """Tokenize a string of text. - - >>> parser = StanfordParser(url='http://localhost:9000') - - >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.' - >>> list(parser.tokenize(text)) - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - - >>> s = "The colour of the wall is blue." - >>> list( - ... parser.tokenize( - ... 'The colour of the wall is blue.', - ... properties={'tokenize.options': 'americanize=true'}, - ... ) - ... ) - ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] + def _execute(self, cmd, input_, verbose=False): + encoding = self._encoding + cmd.extend(['-encoding', encoding]) + if self.corenlp_options: + cmd.append(self.corenlp_options) - """ - default_properties = { - 'annotators': 'tokenize,ssplit', - } + default_options = ' '.join(_java_options) - default_properties.update(properties or {}) + # Configure java. + config_java(options=self.java_options, verbose=verbose) + + # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. + with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: + # Write the actual sentences to the temporary input file + if isinstance(input_, compat.text_type) and encoding: + input_ = input_.encode(encoding) + input_file.write(input_) + input_file.flush() + + # Run the tagger and get the output. + if self._USE_STDIN: + input_file.seek(0) + stdout, stderr = java(cmd, classpath=self._classpath, + stdin=input_file, stdout=PIPE, stderr=PIPE) + else: + cmd.append(input_file.name) + stdout, stderr = java(cmd, classpath=self._classpath, + stdout=PIPE, stderr=PIPE) + + stdout = stdout.replace(b'\xc2\xa0',b' ') + stdout = stdout.replace(b'\xa0',b' ') + stdout = stdout.decode(encoding) - result = self.api_call(text, properties=default_properties) + os.unlink(input_file.name) - for sentence in result['sentences']: - for token in sentence['tokens']: - yield token['originalText'] + # Return java configurations to their default values. + config_java(options=default_options, verbose=False) + return stdout class StanfordParser(GenericStanfordParser): """ - >>> parser = StanfordParser(url='http://localhost:9000') - - >>> next( - ... parser.raw_parse('the quick brown fox jumps over the lazy dog') - ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______________|_________ - | VP - | _________|___ - | | PP - | | ________|___ - NP | | NP - ____|__________ | | _______|____ - DT JJ JJ NN VBZ IN DT JJ NN - | | | | | | | | | - the quick brown fox jumps over the lazy dog - - >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents( - ... [ - ... 'the quick brown fox jumps over the lazy dog', - ... 'the quick grey wolf jumps over the lazy fox', - ... ] - ... ) - - >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______________|_________ - | VP - | _________|___ - | | PP - | | ________|___ - NP | | NP - ____|__________ | | _______|____ - DT JJ JJ NN VBZ IN DT JJ NN - | | | | | | | | | - the quick brown fox jumps over the lazy dog - - >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______________|_________ - | VP - | _________|___ - | | PP - | | ________|___ - NP | | NP - ____|_________ | | _______|____ - DT JJ JJ NN VBZ IN DT JJ NN - | | | | | | | | | - the quick grey wolf jumps over the lazy fox - - >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents( - ... [ - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ] - ... ) - - >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______|____ - | VP - | ________|___ - NP | NP - | | ___|___ - PRP VBP DT NN - | | | | - I 'm a dog - - >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - ____|___________ - | VP - | ___________|_____________ - | | NP - | | _______|_________ - | | NP PRN - | | _____|_______ ____|______________ - NP | NP | | NP | - | | ______|_________ | | ___|____ | - DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB- - | | | | | | | | | | - This is my friends ' cat -LRB- the tabby -RRB- - - >>> parse_john, parse_mary, = parser.parse_text( - ... 'John loves Mary. Mary walks.' + >>> parser=StanfordParser( + ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) - >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _____|_____________ - | VP | - | ____|___ | - NP | NP | - | | | | - NNP VBZ NNP . - | | | | - John loves Mary . - - >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _____|____ - NP VP | - | | | - NNP VBZ . - | | | - Mary walks . - - Special cases - ------------- - - >>> next( - ... parser.raw_parse( - ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war ' - ... 'Jessica Lynch have angrily dismissed claims made in her biography ' - ... 'that she was raped by her Iraqi captors.' - ... ) - ... ).height() - 17 - - >>> next( - ... parser.raw_parse( - ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or " - ... '0.05 percent, at 997.02.' - ... ) - ... ).height() - 10 - + >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE + [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), + Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), + Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])] + + >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents(( + ... "the quick brown fox jumps over the lazy dog", + ... "the quick grey wolf jumps over the lazy fox" + ... ))], []) # doctest: +NORMALIZE_WHITESPACE + [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), + Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), + Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', + [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP', + [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), + Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] + + >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents(( + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ))], []) # doctest: +NORMALIZE_WHITESPACE + [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), + Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', + [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), + Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']), + Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])] + + >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( + ... ( + ... ("The", "DT"), + ... ("quick", "JJ"), + ... ("brown", "JJ"), + ... ("fox", "NN"), + ... ("jumped", "VBD"), + ... ("over", "IN"), + ... ("the", "DT"), + ... ("lazy", "JJ"), + ... ("dog", "NN"), + ... (".", "."), + ... ), + ... ))],[]) # doctest: +NORMALIZE_WHITESPACE + [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), + Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', + [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] """ _OUTPUT_FORMAT = 'penn' - parser_annotator = 'parse' - def make_tree(self, result): - return Tree.fromstring(result['parse']) + def _make_tree(self, result): + return Tree.fromstring(result) class StanfordDependencyParser(GenericStanfordParser): """ - >>> dep_parser = StanfordDependencyParser(url='http://localhost:9000') - - >>> parse, = dep_parser.raw_parse( - ... 'The quick brown fox jumps over the lazy dog.' - ... ) - >>> print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - The DT 4 det - quick JJ 4 amod - brown JJ 4 amod - fox NN 5 nsubj - jumps VBZ 0 ROOT - over IN 9 case - the DT 9 det - lazy JJ 9 amod - dog NN 5 nmod - . . 5 punct - - >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE - (jumps (fox The quick brown) (dog over the lazy) .) - - >>> for governor, dep, dependent in parse.triples(): - ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE - ('jumps', 'VBZ') nsubj ('fox', 'NN') - ('fox', 'NN') det ('The', 'DT') - ('fox', 'NN') amod ('quick', 'JJ') - ('fox', 'NN') amod ('brown', 'JJ') - ('jumps', 'VBZ') nmod ('dog', 'NN') - ('dog', 'NN') case ('over', 'IN') - ('dog', 'NN') det ('the', 'DT') - ('dog', 'NN') amod ('lazy', 'JJ') - ('jumps', 'VBZ') punct ('.', '.') - - >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents( - ... [ - ... 'The quick brown fox jumps over the lazy dog.', - ... 'The quick grey wolf jumps over the lazy fox.', - ... ] - ... ) - >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - The DT 4 det - quick JJ 4 amod - brown JJ 4 amod - fox NN 5 nsubj - jumps VBZ 0 ROOT - over IN 9 case - the DT 9 det - lazy JJ 9 amod - dog NN 5 nmod - . . 5 punct - - >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - The DT 4 det - quick JJ 4 amod - grey JJ 4 amod - wolf NN 5 nsubj - jumps VBZ 0 ROOT - over IN 9 case - the DT 9 det - lazy JJ 9 amod - fox NN 5 nmod - . . 5 punct - - >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents( - ... [ - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ] - ... ) - >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - I PRP 4 nsubj - 'm VBP 4 cop - a DT 4 det - dog NN 0 ROOT - - >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - This DT 6 nsubj - is VBZ 6 cop - my PRP$ 4 nmod:poss - friends NNS 6 nmod:poss - ' POS 4 case - cat NN 0 ROOT - -LRB- -LRB- 9 punct - the DT 9 det - tabby NN 6 appos - -RRB- -RRB- 9 punct - - >>> parse_john, parse_mary, = dep_parser.parse_text( - ... 'John loves Mary. Mary walks.' + >>> dep_parser=StanfordDependencyParser( + ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) - >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - John NNP 2 nsubj - loves VBZ 0 ROOT - Mary NNP 2 dobj - . . 2 punct - - >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - Mary NNP 2 nsubj - walks VBZ 0 ROOT - . . 2 punct - - Special cases - ------------- - - Non-breaking space inside of a token. - - >>> len( - ... next( - ... dep_parser.raw_parse( - ... 'Anhalt said children typically treat a 20-ounce soda bottle as one ' - ... 'serving, while it actually contains 2 1/2 servings.' - ... ) - ... ).nodes - ... ) - 21 - - Phone numbers. - - >>> len( - ... next( - ... dep_parser.raw_parse('This is not going to crash: 01 111 555.') - ... ).nodes - ... ) - 10 + >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] + + >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE + [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), + ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), + ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), + ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( + ... "The quick brown fox jumps over the lazy dog.", + ... "The quick grey wolf jumps over the lazy fox." + ... ))], []) # doctest: +NORMALIZE_WHITESPACE + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), + Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ))], []) # doctest: +NORMALIZE_WHITESPACE + [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] + + >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( + ... ( + ... ("The", "DT"), + ... ("quick", "JJ"), + ... ("brown", "JJ"), + ... ("fox", "NN"), + ... ("jumped", "VBD"), + ... ("over", "IN"), + ... ("the", "DT"), + ... ("lazy", "JJ"), + ... ("dog", "NN"), + ... (".", "."), + ... ), + ... ))],[]) # doctest: +NORMALIZE_WHITESPACE + [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), + ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), + ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), + ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] """ _OUTPUT_FORMAT = 'conll2007' - parser_annotator = 'depparse' - def make_tree(self, result): + def _make_tree(self, result): + return DependencyGraph(result, top_relation_label='root') + + +class StanfordNeuralDependencyParser(GenericStanfordParser): + ''' + >>> from nltk.parse.stanford import StanfordNeuralDependencyParser + >>> dep_parser=StanfordNeuralDependencyParser() + + >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] + + >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE + [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), + ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), + ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), + ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( + ... "The quick brown fox jumps over the lazy dog.", + ... "The quick grey wolf jumps over the lazy fox." + ... ))], []) # doctest: +NORMALIZE_WHITESPACE + [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), + Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] + + >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ))], []) # doctest: +NORMALIZE_WHITESPACE + [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] + ''' + + _OUTPUT_FORMAT = 'conll' + _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP' + _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar' + _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar' + _USE_STDIN = True + _DOUBLE_SPACED_OUTPUT = True + + def __init__(self, *args, **kwargs): + super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs) + self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse' - return DependencyGraph( - ( - ' '.join(items) # NLTK expects an iterable of strings... - for n, *items in sorted(transform(result)) - ), - cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token. + def tagged_parse_sents(self, sentences, verbose=False): + ''' + Currently unimplemented because the neural dependency parser (and + the StanfordCoreNLP pipeline class) doesn't support passing in pre- + tagged tokens. + ''' + raise NotImplementedError( + 'tagged_parse[_sents] is not supported by ' + 'StanfordNeuralDependencyParser; use ' + 'parse[_sents] or raw_parse[_sents] instead.' ) - -def transform(sentence): - for dependency in sentence['basic-dependencies']: - - dependent_index = dependency['dependent'] - token = sentence['tokens'][dependent_index - 1] - - # Return values we don't know as '_'. Also, consider tag and ctag to be - # equal. - yield ( - dependent_index, - '_', - token['word'], - token['lemma'], - token['pos'], - token['pos'], - '_', - str(dependency['governor']), - dependency['dep'], - '_', - '_', - ) + def _make_tree(self, result): + return DependencyGraph(result, top_relation_label='ROOT') def setup_module(module): from nose import SkipTest - global server - server = CoreNLPServer(port=9000) - try: - server.start() - except CoreNLPServerError as e: - raise SkipTest( - 'Skipping CoreNLP tests because the server could not be started. ' - 'Make sure that the 9000 port is free. ' - '{}'.format(e.strerror) + StanfordParser( + model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) - - -def teardown_module(module): - server.stop() + StanfordNeuralDependencyParser() + except LookupError: + raise SkipTest('doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist') diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py index 05980c62ee..e214d2139a 100644 --- a/nltk/tokenize/stanford.py +++ b/nltk/tokenize/stanford.py @@ -8,13 +8,22 @@ # For license information, see LICENSE.TXT from __future__ import unicode_literals, print_function -import warnings + +import tempfile +import os +import json +from subprocess import PIPE + +from nltk import compat +from nltk.internals import find_jar, config_java, java, _java_options, find_jars_within_path from nltk.tokenize.api import TokenizerI +_stanford_url = 'http://nlp.stanford.edu/software/tokenizer.shtml' class StanfordTokenizer(TokenizerI): - r"""Interface to the Stanford Tokenizer. + r""" + Interface to the Stanford Tokenizer >>> from nltk.tokenize import StanfordTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." @@ -23,37 +32,82 @@ class StanfordTokenizer(TokenizerI): >>> s = "The colour of the wall is blue." >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] - """ - def __init__(self, options=None, *args, **kwargs): - warnings.warn( - 'StanfordTokenizer is deprecated, use nltk.parse.stanford.StanfordParser instead.', - DeprecationWarning, + _JAR = 'stanford-postagger.jar' + + def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'): + self._stanford_jar = find_jar( + self._JAR, path_to_jar, + env_vars=('STANFORD_POSTAGGER',), + searchpath=(), url=_stanford_url, + verbose=verbose ) + + # Adding logging jar files to classpath + stanford_dir = os.path.split(self._stanford_jar)[0] + self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) + + self._encoding = encoding + self.java_options = java_options - self.options = options or {} + options = {} if options is None else options + self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items()) - from nltk.parse.stanford import StanfordParser - self.parser = StanfordParser(*args, **kwargs) + @staticmethod + def _parse_tokenized_output(s): + return s.splitlines() def tokenize(self, s): - properties = {'tokenize.options': 'americanize=true'} if self.options.get('americanize', False) else {} - return list(self.parser.tokenize(s, properties=properties)) + """ + Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. + """ + cmd = [ + 'edu.stanford.nlp.process.PTBTokenizer', + ] + return self._parse_tokenized_output(self._execute(cmd, s)) + + def _execute(self, cmd, input_, verbose=False): + encoding = self._encoding + cmd.extend(['-charset', encoding]) + _options_cmd = self._options_cmd + if _options_cmd: + cmd.extend(['-options', self._options_cmd]) + + default_options = ' '.join(_java_options) + + # Configure java. + config_java(options=self.java_options, verbose=verbose) + + # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. + with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: + # Write the actual sentences to the temporary input file + if isinstance(input_, compat.text_type) and encoding: + input_ = input_.encode(encoding) + input_file.write(input_) + input_file.flush() + + cmd.append(input_file.name) + + # Run the tagger and get the output. + stdout, stderr = java(cmd, classpath=self._stanford_jar, + stdout=PIPE, stderr=PIPE) + stdout = stdout.decode(encoding) + + os.unlink(input_file.name) + + # Return java configurations to their default values. + config_java(options=default_options, verbose=False) + + return stdout def setup_module(module): - from nltk.parse.stanford import CoreNLPServer, CoreNLPServerError from nose import SkipTest - global server - server = CoreNLPServer(port=9000) - try: - server.start() - except CoreNLPServerError as e: - raise SkipTest('Skipping CoreNLP tests because the server could not be started. {}'.format(e.strerror)) + StanfordTokenizer() + except LookupError: + raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist') -def teardown_module(module): - server.stop() From 861a670e932eff6e594782eb1c5c9437f157238f Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Thu, 29 Dec 2016 17:39:46 -0500 Subject: [PATCH 20/32] Adding corenlp. --- nltk/parse/corenlp.py | 681 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 681 insertions(+) create mode 100644 nltk/parse/corenlp.py diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py new file mode 100644 index 0000000000..652bbf55ce --- /dev/null +++ b/nltk/parse/corenlp.py @@ -0,0 +1,681 @@ +# -*- coding: utf-8 -*- +# Natural Language Toolkit: Interface to the CoreNLP REST API. +# +# Copyright (C) 2001-2016 NLTK Project +# Author: Steven Xu +# +# URL: +# For license information, see LICENSE.TXT + +from __future__ import unicode_literals + +import os +import re +import json +import time +import socket + +import requests + +from nltk.internals import find_jar_iter, config_java, java, _java_options, find_jars_within_path + +from nltk.parse.api import ParserI +from nltk.tokenize.api import TokenizerI +from nltk.parse.dependencygraph import DependencyGraph +from nltk.tree import Tree + +_stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml' + + +class CoreNLPServerError(EnvironmentError): + """Exceptions associated with the Core NLP server.""" + + +def try_port(port=0): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(('', port)) + + p = sock.getsockname()[1] + sock.close() + + return p + + +class CoreNLPServer(object): + + _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar' + _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar' + + def __init__( + self, path_to_jar=None, path_to_models_jar=None, verbose=False, + java_options=None, corenlp_options=None, port=None, + ): + # find the most recent code and model jar + stanford_jar = max( + find_jar_iter( + self._JAR, + path_to_jar, + env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'), + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, + ), + key=lambda model_name: re.match(self._JAR, model_name) + ) + + if port is None: + try: + port = try_port(9000) + except socket.error: + port = try_port() + + if corenlp_options: + corenlp_options.append(str(port)) + else: + corenlp_options = [str(port)] + else: + try_port(port) + + self.url = 'http://localhost:{}'.format(port) + + model_jar = max( + find_jar_iter( + self._MODEL_JAR_PATTERN, + path_to_models_jar, + env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'), + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, + ), + key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name) + ) + + self.other_jars = tuple(find_jars_within_path(os.path.dirname(stanford_jar))) + + self.verbose = verbose + + self._classpath = (stanford_jar, model_jar) + self.other_jars + + self.corenlp_options = corenlp_options or [] + self.java_options = java_options or ['-mx1g'] + + def start(self): + cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] + + if self.corenlp_options: + cmd.extend(self.corenlp_options) + + # Configure java. + default_options = ' '.join(_java_options) + config_java(options=self.java_options, verbose=self.verbose) + + try: + # TODO: it's probably a bad idea to pipe stdout, as it will + # accumulate when lots of text is being parsed. + self.popen = java( + cmd, + classpath=self._classpath, + blocking=False, + stderr='pipe', + ) + finally: + # Return java configurations to their default values. + config_java(options=default_options, verbose=self.verbose) + + # Check that the server is istill running. + # TODO: is there a better way of checking whether a server is ready to + # accept connections? + time.sleep(5) + returncode = self.popen.poll() + if returncode is not None: + _, stderrdata = self.popen.communicate() + raise CoreNLPServerError( + returncode, + 'Could not start the server. ' + 'The error was: {}'.format(stderrdata.decode('ascii')) + ) + + def stop(self): + self.popen.terminate() + self.popen.wait() + + def __enter__(self): + self.start() + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop() + return False + + +class GenericCoreNLPParser(ParserI, TokenizerI): + """Interface to the CoreNLP Parser""" + + def __init__(self, url='http://localhost:9000', encoding='utf8'): + + self.url = url + self.encoding = encoding + + self.session = requests.Session() + + def parse_sents(self, sentences, *args, **kwargs): + """Parse multiple sentences. + + Takes multiple sentences as a list where each sentence is a list of + words. Each sentence will be automatically tagged with this + CoreNLPParser instance's tagger. + + If a whitespace exists inside a token, then the token will be treated as + several tokens. + + :param sentences: Input sentences to parse + :type sentences: list(list(str)) + :rtype: iter(iter(Tree)) + """ + + sentences = (' '.join(words) for words in sentences) + return self.raw_parse_sents(sentences, *args, **kwargs) + + def raw_parse(self, sentence, properties=None, *args, **kwargs): + """Parse a sentence. + + Takes a sentence as a string; before parsing, it will be automatically + tokenized and tagged by the CoreNLP Parser. + + :param sentence: Input sentence to parse + :type sentence: str + :rtype: iter(Tree) + """ + default_properties = { + 'tokenize.whitespace': 'false', + } + + default_properties.update(properties or {}) + + return next( + self.raw_parse_sents( + [sentence], + properties=default_properties, + *args, + **kwargs + ) + ) + + def api_call(self, data, properties=None): + default_properties = { + 'outputFormat': 'json', + 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format( + parser_annotator=self.parser_annotator, + ), + } + + default_properties.update(properties or {}) + + response = self.session.post( + self.url, + params={ + 'properties': json.dumps(default_properties), + }, + data=data.encode(self.encoding), + ) + + response.raise_for_status() + + return response.json() + + def raw_parse_sents( + self, + sentences, + verbose=False, + properties=None, + *args, + **kwargs + ): + """Parse multiple sentences. + + Takes multiple sentences as a list of strings. Each sentence will be + automatically tokenized and tagged. + + :param sentences: Input sentences to parse. + :type sentences: list(str) + :rtype: iter(iter(Tree)) + + """ + default_properties = { + 'ssplit.isOneSentence': 'true', + } + + default_properties.update(properties or {}) + + for sentence in sentences: + parsed_data = self.api_call(sentence, properties=default_properties) + + assert len(parsed_data['sentences']) == 1 + + for parse in parsed_data['sentences']: + tree = self.make_tree(parse) + yield iter([tree]) + + def tagged_parse(self, sentence, verbose=False): + """Parse a sentence. + + Takes a sentence as a list of (word, tag) tuples; the sentence must have + already been tokenized and tagged. + + :param sentence: Input sentence to parse + :type sentence: list(tuple(str, str)) + :rtype: iter(Tree) + """ + return next(self.tagged_parse_sents([sentence], verbose)) + + def tagged_parse_sents(self, sentences, verbose=False): + """Parse multiple sentences. + + Takes multiple sentences where each sentence is a list of (word, tag) + tuples. The sentences must have already been tokenized and tagged. + + :param sentences: Input sentences to parse + :type sentences: list(list(tuple(str, str))) + :rtype: iter(iter(Tree)) + """ + tag_separator = '/' + cmd = [ + self._MAIN_CLASS, + '-model', self.model_path, + '-sentences', 'newline', + '-outputFormat', self._OUTPUT_FORMAT, + '-tokenized', + '-tagSeparator', tag_separator, + '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', + '-tokenizerMethod', 'newCoreLabelTokenizerFactory', + ] + # We don't need to escape slashes as "splitting is done on the last instance of the character in the token" + return self._parse_trees_output(self._execute( + cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose)) + + def parse_text(self, text, *args, **kwargs): + """Parse a piece of text. + + The text might contain several sentences which will be split by CoreNLP. + + :param str text: text to be split. + :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables? + + """ + parsed_data = self.api_call(text, *args, **kwargs) + + for parse in parsed_data['sentences']: + yield self.make_tree(parse) + + def tokenize(self, text, properties=None): + """Tokenize a string of text. + + >>> parser = CoreNLPParser(url='http://localhost:9000') + + >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.' + >>> list(parser.tokenize(text)) + ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + + >>> s = "The colour of the wall is blue." + >>> list( + ... parser.tokenize( + ... 'The colour of the wall is blue.', + ... properties={'tokenize.options': 'americanize=true'}, + ... ) + ... ) + ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] + + """ + default_properties = { + 'annotators': 'tokenize,ssplit', + } + + default_properties.update(properties or {}) + + result = self.api_call(text, properties=default_properties) + + for sentence in result['sentences']: + for token in sentence['tokens']: + yield token['originalText'] + + +class CoreNLPParser(GenericCoreNLPParser): + """ + >>> parser = CoreNLPParser(url='http://localhost:9000') + + >>> next( + ... parser.raw_parse('the quick brown fox jumps over the lazy dog') + ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|_________ + | VP + | _________|___ + | | PP + | | ________|___ + NP | | NP + ____|__________ | | _______|____ + DT JJ JJ NN VBZ IN DT JJ NN + | | | | | | | | | + the quick brown fox jumps over the lazy dog + + >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents( + ... [ + ... 'the quick brown fox jumps over the lazy dog', + ... 'the quick grey wolf jumps over the lazy fox', + ... ] + ... ) + + >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|_________ + | VP + | _________|___ + | | PP + | | ________|___ + NP | | NP + ____|__________ | | _______|____ + DT JJ JJ NN VBZ IN DT JJ NN + | | | | | | | | | + the quick brown fox jumps over the lazy dog + + >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|_________ + | VP + | _________|___ + | | PP + | | ________|___ + NP | | NP + ____|_________ | | _______|____ + DT JJ JJ NN VBZ IN DT JJ NN + | | | | | | | | | + the quick grey wolf jumps over the lazy fox + + >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents( + ... [ + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ] + ... ) + + >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______|____ + | VP + | ________|___ + NP | NP + | | ___|___ + PRP VBP DT NN + | | | | + I 'm a dog + + >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + ____|___________ + | VP + | ___________|_____________ + | | NP + | | _______|_________ + | | NP PRN + | | _____|_______ ____|______________ + NP | NP | | NP | + | | ______|_________ | | ___|____ | + DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB- + | | | | | | | | | | + This is my friends ' cat -LRB- the tabby -RRB- + + >>> parse_john, parse_mary, = parser.parse_text( + ... 'John loves Mary. Mary walks.' + ... ) + + >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _____|_____________ + | VP | + | ____|___ | + NP | NP | + | | | | + NNP VBZ NNP . + | | | | + John loves Mary . + + >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _____|____ + NP VP | + | | | + NNP VBZ . + | | | + Mary walks . + + Special cases + ------------- + + >>> next( + ... parser.raw_parse( + ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war ' + ... 'Jessica Lynch have angrily dismissed claims made in her biography ' + ... 'that she was raped by her Iraqi captors.' + ... ) + ... ).height() + 17 + + >>> next( + ... parser.raw_parse( + ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or " + ... '0.05 percent, at 997.02.' + ... ) + ... ).height() + 10 + + """ + + _OUTPUT_FORMAT = 'penn' + parser_annotator = 'parse' + + def make_tree(self, result): + return Tree.fromstring(result['parse']) + + +class CoreNLPDependencyParser(GenericCoreNLPParser): + + """ + >>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') + + >>> parse, = dep_parser.raw_parse( + ... 'The quick brown fox jumps over the lazy dog.' + ... ) + >>> print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + brown JJ 4 amod + fox NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + dog NN 5 nmod + . . 5 punct + + >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE + (jumps (fox The quick brown) (dog over the lazy) .) + + >>> for governor, dep, dependent in parse.triples(): + ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE + ('jumps', 'VBZ') nsubj ('fox', 'NN') + ('fox', 'NN') det ('The', 'DT') + ('fox', 'NN') amod ('quick', 'JJ') + ('fox', 'NN') amod ('brown', 'JJ') + ('jumps', 'VBZ') nmod ('dog', 'NN') + ('dog', 'NN') case ('over', 'IN') + ('dog', 'NN') det ('the', 'DT') + ('dog', 'NN') amod ('lazy', 'JJ') + ('jumps', 'VBZ') punct ('.', '.') + + >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents( + ... [ + ... 'The quick brown fox jumps over the lazy dog.', + ... 'The quick grey wolf jumps over the lazy fox.', + ... ] + ... ) + >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + brown JJ 4 amod + fox NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + dog NN 5 nmod + . . 5 punct + + >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + grey JJ 4 amod + wolf NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + fox NN 5 nmod + . . 5 punct + + >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents( + ... [ + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ] + ... ) + >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + I PRP 4 nsubj + 'm VBP 4 cop + a DT 4 det + dog NN 0 ROOT + + >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + This DT 6 nsubj + is VBZ 6 cop + my PRP$ 4 nmod:poss + friends NNS 6 nmod:poss + ' POS 4 case + cat NN 0 ROOT + -LRB- -LRB- 9 punct + the DT 9 det + tabby NN 6 appos + -RRB- -RRB- 9 punct + + >>> parse_john, parse_mary, = dep_parser.parse_text( + ... 'John loves Mary. Mary walks.' + ... ) + + >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + John NNP 2 nsubj + loves VBZ 0 ROOT + Mary NNP 2 dobj + . . 2 punct + + >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + Mary NNP 2 nsubj + walks VBZ 0 ROOT + . . 2 punct + + Special cases + ------------- + + Non-breaking space inside of a token. + + >>> len( + ... next( + ... dep_parser.raw_parse( + ... 'Anhalt said children typically treat a 20-ounce soda bottle as one ' + ... 'serving, while it actually contains 2 1/2 servings.' + ... ) + ... ).nodes + ... ) + 21 + + Phone numbers. + + >>> len( + ... next( + ... dep_parser.raw_parse('This is not going to crash: 01 111 555.') + ... ).nodes + ... ) + 10 + + """ + + _OUTPUT_FORMAT = 'conll2007' + parser_annotator = 'depparse' + + def make_tree(self, result): + + return DependencyGraph( + ( + ' '.join(items) # NLTK expects an iterable of strings... + for n, *items in sorted(transform(result)) + ), + cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token. + ) + + +def transform(sentence): + for dependency in sentence['basic-dependencies']: + + dependent_index = dependency['dependent'] + token = sentence['tokens'][dependent_index - 1] + + # Return values we don't know as '_'. Also, consider tag and ctag to be + # equal. + yield ( + dependent_index, + '_', + token['word'], + token['lemma'], + token['pos'], + token['pos'], + '_', + str(dependency['governor']), + dependency['dep'], + '_', + '_', + ) + + +def setup_module(module): + from nose import SkipTest + + global server + server = CoreNLPServer(port=9000) + + try: + server.start() + except CoreNLPServerError as e: + raise SkipTest( + 'Skipping CoreNLP tests because the server could not be started. ' + 'Make sure that the 9000 port is free. ' + '{}'.format(e.strerror) + ) + + +def teardown_module(module): + server.stop() From c2ef50b40dac5be36637e8563787352c3e63787c Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Thu, 29 Dec 2016 17:58:55 -0500 Subject: [PATCH 21/32] Avoid a test error when the corenlp jar is not found. --- nltk/parse/corenlp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 652bbf55ce..507f374f91 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -665,7 +665,10 @@ def setup_module(module): from nose import SkipTest global server - server = CoreNLPServer(port=9000) + try: + server = CoreNLPServer(port=9000) + except LookupError as e: + raise SkipTest('Could not instantiate CoreNLPServer.') try: server.start() From 03f80bb0d25b3c5b501f35df2ad611e80e749cc8 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Mon, 16 Jan 2017 17:53:08 -0500 Subject: [PATCH 22/32] Cleanup --- nltk/parse/corenlp.py | 43 +++---------------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 507f374f91..b9333f79a3 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -2,7 +2,7 @@ # Natural Language Toolkit: Interface to the CoreNLP REST API. # # Copyright (C) 2001-2016 NLTK Project -# Author: Steven Xu +# Author: Dmitrijs Milajevs # # URL: # For license information, see LICENSE.TXT @@ -24,7 +24,7 @@ from nltk.parse.dependencygraph import DependencyGraph from nltk.tree import Tree -_stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml' +_stanford_url = 'http://stanfordnlp.github.io/CoreNLP/' class CoreNLPServerError(EnvironmentError): @@ -152,7 +152,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): class GenericCoreNLPParser(ParserI, TokenizerI): - """Interface to the CoreNLP Parser""" + """Interface to the CoreNLP Parser.""" def __init__(self, url='http://localhost:9000', encoding='utf8'): @@ -259,43 +259,6 @@ def raw_parse_sents( tree = self.make_tree(parse) yield iter([tree]) - def tagged_parse(self, sentence, verbose=False): - """Parse a sentence. - - Takes a sentence as a list of (word, tag) tuples; the sentence must have - already been tokenized and tagged. - - :param sentence: Input sentence to parse - :type sentence: list(tuple(str, str)) - :rtype: iter(Tree) - """ - return next(self.tagged_parse_sents([sentence], verbose)) - - def tagged_parse_sents(self, sentences, verbose=False): - """Parse multiple sentences. - - Takes multiple sentences where each sentence is a list of (word, tag) - tuples. The sentences must have already been tokenized and tagged. - - :param sentences: Input sentences to parse - :type sentences: list(list(tuple(str, str))) - :rtype: iter(iter(Tree)) - """ - tag_separator = '/' - cmd = [ - self._MAIN_CLASS, - '-model', self.model_path, - '-sentences', 'newline', - '-outputFormat', self._OUTPUT_FORMAT, - '-tokenized', - '-tagSeparator', tag_separator, - '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', - '-tokenizerMethod', 'newCoreLabelTokenizerFactory', - ] - # We don't need to escape slashes as "splitting is done on the last instance of the character in the token" - return self._parse_trees_output(self._execute( - cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose)) - def parse_text(self, text, *args, **kwargs): """Parse a piece of text. From 6f3c7b85655673d9696fa812f9911c15c4de0342 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Tue, 24 Jan 2017 18:58:41 -0500 Subject: [PATCH 23/32] Python 2 support. --- nltk/parse/corenlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index b9333f79a3..71201e08f2 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -594,8 +594,8 @@ def make_tree(self, result): return DependencyGraph( ( - ' '.join(items) # NLTK expects an iterable of strings... - for n, *items in sorted(transform(result)) + ' '.join(n_items[1:]) # NLTK expects an iterable of strings... + for n_items in sorted(transform(result)) ), cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token. ) From da7609d38554497413dbcc17b72221c3e0653c88 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 1 Apr 2017 15:15:11 -0400 Subject: [PATCH 24/32] Updating tests --- nltk/parse/corenlp.py | 72 +++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 71201e08f2..08dd40494d 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -310,26 +310,26 @@ class CoreNLPParser(GenericCoreNLPParser): >>> parser = CoreNLPParser(url='http://localhost:9000') >>> next( - ... parser.raw_parse('the quick brown fox jumps over the lazy dog') + ... parser.raw_parse('The quick brown fox jumps over the lazy dog.') ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S - _______________|_________ - | VP - | _________|___ - | | PP - | | ________|___ - NP | | NP - ____|__________ | | _______|____ - DT JJ JJ NN VBZ IN DT JJ NN - | | | | | | | | | - the quick brown fox jumps over the lazy dog + _______________|__________________________ + | VP | + | _________|___ | + | | PP | + | | ________|___ | + NP | | NP | + ____|__________ | | _______|____ | + DT JJ JJ NN VBZ IN DT JJ NN . + | | | | | | | | | | + The quick brown fox jumps over the lazy dog . >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents( ... [ - ... 'the quick brown fox jumps over the lazy dog', - ... 'the quick grey wolf jumps over the lazy fox', + ... 'The quick brown fox jumps over the lazy dog.', + ... 'The quick grey wolf jumps over the lazy fox.', ... ] ... ) @@ -337,31 +337,31 @@ class CoreNLPParser(GenericCoreNLPParser): ROOT | S - _______________|_________ - | VP - | _________|___ - | | PP - | | ________|___ - NP | | NP - ____|__________ | | _______|____ - DT JJ JJ NN VBZ IN DT JJ NN - | | | | | | | | | - the quick brown fox jumps over the lazy dog + _______________|__________________________ + | VP | + | _________|___ | + | | PP | + | | ________|___ | + NP | | NP | + ____|__________ | | _______|____ | + DT JJ JJ NN VBZ IN DT JJ NN . + | | | | | | | | | | + The quick brown fox jumps over the lazy dog . >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE ROOT | S - _______________|_________ - | VP - | _________|___ - | | PP - | | ________|___ - NP | | NP - ____|_________ | | _______|____ - DT JJ JJ NN VBZ IN DT JJ NN - | | | | | | | | | - the quick grey wolf jumps over the lazy fox + _______________|__________________________ + | VP | + | _________|___ | + | | PP | + | | ________|___ | + NP | | NP | + ____|_________ | | _______|____ | + DT JJ JJ NN VBZ IN DT JJ NN . + | | | | | | | | | | + The quick grey wolf jumps over the lazy fox . >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents( ... [ @@ -438,7 +438,7 @@ class CoreNLPParser(GenericCoreNLPParser): ... 'that she was raped by her Iraqi captors.' ... ) ... ).height() - 17 + 20 >>> next( ... parser.raw_parse( @@ -446,7 +446,7 @@ class CoreNLPParser(GenericCoreNLPParser): ... '0.05 percent, at 997.02.' ... ) ... ).height() - 10 + 9 """ @@ -458,8 +458,8 @@ def make_tree(self, result): class CoreNLPDependencyParser(GenericCoreNLPParser): + """Dependency parser. - """ >>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') >>> parse, = dep_parser.raw_parse( From 02c154a74aa087dc902a08f7f138413b0981f7b5 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 1 Apr 2017 15:16:04 -0400 Subject: [PATCH 25/32] Formatting. --- nltk/parse/corenlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 08dd40494d..0bdd0a297f 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -190,7 +190,7 @@ def raw_parse(self, sentence, properties=None, *args, **kwargs): :rtype: iter(Tree) """ default_properties = { - 'tokenize.whitespace': 'false', + 'tokenize.whitespace': 'false', } default_properties.update(properties or {}) From c4a1e2ac0af102bc6f5f7f27664aa622ab6018f0 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 1 Apr 2017 15:17:12 -0400 Subject: [PATCH 26/32] Look for the CORENLP env. variable. --- nltk/parse/corenlp.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 0bdd0a297f..907e3ae71c 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -50,17 +50,20 @@ def __init__( self, path_to_jar=None, path_to_models_jar=None, verbose=False, java_options=None, corenlp_options=None, port=None, ): + + jars = list(find_jar_iter( + self._JAR, + path_to_jar, + env_vars=('CORENLP', ), + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, + )) + # find the most recent code and model jar stanford_jar = max( - find_jar_iter( - self._JAR, - path_to_jar, - env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'), - searchpath=(), - url=_stanford_url, - verbose=verbose, - is_regex=True, - ), + jars, key=lambda model_name: re.match(self._JAR, model_name) ) @@ -83,7 +86,7 @@ def __init__( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, - env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'), + env_vars=('CORENLP_MODELS', ), searchpath=(), url=_stanford_url, verbose=verbose, From 1b64c3f7f46deb516fe593284435ced9dcd2d6a4 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 1 Apr 2017 16:30:53 -0400 Subject: [PATCH 27/32] Let the CoreNLP server use more memory and pipe stderr in the CoreNLP interface. Also, update the key for basic dependencies. --- nltk/parse/corenlp.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 907e3ae71c..472c598bcf 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -102,7 +102,7 @@ def __init__( self._classpath = (stanford_jar, model_jar) + self.other_jars self.corenlp_options = corenlp_options or [] - self.java_options = java_options or ['-mx1g'] + self.java_options = java_options or ['-mx2g'] def start(self): cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] @@ -121,6 +121,7 @@ def start(self): cmd, classpath=self._classpath, blocking=False, + stdout='pipe', stderr='pipe', ) finally: @@ -195,7 +196,6 @@ def raw_parse(self, sentence, properties=None, *args, **kwargs): default_properties = { 'tokenize.whitespace': 'false', } - default_properties.update(properties or {}) return next( @@ -605,13 +605,13 @@ def make_tree(self, result): def transform(sentence): - for dependency in sentence['basic-dependencies']: + for dependency in sentence['basicDependencies']: dependent_index = dependency['dependent'] token = sentence['tokens'][dependent_index - 1] - # Return values we don't know as '_'. Also, consider tag and ctag to be - # equal. + # Return values that we don't know as '_'. Also, consider tag and ctag + # to be equal. yield ( dependent_index, '_', From 4743f714c344110d1b59ec2d6ca194bfec46aaf5 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Sat, 1 Apr 2017 19:17:20 -0400 Subject: [PATCH 28/32] Add timeout to the api call. --- nltk/parse/corenlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 472c598bcf..be2aefdfcd 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -223,6 +223,7 @@ def api_call(self, data, properties=None): 'properties': json.dumps(default_properties), }, data=data.encode(self.encoding), + timeout=60, ) response.raise_for_status() From d8b34c05431be0c088e5c6c76833436c1d3ec62b Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Fri, 7 Apr 2017 16:32:58 -0400 Subject: [PATCH 29/32] =?UTF-8?q?Don=E2=80=99t=20look=20for=20extra=20jars?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nltk/parse/corenlp.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index be2aefdfcd..b59d2ae5cb 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -95,11 +95,9 @@ def __init__( key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name) ) - self.other_jars = tuple(find_jars_within_path(os.path.dirname(stanford_jar))) - self.verbose = verbose - self._classpath = (stanford_jar, model_jar) + self.other_jars + self._classpath = stanford_jar, model_jar self.corenlp_options = corenlp_options or [] self.java_options = java_options or ['-mx2g'] From afe0dfd232cad4c5a25be4cec7fe69ea4a9f04b8 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Fri, 7 Apr 2017 16:39:42 -0400 Subject: [PATCH 30/32] import CoreNLP parsers in nltk.parse --- nltk/parse/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nltk/parse/__init__.py b/nltk/parse/__init__.py index dba46c595c..66441de612 100644 --- a/nltk/parse/__init__.py +++ b/nltk/parse/__init__.py @@ -78,3 +78,4 @@ from nltk.parse.evaluate import DependencyEvaluator from nltk.parse.transitionparser import TransitionParser from nltk.parse.bllip import BllipParser +from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser From b6afcd0fb3abf9ccb2bff16276de71f3861b0c08 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Wed, 10 May 2017 21:35:17 -0400 Subject: [PATCH 31/32] A basic server status check. --- nltk/parse/corenlp.py | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index b59d2ae5cb..65fec165f0 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -9,7 +9,6 @@ from __future__ import unicode_literals -import os import re import json import time @@ -17,7 +16,7 @@ import requests -from nltk.internals import find_jar_iter, config_java, java, _java_options, find_jars_within_path +from nltk.internals import find_jar_iter, config_java, java, _java_options from nltk.parse.api import ParserI from nltk.tokenize.api import TokenizerI @@ -51,6 +50,11 @@ def __init__( java_options=None, corenlp_options=None, port=None, ): + if corenlp_options is None: + corenlp_options = [ + '-preload', 'tokenize,ssplit,pos,lemma,parse,depparse', + ] + jars = list(find_jar_iter( self._JAR, path_to_jar, @@ -72,11 +76,7 @@ def __init__( port = try_port(9000) except socket.error: port = try_port() - - if corenlp_options: corenlp_options.append(str(port)) - else: - corenlp_options = [str(port)] else: try_port(port) @@ -99,7 +99,7 @@ def __init__( self._classpath = stanford_jar, model_jar - self.corenlp_options = corenlp_options or [] + self.corenlp_options = corenlp_options self.java_options = java_options or ['-mx2g'] def start(self): @@ -127,9 +127,19 @@ def start(self): config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. - # TODO: is there a better way of checking whether a server is ready to - # accept connections? - time.sleep(5) + for i in range(30): + try: + response = requests.get(requests.compat.urljoin(self.url, 'live')) + except requests.exceptions.ConnectionError: + time.sleep(1) + else: + if response.ok: + break + else: + raise CoreNLPServerError( + 'Could not connect to the server.' + ) + returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() @@ -139,6 +149,19 @@ def start(self): 'The error was: {}'.format(stderrdata.decode('ascii')) ) + for i in range(60): + try: + response = requests.get(requests.compat.urljoin(self.url, 'ready')) + except requests.exceptions.ConnectionError: + time.sleep(1) + else: + if response.ok: + break + else: + raise CoreNLPServerError( + 'The server is not ready.' + ) + def stop(self): self.popen.terminate() self.popen.wait() From 27cf680dbb741108bd749992d1207c0c27b88a97 Mon Sep 17 00:00:00 2001 From: Dmitrijs Milajevs Date: Wed, 10 May 2017 21:40:20 -0400 Subject: [PATCH 32/32] =?UTF-8?q?It=E2=80=99s=20a=20good=20idea=20to=20che?= =?UTF-8?q?ck=20that=20the=20server=20process=20is=20running=20before=20do?= =?UTF-8?q?ing=20http=20requests.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nltk/parse/corenlp.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 65fec165f0..38e406cd20 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -127,6 +127,15 @@ def start(self): config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. + returncode = self.popen.poll() + if returncode is not None: + _, stderrdata = self.popen.communicate() + raise CoreNLPServerError( + returncode, + 'Could not start the server. ' + 'The error was: {}'.format(stderrdata.decode('ascii')) + ) + for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, 'live')) @@ -140,15 +149,6 @@ def start(self): 'Could not connect to the server.' ) - returncode = self.popen.poll() - if returncode is not None: - _, stderrdata = self.popen.communicate() - raise CoreNLPServerError( - returncode, - 'Could not start the server. ' - 'The error was: {}'.format(stderrdata.decode('ascii')) - ) - for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, 'ready'))