forked from explosion/spaCy
-
Notifications
You must be signed in to change notification settings - Fork 1
/
test_issue1001-1500.py
159 lines (134 loc) · 4.7 KB
/
test_issue1001-1500.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# coding: utf-8
from __future__ import unicode_literals
import pytest
import re
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@pytest.mark.xfail(
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
)
def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter"""
nlp = English()
testwords = u'e2g 2g 52g'
doc = nlp(testwords)
assert len(doc) == 5
assert doc[0].text == "e2g"
assert doc[1].text == "2"
assert doc[2].text == "g"
assert doc[3].text == "52"
assert doc[4].text == "g"
def test_issue1242():
nlp = English()
doc = nlp("")
assert len(doc) == 0
docs = list(nlp.pipe(["", "hello"]))
assert len(docs[0]) == 0
assert len(docs[1]) == 1
def test_issue1250():
"""Test cached special cases."""
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
nlp = English()
nlp.tokenizer.add_special_case("reimbur", special_case)
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
assert lemmas == ["reimburse", ",", "reimburse", "..."]
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
assert lemmas == ["reimburse", ",", "reimburse", "..."]
def test_issue1257():
"""Test that tokens compare correctly."""
doc1 = Doc(Vocab(), words=["a", "b", "c"])
doc2 = Doc(Vocab(), words=["a", "c", "e"])
assert doc1[0] != doc2[0]
assert not doc1[0] == doc2[0]
def test_issue1375():
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
doc = Doc(Vocab(), words=["0", "1", "2"])
with pytest.raises(IndexError):
assert doc[0].nbor(-1)
assert doc[1].nbor(-1).text == "0"
with pytest.raises(IndexError):
assert doc[2].nbor(1)
assert doc[1].nbor(1).text == "2"
def test_issue1387():
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
index = {"verb": ("cope", "cop")}
exc = {"verb": {"coping": ("cope",)}}
rules = {"verb": [["ing", ""]]}
lemmatizer = Lemmatizer(index, exc, rules)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=["coping"])
doc[0].tag_ = "VBG"
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"
def test_issue1434():
"""Test matches occur when optional element at end of short doc."""
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
vocab = Vocab(lex_attr_getters=LEX_ATTRS)
hello_world = Doc(vocab, words=["Hello", "World"])
hello = Doc(vocab, words=["Hello"])
matcher = Matcher(vocab)
matcher.add("MyMatcher", None, pattern)
matches = matcher(hello_world)
assert matches
matches = matcher(hello)
assert matches
@pytest.mark.parametrize(
"string,start,end",
[
("a", 0, 1),
("a b", 0, 2),
("a c", 0, 1),
("a b c", 0, 2),
("a b b c", 0, 3),
("a b b", 0, 3),
],
)
def test_issue1450(string, start, end):
"""Test matcher works when patterns end with * operator."""
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
matcher = Matcher(Vocab())
matcher.add("TSTEND", None, pattern)
doc = Doc(Vocab(), words=string.split())
matches = matcher(doc)
if start is None or end is None:
assert matches == []
assert matches[-1][1] == start
assert matches[-1][2] == end
def test_issue1488():
prefix_re = re.compile(r"""[\[\("']""")
suffix_re = re.compile(r"""[\]\)"']""")
infix_re = re.compile(r"""[-~\.]""")
simple_url_re = re.compile(r"""^https?://""")
def my_tokenizer(nlp):
return Tokenizer(
nlp.vocab,
{},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match,
)
nlp = English()
nlp.tokenizer = my_tokenizer(nlp)
doc = nlp("This is a test.")
for token in doc:
assert token.text
def test_issue1494():
infix_re = re.compile(r"""[^a-z]""")
test_cases = [
("token 123test", ["token", "1", "2", "3", "test"]),
("token 1test", ["token", "1test"]),
("hello...test", ["hello", ".", ".", ".", "test"]),
]
def new_tokenizer(nlp):
return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
nlp = English()
nlp.tokenizer = new_tokenizer(nlp)
for text, expected in test_cases:
assert [token.text for token in nlp(text)] == expected