-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #84 from ArneBinder/nltk_sentence_splitter
add `NltkSentenceSplitter`
- Loading branch information
Showing
5 changed files
with
86 additions
and
1 deletion.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from __future__ import annotations | ||
|
||
import logging | ||
import ssl | ||
from typing import TypeVar | ||
|
||
import nltk | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.documents import TextDocumentWithLabeledPartitions | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
D = TypeVar("D", bound=TextDocumentWithLabeledPartitions) | ||
|
||
|
||
class NltkSentenceSplitter: | ||
"""A document processor that adds sentence partitions to a TextDocumentWithLabeledPartitions document. | ||
It uses the NLTK Punkt tokenizer to split the text of the document into sentences. See | ||
https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.punkt.PunktSentenceTokenizer for more information. | ||
Args: | ||
partition_layer_name: The name of the partition layer to add the sentence partitions to. This layer | ||
must be an AnnotationLayer of LabeledSpan annotations. | ||
text_field_name: The name of the text field in the document to split into sentences. | ||
sentencizer_url: The URL to the NLTK Punkt tokenizer model. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
partition_layer_name: str = "labeled_partitions", | ||
text_field_name: str = "text", | ||
sentencizer_url: str = "tokenizers/punkt/PY3/english.pickle", | ||
): | ||
self.partition_layer_name = partition_layer_name | ||
self.text_field_name = text_field_name | ||
# download the NLTK Punkt tokenizer model | ||
nltk.download("punkt") | ||
self.sentencizer = nltk.data.load(sentencizer_url) | ||
|
||
def __call__(self, document: D) -> None: | ||
partition_layer = document[self.partition_layer_name] | ||
if len(partition_layer) > 0: | ||
logger.warning( | ||
f"Layer {self.partition_layer_name} in document {document.id} is not empty. " | ||
f"Clearing it before adding new sentence partitions." | ||
) | ||
partition_layer.clear() | ||
|
||
text: str = getattr(document, self.text_field_name) | ||
sentence_spans = self.sentencizer.span_tokenize(text) | ||
sentences = [ | ||
LabeledSpan(start=start, end=end, label="sentence") for start, end in sentence_spans | ||
] | ||
partition_layer.extend(sentences) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.documents import TextDocumentWithLabeledPartitions | ||
|
||
from pie_modules.document.processing import NltkSentenceSplitter | ||
|
||
|
||
def test_nltk_sentence_splitter(caplog): | ||
doc = TextDocumentWithLabeledPartitions( | ||
text="This is a test sentence. This is another one.", id="test_doc" | ||
) | ||
# add a dummy text partition to trigger the warning (see below) | ||
doc.labeled_partitions.append(LabeledSpan(start=0, end=len(doc.text), label="text")) | ||
caplog.clear() | ||
# create the sentence splitter | ||
sentence_splitter = NltkSentenceSplitter() | ||
# call the sentence splitter | ||
sentence_splitter(doc) | ||
# check the log message | ||
assert len(caplog.records) == 1 | ||
assert ( | ||
caplog.records[0].message == "Layer labeled_partitions in document test_doc is not empty. " | ||
"Clearing it before adding new sentence partitions." | ||
) | ||
# check the result | ||
assert len(doc.labeled_partitions) == 2 | ||
assert str(doc.labeled_partitions[0]) == "This is a test sentence." | ||
assert str(doc.labeled_partitions[1]) == "This is another one." |