Skip to content

Commit

Permalink
Merge pull request #84 from ArneBinder/nltk_sentence_splitter
Browse files Browse the repository at this point in the history
add `NltkSentenceSplitter`
  • Loading branch information
ArneBinder authored Apr 22, 2024
2 parents 0f8296b + 079f29f commit 25ea281
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 1 deletion.
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ pytorch-crf = ">=0.7.2"
networkx = "^3.0.0"
# because of BartModelWithDecoderPositionIds
transformers = "^4.35.0"
# for NltkSentenceSplitter
nltk = "^3.8.1"

[tool.poetry.group.dev.dependencies]
torch = {version = "^2.1.0+cpu", source = "pytorch"}
Expand Down
1 change: 1 addition & 0 deletions src/pie_modules/document/processing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .merge_spans_via_relation import SpansViaRelationMerger
from .regex_partitioner import RegexPartitioner
from .relation_argument_sorter import RelationArgumentSorter
from .sentence_splitter import NltkSentenceSplitter
from .text_span_trimmer import TextSpanTrimmer
from .tokenization import (
text_based_document_to_token_based,
Expand Down
55 changes: 55 additions & 0 deletions src/pie_modules/document/processing/sentence_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

import logging
import ssl
from typing import TypeVar

import nltk
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.documents import TextDocumentWithLabeledPartitions

logger = logging.getLogger(__name__)


D = TypeVar("D", bound=TextDocumentWithLabeledPartitions)


class NltkSentenceSplitter:
"""A document processor that adds sentence partitions to a TextDocumentWithLabeledPartitions document.
It uses the NLTK Punkt tokenizer to split the text of the document into sentences. See
https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.punkt.PunktSentenceTokenizer for more information.
Args:
partition_layer_name: The name of the partition layer to add the sentence partitions to. This layer
must be an AnnotationLayer of LabeledSpan annotations.
text_field_name: The name of the text field in the document to split into sentences.
sentencizer_url: The URL to the NLTK Punkt tokenizer model.
"""

def __init__(
self,
partition_layer_name: str = "labeled_partitions",
text_field_name: str = "text",
sentencizer_url: str = "tokenizers/punkt/PY3/english.pickle",
):
self.partition_layer_name = partition_layer_name
self.text_field_name = text_field_name
# download the NLTK Punkt tokenizer model
nltk.download("punkt")
self.sentencizer = nltk.data.load(sentencizer_url)

def __call__(self, document: D) -> None:
partition_layer = document[self.partition_layer_name]
if len(partition_layer) > 0:
logger.warning(
f"Layer {self.partition_layer_name} in document {document.id} is not empty. "
f"Clearing it before adding new sentence partitions."
)
partition_layer.clear()

text: str = getattr(document, self.text_field_name)
sentence_spans = self.sentencizer.span_tokenize(text)
sentences = [
LabeledSpan(start=start, end=end, label="sentence") for start, end in sentence_spans
]
partition_layer.extend(sentences)
27 changes: 27 additions & 0 deletions tests/document/processing/test_sentence_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.documents import TextDocumentWithLabeledPartitions

from pie_modules.document.processing import NltkSentenceSplitter


def test_nltk_sentence_splitter(caplog):
doc = TextDocumentWithLabeledPartitions(
text="This is a test sentence. This is another one.", id="test_doc"
)
# add a dummy text partition to trigger the warning (see below)
doc.labeled_partitions.append(LabeledSpan(start=0, end=len(doc.text), label="text"))
caplog.clear()
# create the sentence splitter
sentence_splitter = NltkSentenceSplitter()
# call the sentence splitter
sentence_splitter(doc)
# check the log message
assert len(caplog.records) == 1
assert (
caplog.records[0].message == "Layer labeled_partitions in document test_doc is not empty. "
"Clearing it before adding new sentence partitions."
)
# check the result
assert len(doc.labeled_partitions) == 2
assert str(doc.labeled_partitions[0]) == "This is a test sentence."
assert str(doc.labeled_partitions[1]) == "This is another one."

0 comments on commit 25ea281

Please sign in to comment.