Merge pull request #84 from ArneBinder/nltk_sentence_splitter

add `NltkSentenceSplitter`
ArneBinder · Apr 22, 2024 · 25ea281 · 25ea281
2 parents 0f8296b + 079f29f
commit 25ea281
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 1 deletion.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,8 @@ pytorch-crf = ">=0.7.2"
 networkx = "^3.0.0"
 # because of BartModelWithDecoderPositionIds
 transformers = "^4.35.0"
+# for NltkSentenceSplitter
+nltk = "^3.8.1"
 
 [tool.poetry.group.dev.dependencies]
 torch = {version = "^2.1.0+cpu", source = "pytorch"}

diff --git a/src/pie_modules/document/processing/__init__.py b/src/pie_modules/document/processing/__init__.py
@@ -2,6 +2,7 @@
 from .merge_spans_via_relation import SpansViaRelationMerger
 from .regex_partitioner import RegexPartitioner
 from .relation_argument_sorter import RelationArgumentSorter
+from .sentence_splitter import NltkSentenceSplitter
 from .text_span_trimmer import TextSpanTrimmer
 from .tokenization import (
     text_based_document_to_token_based,

diff --git a/src/pie_modules/document/processing/sentence_splitter.py b/src/pie_modules/document/processing/sentence_splitter.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import logging
+import ssl
+from typing import TypeVar
+
+import nltk
+from pytorch_ie.annotations import LabeledSpan
+from pytorch_ie.documents import TextDocumentWithLabeledPartitions
+
+logger = logging.getLogger(__name__)
+
+
+D = TypeVar("D", bound=TextDocumentWithLabeledPartitions)
+
+
+class NltkSentenceSplitter:
+    """A document processor that adds sentence partitions to a TextDocumentWithLabeledPartitions document.
+    It uses the NLTK Punkt tokenizer to split the text of the document into sentences. See
+    https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.punkt.PunktSentenceTokenizer for more information.
+
+    Args:
+        partition_layer_name: The name of the partition layer to add the sentence partitions to. This layer
+            must be an AnnotationLayer of LabeledSpan annotations.
+        text_field_name: The name of the text field in the document to split into sentences.
+        sentencizer_url: The URL to the NLTK Punkt tokenizer model.
+    """
+
+    def __init__(
+        self,
+        partition_layer_name: str = "labeled_partitions",
+        text_field_name: str = "text",
+        sentencizer_url: str = "tokenizers/punkt/PY3/english.pickle",
+    ):
+        self.partition_layer_name = partition_layer_name
+        self.text_field_name = text_field_name
+        # download the NLTK Punkt tokenizer model
+        nltk.download("punkt")
+        self.sentencizer = nltk.data.load(sentencizer_url)
+
+    def __call__(self, document: D) -> None:
+        partition_layer = document[self.partition_layer_name]
+        if len(partition_layer) > 0:
+            logger.warning(
+                f"Layer {self.partition_layer_name} in document {document.id} is not empty. "
+                f"Clearing it before adding new sentence partitions."
+            )
+            partition_layer.clear()
+
+        text: str = getattr(document, self.text_field_name)
+        sentence_spans = self.sentencizer.span_tokenize(text)
+        sentences = [
+            LabeledSpan(start=start, end=end, label="sentence") for start, end in sentence_spans
+        ]
+        partition_layer.extend(sentences)
diff --git a/tests/document/processing/test_sentence_splitter.py b/tests/document/processing/test_sentence_splitter.py
@@ -0,0 +1,27 @@
+from pytorch_ie.annotations import LabeledSpan
+from pytorch_ie.documents import TextDocumentWithLabeledPartitions
+
+from pie_modules.document.processing import NltkSentenceSplitter
+
+
+def test_nltk_sentence_splitter(caplog):
+    doc = TextDocumentWithLabeledPartitions(
+        text="This is a test sentence. This is another one.", id="test_doc"
+    )
+    # add a dummy text partition to trigger the warning (see below)
+    doc.labeled_partitions.append(LabeledSpan(start=0, end=len(doc.text), label="text"))
+    caplog.clear()
+    # create the sentence splitter
+    sentence_splitter = NltkSentenceSplitter()
+    # call the sentence splitter
+    sentence_splitter(doc)
+    # check the log message
+    assert len(caplog.records) == 1
+    assert (
+        caplog.records[0].message == "Layer labeled_partitions in document test_doc is not empty. "
+        "Clearing it before adding new sentence partitions."
+    )
+    # check the result
+    assert len(doc.labeled_partitions) == 2
+    assert str(doc.labeled_partitions[0]) == "This is a test sentence."
+    assert str(doc.labeled_partitions[1]) == "This is another one."