Skip to content

Commit

Permalink
implement Dataset.from_documents (#136)
Browse files Browse the repository at this point in the history
* implement Dataset.from_documents

* Implemented test for dataset creation through from_documents() method

* Enhanced test for from_documents() method to cover ValueError

* Implemented from_documents() method for IterableDataset

* Improved test-from-documents method

* parametrized

* include testing of dataset creation with converters

* compare full document dicts instead of just id

* IterableDataset.from_documents requires a generator as input

* add documentation

* fix documentation

---------

Co-authored-by: Kai Carhuallanqui <[email protected]>
  • Loading branch information
ArneBinder and kai-car authored Aug 20, 2024
1 parent b5f08f4 commit 8605432
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 0 deletions.
74 changes: 74 additions & 0 deletions src/pie_datasets/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,36 @@ def from_hf_dataset(
)
return document_dataset

@classmethod
def from_documents(
cls,
documents: List[Document],
document_converters: Optional[DocumentConvertersType] = None,
**dataset_kwargs,
) -> "Dataset":
"""Create a Dataset from a list of documents. It wraps the Huggingface
datasets.Dataset.from_list method, see the documentation for more details.
Args:
documents (List[Document]): A list of documents.
document_converters (Optional[DocumentConvertersType], optional): A dictionary of document
converters. Defaults to None.
**dataset_kwargs: Additional arguments for the Huggingface dataset creation.
Returns:
Dataset: The created dataset.
"""

if len(documents) == 0:
raise ValueError("No documents to create dataset from")
document_type = type(documents[0])
data = [doc.asdict() for doc in documents]
hf_dataset = datasets.Dataset.from_list(mapping=data, **dataset_kwargs)
dataset = cls.from_hf_dataset(
hf_dataset, document_type=document_type, document_converters=document_converters
)
return dataset

def apply_hf_func(self, func, **kwargs) -> "Dataset":
return Dataset.from_hf_dataset(
func(self, **kwargs),
Expand Down Expand Up @@ -470,6 +500,50 @@ def from_hf_dataset(
)
return dataset

@classmethod
def from_documents(
cls,
documents: Callable,
document_converters: Optional[DocumentConvertersType] = None,
**dataset_kwargs,
) -> "IterableDataset":
"""Create an Iterable Dataset from a generator that yields documents. It wraps the
Huggingface datasets.IterableDataset.from_generator method, see the documentation for more
details.
Args:
documents (Callable): A generator function that `yields` documents.
document_converters (Optional[DocumentConvertersType], optional): A dictionary of document
converters. Defaults to None.
**dataset_kwargs: Additional arguments for the Huggingface dataset creation.
Returns:
IterableDataset: The created iterable dataset.
"""

# get first document to infer the document type
try:
gen_kwargs = dataset_kwargs.get("gen_kwargs", {})
first_doc = next(documents(**gen_kwargs))
except StopIteration:
raise ValueError("No documents to create dataset from")
document_type = type(first_doc)

# wrap the generator to yield dictionaries
def wrapped_documents_generator(**kwargs):
for doc in documents(**kwargs):
yield doc.asdict()

hf_dataset = datasets.IterableDataset.from_generator(
wrapped_documents_generator, **dataset_kwargs
)
dataset = cls.from_hf_dataset(
hf_dataset,
document_type=document_type,
document_converters=document_converters,
)
return dataset

def __iter__(self):
for example in iter(super().__iter__()):
yield self.document_type.fromdict(example)
Expand Down
48 changes: 48 additions & 0 deletions tests/unit/core/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy
import pytest
import torch
from pytorch_ie import Document
from pytorch_ie.annotations import BinaryRelation, Label, LabeledSpan, Span
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.core.taskmodule import (
Expand Down Expand Up @@ -431,3 +432,50 @@ def test_dataset_with_taskmodule(

for document in train_dataset:
assert not document["entities"].predictions


@pytest.mark.parametrize("as_iterable_dataset", [False, True])
def test_pie_dataset_from_documents(documents, as_iterable_dataset):
if as_iterable_dataset:
dataset_class = IterableDataset

# make generators from list
def _documents():
yield from documents

def _empty_docs():
return iter([])

else:
dataset_class = Dataset
_documents = documents
_empty_docs = list[Document]()

dataset_from_documents = dataset_class.from_documents(_documents)

assert isinstance(dataset_from_documents, dataset_class)

assert all(isinstance(doc, TextBasedDocument) for doc in dataset_from_documents)
assert all(
doc1.asdict() == doc2.asdict() for doc1, doc2 in zip(documents, dataset_from_documents)
)
assert hasattr(dataset_from_documents, "document_type")

# Test dataset creation with document converter
dataset_from_documents_with_converter = dataset_class.from_documents(
_documents, document_converters={TestDocumentWithLabel: convert_to_document_with_label}
)

assert isinstance(dataset_from_documents_with_converter, dataset_class)

assert len(dataset_from_documents_with_converter.document_converters) == 1
assert TestDocumentWithLabel in dataset_from_documents_with_converter.document_converters
assert (
dataset_from_documents_with_converter.document_converters[TestDocumentWithLabel]
== convert_to_document_with_label
)

# Test dataset creation with empty list / generator
with pytest.raises(ValueError) as excinfo:
dataset_class.from_documents(_empty_docs)
assert str(excinfo.value) == "No documents to create dataset from"

0 comments on commit 8605432

Please sign in to comment.