Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

separate test workflow for datasets #117

Merged
merged 9 commits into from
Apr 15, 2024
12 changes: 10 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,16 @@ name: Tests
on:
push:
branches: [main]
paths-ignore:
- "dataset_builders/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"
pull_request:
branches: [main, "release/*"]
paths-ignore:
- "dataset_builders/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"

jobs:
tests:
Expand All @@ -17,7 +25,7 @@ jobs:
os: ["ubuntu-latest"]
python-version: ["3.9"]

timeout-minutes: 30
timeout-minutes: 10

steps:
#----------------------------------------------
Expand Down Expand Up @@ -70,7 +78,7 @@ jobs:
- name: Run tests with coverage
run: |
source .venv/bin/activate
pytest -k "not slow" --cov --cov-report term-missing --cov-report xml:coverage.xml
pytest --ignore=tests/dataset_builders -k "not slow" --cov=src --cov-report term-missing --cov-report xml:coverage.xml
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v3
with:
Expand Down
91 changes: 91 additions & 0 deletions .github/workflows/test_datasets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@

name: Test Datasets

on:
push:
branches: [main]
paths:
- "src/dataset_builders/**"
- "data/datasets/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"
- ".github/workflows/test_datasets.yaml"
pull_request:
branches: [main, "release/*"]
paths:
- "src/dataset_builders/**"
- "data/datasets/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"
- ".github/workflows/test_datasets.yaml"

jobs:
tests:
runs-on: ${{ matrix.os }}

strategy:
fail-fast: false
matrix:
os: ["ubuntu-latest"]
python-version: ["3.9"]

timeout-minutes: 30

steps:
#----------------------------------------------
# check-out repo and set-up python
#----------------------------------------------
- name: Check out repository
uses: actions/checkout@v3
- name: Set up python
id: setup-python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

#----------------------------------------------
# ----- install & configure poetry -----
#----------------------------------------------
- name: Install Poetry
uses: snok/install-poetry@v1
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true

#----------------------------------------------
# load cached venv if cache exists
#----------------------------------------------
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}

#----------------------------------------------
# install dependencies if cache does not exist
#----------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --no-root

#----------------------------------------------
# install your root project, if required
#----------------------------------------------
- name: Install project
run: poetry install --no-interaction

#----------------------------------------------
# run test suite and upload coverage data
#----------------------------------------------
- name: Run tests with coverage
run: |
source .venv/bin/activate
pytest tests/dataset_builders -k "not slow" --cov=dataset_builders --cov-report term-missing --cov-report xml:coverage_datasets.xml
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v3
with:
files: ./coverage_datasets.xml
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
6 changes: 6 additions & 0 deletions src/pie_datasets/builders/brat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import datasets
from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
from pytorch_ie import Document
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
from pytorch_ie.documents import TextBasedDocument

Expand Down Expand Up @@ -305,3 +306,8 @@ def _generate_document(self, example, **kwargs):
return example_to_document(
example, merge_fragmented_spans=self.config.merge_fragmented_spans
)

def _generate_example(self, document: Document, **kwargs) -> Dict[str, Any]:
if not isinstance(document, (BratDocument, BratDocumentWithMergedSpans)):
raise TypeError(f"document type {type(document)} is not supported")
return document_to_example(document)
Empty file added tests/unit/builder/__init__.py
Empty file.
198 changes: 198 additions & 0 deletions tests/unit/builder/test_brat_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
from typing import Any

import pytest
from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
from pytorch_ie import Annotation
from pytorch_ie.documents import TextBasedDocument

from pie_datasets.builders.brat import BratAttribute, BratBuilder

HF_EXAMPLES = [
{
"context": "Jane lives in Berlin.\n",
"file_name": "1",
"spans": {
"id": ["T1", "T2"],
"type": ["person", "city"],
"locations": [{"start": [0], "end": [4]}, {"start": [14], "end": [20]}],
"text": ["Jane", "Berlin"],
},
"relations": {"id": [], "type": [], "arguments": []},
"equivalence_relations": {"type": [], "targets": []},
"events": {"id": [], "type": [], "trigger": [], "arguments": []},
"attributions": {"id": [], "type": [], "target": [], "value": []},
"normalizations": {
"id": [],
"type": [],
"target": [],
"resource_id": [],
"entity_id": [],
},
"notes": {"id": [], "type": [], "target": [], "note": []},
},
{
"context": "Seattle is a rainy city. Jenny Durkan is the city's mayor.\n",
"file_name": "2",
"spans": {
"id": ["T1", "T2"],
"type": ["city", "person"],
"locations": [{"start": [0], "end": [7]}, {"start": [25], "end": [37]}],
"text": ["Seattle", "Jenny Durkan"],
},
"relations": {
"id": ["R1"],
"type": ["mayor_of"],
"arguments": [{"type": ["Arg1", "Arg2"], "target": ["T2", "T1"]}],
},
"equivalence_relations": {"type": [], "targets": []},
"events": {"id": [], "type": [], "trigger": [], "arguments": []},
"attributions": {
"id": ["A1", "A2"],
"type": ["factuality", "statement"],
"target": ["T1", "R1"],
"value": ["actual", "true"],
},
"normalizations": {
"id": [],
"type": [],
"target": [],
"resource_id": [],
"entity_id": [],
},
"notes": {"id": [], "type": [], "target": [], "note": []},
},
]


def resolve_annotation(annotation: Annotation) -> Any:
if annotation.target is None:
return None
if isinstance(annotation, LabeledMultiSpan):
return (
[annotation.target[start:end] for start, end in annotation.slices],
annotation.label,
)
elif isinstance(annotation, LabeledSpan):
return (annotation.target[annotation.start : annotation.end], annotation.label)
elif isinstance(annotation, BinaryRelation):
return (
resolve_annotation(annotation.head),
annotation.label,
resolve_annotation(annotation.tail),
)
elif isinstance(annotation, BratAttribute):
result = (resolve_annotation(annotation.annotation), annotation.label)
if annotation.value is not None:
return result + (annotation.value,)
else:
return result
else:
raise TypeError(f"Unknown annotation type: {type(annotation)}")


@pytest.fixture(scope="module", params=BratBuilder.BUILDER_CONFIGS)
def config_name(request) -> str:
return request.param.name


def test_config_names(config_name):
assert config_name in ["default", "merge_fragmented_spans"]


@pytest.fixture(scope="module")
def builder(config_name: str) -> BratBuilder:
return BratBuilder(name=config_name)


def test_builder(builder):
assert builder is not None


@pytest.fixture(scope="module", params=HF_EXAMPLES)
def hf_example(request) -> dict:
return request.param


def test_generate_document(builder, hf_example):
kwargs = dict()
generated_document = builder._generate_document(example=hf_example, **kwargs)
resolved_spans = [resolve_annotation(annotation=span) for span in generated_document.spans]
resolved_relations = [
resolve_annotation(relation) for relation in generated_document.relations
]
if hf_example == HF_EXAMPLES[0]:
assert len(generated_document.spans) == 2
assert len(generated_document.relations) == 0
assert len(generated_document.span_attributes) == 0
assert len(generated_document.relation_attributes) == 0

if builder.config.name == "default":
assert resolved_spans[0] == (["Jane"], "person")
assert resolved_spans[1] == (["Berlin"], "city")
elif builder.config.name == "merge_fragmented_spans":
assert resolved_spans[0] == ("Jane", "person")
assert resolved_spans[1] == ("Berlin", "city")
else:
raise ValueError(f"Unknown builder variant: {builder.name}")

elif hf_example == HF_EXAMPLES[1]:
assert len(generated_document.spans) == 2
assert len(generated_document.relations) == 1
assert len(generated_document.span_attributes) == 1
assert len(generated_document.relation_attributes) == 1

resolved_span_attributes = [
resolve_annotation(attribute) for attribute in generated_document.span_attributes
]
resolved_relation_attributes = [
resolve_annotation(attribute) for attribute in generated_document.relation_attributes
]

if builder.config.name == "default":
assert resolved_spans[0] == (["Seattle"], "city")
assert resolved_spans[1] == (["Jenny Durkan"], "person")
assert resolved_relations[0] == (
(["Jenny Durkan"], "person"),
"mayor_of",
(["Seattle"], "city"),
)
assert resolved_span_attributes[0] == ((["Seattle"], "city"), "factuality", "actual")
assert resolved_relation_attributes[0] == (
((["Jenny Durkan"], "person"), "mayor_of", (["Seattle"], "city")),
"statement",
"true",
)
elif builder.config.name == "merge_fragmented_spans":
assert resolved_spans[0] == ("Seattle", "city")
assert resolved_spans[1] == ("Jenny Durkan", "person")
assert resolved_relations[0] == (
("Jenny Durkan", "person"),
"mayor_of",
("Seattle", "city"),
)
assert resolved_span_attributes[0] == (("Seattle", "city"), "factuality", "actual")
assert resolved_relation_attributes[0] == (
(("Jenny Durkan", "person"), "mayor_of", ("Seattle", "city")),
"statement",
"true",
)
else:
raise ValueError(f"Unknown builder variant: {config_name}")
else:
raise ValueError(f"Unknown sample: {hf_example}")


def test_example_to_document_and_back_all(builder):
for hf_example in HF_EXAMPLES:
doc = builder._generate_document(hf_example)
assert isinstance(doc, builder.document_type)
hf_example_back = builder._generate_example(doc)
assert hf_example == hf_example_back


def test_document_to_example_wrong_type(builder):
doc = TextBasedDocument(text="Hello, world!")

with pytest.raises(TypeError) as exc_info:
builder._generate_example(doc)
assert str(exc_info.value) == f"document type {type(doc)} is not supported"
Loading