Update dataset package

EMMC-ASBL · Oct 25, 2024 · f2900ab · f2900ab
1 parent eb28b50
commit f2900ab
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 66 deletions.
diff --git a/tests/input/datasets.yaml b/tests/input/datasets.yaml
diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml
@@ -34,7 +34,6 @@ datasets:
       downloadURL: https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif
       mediaType: image/tiff
       parser: parser:sem_hitachi
-      generator: gen:sem_hitachi
 
   - "@id": semdata:SEM_cement_batch2/77600-23-001
     "@type": sem:SEMImageSeries

diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py
@@ -1,25 +1,34 @@
 """Module for documenting datasets with Tripper.
 
-The dataset documentation follows the DCAT structure and is exposed as
-Python dicts with attribute access in this module.  This dict
-structure is used by the functions:
-  - `read_datadoc()`: Read documentation from YAML file and return it as dict.
-  - `save_dict()`: Save dict documentation to the triplestore.
-  - `load_dict()`: Load dict documentation from the triplestore.
-
-YAML documentation can also be stored directly to the triplestore with
-  - `save_datadoc()`: Save documentation from YAML file to the triplestore.
+The dataset documentation follows the [DCAT] structure and is exposed
+as Python dicts with attribute access in this module.  The semantic
+meaning of the keywords in this dict are defined by a [JSON-LD context].
 
-For accessing and storing actual data, the following functions can be used:
+High-level functions for accessing and storing actual data:
   - `load()`: Load documented dataset from its source.
   - `save()`: Save documented dataset to a data resource.
 
-For searching the triplestore:
+High-level function for populating the triplestore from YAML documentation:
+  - `save_datadoc()`: Save documentation from YAML file to the triplestore.
+
+Functions for searching the triplestore:
   - `list_dataset_iris()`: Get IRIs of matching datasets.
 
-For interaction with OTEAPI:
+Functions for working with the dict-representation:
+  - `read_datadoc()`: Read documentation from YAML file and return it as dict.
+  - `save_dict()`: Save dict documentation to the triplestore.
+  - `load_dict()`: Load dict documentation from the triplestore.
+
+Functions for interaction with OTEAPI:
   - `get_partial_pipeline()`: Returns a OTELib partial pipeline.
 
+---
+
+__TODO__: Update the URL to the JSON-LD context when merged to master
+
+[DCAT]: https://www.w3.org/TR/vocab-dcat-3/
+[JSON-LD context]: https://raw.githubusercontent.com/EMMC-ASBL/tripper/refs/heads/dataset/tripper/context/0.2/context.json
+
 """
 
 # pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel
@@ -107,20 +116,23 @@ def save(
     generator: "Optional[str]" = None,
     prefixes: "Optional[dict]" = None,
     use_sparql: "Optional[bool]" = None,
-) -> None:
-    """Saves a documented dataset to a data resource.
+) -> str:
+    """Saves data to a dataresource and document it in the triplestore.
 
     Arguments:
         ts: Triplestore to load data from.
-        data: Bytes representation of the dataset to save.
+        data: Bytes representation of the data to save.
         class_iri: IRI of a class in the ontology (e.g. an `emmo:DataSet`
             subclass) that describes the dataset that is saved.
             Is used to select the `distribution` if that is not given.
             If `distribution` is also given, a
             `dcat:distribution value <distribution>` restriction will be
             added to `class_iri`
-        dataset: IRI of dataset for the data to be saved.
-            Or a dict with additional documentation of the dataset.
+        dataset: Either the IRI of the dataset individual standing for
+            the data to be saved or or a dict with additional
+            documentation of the dataset.
+            If the dataset already exists, a new distribution will be added
+            to it. Otherwise a new random blank node IRI will be created.
         distribution: IRI of distribution for the data to be saved.
             Or a dict additional documentation of the distribution,
             like media type, parsers, generators etc...
@@ -131,6 +143,9 @@ def save(
         use_sparql: Whether to access the triplestore with SPARQL.
             Defaults to `ts.prefer_sparql`.
 
+    Returns:
+        IRI of the dataset.
+
     """
     # pylint: disable=too-many-locals,too-many-branches,too-many-statements
     # Use the Protocol plugin system from DLite.  Should we move it to tripper?
@@ -245,6 +260,8 @@ def save(
     elif save_distribution:
         save_dict(ts, "distribution", distribution, prefixes=prefixes)
 
+    return dataset["@id"]
+
 
 def load(
     ts: Triplestore,