diff --git a/tests/input/datasets.yaml b/tests/input/datasets.yaml deleted file mode 100644 index badd922..0000000 --- a/tests/input/datasets.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -"@context": https://raw.githubusercontent.com/EMMC-ASBL/oteapi-dlite/refs/heads/rdf-serialisation/oteapi_dlite/context/0.2/context.json - -# This extends the list of prefixes that are already defined in the context -prefixes: - sem: "https://w3id.com/emmo/domain/sem/0.1#" - semdata: "http://sintef.no/data/matchmaker/SEM/" - dm: "http://onto-ns.com/meta/characterisation/0.1/SEMImage#" - -# List of documented datasets -datasets: - # unique ID of the dataset - - "@id": "semdata:sample3/pos1_01_grid_200x" - # Dataset type as defined in the SEM domain ontology - "@type": "https://w3id.com/emmo/domain/sem/0.1#" - title: SEM image of cement - description: Back-scattered SEM image of cement sample 3, polished with 1 µm diamond compound. - creator: Sigurd Wenner - contactPoint: "Sigurd Wenner " - - datamodel: http://onto-ns.com/meta/characterisation/0.1/SEMImage - mappingURL: https://raw.githubusercontent.com/HEU-MatCHMaker/DataDocumentation/refs/heads/master/SEM/datamodels/SEMImage.ttl - - # Contextual documentation of the dataset - statements: - - ["@id", "sem:fromSample", "semdata:sample3"] - - ["@id", "emmo:isDescriptionOf", "semdata:concrete1"] - - # A dataset can have several distributions, hence a list - distribution: - - downloadURL: https://github.com/HEU-MatCHMaker/DataDocumentation/raw/refs/heads/master/SEM/example_data/(ThermoFisher)%20pos1_01_grid_200x.tif - mediaType: image/tiff - parser: - parserType: application/vnd.dlite-parse - configuration: - driver: image - options: "plugin=tiffile" - - # List of consumers of this dataset - datasink: - # This simple consumer wants the data in png format - - storeURL: mydata.png - mediaType: image/png - generator: - functionType: application/vnd.dlite-generate - configuration: - driver: image - options: format=png diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index ec2a839..e640d93 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -34,7 +34,6 @@ datasets: downloadURL: https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif mediaType: image/tiff parser: parser:sem_hitachi - generator: gen:sem_hitachi - "@id": semdata:SEM_cement_batch2/77600-23-001 "@type": sem:SEMImageSeries diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py index f321c03..7682633 100644 --- a/tripper/dataset/dataset.py +++ b/tripper/dataset/dataset.py @@ -1,25 +1,34 @@ """Module for documenting datasets with Tripper. -The dataset documentation follows the DCAT structure and is exposed as -Python dicts with attribute access in this module. This dict -structure is used by the functions: - - `read_datadoc()`: Read documentation from YAML file and return it as dict. - - `save_dict()`: Save dict documentation to the triplestore. - - `load_dict()`: Load dict documentation from the triplestore. - -YAML documentation can also be stored directly to the triplestore with - - `save_datadoc()`: Save documentation from YAML file to the triplestore. +The dataset documentation follows the [DCAT] structure and is exposed +as Python dicts with attribute access in this module. The semantic +meaning of the keywords in this dict are defined by a [JSON-LD context]. -For accessing and storing actual data, the following functions can be used: +High-level functions for accessing and storing actual data: - `load()`: Load documented dataset from its source. - `save()`: Save documented dataset to a data resource. -For searching the triplestore: +High-level function for populating the triplestore from YAML documentation: + - `save_datadoc()`: Save documentation from YAML file to the triplestore. + +Functions for searching the triplestore: - `list_dataset_iris()`: Get IRIs of matching datasets. -For interaction with OTEAPI: +Functions for working with the dict-representation: + - `read_datadoc()`: Read documentation from YAML file and return it as dict. + - `save_dict()`: Save dict documentation to the triplestore. + - `load_dict()`: Load dict documentation from the triplestore. + +Functions for interaction with OTEAPI: - `get_partial_pipeline()`: Returns a OTELib partial pipeline. +--- + +__TODO__: Update the URL to the JSON-LD context when merged to master + +[DCAT]: https://www.w3.org/TR/vocab-dcat-3/ +[JSON-LD context]: https://raw.githubusercontent.com/EMMC-ASBL/tripper/refs/heads/dataset/tripper/context/0.2/context.json + """ # pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel @@ -107,20 +116,23 @@ def save( generator: "Optional[str]" = None, prefixes: "Optional[dict]" = None, use_sparql: "Optional[bool]" = None, -) -> None: - """Saves a documented dataset to a data resource. +) -> str: + """Saves data to a dataresource and document it in the triplestore. Arguments: ts: Triplestore to load data from. - data: Bytes representation of the dataset to save. + data: Bytes representation of the data to save. class_iri: IRI of a class in the ontology (e.g. an `emmo:DataSet` subclass) that describes the dataset that is saved. Is used to select the `distribution` if that is not given. If `distribution` is also given, a `dcat:distribution value ` restriction will be added to `class_iri` - dataset: IRI of dataset for the data to be saved. - Or a dict with additional documentation of the dataset. + dataset: Either the IRI of the dataset individual standing for + the data to be saved or or a dict with additional + documentation of the dataset. + If the dataset already exists, a new distribution will be added + to it. Otherwise a new random blank node IRI will be created. distribution: IRI of distribution for the data to be saved. Or a dict additional documentation of the distribution, like media type, parsers, generators etc... @@ -131,6 +143,9 @@ def save( use_sparql: Whether to access the triplestore with SPARQL. Defaults to `ts.prefer_sparql`. + Returns: + IRI of the dataset. + """ # pylint: disable=too-many-locals,too-many-branches,too-many-statements # Use the Protocol plugin system from DLite. Should we move it to tripper? @@ -245,6 +260,8 @@ def save( elif save_distribution: save_dict(ts, "distribution", distribution, prefixes=prefixes) + return dataset["@id"] + def load( ts: Triplestore,