Skip to content

Commit

Permalink
fix #37
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Nov 9, 2023
1 parent 2aa2e15 commit 1194040
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 5 deletions.
2 changes: 2 additions & 0 deletions sand/container.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import os
from contextlib import contextmanager
from pathlib import Path
Expand Down
52 changes: 51 additions & 1 deletion sand/extensions/export/drepr/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import defaultdict
from io import BytesIO, StringIO
from typing import List, Set

import orjson
Expand All @@ -19,6 +21,8 @@
Resource,
ResourceType,
)
from kgdata.dbpedia.datasets.ontology_dump import aggregated_triples
from rdflib import RDF, Graph, URIRef
from slugify import slugify
from sm.misc.funcs import assert_not_null

Expand Down Expand Up @@ -110,7 +114,7 @@ def export_data(
output=MemoryOutput(output_format),
debug=False,
)
return content
return self.post_processing(sm, content, output_format)

def export_drepr_model(self, table: Table, sm: O.SemanticModel) -> DRepr:
"""Create a D-REPR model of the dataset."""
Expand Down Expand Up @@ -211,3 +215,49 @@ def export_drepr_model(self, table: Table, sm: O.SemanticModel) -> DRepr:
],
sm=dsm,
)

def post_processing(
self, sm: O.SemanticModel, ttldata: str, output_format: OutputFormat
) -> str:
"""Post-processing the TTL data to fix until D-REPR addresses
them.
1. D-REPR doesn't generate relationships for literals that have outgoing edges to class nodes
"""
outliterals = []
for node in sm.iter_nodes():
if isinstance(node, O.LiteralNode):
if sm.out_degree(node.id) > 0:
outliterals.append(node)

if len(outliterals) == 0:
return ttldata

assert output_format == OutputFormat.TTL, "Only support TTL output format"
g = Graph()
file = StringIO(ttldata)
g.parse(file)

source2triples = defaultdict(list)
for s, p, o in g:
source2triples[s].append((s, p, o))
resources = [aggregated_triples(x) for x in source2triples.items()]

new_triples = []
for node in outliterals:
node_value = URIRef(node.value)
for edge in sm.out_edges(node.id):
target_node = sm.get_node(edge.target)
assert isinstance(target_node, O.ClassNode)
for resource in resources:
if str(resource.props[str(RDF.type)][0]) == target_node.abs_uri:
new_triples.append(
(node_value, URIRef(edge.abs_uri), URIRef(resource.id))
)

for triple in new_triples:
g.add(triple)

file = BytesIO()
g.serialize(file, format="turtle")
return file.getvalue().decode()
8 changes: 4 additions & 4 deletions sand/models/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ def readable_label(self):
@inject
def get_default_properties(cfg: AppConfig = Provide["appcfg"]):
mapping = {
"rdfs:label": OntProperty(
id="rdfs:label",
str(RDFS.label): OntProperty(
id=str(RDFS.label),
uri=str(RDFS.label),
label="rdfs:label",
aliases=[],
datatype="string",
description="Provides a human-readable version of a resource's name.",
parents=[],
),
"rdf:type": OntProperty(
id="rdf:type",
str(RDF.type): OntProperty(
id=str(RDF.type),
uri=str(RDF.type),
label="rdf:type",
aliases=[],
Expand Down

0 comments on commit 1194040

Please sign in to comment.