From 2fc8fee10df55c69b42b496ae9b82580509e92d9 Mon Sep 17 00:00:00 2001 From: Ostrzyciel Date: Tue, 4 Jun 2024 09:24:08 +0200 Subject: [PATCH] Implement unique S/P/O/G counts in dataset stats Issue: https://github.com/RiverBench/RiverBench/issues/89 --- src/main/scala/util/StatCounter.scala | 5 ++++ src/main/scala/util/StatCounterSuite.scala | 34 +++++++++++++--------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/main/scala/util/StatCounter.scala b/src/main/scala/util/StatCounter.scala index 688d131..5ce342f 100644 --- a/src/main/scala/util/StatCounter.scala +++ b/src/main/scala/util/StatCounter.scala @@ -3,6 +3,7 @@ package util import com.google.common.hash.{BloomFilter, Funnel, PrimitiveSink} import org.apache.jena.datatypes.xsd.XSDDatatype.* +import org.apache.jena.graph.Node import org.apache.jena.rdf.model.Resource //noinspection UnstableApiUsage @@ -19,6 +20,10 @@ object StatCounter: implicit val stringFunnel: Funnel[String] = (from: String, into: PrimitiveSink) => into.putBytes(from.getBytes) + + // Add to the node's funnel both its hashcode and string repr. bytes to avoid collisions... just in case + implicit val nodeFunnel: Funnel[Node] = + (from: Node, into: PrimitiveSink) => into.putInt(from.hashCode()).putBytes(from.toString.getBytes) class LightStatCounter[T]: import StatCounter.* diff --git a/src/main/scala/util/StatCounterSuite.scala b/src/main/scala/util/StatCounterSuite.scala index 13a59a4..4e638dc 100644 --- a/src/main/scala/util/StatCounterSuite.scala +++ b/src/main/scala/util/StatCounterSuite.scala @@ -2,7 +2,7 @@ package io.github.riverbench.ci_worker package util import org.apache.jena.datatypes.xsd.XSDDatatype.* -import org.apache.jena.graph.{Node, Triple} +import org.apache.jena.graph.{Node, NodeFactory, Node_URI, Triple} import org.apache.jena.rdf.model.{Model, Resource} import org.apache.jena.sparql.core.DatasetGraph import org.apache.jena.vocabulary.RDF @@ -61,6 +61,9 @@ object StatCounterSuite: */ class StatCounterSuite(val size: Long): import StatCounter.* + + // Hack. Jena usually represents the default graph node as null, which is not great for us here. + private val DEFAULT_GRAPH = NodeFactory.createBlankNode("DEFAULT GRAPH") // A bad heuristic: 10x the size of the stream is assumed to be the number of elements in the bloom filters private val cIris = new StatCounter[String](10 * size) @@ -72,20 +75,23 @@ class StatCounterSuite(val size: Long): private val cBlankNodes = new LightStatCounter[String]() private val cQuotedTriples = new LightStatCounter[String]() - private val cSubjects = new LightStatCounter[String]() - private val cPredicates = new LightStatCounter[String]() - private val cObjects = new LightStatCounter[String]() - private val cGraphs = new LightStatCounter[String]() + private val cSubjects = new StatCounter[Node](10 * size) + private val cPredicates = new StatCounter[Node](10 * size) + private val cObjects = new StatCounter[Node](10 * size) + private val cGraphs = new StatCounter[Node](10 * size) private val cStatements = new LightStatCounter[String]() def add(ds: DatasetGraph): Unit = - cGraphs.lightAdd( - ds.listGraphNodes().asScala.size + (if ds.getDefaultGraph.isEmpty then 0 else 1) - ) - val subjects = mutable.Set[String]() - val predicates = mutable.Set[String]() - val objects = mutable.Set[String]() + + if ds.getDefaultGraph.isEmpty then + cGraphs.add(ds.listGraphNodes().asScala.toSeq) + else + cGraphs.add(ds.listGraphNodes().asScala.toSeq :+ DEFAULT_GRAPH) + + val subjects = mutable.Set[Node]() + val predicates = mutable.Set[Node]() + val objects = mutable.Set[Node]() val iris = mutable.Set[String]() val blankNodes = mutable.Set[String]() val literals = mutable.Set[String]() @@ -105,9 +111,9 @@ class StatCounterSuite(val size: Long): stCount += 1 getTriples(t.asTriple) }).flatMap(t => { - subjects += t.getSubject.toString(false) - predicates += t.getPredicate.toString(false) - objects += t.getObject.toString(false) + subjects += t.getSubject + predicates += t.getPredicate + objects += t.getObject t.getSubject :: t.getPredicate :: t.getObject :: Nil }) ++ ds.listGraphNodes().asScala