Skip to content

Commit

Permalink
Implement unique S/P/O/G counts in dataset stats
Browse files Browse the repository at this point in the history
  • Loading branch information
Ostrzyciel committed Jun 4, 2024
1 parent 9c5ecab commit 2fc8fee
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 14 deletions.
5 changes: 5 additions & 0 deletions src/main/scala/util/StatCounter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package util

import com.google.common.hash.{BloomFilter, Funnel, PrimitiveSink}
import org.apache.jena.datatypes.xsd.XSDDatatype.*
import org.apache.jena.graph.Node
import org.apache.jena.rdf.model.Resource

//noinspection UnstableApiUsage
Expand All @@ -19,6 +20,10 @@ object StatCounter:

implicit val stringFunnel: Funnel[String] =
(from: String, into: PrimitiveSink) => into.putBytes(from.getBytes)

// Add to the node's funnel both its hashcode and string repr. bytes to avoid collisions... just in case
implicit val nodeFunnel: Funnel[Node] =
(from: Node, into: PrimitiveSink) => into.putInt(from.hashCode()).putBytes(from.toString.getBytes)

class LightStatCounter[T]:
import StatCounter.*
Expand Down
34 changes: 20 additions & 14 deletions src/main/scala/util/StatCounterSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package io.github.riverbench.ci_worker
package util

import org.apache.jena.datatypes.xsd.XSDDatatype.*
import org.apache.jena.graph.{Node, Triple}
import org.apache.jena.graph.{Node, NodeFactory, Node_URI, Triple}
import org.apache.jena.rdf.model.{Model, Resource}
import org.apache.jena.sparql.core.DatasetGraph
import org.apache.jena.vocabulary.RDF
Expand Down Expand Up @@ -61,6 +61,9 @@ object StatCounterSuite:
*/
class StatCounterSuite(val size: Long):
import StatCounter.*

// Hack. Jena usually represents the default graph node as null, which is not great for us here.
private val DEFAULT_GRAPH = NodeFactory.createBlankNode("DEFAULT GRAPH")

// A bad heuristic: 10x the size of the stream is assumed to be the number of elements in the bloom filters
private val cIris = new StatCounter[String](10 * size)
Expand All @@ -72,20 +75,23 @@ class StatCounterSuite(val size: Long):
private val cBlankNodes = new LightStatCounter[String]()
private val cQuotedTriples = new LightStatCounter[String]()

private val cSubjects = new LightStatCounter[String]()
private val cPredicates = new LightStatCounter[String]()
private val cObjects = new LightStatCounter[String]()
private val cGraphs = new LightStatCounter[String]()
private val cSubjects = new StatCounter[Node](10 * size)
private val cPredicates = new StatCounter[Node](10 * size)
private val cObjects = new StatCounter[Node](10 * size)
private val cGraphs = new StatCounter[Node](10 * size)

private val cStatements = new LightStatCounter[String]()

def add(ds: DatasetGraph): Unit =
cGraphs.lightAdd(
ds.listGraphNodes().asScala.size + (if ds.getDefaultGraph.isEmpty then 0 else 1)
)
val subjects = mutable.Set[String]()
val predicates = mutable.Set[String]()
val objects = mutable.Set[String]()

if ds.getDefaultGraph.isEmpty then
cGraphs.add(ds.listGraphNodes().asScala.toSeq)
else
cGraphs.add(ds.listGraphNodes().asScala.toSeq :+ DEFAULT_GRAPH)

val subjects = mutable.Set[Node]()
val predicates = mutable.Set[Node]()
val objects = mutable.Set[Node]()
val iris = mutable.Set[String]()
val blankNodes = mutable.Set[String]()
val literals = mutable.Set[String]()
Expand All @@ -105,9 +111,9 @@ class StatCounterSuite(val size: Long):
stCount += 1
getTriples(t.asTriple)
}).flatMap(t => {
subjects += t.getSubject.toString(false)
predicates += t.getPredicate.toString(false)
objects += t.getObject.toString(false)
subjects += t.getSubject
predicates += t.getPredicate
objects += t.getObject

t.getSubject :: t.getPredicate :: t.getObject :: Nil
}) ++ ds.listGraphNodes().asScala
Expand Down

0 comments on commit 2fc8fee

Please sign in to comment.