Skip to content

Commit

Permalink
Add datatype count statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
Ostrzyciel committed Sep 11, 2024
1 parent b9481c7 commit 6f75b58
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
16 changes: 15 additions & 1 deletion src/main/scala/util/StatCounter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ class StatCounter[T : Funnel](size: Long) extends LightStatCounter[T]:
override def add(values: Seq[T]): Unit =
// the bloom filter is thread-safe
values.foreach(bloomFilter.put)

// but the counter is not
lightAdd(values.distinct.size)

Expand All @@ -73,3 +72,18 @@ class StatCounter[T : Funnel](size: Long) extends LightStatCounter[T]:

override def result: Result =
super.result.copy(uniqueCount = Some(bloomFilter.approximateElementCount))

// uses sets instead of bloom filters
class PreciseStatCounter[T] extends LightStatCounter[T]:
private val set: scala.collection.mutable.HashSet[T] = scala.collection.mutable.HashSet.empty

override def add(values: Seq[T]): Unit =
set ++= values
lightAdd(values.distinct.size)

override def addUnique(values: Iterable[T]): Unit =
set ++= values
lightAdd(values.size)

override def result: StatCounter.Result =
super.result.copy(uniqueCount = Some(set.size.toLong))
13 changes: 9 additions & 4 deletions src/main/scala/util/StatCounterSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ import scala.jdk.CollectionConverters.*
object StatCounterSuite:
case class Result(iris: StatCounter.Result, blankNodes: StatCounter.Result, literals: StatCounter.Result,
plainLiterals: StatCounter.Result, dtLiterals: StatCounter.Result,
langLiterals: StatCounter.Result, controlChars: StatCounter.Result,
quotedTriples: StatCounter.Result,
langLiterals: StatCounter.Result, datatypes: StatCounter.Result,
controlChars: StatCounter.Result, quotedTriples: StatCounter.Result,
subjects: StatCounter.Result, predicates: StatCounter.Result,
objects: StatCounter.Result, graphs: StatCounter.Result,
statements: StatCounter.Result):
Expand All @@ -27,6 +27,7 @@ object StatCounterSuite:
"SimpleLiteralCountStatistics" -> plainLiterals,
"DatatypeLiteralCountStatistics" -> dtLiterals,
"LanguageLiteralCountStatistics" -> langLiterals,
"DatatypeCountStatistics" -> datatypes,
"AsciiControlCharacterCountStatistics" -> controlChars,
"QuotedTripleCountStatistics" -> quotedTriples,
"SubjectCountStatistics" -> subjects,
Expand Down Expand Up @@ -73,6 +74,7 @@ class StatCounterSuite(val size: Long):
private val cPlainLiterals = new StatCounter[String](10 * size)
private val cDtLiterals = new StatCounter[String](10 * size)
private val cLangLiterals = new StatCounter[String](10 * size)
private val cDatatypes = new PreciseStatCounter[String]

private val cAsciiControlChars = LightStatCounter[Char]()
private val cBlankNodes = new LightStatCounter[String]()
Expand Down Expand Up @@ -100,6 +102,7 @@ class StatCounterSuite(val size: Long):
val simpleLiterals = mutable.Set[String]()
val dtLiterals = mutable.Set[String]()
val langLiterals = mutable.Set[String]()
val datatypes = mutable.Set[String]()
var controlCharCount = 0
var quotedTripleCount = 0
var stCount = 0
Expand Down Expand Up @@ -137,6 +140,7 @@ class StatCounterSuite(val size: Long):
simpleLiterals += n.getLiteralLexicalForm
else if n.getLiteralDatatypeURI != null then
dtLiterals += lit
datatypes += n.getLiteralDatatypeURI
else
simpleLiterals += n.getLiteralLexicalForm
else if n.isNodeTriple then
Expand All @@ -150,6 +154,7 @@ class StatCounterSuite(val size: Long):
cPlainLiterals.addUnique(simpleLiterals)
cDtLiterals.addUnique(dtLiterals)
cLangLiterals.addUnique(langLiterals)
cDatatypes.addUnique(datatypes)
cAsciiControlChars.lightAdd(controlCharCount)

cQuotedTriples.lightAdd(quotedTripleCount)
Expand All @@ -165,5 +170,5 @@ class StatCounterSuite(val size: Long):

def result: StatCounterSuite.Result =
StatCounterSuite.Result(cIris.result, cBlankNodes.result, cLiterals.result, cPlainLiterals.result,
cDtLiterals.result, cLangLiterals.result, cAsciiControlChars.result, cQuotedTriples.result, cSubjects.result,
cPredicates.result, cObjects.result, cGraphs.result, cStatements.result)
cDtLiterals.result, cLangLiterals.result, cDatatypes.result, cAsciiControlChars.result, cQuotedTriples.result,
cSubjects.result, cPredicates.result, cObjects.result, cGraphs.result, cStatements.result)

0 comments on commit 6f75b58

Please sign in to comment.