Skip to content

RDF notebook

Gezim Sejdiu edited this page Jun 13, 2018 · 3 revisions

SANSA uses the RDF data model for representing graphs consisting of triples with subject, predicate and object. RDF datasets may contain multiple RDF graphs and record information about each graph, allowing any of the upper layers of SANSA (Querying and ML) to make queries that involve information from more than one graph. Instead of directly dealing with RDF datasets, the target RDF datasets need to be converted into an RDD of triples. We name such an RDD the main dataset. The main dataset is based on an RDD data structure, which is a basic building block of the Spark framework. RDDs are in-memory collections of records that can be operated on in parallel on large clusters.

Triple reader example

import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang

val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES

val triples = spark.rdf(lang)(input)

triples.take(5).foreach(println(_))

Triple Ops example

import org.apache.jena.riot.Lang
import net.sansa_stack.rdf.spark.io._
import net.sansa_stack.rdf.spark.model._
import org.apache.jena.graph.NodeFactory

val input = "hdfs://namenode:8020/data/rdf.nt"

val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)

//Triples filtered by subject ( "http://dbpedia.org/resource/Charles_Dickens" )
println("All triples related to Dickens:\n" + triples.find(Some(NodeFactory.createURI("http://dbpedia.org/resource/Charles_Dickens")), None, None).collect().mkString("\n"))

//Triples filtered by predicate ( "http://dbpedia.org/ontology/influenced" )
println("All triples for predicate influenced:\n" + triples.find(None, Some(NodeFactory.createURI("http://dbpedia.org/ontology/influenced")), None).collect().mkString("\n"))

//Triples filtered by object ( <http://dbpedia.org/resource/Henry_James> )
println("All triples influenced by Henry_James:\n" + triples.find(None, None, Some(NodeFactory.createURI("http://dbpedia.org/resource/Henry_James"))).collect().mkString("\n"))

println("Number of triples: " + triples.distinct.count())
println("Number of subjects: " + triples.getSubjects.distinct.count())
println("Number of predicates: " + triples.getPredicates.distinct.count())
println("Number of objects: " + triples.getObjects.distinct.count())

val subjects = triples.filterSubjects(_.isURI()).collect.mkString("\n")

val predicates = triples.filterPredicates(_.isVariable()).collect.mkString("\n")
val objects = triples.filterObjects(_.isLiteral()).collect.mkString("\n")

triples.getTriples().take(5).foreach(println(_))

PageRank of resources example

import org.apache.spark.graphx.Graph
import net.sansa_stack.rdf.spark.io._
import net.sansa_stack.rdf.spark.model.graph._
import org.apache.jena.riot.Lang

val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)

val graph = triples.asGraph()
val pagerank = graph.pageRank(0.00001).vertices
val report = pagerank.join(graph.vertices)
  .map({ case (k, (r, v)) => (r, v, k) })
  .sortBy(50 - _._1)

val rankedreport = report.map(f => f._2 + "\t" + f._1 )
println("%table resource\t rank\n " + rankedreport.take(50).mkString("\n"))

RDF Statistic example

import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang
import net.sansa_stack.rdf.spark.stats._

val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES

val triples = spark.rdf(lang)(input)

val propertyDist = triples.statsPropertyUsage()

val sortedValues = propertyDist.sortBy(_._2, false).take(100)
                                .map(f => f._1.getLocalName+ "\t" + f._2)


println("%table Property Distribution\tFrequency\n " + sortedValues.mkString("\n"))

RDF Quality Assessment Example

import org.apache.jena.riot.Lang
import net.sansa_stack.rdf.spark.qualityassessment._
import net.sansa_stack.rdf.spark.io._

val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)

// compute  quality assessment
val completeness_schema = triples.assessSchemaCompleteness()
val completeness_interlinking = triples.assessInterlinkingCompleteness()
val completeness_property = triples.assessPropertyCompleteness()

val syntacticvalidity_literalnumeric = triples.assessLiteralNumericRangeChecker()
val syntacticvalidity_XSDDatatypeCompatibleLiterals = triples.assessXSDDatatypeCompatibleLiterals()

val availability_DereferenceableUris = triples.assessDereferenceableUris()

val relevancy_CoverageDetail = triples.assessCoverageDetail()
val relevancy_CoverageScope = triples.assessCoverageScope()
val relevancy_AmountOfTriples = triples.assessAmountOfTriples()

val performance_NoHashURIs = triples.assessNoHashUris()
val understandability_LabeledResources = triples.assessLabeledResources()

val AssessQualityStr = s"""%table
metric\tvalue
completeness_schema\t$completeness_schema
completeness_interlinking\t$completeness_interlinking 
completeness_property\t$completeness_property
syntacticvalidity_literalnumeric\t$syntacticvalidity_literalnumeric 
syntacticvalidity_XSDDatatypeCompatibleLiterals\t$syntacticvalidity_XSDDatatypeCompatibleLiterals
availability_DereferenceableUris\t$availability_DereferenceableUris
relevancy_CoverageDetail\t$relevancy_CoverageDetail
relevancy_CoverageScope\t$relevancy_CoverageScope
relevancy_AmountOfTriples\t$relevancy_AmountOfTriples
performance_NoHashURIs\t$performance_NoHashURIs
understandability_LabeledResources\t$understandability_LabeledResources  
"""

z.show(AssessQualityStr)