-
Notifications
You must be signed in to change notification settings - Fork 12
RDF notebook
Gezim Sejdiu edited this page Jun 13, 2018
·
3 revisions
SANSA uses the RDF data model for representing graphs consisting of triples with subject, predicate and object. RDF datasets may contain multiple RDF graphs and record information about each graph, allowing any of the upper layers of SANSA (Querying and ML) to make queries that involve information from more than one graph. Instead of directly dealing with RDF datasets, the target RDF datasets need to be converted into an RDD of triples. We name such an RDD the main dataset. The main dataset is based on an RDD data structure, which is a basic building block of the Spark framework. RDDs are in-memory collections of records that can be operated on in parallel on large clusters.
import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang
val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)
triples.take(5).foreach(println(_))
import org.apache.jena.riot.Lang
import net.sansa_stack.rdf.spark.io._
import net.sansa_stack.rdf.spark.model._
import org.apache.jena.graph.NodeFactory
val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)
//Triples filtered by subject ( "http://dbpedia.org/resource/Charles_Dickens" )
println("All triples related to Dickens:\n" + triples.find(Some(NodeFactory.createURI("http://dbpedia.org/resource/Charles_Dickens")), None, None).collect().mkString("\n"))
//Triples filtered by predicate ( "http://dbpedia.org/ontology/influenced" )
println("All triples for predicate influenced:\n" + triples.find(None, Some(NodeFactory.createURI("http://dbpedia.org/ontology/influenced")), None).collect().mkString("\n"))
//Triples filtered by object ( <http://dbpedia.org/resource/Henry_James> )
println("All triples influenced by Henry_James:\n" + triples.find(None, None, Some(NodeFactory.createURI("http://dbpedia.org/resource/Henry_James"))).collect().mkString("\n"))
println("Number of triples: " + triples.distinct.count())
println("Number of subjects: " + triples.getSubjects.distinct.count())
println("Number of predicates: " + triples.getPredicates.distinct.count())
println("Number of objects: " + triples.getObjects.distinct.count())
val subjects = triples.filterSubjects(_.isURI()).collect.mkString("\n")
val predicates = triples.filterPredicates(_.isVariable()).collect.mkString("\n")
val objects = triples.filterObjects(_.isLiteral()).collect.mkString("\n")
triples.getTriples().take(5).foreach(println(_))
import org.apache.spark.graphx.Graph
import net.sansa_stack.rdf.spark.io._
import net.sansa_stack.rdf.spark.model.graph._
import org.apache.jena.riot.Lang
val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)
val graph = triples.asGraph()
val pagerank = graph.pageRank(0.00001).vertices
val report = pagerank.join(graph.vertices)
.map({ case (k, (r, v)) => (r, v, k) })
.sortBy(50 - _._1)
val rankedreport = report.map(f => f._2 + "\t" + f._1 )
println("%table resource\t rank\n " + rankedreport.take(50).mkString("\n"))
import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang
import net.sansa_stack.rdf.spark.stats._
val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)
val propertyDist = triples.statsPropertyUsage()
val sortedValues = propertyDist.sortBy(_._2, false).take(100)
.map(f => f._1.getLocalName+ "\t" + f._2)
println("%table Property Distribution\tFrequency\n " + sortedValues.mkString("\n"))
import org.apache.jena.riot.Lang
import net.sansa_stack.rdf.spark.qualityassessment._
import net.sansa_stack.rdf.spark.io._
val input = "hdfs://namenode:8020/data/rdf.nt"
val lang = Lang.NTRIPLES
val triples = spark.rdf(lang)(input)
// compute quality assessment
val completeness_schema = triples.assessSchemaCompleteness()
val completeness_interlinking = triples.assessInterlinkingCompleteness()
val completeness_property = triples.assessPropertyCompleteness()
val syntacticvalidity_literalnumeric = triples.assessLiteralNumericRangeChecker()
val syntacticvalidity_XSDDatatypeCompatibleLiterals = triples.assessXSDDatatypeCompatibleLiterals()
val availability_DereferenceableUris = triples.assessDereferenceableUris()
val relevancy_CoverageDetail = triples.assessCoverageDetail()
val relevancy_CoverageScope = triples.assessCoverageScope()
val relevancy_AmountOfTriples = triples.assessAmountOfTriples()
val performance_NoHashURIs = triples.assessNoHashUris()
val understandability_LabeledResources = triples.assessLabeledResources()
val AssessQualityStr = s"""%table
metric\tvalue
completeness_schema\t$completeness_schema
completeness_interlinking\t$completeness_interlinking
completeness_property\t$completeness_property
syntacticvalidity_literalnumeric\t$syntacticvalidity_literalnumeric
syntacticvalidity_XSDDatatypeCompatibleLiterals\t$syntacticvalidity_XSDDatatypeCompatibleLiterals
availability_DereferenceableUris\t$availability_DereferenceableUris
relevancy_CoverageDetail\t$relevancy_CoverageDetail
relevancy_CoverageScope\t$relevancy_CoverageScope
relevancy_AmountOfTriples\t$relevancy_AmountOfTriples
performance_NoHashURIs\t$performance_NoHashURIs
understandability_LabeledResources\t$understandability_LabeledResources
"""
z.show(AssessQualityStr)