-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
61d965a
commit 791e1c8
Showing
17 changed files
with
302 additions
and
246 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,4 +3,6 @@ | |
Cargo.lock | ||
benches/*/*.nt | ||
!resources/root.zarr | ||
.vscode | ||
.vscode | ||
heaptrack.* | ||
tests/out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +1,106 @@ | ||
use oxigraph::io::GraphFormat; | ||
use oxigraph::io::GraphParser; | ||
use std::collections::HashMap; | ||
use rio_api::model::Triple; | ||
use rio_api::parser::TriplesParser; | ||
use std::collections::HashSet; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
|
||
use crate::dictionary::Dictionary; | ||
use crate::error::ParserError; | ||
|
||
pub type Graph = HashMap<String, Vec<(String, String)>>; | ||
use self::ntriples::NTriples; | ||
use self::rdf_xml::RdfXml; | ||
use self::turtle::Turtle; | ||
|
||
pub struct RdfParser { | ||
path: String, | ||
format: GraphFormat, | ||
} | ||
mod ntriples; | ||
mod rdf_xml; | ||
mod turtle; | ||
|
||
impl RdfParser { | ||
pub fn new(path: &str) -> Result<Self, String> { | ||
match path.split('.').last() { | ||
Some("nt") => Ok(RdfParser { | ||
path: path.to_string(), | ||
format: GraphFormat::NTriples, | ||
}), | ||
Some("ttl") => Ok(RdfParser { | ||
path: path.to_string(), | ||
format: GraphFormat::Turtle, | ||
}), | ||
Some("rdf") => Ok(RdfParser { | ||
path: path.to_string(), | ||
format: GraphFormat::RdfXml, | ||
}), | ||
_ => Err(String::from("Not supported format for loading the dump")), | ||
} | ||
} | ||
pub type RdfParserResult = Result<(Graph, Dictionary), ParserError>; | ||
pub type Graph = Vec<Vec<(u32, u32)>>; | ||
|
||
pub fn parse(&self) -> Result<(Graph, Dictionary), String> { | ||
let mut graph = Graph::new(); | ||
trait Backend<T: TriplesParser, E: From<<T>::Error>> { | ||
fn parse(path: &str) -> RdfParserResult { | ||
// We create as many HashSets as fields we will be storing; that is, one | ||
// for the subjects, another for the predicates, and one for the objects. | ||
// The idea is that we will create a Dictionary matching every Term to | ||
// an integer value; thus, we will be able to store the Triples in a | ||
// more efficient manner | ||
let mut subjects = HashSet::new(); | ||
let mut predicates = HashSet::new(); | ||
let mut objects = HashSet::new(); | ||
|
||
let reader = BufReader::new(match File::open(self.path.clone()) { | ||
if let Err(err) = Self::parser_fn(path, &mut |triple: Triple| { | ||
{ | ||
subjects.insert(triple.subject.to_string()); | ||
predicates.insert(triple.predicate.to_string()); | ||
objects.insert(triple.object.to_string()); | ||
}; | ||
Ok(()) | ||
} as Result<(), E>) | ||
{ | ||
return Err(ParserError::Dictionary(err)); | ||
} | ||
|
||
let mut graph = vec![Vec::new(); subjects.len()]; | ||
let dictionary = Dictionary::from_set_terms(subjects, predicates, objects); | ||
|
||
if let Err(err) = Self::parser_fn(path, &mut |triple: Triple| { | ||
{ | ||
let sidx = dictionary.get_subject_idx_unchecked(&triple.subject.to_string()); | ||
let pidx = dictionary.get_predicate_idx_unchecked(&triple.predicate.to_string()); | ||
let oidx = dictionary.get_object_idx_unchecked(&triple.object.to_string()); | ||
graph | ||
.get_mut(sidx) | ||
.unwrap() | ||
.push((pidx as u32, oidx as u32)) | ||
}; | ||
Ok(()) | ||
} as Result<(), E>) | ||
{ | ||
return Err(ParserError::Graph(err)); | ||
} | ||
|
||
Ok((graph, dictionary)) | ||
} | ||
|
||
fn parser_fn( | ||
path: &str, | ||
on_triple: &mut impl FnMut(Triple<'_>) -> Result<(), E>, | ||
) -> Result<(), String> { | ||
// We open a reader for the file that is requested to be read. The idea | ||
// is that we will iterate over the triples stored in a certain file | ||
let reader = BufReader::new(match File::open(path) { | ||
Ok(file) => file, | ||
Err(_) => return Err(String::from("Cannot open the file")), | ||
}); | ||
|
||
let triples = match GraphParser::from_format(self.format).read_triples(reader) { | ||
Ok(iter) => iter, | ||
Err(_) => return Err(String::from("Error parsing the graph")), | ||
}; | ||
|
||
for triple in triples.flatten() { | ||
subjects.insert(triple.subject.to_owned().to_string()); | ||
predicates.insert(triple.predicate.to_owned().to_string()); | ||
objects.insert(triple.object.to_owned().to_string()); | ||
|
||
if let Some(value) = graph.get_mut(&triple.subject.to_string()) { | ||
value.push((triple.predicate.to_string(), triple.object.to_string())); | ||
} else { | ||
graph.insert( | ||
triple.subject.to_string(), | ||
vec![(triple.predicate.to_string(), triple.object.to_string())], | ||
); | ||
// We create a parser that will be in charge of reading the file retrieving | ||
// the triples that are stored in the provided file | ||
let mut parser = Self::concrete_parser(reader); | ||
|
||
while !parser.is_end() { | ||
if parser.parse_step(on_triple).is_err() { | ||
// We skip the line if it is not a valid triple | ||
continue; | ||
} | ||
} | ||
|
||
Ok(( | ||
graph, | ||
Dictionary::from_set_terms(subjects, predicates, objects), | ||
)) | ||
Ok(()) | ||
} | ||
|
||
fn concrete_parser(reader: BufReader<File>) -> T; | ||
} | ||
|
||
pub struct RdfParser; | ||
|
||
impl RdfParser { | ||
pub fn parse(path: &str) -> RdfParserResult { | ||
match path.split('.').last() { | ||
Some("nt") => NTriples::parse(path), | ||
Some("ttl") => Turtle::parse(path), | ||
Some("rdf") => RdfXml::parse(path), | ||
Some(format) => Err(ParserError::NotSupportedFormat(format.to_string())), | ||
None => Err(ParserError::NoFormatProvided), | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
use rio_turtle::NTriplesParser; | ||
use rio_turtle::TurtleError; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
|
||
use super::Backend; | ||
|
||
type NTriplesFileParser = NTriplesParser<BufReader<File>>; | ||
|
||
pub struct NTriples; | ||
|
||
impl Backend<NTriplesFileParser, TurtleError> for NTriples { | ||
fn concrete_parser(reader: BufReader<File>) -> NTriplesFileParser { | ||
NTriplesParser::new(reader) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
use rio_xml::RdfXmlError; | ||
use rio_xml::RdfXmlParser; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
|
||
use super::Backend; | ||
|
||
type RdfXmlFileParser = RdfXmlParser<BufReader<File>>; | ||
|
||
pub struct RdfXml; | ||
|
||
impl Backend<RdfXmlFileParser, RdfXmlError> for RdfXml { | ||
fn concrete_parser(reader: BufReader<File>) -> RdfXmlFileParser { | ||
RdfXmlParser::new(reader, None) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
use rio_turtle::TurtleError; | ||
use rio_turtle::TurtleParser; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
|
||
use super::Backend; | ||
|
||
type TurtleFileParser = TurtleParser<BufReader<File>>; | ||
|
||
pub struct Turtle; | ||
|
||
impl Backend<TurtleFileParser, TurtleError> for Turtle { | ||
fn concrete_parser(reader: BufReader<File>) -> TurtleFileParser { | ||
TurtleParser::new(reader, None) | ||
} | ||
} |
Oops, something went wrong.