Skip to content

Commit

Permalink
NQuadsParser added and few fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
zozlak committed Jan 30, 2021
1 parent 94c6562 commit 9108cd0
Show file tree
Hide file tree
Showing 6 changed files with 342 additions and 34 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
/vendor
/composer.lock
/.phpunit.result.cache
/nbproject
210 changes: 210 additions & 0 deletions src/quickRdfIo/NQuadsParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
<?php

/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/

namespace quickRdfIo;

use Generator;
use rdfInterface\QuadIterator as iQuadIterator;
use rdfInterface\Parser as iParser;
use rdfInterface\Quad as iQuad;
use quickRdf\DataFactory as DF;

/**
* Parses only n-quads and n-triples but does it fast (thanks to parsing in chunks
* and extensive use of regullar expressions).
*
* @author zozlak
*/
class NQuadsParser implements iParser, iQuadIterator {

const EOL = '[\x0D\x0A]+';
const UCHAR = '\\\\u[0-9A-Fa-f]{4}|\\\\U[0-9A-Fa-f]{8}';
const COMMENT = '\s*(?:#[^\x0D\x0A]*)?';
const LANGTAG_STRICT = '@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)';
const LANGTAG = '@([-a-zA-Z0-9]+)';
const IRIREF_STRICT = '<((?:[^\x{00}-\x{20}<>"{}|^`\\\\]|\\\\u[0-9A-Fa-f]{4}|\\\\U[0-9A-Fa-f]{8})*)>';
const IRIREF = '<([^>]+)>';
const BLANKNODE1_STRICT = '_:';
const BLANKNODE2_STRICT = '[0-9_:A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]';
const BLANKNODE3_STRICT = '[-0-9_:A-Za-z\x{00B7}\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0300}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{203F}-\x{2040}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}.]';
const BLANKNODE4_STRICT = '[-0-9_:A-Za-z\x{00B7}\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0300}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{203F}-\x{2040}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]';
const BLANKNODE = '(_:[^ ]+)';
const LITERAL_STRICT = '"((?:[^\x{22}\x{5C}\x{0A}\x{0D}]|\\\\[tbnrf"\'\\\\]|\\\\u[0-9A-Fa-f]{4}|\\\\U[0-9A-Fa-f]{8})*)"';
const LITERAL = '"([^"]*)"';
use TmpStreamTrait;

/**
*
* @var resource
*/
private $input;
private int $chunkSize;
private string $regexp;

/**
*
* @var Generator<iQuad>
*/
private Generator $quads;

/**
* Creates the parser.
*
* Parser can work in four different modes according to `$strict` and `$ntriples`
* parameter values.
*
* When `$strict = true` regular expressions following strictly n-triples/n-quads
* formal definition are used (see https://www.w3.org/TR/n-quads/#sec-grammar and
* https://www.w3.org/TR/n-triples/#n-triples-grammar). When `$strict = false`
* simplified regular expressions are used. Simplified variants provide a little
* faster parsing and are (much) easier to debug. All data which are valid according
* to the strict syntax can be properly parsed in the simplified mode, therefore
* until you need to check the input is 100% correct RDF, you may just stick to
* simplified mode.
*
* When `$ntriples = true` a simplified regular expression is used which doesn't
* match the optional graph IRI. It provides a little faster parsing but can deal
* only with n-triples input.
*
* @param bool $strict should strict RDF syntax be enforced?
* @param bool $ntriples should parsing be done in n-triples only mode?
* @param int $chunkSize parsing chunk size. Default value should be just fine.
*/
public function __construct(bool $strict = false, bool $ntriples = false,
int $chunkSize = 8192) {
$eol = self::EOL;
$comment = self::COMMENT;
if ($strict) {
$iri = self::IRIREF_STRICT;
$blank = '(' . self::BLANKNODE1_STRICT . self::BLANKNODE2_STRICT . '(?:' . self::BLANKNODE3_STRICT . '*' . self::BLANKNODE4_STRICT . ')?)';
$lang = self::LANGTAG_STRICT;
$literal = self::LITERAL_STRICT;
$flags = 'u';
} else {
$iri = self::IRIREF;
$blank = self::BLANKNODE;
$lang = self::LANGTAG;
$literal = self::LITERAL;
$flags = '';
}
$graph = $ntriples ? '' : "(?:\\s*(?:$iri|$blank))?";
$this->regexp = "%\\G$comment$eol|\\G\\s*(?:$iri|$blank)\\s*$iri\\s*(?:$iri|$blank|$literal(?:^^$iri|$lang)?)$graph\\s*\\.$comment$eol%$flags";
$this->chunkSize = $chunkSize;
}

public function __destruct() {
$this->closeTmpStream();
}

public function parseStream($input): iQuadIterator {
if (!is_resource($input)) {
throw new RdfIoException("Input has to be a resource");
}

$this->input = $input;
return $this;
}

public function current(): iQuad {
return $this->quads->current();
}

public function key() {
return $this->quads->key();
}

public function next(): void {
$this->quads->next();
}

public function rewind(): void {
if (ftell($this->input) !== 0) {
$ret = rewind($this->input);
if ($ret !== true) {
throw new RdfIoException("Can't seek in the input stream");
}
}
$this->quads = $this->quadGenerator();
}

public function valid(): bool {
return $this->quads->valid();
}

/**
*
* @return Generator<iQuad>
* @throws RdfIoException
*/
private function quadGenerator(): Generator {
$matches = null;
$buffer = '';
$line = 1;
$bufferPos = 0;
while (!feof($this->input)) {
$buffer .= fread($this->input, $this->chunkSize);
$bufferPos = 0;
do {
$ret = preg_match($this->regexp, $buffer, $matches, PREG_UNMATCHED_AS_NULL, $bufferPos);
if ($ret) {
$bufferPos += strlen($matches[0]);
if ($matches[3] !== null) {
yield $this->makeQuad($matches);
}
$line++;
}
} while ($ret);
$buffer = substr($buffer, $bufferPos);
// Once per chunk check for parsing errors. Otherwise a parsing error would cause
// accumulation of the whole input in the buffer
$p1 = strpos($buffer, "\n");
$p2 = strpos($buffer, "\r");
if ($p1 !== false || $p2 !== false) {
$p = min($p1 !== false ? $p1 : PHP_INT_MAX, $p2 !== false ? $p2 : PHP_INT_MAX);
throw new RdfIoException("Can't parse line $line: " . substr($buffer, 0, $p));
}
}
$ret = preg_match($this->regexp, $buffer, $matches, PREG_UNMATCHED_AS_NULL, $bufferPos);
if ($ret && $matches[3] !== null) {
yield $this->makeQuad($matches);
}
}

/**
* Converts regex matches array into a Quad.
*
* @param array<?string> $matches
* @return iQuad
*/
private function makeQuad(array &$matches): iQuad {
$sbj = $matches[1] !== null ? DF::namedNode($matches[1]) : DF::blankNode($matches[2]);
$pred = DF::namedNode($matches[3] ?? '');
if ($matches[4] !== null) {
$obj = DF::namedNode($matches[4]);
} elseif ($matches[5] !== null) {
$obj = DF::blankNode($matches[5]);
} else {
$value = $matches[6] ?? '';
$escapes = null;
// deal with unicode escapes
$count = preg_match_all('%' . self::UCHAR . '%', $value, $escapes);
if ($count > 0) {
$dict = [];
foreach ($escapes[0] as $i) {
$dict[$i] = mb_chr((int) hexdec(substr($i, 2)));
}
$value = strtr($value, $dict);
}
$obj = DF::literal($value, $matches[8], $matches[7]);
}
if (array_key_exists(9, $matches)) {
$graph = $matches[9] !== null ? DF::namedNode($matches[9]) : DF::blankNode($matches[10]);
}
return DF::quad($sbj, $pred, $obj, $graph ?? null);
}
}
62 changes: 62 additions & 0 deletions src/quickRdfIo/TmpStreamTrait.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php

/*
* The MIT License
*
* Copyright 2021 zozlak.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

namespace quickRdfIo;

use rdfInterface\QuadIterator as iQuadIterator;

/**
* Description of TmpStreamTrait
*
* @author zozlak
*/
trait TmpStreamTrait {

/**
*
* @var resource|null
*/
private $tmpStream;

public function parse(string $input): iQuadIterator {
$this->closeTmpStream();
$tmp = fopen('php://memory', 'r+');
if ($tmp === false) {
throw new RdfIoException('Failed to convert input to stream');
}
$this->tmpStream = $tmp;
fwrite($this->tmpStream, $input);
rewind($this->tmpStream);
return $this->parseStream($this->tmpStream);
}

private function closeTmpStream(): void {
if (is_resource($this->tmpStream)) {
fclose($this->tmpStream);
$this->tmpStream = null;
}
}
}
35 changes: 7 additions & 28 deletions src/quickRdfIo/TriGParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
*/
class TriGParser implements iParser, iQuadIterator {

use TmpStreamTrait;

private const CHUNK_SIZE = 8192;

/**
Expand Down Expand Up @@ -68,12 +70,6 @@ class TriGParser implements iParser, iQuadIterator {
private ArrayIterator $quadsBuffer;
private int $n;

/**
*
* @var resource|null
*/
private $tmpStream;

/**
*
* @var callable|null
Expand All @@ -96,18 +92,6 @@ public function __destruct() {
$this->closeTmpStream();
}

public function parse(string $input): iQuadIterator {
$this->closeTmpStream();
$tmp = fopen('php://memory', 'r+');
if ($tmp === false) {
throw new RdfIoException('Failed to convert input to stream');
}
$this->tmpStream = $tmp;
fwrite($this->tmpStream, $input);
rewind($this->tmpStream);
return $this->parseStream($this->tmpStream);
}

public function parseStream($input): iQuadIterator {
if (!is_resource($input)) {
throw new RdfIoException("Input has to be a resource");
Expand Down Expand Up @@ -168,21 +152,16 @@ public function next(): void {
}

public function rewind(): void {
$ret = rewind($this->input);
if ($ret !== true) {
throw new RdfIoException("Can't seek in the input stream");
if (ftell($this->input) !== 0) {
$ret = rewind($this->input);
if ($ret !== true) {
throw new RdfIoException("Can't seek in the input stream");
}
}
$this->next();
}

public function valid(): bool {
return $this->quadsBuffer->valid();
}

private function closeTmpStream(): void {
if (is_resource($this->tmpStream)) {
fclose($this->tmpStream);
$this->tmpStream = null;
}
}
}
Loading

0 comments on commit 9108cd0

Please sign in to comment.