diff --git a/.gitignore b/.gitignore index 71d9090..252ad40 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /vendor /composer.lock /.phpunit.result.cache +/nbproject diff --git a/src/quickRdfIo/NQuadsParser.php b/src/quickRdfIo/NQuadsParser.php new file mode 100644 index 0000000..339dfb9 --- /dev/null +++ b/src/quickRdfIo/NQuadsParser.php @@ -0,0 +1,210 @@ +"{}|^`\\\\]|\\\\u[0-9A-Fa-f]{4}|\\\\U[0-9A-Fa-f]{8})*)>'; + const IRIREF = '<([^>]+)>'; + const BLANKNODE1_STRICT = '_:'; + const BLANKNODE2_STRICT = '[0-9_:A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]'; + const BLANKNODE3_STRICT = '[-0-9_:A-Za-z\x{00B7}\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0300}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{203F}-\x{2040}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}.]'; + const BLANKNODE4_STRICT = '[-0-9_:A-Za-z\x{00B7}\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0300}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{203F}-\x{2040}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]'; + const BLANKNODE = '(_:[^ ]+)'; + const LITERAL_STRICT = '"((?:[^\x{22}\x{5C}\x{0A}\x{0D}]|\\\\[tbnrf"\'\\\\]|\\\\u[0-9A-Fa-f]{4}|\\\\U[0-9A-Fa-f]{8})*)"'; + const LITERAL = '"([^"]*)"'; + use TmpStreamTrait; + + /** + * + * @var resource + */ + private $input; + private int $chunkSize; + private string $regexp; + + /** + * + * @var Generator + */ + private Generator $quads; + + /** + * Creates the parser. + * + * Parser can work in four different modes according to `$strict` and `$ntriples` + * parameter values. + * + * When `$strict = true` regular expressions following strictly n-triples/n-quads + * formal definition are used (see https://www.w3.org/TR/n-quads/#sec-grammar and + * https://www.w3.org/TR/n-triples/#n-triples-grammar). When `$strict = false` + * simplified regular expressions are used. Simplified variants provide a little + * faster parsing and are (much) easier to debug. All data which are valid according + * to the strict syntax can be properly parsed in the simplified mode, therefore + * until you need to check the input is 100% correct RDF, you may just stick to + * simplified mode. + * + * When `$ntriples = true` a simplified regular expression is used which doesn't + * match the optional graph IRI. It provides a little faster parsing but can deal + * only with n-triples input. + * + * @param bool $strict should strict RDF syntax be enforced? + * @param bool $ntriples should parsing be done in n-triples only mode? + * @param int $chunkSize parsing chunk size. Default value should be just fine. + */ + public function __construct(bool $strict = false, bool $ntriples = false, + int $chunkSize = 8192) { + $eol = self::EOL; + $comment = self::COMMENT; + if ($strict) { + $iri = self::IRIREF_STRICT; + $blank = '(' . self::BLANKNODE1_STRICT . self::BLANKNODE2_STRICT . '(?:' . self::BLANKNODE3_STRICT . '*' . self::BLANKNODE4_STRICT . ')?)'; + $lang = self::LANGTAG_STRICT; + $literal = self::LITERAL_STRICT; + $flags = 'u'; + } else { + $iri = self::IRIREF; + $blank = self::BLANKNODE; + $lang = self::LANGTAG; + $literal = self::LITERAL; + $flags = ''; + } + $graph = $ntriples ? '' : "(?:\\s*(?:$iri|$blank))?"; + $this->regexp = "%\\G$comment$eol|\\G\\s*(?:$iri|$blank)\\s*$iri\\s*(?:$iri|$blank|$literal(?:^^$iri|$lang)?)$graph\\s*\\.$comment$eol%$flags"; + $this->chunkSize = $chunkSize; + } + + public function __destruct() { + $this->closeTmpStream(); + } + + public function parseStream($input): iQuadIterator { + if (!is_resource($input)) { + throw new RdfIoException("Input has to be a resource"); + } + + $this->input = $input; + return $this; + } + + public function current(): iQuad { + return $this->quads->current(); + } + + public function key() { + return $this->quads->key(); + } + + public function next(): void { + $this->quads->next(); + } + + public function rewind(): void { + if (ftell($this->input) !== 0) { + $ret = rewind($this->input); + if ($ret !== true) { + throw new RdfIoException("Can't seek in the input stream"); + } + } + $this->quads = $this->quadGenerator(); + } + + public function valid(): bool { + return $this->quads->valid(); + } + + /** + * + * @return Generator + * @throws RdfIoException + */ + private function quadGenerator(): Generator { + $matches = null; + $buffer = ''; + $line = 1; + $bufferPos = 0; + while (!feof($this->input)) { + $buffer .= fread($this->input, $this->chunkSize); + $bufferPos = 0; + do { + $ret = preg_match($this->regexp, $buffer, $matches, PREG_UNMATCHED_AS_NULL, $bufferPos); + if ($ret) { + $bufferPos += strlen($matches[0]); + if ($matches[3] !== null) { + yield $this->makeQuad($matches); + } + $line++; + } + } while ($ret); + $buffer = substr($buffer, $bufferPos); + // Once per chunk check for parsing errors. Otherwise a parsing error would cause + // accumulation of the whole input in the buffer + $p1 = strpos($buffer, "\n"); + $p2 = strpos($buffer, "\r"); + if ($p1 !== false || $p2 !== false) { + $p = min($p1 !== false ? $p1 : PHP_INT_MAX, $p2 !== false ? $p2 : PHP_INT_MAX); + throw new RdfIoException("Can't parse line $line: " . substr($buffer, 0, $p)); + } + } + $ret = preg_match($this->regexp, $buffer, $matches, PREG_UNMATCHED_AS_NULL, $bufferPos); + if ($ret && $matches[3] !== null) { + yield $this->makeQuad($matches); + } + } + + /** + * Converts regex matches array into a Quad. + * + * @param array $matches + * @return iQuad + */ + private function makeQuad(array &$matches): iQuad { + $sbj = $matches[1] !== null ? DF::namedNode($matches[1]) : DF::blankNode($matches[2]); + $pred = DF::namedNode($matches[3] ?? ''); + if ($matches[4] !== null) { + $obj = DF::namedNode($matches[4]); + } elseif ($matches[5] !== null) { + $obj = DF::blankNode($matches[5]); + } else { + $value = $matches[6] ?? ''; + $escapes = null; + // deal with unicode escapes + $count = preg_match_all('%' . self::UCHAR . '%', $value, $escapes); + if ($count > 0) { + $dict = []; + foreach ($escapes[0] as $i) { + $dict[$i] = mb_chr((int) hexdec(substr($i, 2))); + } + $value = strtr($value, $dict); + } + $obj = DF::literal($value, $matches[8], $matches[7]); + } + if (array_key_exists(9, $matches)) { + $graph = $matches[9] !== null ? DF::namedNode($matches[9]) : DF::blankNode($matches[10]); + } + return DF::quad($sbj, $pred, $obj, $graph ?? null); + } +} diff --git a/src/quickRdfIo/TmpStreamTrait.php b/src/quickRdfIo/TmpStreamTrait.php new file mode 100644 index 0000000..c3cb41d --- /dev/null +++ b/src/quickRdfIo/TmpStreamTrait.php @@ -0,0 +1,62 @@ +closeTmpStream(); + $tmp = fopen('php://memory', 'r+'); + if ($tmp === false) { + throw new RdfIoException('Failed to convert input to stream'); + } + $this->tmpStream = $tmp; + fwrite($this->tmpStream, $input); + rewind($this->tmpStream); + return $this->parseStream($this->tmpStream); + } + + private function closeTmpStream(): void { + if (is_resource($this->tmpStream)) { + fclose($this->tmpStream); + $this->tmpStream = null; + } + } +} diff --git a/src/quickRdfIo/TriGParser.php b/src/quickRdfIo/TriGParser.php index 1d627d4..35e81cc 100644 --- a/src/quickRdfIo/TriGParser.php +++ b/src/quickRdfIo/TriGParser.php @@ -41,6 +41,8 @@ */ class TriGParser implements iParser, iQuadIterator { + use TmpStreamTrait; + private const CHUNK_SIZE = 8192; /** @@ -68,12 +70,6 @@ class TriGParser implements iParser, iQuadIterator { private ArrayIterator $quadsBuffer; private int $n; - /** - * - * @var resource|null - */ - private $tmpStream; - /** * * @var callable|null @@ -96,18 +92,6 @@ public function __destruct() { $this->closeTmpStream(); } - public function parse(string $input): iQuadIterator { - $this->closeTmpStream(); - $tmp = fopen('php://memory', 'r+'); - if ($tmp === false) { - throw new RdfIoException('Failed to convert input to stream'); - } - $this->tmpStream = $tmp; - fwrite($this->tmpStream, $input); - rewind($this->tmpStream); - return $this->parseStream($this->tmpStream); - } - public function parseStream($input): iQuadIterator { if (!is_resource($input)) { throw new RdfIoException("Input has to be a resource"); @@ -168,9 +152,11 @@ public function next(): void { } public function rewind(): void { - $ret = rewind($this->input); - if ($ret !== true) { - throw new RdfIoException("Can't seek in the input stream"); + if (ftell($this->input) !== 0) { + $ret = rewind($this->input); + if ($ret !== true) { + throw new RdfIoException("Can't seek in the input stream"); + } } $this->next(); } @@ -178,11 +164,4 @@ public function rewind(): void { public function valid(): bool { return $this->quadsBuffer->valid(); } - - private function closeTmpStream(): void { - if (is_resource($this->tmpStream)) { - fclose($this->tmpStream); - $this->tmpStream = null; - } - } } diff --git a/tests/NQuadsParserTest.php b/tests/NQuadsParserTest.php new file mode 100644 index 0000000..ef4dbd6 --- /dev/null +++ b/tests/NQuadsParserTest.php @@ -0,0 +1,58 @@ +parseStream($stream) as $i) { + $n++; + if ($N < 0 && $tmpl->equals($i)) { + $N = (int) (string) $i->getObject()->getValue(); + } + } + fclose($stream); + } + $this->assertEquals($N, $n); + } +} diff --git a/tests/TriGParserTest.php b/tests/TriGParserTest.php index 7534f96..18ae5d2 100644 --- a/tests/TriGParserTest.php +++ b/tests/TriGParserTest.php @@ -33,19 +33,17 @@ * * @author zozlak */ -class TriGParserTest extends \PHPUnit\Framework\TestCase -{ +class TriGParserTest extends \PHPUnit\Framework\TestCase { - public function testBig(): void - { + public function testBig(): void { $parser = new TriGParser(); $n = 0; $N = -1; $stream = fopen(__DIR__ . '/puzzle4d_100k.ntriples', 'r'); if ($stream) { $tmpl = DF::quadTemplate( - DF::namedNode('https://technical#subject'), - DF::namedNode('https://technical#tripleCount') + DF::namedNode('https://technical#subject'), + DF::namedNode('https://technical#tripleCount') ); foreach ($parser->parseStream($stream) as $i) { $n++;