Skip to content

Commit

Permalink
TriGParser: parse in valid UTF-8 chunks (closes #4)
Browse files Browse the repository at this point in the history
  • Loading branch information
zozlak committed Feb 15, 2024
1 parent 42343e3 commit 4075580
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 7 deletions.
23 changes: 21 additions & 2 deletions src/quickRdfIo/TriGParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class TriGParser implements iParser, iQuadIterator {

use TmpStreamParserTrait;

private const CHUNK_SIZE = 100; // if it's too big, the regexp used by the hardf to match the prefixed URIs may not match
const CHUNK_SIZE = 8192;

private iDataFactory $dataFactory;

Expand All @@ -55,6 +55,7 @@ class TriGParser implements iParser, iQuadIterator {
private array $options;
private Parser $parser;
private StreamInterface $input;
private string $chunk;

/**
*
Expand Down Expand Up @@ -105,6 +106,7 @@ public function parseStream($input): iQuadIterator {

$this->input = $input;
$this->n = -1;
$this->chunk = '';
$this->quadsBuffer = new ArrayIterator();
$this->parser = new Parser($this->options, null, $this->prefixCallback);
return $this;
Expand Down Expand Up @@ -149,7 +151,24 @@ public function next(): void {
}
});
while (!$this->input->eof() && $this->quadsBuffer->count() === 0) {
$this->parser->parseChunk($this->input->read(self::CHUNK_SIZE));
$p = strlen($this->chunk) + self::CHUNK_SIZE - 1;
$this->chunk .= $this->input->read(self::CHUNK_SIZE);
$cp = ord($this->chunk[$p]);
if ($cp < 127) {
// chunk ends with a single-byte UTF-8 character - just parse whole chunk
$this->parser->parseChunk($this->chunk);
$this->chunk = '';
} else {
// find the position of the first byte of the UTF-8 character ending the chunk
// parse the chunk part excluding it
while ($cp < 192) {
$p--;
$cp = ord($this->chunk[$p]);
}
$this->parser->parseChunk(substr($this->chunk, 0, $p));
$this->chunk = substr($this->chunk, $p);
}
$this->parser->parseChunk($x);
}
if ($this->input->eof()) {
$this->parser->end();
Expand Down
22 changes: 17 additions & 5 deletions tests/TriGParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,33 @@ public function testRepeat(): void {
$t2 = iterator_to_array($parser->parse($input));
$this->assertEquals($t1, $t2);
}

public function testMatchesNQuadsSerializer(): void {
$stream = fopen(__DIR__ . '/files/puzzle4d_100k.nt', 'r');
if ($stream) {
$trig = new TriGParser(new DF());
$d1 = new Dataset();
$d1 = new Dataset();
$d1->add($trig->parseStream($stream));

fseek($stream, 0);
$quads = new NQuadsParser(new DF());
$d2 = new Dataset();
$d2 = new Dataset();
$d2->add($quads->parseStream($stream));

$this->assertEquals(count($d1), count($d2));
$this->assertTrue($d1->equals($d2));
}
}

/**
* See https://github.com/sweetrdf/quickRdfIo/issues/4
* @return void
*/
public function testUtfChunk(): void {
$input .= '# ' . str_repeat('𐍈', TriGParser::CHUNK_SIZE);
$parser = new TriGParser(new DF());
$iter = $parser->parse(file_get_contents(__DIR__ . '/files/issue4.ttl'));
$triples = iterator_to_array($iter);
$this->assertCount(148, $triples);
}
}

0 comments on commit 4075580

Please sign in to comment.