diff --git a/src/quickRdfIo/JsonLdParser.php b/src/quickRdfIo/JsonLdParser.php index 4947ee6..5a9a69e 100644 --- a/src/quickRdfIo/JsonLdParser.php +++ b/src/quickRdfIo/JsonLdParser.php @@ -114,6 +114,9 @@ public function next(): void { } public function parse(string $input): iQuadIterator { + if (substr($input, 0, 3) === "\xEF\xBB\xBF") { + $input = substr($input, 3); + } $this->quads = JsonLD::toRdf($input, ['base' => $this->baseUri]); return $this; } diff --git a/src/quickRdfIo/NQuadsParser.php b/src/quickRdfIo/NQuadsParser.php index a91db6d..0b61316 100644 --- a/src/quickRdfIo/NQuadsParser.php +++ b/src/quickRdfIo/NQuadsParser.php @@ -43,6 +43,9 @@ */ class NQuadsParser implements iParser, iQuadIterator { + use TmpStreamParserTrait; + use StreamSkipBomTrait; + const MODE_TRIPLES = 1; const MODE_QUADS = 2; const MODE_TRIPLES_STAR = 3; @@ -66,7 +69,6 @@ class NQuadsParser implements iParser, iQuadIterator { const STAR_START = '%\\G\s*<<%'; const STAR_END = '%\\G\s*>>%'; const READ_BUF_SIZE = 8096; - use TmpStreamParserTrait; /** * See https://www.w3.org/TR/n-quads/#grammar-production-ECHAR @@ -233,6 +235,7 @@ public function rewind(): void { if ($this->input->tell() !== 0) { $this->input->rewind(); } + $this->skipBom($this->input); if ($this->mode === self::MODE_TRIPLES || $this->mode === self::MODE_QUADS) { $this->quads = $this->quadGenerator(); } else { diff --git a/src/quickRdfIo/TriGParser.php b/src/quickRdfIo/TriGParser.php index 5b3b637..280aeeb 100644 --- a/src/quickRdfIo/TriGParser.php +++ b/src/quickRdfIo/TriGParser.php @@ -43,6 +43,7 @@ class TriGParser implements iParser, iQuadIterator { use TmpStreamParserTrait; + use StreamSkipBomTrait; const CHUNK_SIZE = 8192; @@ -181,6 +182,7 @@ public function rewind(): void { if ($this->input->tell() !== 0) { $this->input->rewind(); } + $this->skipBom($this->input); $this->next(); } diff --git a/tests/JsonLdTest.php b/tests/JsonLdTest.php index 52b5fcf..e66af7a 100644 --- a/tests/JsonLdTest.php +++ b/tests/JsonLdTest.php @@ -26,7 +26,7 @@ namespace quickRdfIo; -use quickRdf\DataFactory; +use quickRdf\DataFactory as DF; use quickRdf\Dataset; /** @@ -42,7 +42,7 @@ class JsonLdTest extends \PHPUnit\Framework\TestCase { private JsonLdSerializer $serializer; public function setUp(): void { - $this->df = new DataFactory(); + $this->df = new DF(); $this->refParser = new NQuadsParser($this->df, false, NQuadsParser::MODE_QUADS); $this->parser = new JsonLdParser($this->df); $this->serializer = new JsonLdSerializer(null); @@ -71,4 +71,22 @@ public function testBig(): void { $this->assertEquals($ref->count(), $dataset->count()); $this->assertTrue($ref->equals($dataset)); } + + /** + * https://github.com/sweetrdf/quickRdfIo/issues/10 + */ + public function testBom(): void { + $ref = new Dataset(); + $quad = DF::quad(df::namedNode('http://foo'), DF::namedNode('http://bar'), DF::namedNode('http://baz')); + $ref->add($quad); + $output = tmpfile(); + fwrite($output, "\xEF\xBB\xBF"); + $this->serializer->serializeStream($output, $ref); + + fseek($output, 0); + $dataset = new Dataset(); + $dataset->add($this->parser->parseStream($output)); + $this->assertCount(1, $dataset); + $this->assertTrue($quad->equals($dataset[0])); + } } diff --git a/tests/NQuadsParserTest.php b/tests/NQuadsParserTest.php index 98adb0c..49c50d4 100644 --- a/tests/NQuadsParserTest.php +++ b/tests/NQuadsParserTest.php @@ -264,4 +264,30 @@ public function testIssue7(): void { $this->assertCount(2, $dataset); } } + + /** + * https://github.com/sweetrdf/quickRdfIo/issues/10 + */ + public function testBom(): void { + $df = new DF(); + $parser = new NQuadsParser($df); + $inputs = [ + 'issue10_utf16be.nq' => "UTF-16 BE", + 'issue10_utf32le.nq' => "UTF-32 LE", + 'issue10_utf7.nq' => "UTF-7", + ]; + foreach ($inputs as $file => $enc) { + try { + $parser->parseStream(fopen(__DIR__ . '/files/' . $file, 'r')); + } catch (RdfIoException $ex) { + $this->assertEquals("Input stream has wrong encoding $enc", $ex->getMessage()); + } + } + + $dataset = new \quickRdf\Dataset(); + $dataset->add($parser->parseStream(fopen(__DIR__ . '/files/issue10_utf8.nq', 'r'))); + $this->assertCount(1, $dataset); + $q = $df->quad(df::namedNode('http://foo'), DF::namedNode('http://bar'), DF::namedNode('http://baz')); + $this->assertTrue($q->equals($dataset[0])); + } } diff --git a/tests/TriGParserTest.php b/tests/TriGParserTest.php index 2e4ade8..eccd7d2 100644 --- a/tests/TriGParserTest.php +++ b/tests/TriGParserTest.php @@ -110,4 +110,30 @@ public function testUtfChunk(): void { $triples = iterator_to_array($iter); $this->assertCount(148, $triples); } + + /** + * https://github.com/sweetrdf/quickRdfIo/issues/10 + */ + public function testBom(): void { + $df = new DF(); + $parser = new TriGParser($df); + $inputs = [ + 'issue10_utf16be.nq' => "UTF-16 BE", + 'issue10_utf32le.nq' => "UTF-32 LE", + 'issue10_utf7.nq' => "UTF-7", + ]; + foreach ($inputs as $file => $enc) { + try { + $parser->parseStream(fopen(__DIR__ . '/files/' . $file, 'r')); + } catch (RdfIoException $ex) { + $this->assertEquals("Input stream has wrong encoding $enc", $ex->getMessage()); + } + } + + $dataset = new \quickRdf\Dataset(); + $dataset->add($parser->parseStream(fopen(__DIR__ . '/files/issue10_utf8.nq', 'r'))); + $this->assertCount(1, $dataset); + $q = $df->quad(df::namedNode('http://foo'), DF::namedNode('http://bar'), DF::namedNode('http://baz')); + $this->assertTrue($q->equals($dataset[0])); + } }