diff --git a/modules/entity_to_text_tika/src/Commands/OcrWarmupCommand.php b/modules/entity_to_text_tika/src/Commands/OcrWarmupCommand.php index adfff56..c2be24c 100644 --- a/modules/entity_to_text_tika/src/Commands/OcrWarmupCommand.php +++ b/modules/entity_to_text_tika/src/Commands/OcrWarmupCommand.php @@ -82,6 +82,11 @@ public function __construct(Connection $connection, EntityTypeManagerInterface $ * The maximum file size in bytes a document can be to be processed. * This is useful to avoid processing large files. * [defaults: NULL]. + * @option save-empty-ocr + * Save an empty OCR file when the file is not processable. + * Some files may not be processable by Tika (too large, corrupted, ...) + * enabling this file may avoid processing the same file over and over. + * [defaults: FALSE]. * @option stop-on-failure * Stop processing on first failed (Ex. Tika's down). * [defaults: FALSE]. @@ -118,6 +123,7 @@ public function warmup( ], 'filesize-threshold' => NULL, 'stop-on-failure' => FALSE, + 'save-empty-ocr' => FALSE, 'force' => FALSE, 'no-progress' => FALSE, 'dry-run' => FALSE, @@ -126,6 +132,7 @@ public function warmup( $fid = $options['fid']; $filemime = (array) $options['filemime']; $filesize_threshold = $options['filesize-threshold']; + $save_empty_ocr = (bool) $options['save-empty-ocr']; $stop_on_failure = (bool) $options['stop-on-failure']; $force = (bool) $options['force']; $dry_run = (bool) $options['dry-run']; @@ -188,7 +195,10 @@ public function warmup( // When the OCR'ed file is not available, then run Tika over it // and store it for the next run. $body = $this->fileToText->fromFileToText($file, 'eng+fra'); - $this->localFileStorage->save($file, $body, 'eng+fra'); + + if ($body !== '' || $save_empty_ocr) { + $this->localFileStorage->save($file, $body, 'eng+fra'); + } } $progressbar_objects->advance(); diff --git a/modules/entity_to_text_tika/tests/src/Unit/OcrWarmupCommandTest.php b/modules/entity_to_text_tika/tests/src/Unit/OcrWarmupCommandTest.php index c3fc54f..8772ecc 100644 --- a/modules/entity_to_text_tika/tests/src/Unit/OcrWarmupCommandTest.php +++ b/modules/entity_to_text_tika/tests/src/Unit/OcrWarmupCommandTest.php @@ -242,6 +242,7 @@ public function testWarmupDryrun(): void { 'application/pdf', ], 'filesize-threshold' => NULL, + 'save-empty-ocr' => FALSE, 'stop-on-failure' => FALSE, 'force' => FALSE, 'no-progress' => FALSE, @@ -334,6 +335,7 @@ public function testWarmupForce(): void { 'application/pdf', ], 'filesize-threshold' => NULL, + 'save-empty-ocr' => FALSE, 'stop-on-failure' => FALSE, 'force' => TRUE, 'no-progress' => FALSE, @@ -405,6 +407,184 @@ public function testWarmupFid(): void { 'application/pdf', ], 'filesize-threshold' => NULL, + 'save-empty-ocr' => FALSE, + 'stop-on-failure' => FALSE, + 'force' => FALSE, + 'no-progress' => FALSE, + 'dry-run' => FALSE, + ]); + } + + /** + * @covers ::warmup + */ + public function testWarmupSaveEmptyOcr(): void { + $query = $this->createMock(QueryInterface::class); + $query->expects($this->once()) + ->method('accessCheck') + ->with(FALSE); + $query->expects($this->once()) + ->method('condition') + ->with('filemime', [ + 'application/pdf', + ]); + $query->expects($this->once()) + ->method('count') + ->willReturnSelf(); + $query->expects($this->exactly(2)) + ->method('execute') + ->willReturnOnConsecutiveCalls( + // The first call is the cound query. + 2, + // The second call is the actual query with files IDs. + [200, 2039], + ); + $query->expects($this->once()) + ->method('range') + ->with(0, 100); + + $this->fileStorage->expects(self::once()) + ->method('getQuery') + ->willReturn($query); + + // Create a test file object. + $file200 = $this->createMock(File::class); + $file200->expects(self::once()) + ->method('getFileUri') + ->willReturn('public://file/test.txt'); + $file200->expects(self::once()) + ->method('id') + ->willReturn(200); + + // Create a test file object. + $file2039 = $this->createMock(File::class); + $file2039->expects(self::once()) + ->method('getFileUri') + ->willReturn('public://file/foo.pdf'); + $file2039->expects(self::once()) + ->method('id') + ->willReturn(2039); + + $this->fileStorage->expects($this->exactly(2)) + ->method('load') + ->withConsecutive( + [200], + [2039], + ) + ->willReturnOnConsecutiveCalls($file200, $file2039); + + $this->localFileStorage->expects($this->exactly(2)) + ->method('load') + ->withConsecutive( + [$file200, 'eng+fra'], + [$file2039, 'eng+fra'], + ) + ->willReturnOnConsecutiveCalls('lorem ipsum', NULL); + + $this->fileToText->expects($this->once()) + ->method('fromFileToText') + ->with($file2039, 'eng+fra') + ->willReturn(''); + + $this->localFileStorage->expects($this->once()) + ->method('save') + ->with($file2039, '', 'eng+fra'); + + $this->warmupCommand->warmup([ + 'fid' => NULL, + 'filemime' => [ + 'application/pdf', + ], + 'filesize-threshold' => NULL, + 'save-empty-ocr' => TRUE, + 'stop-on-failure' => FALSE, + 'force' => FALSE, + 'no-progress' => FALSE, + 'dry-run' => FALSE, + ]); + } + + /** + * @covers ::warmup + */ + public function testWarmupNoSaveEmptyOcr(): void { + $query = $this->createMock(QueryInterface::class); + $query->expects($this->once()) + ->method('accessCheck') + ->with(FALSE); + $query->expects($this->once()) + ->method('condition') + ->with('filemime', [ + 'application/pdf', + ]); + $query->expects($this->once()) + ->method('count') + ->willReturnSelf(); + $query->expects($this->exactly(2)) + ->method('execute') + ->willReturnOnConsecutiveCalls( + // The first call is the cound query. + 2, + // The second call is the actual query with files IDs. + [200, 2039], + ); + $query->expects($this->once()) + ->method('range') + ->with(0, 100); + + $this->fileStorage->expects(self::once()) + ->method('getQuery') + ->willReturn($query); + + // Create a test file object. + $file200 = $this->createMock(File::class); + $file200->expects(self::once()) + ->method('getFileUri') + ->willReturn('public://file/test.txt'); + $file200->expects(self::once()) + ->method('id') + ->willReturn(200); + + // Create a test file object. + $file2039 = $this->createMock(File::class); + $file2039->expects(self::once()) + ->method('getFileUri') + ->willReturn('public://file/foo.pdf'); + $file2039->expects(self::once()) + ->method('id') + ->willReturn(2039); + + $this->fileStorage->expects($this->exactly(2)) + ->method('load') + ->withConsecutive( + [200], + [2039], + ) + ->willReturnOnConsecutiveCalls($file200, $file2039); + + $this->localFileStorage->expects($this->exactly(2)) + ->method('load') + ->withConsecutive( + [$file200, 'eng+fra'], + [$file2039, 'eng+fra'], + ) + ->willReturnOnConsecutiveCalls('lorem ipsum', NULL); + + $this->fileToText->expects($this->once()) + ->method('fromFileToText') + ->with($file2039, 'eng+fra') + ->willReturn(''); + + $this->localFileStorage->expects($this->never()) + ->method('save'); + + $this->warmupCommand->warmup([ + 'fid' => NULL, + 'filemime' => [ + 'application/pdf', + ], + 'filesize-threshold' => NULL, + 'save-empty-ocr' => FALSE, 'stop-on-failure' => FALSE, 'force' => FALSE, 'no-progress' => FALSE,