Skip to content

Commit

Permalink
add filesize threshold capability on tika warmup caching ocr file
Browse files Browse the repository at this point in the history
  • Loading branch information
WengerK committed Apr 25, 2024
1 parent f96a5f5 commit bbec8df
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
16 changes: 16 additions & 0 deletions modules/entity_to_text_tika/src/Commands/OcrWarmupCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ public function __construct(EntityTypeManagerInterface $entity_type_manager, Fil
* 'application/vnd.ms-excel',
* 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
* ].
* @option filesize-threshold
* The maximum file size in bytes a document can be to be processed.
* This is useful to avoid processing large files.
* [defaults: NULL].
* @option stop-on-failure
* Stop processing on first failed (Ex. Tika's down).
* [defaults: FALSE].
Expand All @@ -90,6 +94,10 @@ public function __construct(EntityTypeManagerInterface $entity_type_manager, Fil
* Warmup all files even if the files has already been processed before.
* @usage drush e2t:t:w --fid=2
* Warmup the file with FID 2.
* @usage drush e2t:t:w --filemime=application/pdf
* Warmup all PDF files.
* @usage drush e2t:t:w --filesize-threshold=1000000
* Warmup all files that are lighter than 1Mb.
*/
public function warmup(
array $options = [
Expand All @@ -99,6 +107,7 @@ public function warmup(
'application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
],
'filesize-threshold' => NULL,
'stop-on-failure' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
Expand All @@ -107,6 +116,7 @@ public function warmup(
): void {
$fid = $options['fid'];
$filemime = (array) $options['filemime'];
$filesiz_threshold = $options['filesize-threshold'];
$stop_on_failure = (bool) $options['stop-on-failure'];
$force = (bool) $options['force'];
$dry_run = (bool) $options['dry-run'];
Expand Down Expand Up @@ -145,6 +155,12 @@ public function warmup(

$this->output()->writeln(sprintf('Processing file (%s) "%s".', $file->id(), $file->getFileUri()), OutputInterface::VERBOSITY_VERBOSE);

if ($filesiz_threshold && $file->getSize() > $filesiz_threshold) {
$this->output()->writeln(sprintf('File (%s) "%s" is too large to be processed (%d bytes).', $file->id(), $file->getFileUri(), $file->getSize()), OutputInterface::VERBOSITY_VERBOSE);
$progressbar_objects->advance();
continue;
}

if ($dry_run) {
$progressbar_objects->advance();
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ public function testWarmupDryrun(): void {
'filemime' => [
'application/pdf',
],
'filesiz_threshold' => NULL,
'stop-on-failure' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
Expand Down Expand Up @@ -325,6 +326,7 @@ public function testWarmupForce(): void {
'filemime' => [
'application/pdf',
],
'filesiz_threshold' => NULL,
'stop-on-failure' => FALSE,
'force' => TRUE,
'no-progress' => FALSE,
Expand All @@ -335,7 +337,7 @@ public function testWarmupForce(): void {
/**
* @covers ::warmup
*/
public function testWarmupF(): void {
public function testWarmupFid(): void {
$query = $this->createMock(QueryInterface::class);
$query->expects($this->once())
->method('accessCheck')
Expand Down Expand Up @@ -395,6 +397,7 @@ public function testWarmupF(): void {
'filemime' => [
'application/pdf',
],
'filesiz_threshold' => NULL,
'stop-on-failure' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
Expand Down

0 comments on commit bbec8df

Please sign in to comment.