diff --git a/docling/cli/main.py b/docling/cli/main.py index 60a3c296..b35ce174 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -194,6 +194,13 @@ def convert( help="Show version information.", ), ] = None, + document_timeout: Annotated[ + Optional[int], + typer.Option( + ..., + help="The timeout for processing each document, in seconds.", + ), + ] = None, ): logging.basicConfig(level=logging.INFO) @@ -238,6 +245,7 @@ def convert( do_ocr=ocr, ocr_options=ocr_options, do_table_structure=True, + document_timeout=document_timeout, ) pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching pipeline_options.table_structure_options.mode = table_mode diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 2b9d228c..d341f510 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -82,3 +82,4 @@ class PdfPipelineOptions(PipelineOptions): generate_page_images: bool = False generate_picture_images: bool = False generate_table_images: bool = False + document_timeout: Optional[int] = None \ No newline at end of file diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 5013ad58..6bd6b334 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -41,7 +41,7 @@ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionRes ): # These steps are building and assembling the structure of the # output DoclingDocument - conv_res = self._build_document(conv_res) + conv_res = self._build_document(conv_res, pdf_document_timeout=self.pipeline_options.document_timeout) conv_res = self._assemble_document(conv_res) # From this stage, all operations should rely only on conv_res.output conv_res = self._enrich_document(conv_res) @@ -54,7 +54,7 @@ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionRes return conv_res @abstractmethod - def _build_document(self, conv_res: ConversionResult) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult, pdf_document_timeout=None) -> ConversionResult: pass def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: @@ -115,7 +115,7 @@ def _apply_on_pages( yield from page_batch - def _build_document(self, conv_res: ConversionResult) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult, pdf_document_timeout=None) -> ConversionResult: if not isinstance(conv_res.input._backend, PdfDocumentBackend): raise RuntimeError( @@ -126,6 +126,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: # conv_res.status = ConversionStatus.FAILURE # return conv_res + start_time = time.time() with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): for i in range(0, conv_res.input.page_count): @@ -137,6 +138,13 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: conv_res.pages, settings.perf.page_batch_size ): start_pb_time = time.time() + elapsed_time = start_pb_time - start_time + if pdf_document_timeout is not None and elapsed_time > pdf_document_timeout: + _log.warning( + f"Document processing time ({int(elapsed_time)} s) exceeded the specified timeout of {pdf_document_timeout} s" + ) + conv_res.status = ConversionStatus.PARTIAL_SUCCESS + break # 1. Initialise the page resources init_pages = map(