Skip to content

Commit

Permalink
enhancement: Add timeout limit to document parsing job. DS4SD#270
Browse files Browse the repository at this point in the history
Testing:
(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=100
INFO:docling.document_converter:Going to convert document batch...
Fetching 9 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 87584.07it/s]
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 24.12 sec.
INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md
INFO:docling.cli.main:Processed 1 docs, of which 0 failed
INFO:docling.cli.main:All documents were converted in 24.13 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=5
INFO:docling.document_converter:Going to convert document batch...
Fetching 9 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 29037.49it/s]
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
WARNING:docling.pipeline.base_pipeline:Document processing time (6 s) exceeded the specified timeout of 5 s
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 10.82 sec.
WARNING:docling.cli.main:Document /var/folders/d7/dsfkllxs0xs8x2t4fcjknj4c0000gn/T/tmpzedg349h/2206.01062v1.pdf failed to convert.
INFO:docling.cli.main:Processed 1 docs, of which 1 failed
INFO:docling.cli.main:All documents were converted in 10.82 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062
INFO:docling.document_converter:Going to convert document batch...
Fetching 9 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 88197.98it/s]
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 22.59 sec.
INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md
INFO:docling.cli.main:Processed 1 docs, of which 0 failed
INFO:docling.cli.main:All documents were converted in 22.60 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling

 Usage: docling [OPTIONS] source

╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None] [required]                                                                             │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --from                                       [docx|pptx|html|image|pdf|asciidoc|md]  Specify input formats to convert from. Defaults to all formats. [default: None]                                     │
│ --to                                         [md|json|text|doctags]                  Specify output formats. Defaults to Markdown. [default: None]                                                       │
│ --ocr                 --no-ocr                                                       If enabled, the bitmap content will be processed using OCR. [default: ocr]                                          │
│ --force-ocr           --no-force-ocr                                                 Replace any existing text with OCR generated text over the full content. [default: no-force-ocr]                    │
│ --ocr-engine                                 [easyocr|tesseract_cli|tesseract]       The OCR engine to use. [default: easyocr]                                                                           │
│ --pdf-backend                                [pypdfium2|dlparse_v1|dlparse_v2]       The PDF backend to use. [default: dlparse_v1]                                                                       │
│ --table-mode                                 [fast|accurate]                         The mode to use in the table structure model. [default: fast]                                                       │
│ --artifacts-path                             PATH                                    If provided, the location of the model artifacts. [default: None]                                                   │
│ --abort-on-error      --no-abort-on-error                                            If enabled, the bitmap content will be processed using OCR. [default: no-abort-on-error]                            │
│ --output                                     PATH                                    Output directory where results are saved. [default: .]                                                              │
│ --version                                                                            Show version information.                                                                                           │
│ --document-timeout                           INTEGER                                 The timeout for processing each document, in seconds. [default: None]                                               │
│ --help                                                                               Show this message and exit.                                                                                         │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
  • Loading branch information
Abhishek Kumar authored and Abhishek Kumar committed Nov 12, 2024
1 parent 2c0c439 commit 089e39e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 3 deletions.
8 changes: 8 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@ def convert(
help="Show version information.",
),
] = None,
document_timeout: Annotated[
Optional[int],
typer.Option(
...,
help="The timeout for processing each document, in seconds.",
),
] = None,
):
logging.basicConfig(level=logging.INFO)

Expand Down Expand Up @@ -238,6 +245,7 @@ def convert(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
document_timeout=document_timeout,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode
Expand Down
1 change: 1 addition & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,4 @@ class PdfPipelineOptions(PipelineOptions):
generate_page_images: bool = False
generate_picture_images: bool = False
generate_table_images: bool = False
document_timeout: Optional[int] = None
14 changes: 11 additions & 3 deletions docling/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionRes
):
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(conv_res)
conv_res = self._build_document(conv_res, pdf_document_timeout=self.pipeline_options.document_timeout)
conv_res = self._assemble_document(conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(conv_res)
Expand All @@ -54,7 +54,7 @@ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionRes
return conv_res

@abstractmethod
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
def _build_document(self, conv_res: ConversionResult, pdf_document_timeout=None) -> ConversionResult:
pass

def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
Expand Down Expand Up @@ -115,7 +115,7 @@ def _apply_on_pages(

yield from page_batch

def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
def _build_document(self, conv_res: ConversionResult, pdf_document_timeout=None) -> ConversionResult:

if not isinstance(conv_res.input._backend, PdfDocumentBackend):
raise RuntimeError(
Expand All @@ -126,6 +126,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
# conv_res.status = ConversionStatus.FAILURE
# return conv_res

start_time = time.time()
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):

for i in range(0, conv_res.input.page_count):
Expand All @@ -137,6 +138,13 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
conv_res.pages, settings.perf.page_batch_size
):
start_pb_time = time.time()
elapsed_time = start_pb_time - start_time
if pdf_document_timeout is not None and elapsed_time > pdf_document_timeout:
_log.warning(
f"Document processing time ({int(elapsed_time)} s) exceeded the specified timeout of {pdf_document_timeout} s"
)
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
break

# 1. Initialise the page resources
init_pages = map(
Expand Down

0 comments on commit 089e39e

Please sign in to comment.