Softer OCR heuristics, enable float batch multipliers

VikParuchuri · May 28, 2024 · 05729da · 05729da
1 parent 0281aea
commit 05729da
Show file tree

Hide file tree

Showing 8 changed files with 34 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,8 @@ test_data
 training
 wandb
 *.dat
+report.json
+benchmark_data
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/marker/layout/layout.py b/marker/layout/layout.py
@@ -21,7 +21,7 @@ def surya_layout(doc, pages: List[Page], layout_model, batch_multiplier=1):
     text_detection_results = [p.text_lines for p in pages]
 
     processor = layout_model.processor
-    layout_results = batch_layout_detection(images, layout_model, processor, detection_results=text_detection_results, batch_size=get_batch_size() * batch_multiplier)
+    layout_results = batch_layout_detection(images, layout_model, processor, detection_results=text_detection_results, batch_size=int(get_batch_size() * batch_multiplier))
     for page, layout_result in zip(pages, layout_results):
         page.layout = layout_result
 

diff --git a/marker/layout/order.py b/marker/layout/order.py
@@ -30,7 +30,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1):
         bboxes.append(bbox)
 
     processor = order_model.processor
-    order_results = batch_ordering(images, bboxes, order_model, processor, batch_size=get_batch_size() * batch_multiplier)
+    order_results = batch_ordering(images, bboxes, order_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
     for page, order_result in zip(pages, order_results):
         page.order = order_result
 

diff --git a/marker/ocr/detection.py b/marker/ocr/detection.py
@@ -21,7 +21,7 @@ def surya_detection(doc: PdfDocument, pages: List[Page], det_model, batch_multip
     max_len = min(len(pages), len(doc))
     images = [render_image(doc[pnum], dpi=settings.SURYA_DETECTOR_DPI) for pnum in range(max_len)]
 
-    predictions = batch_text_detection(images, det_model, processor, batch_size=get_batch_size() * batch_multiplier)
+    predictions = batch_text_detection(images, det_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
     for (page, pred) in zip(pages, predictions):
         page.text_lines = pred
 

diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
@@ -12,7 +12,7 @@ def should_ocr_page(page: Page, no_text: bool):
 
     # OCR page if we got minimal text, or if we got too many spaces
     conditions = [
-        no_text , # Full doc has no text, and needs full OCR
+        no_text, # Full doc has no text, and needs full OCR
         (len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)),  # Bad OCR
         detected_lines_found is False, # didn't extract text for all detected lines
     ]
@@ -39,7 +39,7 @@ def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_thre
         return True
 
     invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
-    if invalid_chars > max(4.0, len(text) * .03):
+    if invalid_chars > max(6.0, len(text) * .03):
         return True
 
     return False
@@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
     return len(full_text.strip()) == 0
 
 
-def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65):
+def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.4):
     found_lines = 0
     for detected_line in page.text_lines.bboxes:
 

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -83,7 +83,7 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
     detection_results = [p.text_lines.bboxes for p in selected_pages]
     polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
 
-    results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=get_batch_size() * batch_multiplier)
+    results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
 
     new_pages = []
     for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.9"
+version = "0.2.10"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"
@@ -33,10 +33,10 @@ tabulate = "^0.9.0"
 ftfy = "^6.1.1"
 texify = "^0.1.9"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.4.8"
+surya-ocr = "^0.4.10"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
-pdftext = "^0.3.8"
+pdftext = "^0.3.10"
 grpcio = "^1.63.0"
 
 [tool.poetry.group.dev.dependencies]