Merge pull request #261 from VikParuchuri/dev

Integrate new OCR
kyomano · Aug 19, 2024 · 4d13410 · 4d13410
2 parents e9b8bb2 + 8b8d9a7
commit 4d13410
Show file tree

Hide file tree

Showing 24 changed files with 1,653 additions and 88 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -27,8 +27,12 @@ jobs:
           unzip -o benchmark_data.zip
       - name: Run benchmark test
         run: |
-          poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json
-          poetry run python scripts/verify_benchmark_scores.py report.json
+          poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json
+          poetry run python scripts/verify_benchmark_scores.py report.json --type marker
+      - name: Run table benchmark
+        run: |
+          poetry run python benchmarks/table.py tables.json
+          poetry run python scripts/verify_benchmark_scores.py tables.json --type table
         
           
 
diff --git a/README.md b/README.md
@@ -88,32 +88,40 @@ First, some configuration:
 
 - Inspect the settings in `marker/settings.py`.  You can override any settings with environment variables.
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
-  - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU).  For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
-  - Depending on your document types, marker's average memory usage per task can vary slightly.  You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
-- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).  If you don't want OCR at all, set `OCR_ENGINE` to `None`.
+- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  It also doesn't require you to specify the languages in the document.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).  If you don't want OCR at all, set `OCR_ENGINE` to `None`.
+
+## Interactive App
+
+I've included a streamlit app that lets you interactively try marker with some basic options.  Run it with:
+
+```shell
+pip install streamlit
+marker_gui
+```
 
 ## Convert a single file
 
 ```shell
-marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --max_pages 10 --langs English
+marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --max_pages 10 
 ```
 
 - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM.  Higher numbers will take more VRAM, but process faster.  Set to 2 by default.  The default batch sizes will take ~3GB of VRAM.
 - `--max_pages` is the maximum number of pages to process.  Omit this to convert the entire document.
-- `--langs` is a comma separated list of the languages in the document, for OCR
+- `--langs` is an optional comma separated list of the languages in the document, for OCR.  Optional by default, required if you use tesseract.
+- `--ocr_all_pages` is an optional argument to force OCR on all pages of the PDF.  If this or the env var `OCR_ALL_PAGES` are true, OCR will be forced.
 
-Make sure the `DEFAULT_LANG` setting is set appropriately for your document.  The list of supported languages for OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`.  If you don't need OCR, marker can work with any language.
+The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`.  If you don't need OCR, marker can work with any language.
 
 ## Convert multiple files
 
 ```shell
-marker /path/to/input/folder /path/to/output/folder --workers 10 --max 10 --metadata_file /path/to/metadata.json --min_length 10000
+marker /path/to/input/folder /path/to/output/folder --workers 4 --max 10 --min_length 10000
 ```
 
-- `--workers` is the number of pdfs to convert at once.  This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK` if you're using GPU.
+- `--workers` is the number of pdfs to convert at once.  This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage.  Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
 - `--max` is the maximum number of pdfs to convert.  Omit this to convert all pdfs in the folder.
 - `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing.  If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
-- `--metadata_file` is an optional path to a json file with metadata about the pdfs.  If you provide it, it will be used to set the language for each pdf.  If not, `DEFAULT_LANG` will be used. The format is:
+- `--metadata_file` is an optional path to a json file with metadata about the pdfs.  If you provide it, it will be used to set the language for each pdf.  Setting language is optional for surya (default), but required for tesseract. The format is:
 
 ```
 {
@@ -133,7 +141,7 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar
 
 - `METADATA_FILE` is an optional path to a json file with metadata about the pdfs.  See above for the format.
 - `NUM_DEVICES` is the number of GPUs to use.  Should be `2` or greater.
-- `NUM_WORKERS` is the number of parallel processes to run on each GPU.  Per-GPU parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK`.
+- `NUM_WORKERS` is the number of parallel processes to run on each GPU.
 - `MIN_LENGTH` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing.  If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
 
 Note that the env variables above are specific to this script, and cannot be set in `local.env`.
@@ -199,16 +207,24 @@ git clone https://github.com/VikParuchuri/marker.git
 poetry install
 ```
 
-Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run `benchmark.py` like this:
+Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
 
 ```shell
-python benchmark.py data/pdfs data/references report.json --nougat
+python benchmark/overall.py data/pdfs data/references report.json --nougat
 ```
 
 This will benchmark marker against other text extraction methods.  It sets up batch sizes for nougat and marker to use a similar amount of GPU RAM for each.
 
 Omit `--nougat` to exclude nougat from the benchmark.  I don't recommend running nougat on CPU, since it is very slow.
 
+### Table benchmark
+
+There is a benchmark for table parsing, which you can run with:
+
+```shell
+python benchmarks/table.py test_data/tables.json
+```
+
 # Thanks
 
 This work would not have been possible without amazing open source models and datasets, including (but not limited to):

diff --git a/benchmark.py → benchmarks/overall.py b/benchmark.py → benchmarks/overall.py
diff --git a/benchmarks/table.py b/benchmarks/table.py
@@ -0,0 +1,77 @@
+import argparse
+import json
+
+import datasets
+from surya.schema import LayoutResult, LayoutBox
+from tqdm import tqdm
+
+from marker.benchmark.table import score_table
+from marker.schema.bbox import rescale_bbox
+from marker.schema.page import Page
+from marker.tables.table import format_tables
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark table conversion.")
+    parser.add_argument("out_file", help="Output filename for results")
+    parser.add_argument("--dataset", type=str, help="Dataset to use", default="vikp/table_bench")
+    args = parser.parse_args()
+
+    ds = datasets.load_dataset(args.dataset, split="train")
+
+    results = []
+    for i in tqdm(range(len(ds)), desc="Evaluating tables"):
+        row = ds[i]
+        marker_page = Page(**json.loads(row["marker_page"]))
+        table_bbox = row["table_bbox"]
+        gpt4_table = json.loads(row["gpt_4_table"])["markdown_table"]
+
+        # Counterclockwise polygon from top left
+        table_poly = [
+            [table_bbox[0], table_bbox[1]],
+            [table_bbox[2], table_bbox[1]],
+            [table_bbox[2], table_bbox[3]],
+            [table_bbox[0], table_bbox[3]],
+        ]
+
+        # Remove all other tables from the layout results
+        layout_result = LayoutResult(
+            bboxes=[
+                LayoutBox(
+                    label="Table",
+                    polygon=table_poly
+                )
+            ],
+            segmentation_map="",
+            image_bbox=marker_page.text_lines.image_bbox
+        )
+
+        marker_page.layout = layout_result
+        format_tables([marker_page])
+
+        table_blocks = [block for block in marker_page.blocks if block.block_type == "Table"]
+        if len(table_blocks) != 1:
+            continue
+
+        table_block = table_blocks[0]
+        table_md = table_block.lines[0].spans[0].text
+
+        results.append({
+            "score": score_table(table_md, gpt4_table),
+            "arxiv_id": row["arxiv_id"],
+            "page_idx": row["page_idx"],
+            "marker_table": table_md,
+            "gpt4_table": gpt4_table,
+            "table_bbox": table_bbox
+        })
+
+    avg_score = sum([r["score"] for r in results]) / len(results)
+    print(f"Evaluated {len(results)} tables, average score is {avg_score}.")
+
+    with open(args.out_file, "w+") as f:
+        json.dump(results, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/convert.py b/convert.py
@@ -73,8 +73,8 @@ def main():
     parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
     parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
     parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
-    parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
-    parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for filtering")
+    parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use.  Peak VRAM usage per process is 5GB, but avg is closer to 3.5GB.")
+    parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for languages")
     parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
 
     args = parser.parse_args()
@@ -104,13 +104,6 @@ def main():
 
     total_processes = min(len(files_to_convert), args.workers)
 
-    # Dynamically set GPU allocation per task based on GPU ram
-    if settings.CUDA:
-        tasks_per_gpu = settings.INFERENCE_RAM // settings.VRAM_PER_TASK if settings.CUDA else 0
-        total_processes = int(min(tasks_per_gpu, total_processes))
-    else:
-        total_processes = int(total_processes)
-
     try:
         mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
     except RuntimeError:

diff --git a/convert_single.py b/convert_single.py
@@ -20,17 +20,18 @@ def main():
     parser.add_argument("output", help="Output base folder path")
     parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
     parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
-    parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
+    parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
     parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
     parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
+    parser.add_argument("--ocr_all_pages", action="store_true", help="Force OCR on all pages", default=False)
     args = parser.parse_args()
 
     langs = args.langs.split(",") if args.langs else None
 
     fname = args.filename
     model_lst = load_all_models()
     start = time.time()
-    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
+    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page, ocr_all_pages=args.ocr_all_pages)
 
     fname = os.path.basename(fname)
     subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)

diff --git a/marker/benchmark/scoring.py b/marker/benchmark/scoring.py
@@ -37,4 +37,4 @@ def score_text(hypothesis, reference):
     hypothesis_chunks = chunk_text(hypothesis)
     reference_chunks = chunk_text(reference)
     chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
-    return mean(chunk_scores)
+    return mean(chunk_scores)
diff --git a/marker/benchmark/table.py b/marker/benchmark/table.py
@@ -0,0 +1,41 @@
+from rapidfuzz import fuzz
+import re
+
+
+def split_to_cells(table):
+    table = table.strip()
+    table = re.sub(r" {2,}", "", table)
+    table_rows = table.split("\n")
+    table_rows = [t for t in table_rows if t.strip()]
+    table_cells = [r.split("|") for r in table_rows]
+    return table_cells
+
+
+def align_rows(hypothesis, ref_row):
+    best_alignment = []
+    best_alignment_score = 0
+    for j in range(0, len(hypothesis)):
+        alignments = []
+        for i in range(len(ref_row)):
+            if i >= len(hypothesis[j]):
+                alignments.append(0)
+                continue
+            alignment = fuzz.ratio(hypothesis[j][i], ref_row[i], score_cutoff=30) / 100
+            alignments.append(alignment)
+        if len(alignments) == 0:
+            continue
+        alignment_score = sum(alignments) / len(alignments)
+        if alignment_score >= best_alignment_score:
+            best_alignment = alignments
+            best_alignment_score = alignment_score
+    return best_alignment
+
+
+def score_table(hypothesis, reference):
+    hypothesis = split_to_cells(hypothesis)
+    reference = split_to_cells(reference)
+
+    alignments = []
+    for i in range(0, len(reference)):
+        alignments.extend(align_rows(hypothesis, reference[i]))
+    return sum(alignments) / len(alignments)
diff --git a/marker/convert.py b/marker/convert.py
@@ -41,11 +41,10 @@ def convert_single_pdf(
         start_page: int = None,
         metadata: Optional[Dict] = None,
         langs: Optional[List[str]] = None,
-        batch_multiplier: int = 1
+        batch_multiplier: int = 1,
+        ocr_all_pages: bool = False
 ) -> Tuple[str, Dict[str, Image.Image], Dict]:
-    # Set language needed for OCR
-    if langs is None:
-        langs = [settings.DEFAULT_LANG]
+    ocr_all_pages = ocr_all_pages or settings.OCR_ALL_PAGES
 
     if metadata:
         langs = metadata.get("languages", langs)
@@ -91,7 +90,7 @@ def convert_single_pdf(
     flush_cuda_memory()
 
     # OCR pages as needed
-    pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
+    pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
     flush_cuda_memory()
 
     out_meta["ocr_stats"] = ocr_stats

diff --git a/marker/equations/inference.py b/marker/equations/inference.py
@@ -1,4 +1,5 @@
 from texify.inference import batch_inference
+from tqdm import tqdm
 
 from marker.settings import settings
 import os
@@ -22,7 +23,7 @@ def get_latex_batched(images, token_counts, texify_model, batch_multiplier=1):
     predictions = [""] * len(images)
     batch_size = get_batch_size() * batch_multiplier
 
-    for i in range(0, len(images), batch_size):
+    for i in tqdm(range(0, len(images), batch_size), desc="Recognizing equations"):
         # Dynamically set max length to save inference time
         min_idx = i
         max_idx = min(min_idx + batch_size, len(images))

diff --git a/marker/models.py b/marker/models.py
@@ -13,11 +13,11 @@
 from surya.model.ordering.processor import load_processor as load_order_processor
 
 
-def setup_recognition_model(langs, device=None, dtype=None):
+def setup_recognition_model(device=None, dtype=None):
     if device:
-        rec_model = load_recognition_model(langs=langs, device=device, dtype=dtype)
+        rec_model = load_recognition_model(device=device, dtype=dtype)
     else:
-        rec_model = load_recognition_model(langs=langs)
+        rec_model = load_recognition_model()
     rec_processor = load_recognition_processor()
     rec_model.processor = rec_processor
     return rec_model
@@ -64,7 +64,7 @@ def setup_order_model(device=None, dtype=None):
     return model
 
 
-def load_all_models(langs=None, device=None, dtype=None, force_load_ocr=False):
+def load_all_models(device=None, dtype=None, force_load_ocr=False):
     if device is not None:
         assert dtype is not None, "Must provide dtype if device is provided"
 
@@ -75,7 +75,7 @@ def load_all_models(langs=None, device=None, dtype=None, force_load_ocr=False):
     edit = load_editing_model(device, dtype)
 
     # Only load recognition model if we'll need it for all pdfs
-    ocr = setup_recognition_model(langs, device, dtype)
+    ocr = setup_recognition_model(device, dtype)
     texify = setup_texify_model(device, dtype)
     model_lst = [texify, layout, order, edit, detection, ocr]
     return model_lst
diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
@@ -7,7 +7,7 @@
 from marker.settings import settings
 
 
-def should_ocr_page(page: Page, no_text: bool):
+def should_ocr_page(page: Page, no_text: bool, ocr_all_pages=False):
     detected_lines_found, total_lines = detected_line_coverage(page)
 
     # No reason to OCR page if it has no text lines
@@ -21,7 +21,7 @@ def should_ocr_page(page: Page, no_text: bool):
         detected_lines_found is False, # didn't extract text for all detected lines
     ]
 
-    return any(conditions) or settings.OCR_ALL_PAGES
+    return any(conditions) or ocr_all_pages
 
 
 def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):

diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py
@@ -15,10 +15,16 @@ def langs_to_ids(langs: List[str]):
 
 def replace_langs_with_codes(langs):
     if settings.OCR_ENGINE == "surya":
+        if langs is None:
+            return
         for i, lang in enumerate(langs):
             if lang.title() in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_CODE[lang.title()]
     else:
+        if langs is None:
+            langs = [settings.DEFAULT_LANG]
+            print(f"No languages specified for tesseract, defaulting to {settings.DEFAULT_LANG}.")
+
         for i, lang in enumerate(langs):
             if lang in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
@@ -27,6 +33,8 @@ def replace_langs_with_codes(langs):
 
 def validate_langs(langs):
     if settings.OCR_ENGINE == "surya":
+        if langs is None:
+            return
         for lang in langs:
             if lang not in CODE_TO_LANGUAGE:
                 raise ValueError(f"Invalid language code {lang} for Surya OCR")