diff --git a/.gitignore b/.gitignore index f4cf5dcf..24a84212 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ wandb *.dat report.json benchmark_data +debug # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 313651bf..5f977367 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ First, some configuration: - Inspect the settings in `marker/settings.py`. You can override any settings with environment variables. - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`. - By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. It also doesn't require you to specify the languages in the document. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`. +- Some PDFs, even digital ones, have bad text in them. Set `OCR_ALL_PAGES=true` to force OCR if you find bad output from marker. ## Interactive App @@ -107,15 +108,15 @@ marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --ma - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM. Higher numbers will take more VRAM, but process faster. Set to 2 by default. The default batch sizes will take ~3GB of VRAM. - `--max_pages` is the maximum number of pages to process. Omit this to convert the entire document. +- `--start_page` is the page to start from (default is None, will start from the first page). - `--langs` is an optional comma separated list of the languages in the document, for OCR. Optional by default, required if you use tesseract. -- `--ocr_all_pages` is an optional argument to force OCR on all pages of the PDF. If this or the env var `OCR_ALL_PAGES` are true, OCR will be forced. The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`. If you don't need OCR, marker can work with any language. ## Convert multiple files ```shell -marker /path/to/input/folder /path/to/output/folder --workers 4 --max 10 --min_length 10000 +marker /path/to/input/folder /path/to/output/folder --workers 4 --max 10 ``` - `--workers` is the number of pdfs to convert at once. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average. @@ -136,7 +137,7 @@ You can use language names or codes. The exact codes depend on the OCR engine. ## Convert multiple files on multiple GPUs ```shell -MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out +METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out ``` - `METADATA_FILE` is an optional path to a json file with metadata about the pdfs. See above for the format. @@ -150,15 +151,18 @@ Note that the env variables above are specific to this script, and cannot be set There are some settings that you may find useful if things aren't working the way you expect: -- `OCR_ALL_PAGES` - set this to true to force OCR all pages. This can be very useful if the table layouts aren't recognized properly by default, or if there is garbled text. +- `OCR_ALL_PAGES` - set this to true to force OCR all pages. This can be very useful if there is garbled text in the output of marker. - `TORCH_DEVICE` - set this to force marker to use a given torch device for inference. - `OCR_ENGINE` - can set this to `surya` or `ocrmypdf`. -- `DEBUG` - setting this to `True` shows ray logs when converting multiple pdfs - Verify that you set the languages correctly, or passed in a metadata file. - If you're getting out of memory errors, decrease worker count (increased the `VRAM_PER_TASK` setting). You can also try splitting up long PDFs into multiple files. In general, if output is not what you expect, trying to OCR the PDF is a good first step. Not all PDFs have good text/bboxes embedded in them. +## Debugging + +Set `DEBUG=true` to save data to the `debug` subfolder in the marker root directory. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information. + ## Useful settings These settings can improve/change output quality: diff --git a/convert_single.py b/convert_single.py index eb99bf1d..3469e3ba 100755 --- a/convert_single.py +++ b/convert_single.py @@ -2,6 +2,9 @@ import pypdfium2 # Needs to be at the top to avoid warnings import os + +from marker.settings import settings + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS import argparse @@ -22,8 +25,6 @@ def main(): parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at") parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None) parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes") - parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False) - parser.add_argument("--ocr_all_pages", action="store_true", help="Force OCR on all pages", default=False) args = parser.parse_args() langs = args.langs.split(",") if args.langs else None @@ -31,14 +32,13 @@ def main(): fname = args.filename model_lst = load_all_models() start = time.time() - full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page, ocr_all_pages=args.ocr_all_pages) + full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page) fname = os.path.basename(fname) subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta) print(f"Saved markdown to the {subfolder_path} folder") - if args.debug: - print(f"Total time: {time.time() - start}") + print(f"Total time: {time.time() - start}") if __name__ == "__main__": diff --git a/marker/cleaners/headings.py b/marker/cleaners/headings.py index 385bec3d..0e2703bb 100644 --- a/marker/cleaners/headings.py +++ b/marker/cleaners/headings.py @@ -68,7 +68,7 @@ def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT): data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1) data_labels = np.sort(data_labels, axis=0) - cluster_means = {label: np.mean(data[labels == label, 0]) for label in np.unique(labels)} + cluster_means = {label: np.mean(data_labels[data_labels[:, 1] == label, 0]) for label in np.unique(labels)} label_max = None label_min = None heading_ranges = [] @@ -95,15 +95,14 @@ def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT): return heading_ranges -def infer_heading_levels(pages: List[Page]): +def infer_heading_levels(pages: List[Page], height_tol=.99): all_line_heights = [] for page in pages: for block in page.blocks: if block.block_type not in ["Title", "Section-header"]: continue - block_heights = [min(l.height, l.width) for l in block.lines] # Account for rotation - all_line_heights.extend(block_heights) + all_line_heights.extend([l.height for l in block.lines]) heading_ranges = bucket_headings(all_line_heights) @@ -112,11 +111,11 @@ def infer_heading_levels(pages: List[Page]): if block.block_type not in ["Title", "Section-header"]: continue - block_heights = [min(l.height, l.width) for l in block.lines] # Account for rotation + block_heights = [l.height for l in block.lines] # Account for rotation avg_height = sum(block_heights) / len(block_heights) for idx, (min_height, max_height) in enumerate(heading_ranges): - if avg_height >= min_height: - block.heading_level = len(heading_ranges) - idx + if avg_height >= min_height * height_tol: + block.heading_level = idx + 1 break if block.heading_level is None: diff --git a/marker/convert.py b/marker/convert.py index 7915abdb..4e1baf2e 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -10,7 +10,7 @@ from marker.utils import flush_cuda_memory from marker.tables.table import format_tables -from marker.debug.data import dump_bbox_debug_data +from marker.debug.data import dump_bbox_debug_data, draw_page_debug_images from marker.layout.layout import surya_layout, annotate_block_types from marker.layout.order import surya_order, sort_blocks_in_reading_order from marker.ocr.lang import replace_langs_with_codes, validate_langs @@ -108,7 +108,8 @@ def convert_single_pdf( annotate_block_types(pages) # Dump debug data if flags are set - dump_bbox_debug_data(doc, fname, pages) + draw_page_debug_images(fname, pages) + dump_bbox_debug_data(fname, pages) # Find reading order for blocks # Sort blocks by reading order diff --git a/marker/debug/data.py b/marker/debug/data.py index 26bb47cd..1dfd3b7a 100644 --- a/marker/debug/data.py +++ b/marker/debug/data.py @@ -1,74 +1,62 @@ -import base64 import json +import math import os from typing import List -from marker.pdf.images import render_image +from marker.debug.render import render_on_image +from marker.schema.bbox import rescale_bbox from marker.schema.page import Page from marker.settings import settings from PIL import Image -import io -def dump_equation_debug_data(doc, images, converted_spans): - if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0: +def draw_page_debug_images(fname, pages: List[Page]): + if not settings.DEBUG: return - if len(images) == 0: - return + # Remove extension from doc name + doc_base = os.path.basename(fname).rsplit(".", 1)[0] - # We attempted one conversion per image - assert len(converted_spans) == len(images) - - data_lines = [] - for idx, (pil_image, converted_span) in enumerate(zip(images, converted_spans)): - if converted_span is None: - continue - # Image is a BytesIO object - img_bytes = io.BytesIO() - pil_image.save(img_bytes, format="WEBP", lossless=True) - b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8") - data_lines.append({ - "image": b64_image, - "text": converted_span.text, - "bbox": converted_span.bbox - }) + debug_folder = os.path.join(settings.DEBUG_DATA_FOLDER, doc_base) + os.makedirs(debug_folder, exist_ok=True) + for idx, page in enumerate(pages): + img_size = (int(math.ceil(page.text_lines.image_bbox[2])), int(math.ceil(page.text_lines.image_bbox[3]))) + png_image = Image.new("RGB", img_size, color="white") - # Remove extension from doc name - doc_base = os.path.basename(doc.name).rsplit(".", 1)[0] + line_bboxes = [] + line_text = [] + for block in page.blocks: + for line in block.lines: + line_bboxes.append(rescale_bbox(page.bbox, page.text_lines.image_bbox, line.bbox)) + line_text.append(line.prelim_text) - debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json") - with open(debug_file, "w+") as f: - json.dump(data_lines, f) + render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False) + + line_bboxes = [line.bbox for line in page.text_lines.bboxes] + render_on_image(line_bboxes, png_image, color="blue") + + layout_boxes = [rescale_bbox(page.layout.image_bbox, page.text_lines.image_bbox, box.bbox) for box in page.layout.bboxes] + layout_labels = [box.label for box in page.layout.bboxes] + render_on_image(layout_boxes, png_image, labels=layout_labels, color="red") -def dump_bbox_debug_data(doc, fname, blocks: List[Page]): - if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2: + debug_file = os.path.join(debug_folder, f"page_{idx}.png") + png_image.save(debug_file) + + +def dump_bbox_debug_data(fname, pages: List[Page]): + if not settings.DEBUG: return # Remove extension from doc name - doc_base = fname.rsplit(".", 1)[0] + doc_base = os.path.basename(fname).rsplit(".", 1)[0] debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json") debug_data = [] - for idx, page_blocks in enumerate(blocks): - page = doc[idx] - - png_image = render_image(page, dpi=settings.TEXIFY_DPI) - width, height = png_image.size - max_dimension = 6000 - if width > max_dimension or height > max_dimension: - scaling_factor = min(max_dimension / width, max_dimension / height) - png_image = png_image.resize((int(width * scaling_factor), int(height * scaling_factor)), Image.ANTIALIAS) - - img_bytes = io.BytesIO() - png_image.save(img_bytes, format="WEBP", lossless=True, quality=100) - b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8") - + for idx, page_blocks in enumerate(pages): page_data = page_blocks.model_dump(exclude=["images", "layout", "text_lines"]) page_data["layout"] = page_blocks.layout.model_dump(exclude=["segmentation_map"]) page_data["text_lines"] = page_blocks.text_lines.model_dump(exclude=["heatmap", "affinity_map"]) - #page_data["image"] = b64_image debug_data.append(page_data) with open(debug_file, "w+") as f: diff --git a/marker/equations/equations.py b/marker/equations/equations.py index 6c796e21..9886aa20 100644 --- a/marker/equations/equations.py +++ b/marker/equations/equations.py @@ -2,7 +2,6 @@ from copy import deepcopy from typing import List -from marker.debug.data import dump_equation_debug_data from marker.equations.inference import get_total_texify_tokens, get_latex_batched from marker.pdf.images import render_bbox_image from marker.schema.bbox import rescale_bbox @@ -177,7 +176,4 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_multiplier=1): successful_ocr += success_count unsuccessful_ocr += fail_count - # If debug mode is on, dump out conversions for comparison - dump_equation_debug_data(doc, images, converted_spans) - return pages, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count} diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py index a1743868..bf15c96c 100644 --- a/marker/postprocessors/markdown.py +++ b/marker/postprocessors/markdown.py @@ -146,22 +146,22 @@ def merge_lines(blocks: List[List[MergedBlock]]): prev_line = None block_text = "" block_type = "" - block_heading_level = None + prev_heading_level = None for idx, page in enumerate(blocks): for block in page: block_type = block.block_type - if block_type != prev_type and prev_type: + if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level): text_blocks.append( FullyMergedBlock( - text=block_surround(block_text, prev_type, block_heading_level), + text=block_surround(block_text, prev_type, prev_heading_level), block_type=prev_type ) ) block_text = "" prev_type = block_type - block_heading_level = block.heading_level + prev_heading_level = block.heading_level # Join lines in the block together properly for i, line in enumerate(block.lines): line_height = line.bbox[3] - line.bbox[1] @@ -180,7 +180,7 @@ def merge_lines(blocks: List[List[MergedBlock]]): # Append the final block text_blocks.append( FullyMergedBlock( - text=block_surround(block_text, prev_type, block_heading_level), + text=block_surround(block_text, prev_type, prev_heading_level), block_type=block_type ) ) diff --git a/marker/settings.py b/marker/settings.py index 39d6b3e0..2f0a0526 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -4,6 +4,7 @@ from pydantic import computed_field from pydantic_settings import BaseSettings import torch +import os class Settings(BaseSettings): @@ -12,6 +13,7 @@ class Settings(BaseSettings): IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them PAGINATE_OUTPUT: bool = False # Paginate output markdown + BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @computed_field @property @@ -84,9 +86,11 @@ def TORCH_DEVICE_MODEL(self) -> str: HEADING_DEFAULT_LEVEL: int = 2 # Debug - DEBUG: bool = False # Enable debug logging - DEBUG_DATA_FOLDER: Optional[str] = None - DEBUG_LEVEL: int = 0 # 0 to 2, 2 means log everything + DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug") + DEBUG: bool = False + FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts") + DEBUG_RENDER_FONT: str = os.path.join(FONT_DIR, "GoNotoCurrent-Regular.ttf") + FONT_DL_BASE: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0" @computed_field @property diff --git a/marker/tables/table.py b/marker/tables/table.py index f9b1ebcb..1cb16a12 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -62,7 +62,12 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): out_img_sizes = [] for i in range(len(table_counts)): if i in table_idxs: - text_lines.extend([sel_text_lines.pop(0)] * table_counts[i]) + page_ocred = pages[i].ocr_method is not None + if page_ocred: + # This will force re-detection of cells if the page was ocred (the text lines are not accurate) + text_lines.extend([None] * table_counts[i]) + else: + text_lines.extend([sel_text_lines.pop(0)] * table_counts[i]) out_img_sizes.extend([img_sizes[i]] * table_counts[i]) assert len(table_imgs) == len(table_bboxes) == len(text_lines) == len(out_img_sizes) diff --git a/static/fonts/.gitignore b/static/fonts/.gitignore new file mode 100644 index 00000000..c96a04f0 --- /dev/null +++ b/static/fonts/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file