Skip to content

Commit

Permalink
Fix marker app
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 21, 2024
1 parent a0f0ea6 commit 1575e78
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 33 deletions.
18 changes: 8 additions & 10 deletions marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,20 @@ def output_exists(output_dir: str, fname_base: str):
return False


def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
def text_from_rendered(rendered: BaseModel):
if isinstance(rendered, MarkdownOutput):
ext = "md"
text = rendered.markdown
images = rendered.images
return rendered.markdown, "md", rendered.images
elif isinstance(rendered, HTMLOutput):
ext = "html"
text = rendered.html
images = rendered.images
return rendered.html, "html", rendered.images
elif isinstance(rendered, JSONOutput):
ext = "json"
text = rendered.model_dump_json(exclude=["metadata"], indent=2)
images = {}
return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
else:
raise ValueError("Invalid output type")


def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
text, ext, images = text_from_rendered(rendered)

with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f:
f.write(text)
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
Expand Down
9 changes: 7 additions & 2 deletions marker/processors/ignoretext.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class IgnoreTextProcessor(BaseProcessor):
"""
block_types = (BlockTypes.Text,)
common_element_threshold = .25
common_element_min_blocks = 3
max_blocks = 1
text_match_threshold = 90

Expand Down Expand Up @@ -50,12 +51,16 @@ def clean_text(text):

def filter_common_elements(self, document, blocks):
# We can't filter if we don't have enough pages to find common elements
if len(blocks) < 3:
if len(blocks) < self.common_element_min_blocks:
return

text = [self.clean_text(b.raw_text(document)) for b in blocks]
counter = Counter(text)
common = [k for k, v in counter.items() if v > len(blocks) * self.common_element_threshold]
common = [
k for k, v in counter.items()
if v > len(blocks) * self.common_element_threshold
and v > self.common_element_min_blocks
]
if len(common) == 0:
return

Expand Down
56 changes: 36 additions & 20 deletions marker_app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["IN_STREAMLIT"] = "true"
os.environ["PDFTEXT_CPU_WORKERS"] = "1"
Expand All @@ -7,23 +8,30 @@
import io
import re
import tempfile
from typing import List, Any, Dict
from typing import Any, Dict

import pypdfium2
import streamlit as st

from marker.convert import convert_single_pdf
from marker.models import load_all_models
from surya.languages import CODE_TO_LANGUAGE
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered

@st.cache_resource()
def load_models():
return load_all_models()
return create_model_dict()


def convert_pdf(fname: str, langs: List[str] | None, max_pages: int | None, ocr_all_pages: bool) -> (str, Dict[str, Any], dict):
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=max_pages, langs=langs, ocr_all_pages=ocr_all_pages)
return full_text, images, out_meta
def convert_pdf(fname: str, **kwargs) -> (str, Dict[str, Any], dict):
config_parser = ConfigParser(kwargs)
converter = PdfConverter(
config=config_parser.generate_config_dict(),
artifact_dict=model_dict,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer()
)
return converter(fname)


def open_pdf(pdf_file):
Expand All @@ -41,8 +49,9 @@ def img_to_html(img, img_alt):


def markdown_insert_images(markdown, images):
image_tags = re.findall(r'(!\[(?P<image_title>[^\]]+)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)

print(image_tags)
for image in image_tags:
image_markdown = image[0]
image_alt = image[1]
Expand All @@ -57,7 +66,7 @@ def get_page_image(pdf_file, page_num, dpi=96):
doc = open_pdf(pdf_file)
renderer = doc.render(
pypdfium2.PdfBitmap.to_pil,
page_indices=[page_num - 1],
page_indices=[page_num],
scale=dpi / 72,
)
png = list(renderer)[0]
Expand All @@ -68,13 +77,13 @@ def get_page_image(pdf_file, page_num, dpi=96):
@st.cache_data()
def page_count(pdf_file):
doc = open_pdf(pdf_file)
return len(doc)
return len(doc) - 1


st.set_page_config(layout="wide")
col1, col2 = st.columns([.5, .5])

model_lst = load_models()
model_dict = load_models()


st.markdown("""
Expand All @@ -86,9 +95,6 @@ def page_count(pdf_file):
""")

in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
languages = st.sidebar.multiselect("Languages", sorted(list(CODE_TO_LANGUAGE.values())), default=[], max_selections=4, help="Select the languages in the pdf (if known) to improve OCR accuracy. Optional.")
max_pages = st.sidebar.number_input("Max pages to parse", min_value=1, value=10, help="Optional maximum number of pages to convert")
ocr_all_pages = st.sidebar.checkbox("Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False)

if in_file is None:
st.stop()
Expand All @@ -97,11 +103,14 @@ def page_count(pdf_file):

with col1:
page_count = page_count(in_file)
page_number = st.number_input(f"Page number out of {page_count}:", min_value=1, value=1, max_value=page_count)
page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
pil_image = get_page_image(in_file, page_number)

st.image(pil_image, caption="PDF file (preview)", use_column_width=True)
st.image(pil_image, caption="PDF file (preview)", use_container_width=True)

page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number+1}")
force_ocr = st.sidebar.checkbox("Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False)
output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
run_marker = st.sidebar.button("Run Marker")

if not run_marker:
Expand All @@ -112,8 +121,15 @@ def page_count(pdf_file):
temp_pdf.write(in_file.getvalue())
temp_pdf.seek(0)
filename = temp_pdf.name
md_text, images, out_meta = convert_pdf(filename, languages, max_pages, ocr_all_pages)
md_text = markdown_insert_images(md_text, images)
rendered = convert_pdf(filename, page_range=page_range, force_ocr=force_ocr, output_format=output_format)

text, ext, images = text_from_rendered(rendered)
with col2:
st.markdown(md_text, unsafe_allow_html=True)
if output_format == "markdown":
text = markdown_insert_images(text, images)
st.markdown(text, unsafe_allow_html=True)
elif output_format == "json":
st.json(text)
elif output_format == "html":
st.html(text)

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.3.10"
version = "1.0.0"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 1575e78

Please sign in to comment.