Skip to content

Commit

Permalink
Review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 21, 2024
1 parent 78fd0a7 commit 0000792
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 183 deletions.
4 changes: 2 additions & 2 deletions marker/processors/line_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ def __call__(self, document: Document):
if block.structure is None:
continue

all_lines = [page.get_block(line_id) for line_id in block.structure]
all_lines = block.structure_blocks(document)
if len(all_lines) < self.min_lines_in_block:
continue

starts_with_number = []
ends_with_number = []
for line in all_lines:
spans = [page.get_block(span_id) for span_id in line.structure]
spans = line.structure_blocks(document)
if len(spans) < 2:
starts_with_number.append(False)
ends_with_number.append(False)
Expand Down
4 changes: 2 additions & 2 deletions marker/processors/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __call__(self, document: Document):
if next_block.structure is None: # This is odd though, why do we have text blocks with no structure?
continue

new_block_lines = [page.get_block(block_id) for block_id in next_block.structure]
new_block_lines = next_block.structure_blocks(document)
else: # page break
next_page = document.get_next_page(page)
if next_page is None:
Expand Down Expand Up @@ -80,7 +80,7 @@ def __call__(self, document: Document):
min_x = math.ceil(min([l.polygon.x_start for l in new_block_lines]))
next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x

lines: List[Line] = [page.get_block(block_id) for block_id in block.structure]
lines: List[Line] = block.structure_blocks(document)
max_x = math.floor(max([l.polygon.x_end for l in lines]))
last_line_is_full_width = lines[-1].polygon.x_end >= max_x

Expand Down
4 changes: 4 additions & 0 deletions marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def id(self) -> BlockId:
block_type=self.block_type
)

def structure_blocks(self, document_page) -> List[Block]:
return [document_page.get_block(block_id) for block_id in self.structure]

def add_structure(self, block: Block):
if self.structure is None:
self.structure = [block.id]
Expand Down Expand Up @@ -128,6 +131,7 @@ def assign_section_hierarchy(self, section_hierarchy):
def contained_blocks(self, document: Document, block_types: Sequence[BlockTypes] = None):
if self.structure is None:
return []

blocks = []
for block_id in self.structure:
block = document.get_block(block_id)
Expand Down
322 changes: 161 additions & 161 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ ftfy = "^6.1.1"
texify = "^0.2.1"
rapidfuzz = "^3.8.1"
surya-ocr = { git = "https://github.com/VikParuchuri/surya.git", branch = "layout2" }
filetype = "^1.2.0"
regex = "^2024.4.28"
pdftext = "^0.3.19"
tabled-pdf = { git = "https://github.com/VikParuchuri/tabled.git", branch = "dev-mose/compilation-updates" }
Expand Down
18 changes: 1 addition & 17 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,6 @@
from marker.models import setup_detection_model, setup_layout_model, \
setup_recognition_model, setup_table_rec_model, \
setup_texify_model
from marker.processors.code import CodeProcessor
from marker.processors.debug import DebugProcessor
from marker.processors.document_toc import DocumentTOCProcessor
from marker.processors.equation import EquationProcessor
from marker.processors.sectionheader import SectionHeaderProcessor
from marker.processors.table import TableProcessor
from marker.processors.text import TextProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.renderers.markdown import MarkdownRenderer
Expand Down Expand Up @@ -111,18 +104,9 @@ def pdf_converter(request, config, layout_model, texify_model, recognition_model
"table_rec_model": table_rec_model,
"detection_model": detection_model
}
processor_list = [
EquationProcessor,
TableProcessor,
SectionHeaderProcessor,
TextProcessor,
CodeProcessor,
DocumentTOCProcessor,
DebugProcessor,
]
yield PdfConverter(
artifact_dict=model_dict,
processor_list=classes_to_strings(processor_list),
processor_list=None,
renderer=classes_to_strings([renderer])[0],
config=config
)
Expand Down
19 changes: 19 additions & 0 deletions tests/processors/test_ignoretext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pytest

from marker.processors.ignoretext import IgnoreTextProcessor
from marker.schema import BlockTypes


@pytest.mark.filename("bio_pdf.pdf")
@pytest.mark.config({"page_range": list(range(6))})
@pytest.mark.skip(reason="Need to wait for layout model to stabilize before activating.")
def test_ignoretext_processor(pdf_document):
processor = IgnoreTextProcessor()
processor(pdf_document)

page0_header = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Text])[0]
assert "bioRxiv" in page0_header.raw_text(pdf_document)
breakpoint()

first_span = page0_header.contained_blocks(pdf_document, [BlockTypes.Span])[0]
assert first_span.ignore_for_output is True

0 comments on commit 0000792

Please sign in to comment.