Skip to content

Commit

Permalink
Merge pull request #6 from explosion/fix/bounding-boxes
Browse files Browse the repository at this point in the history
Fix bounding boxes for bottom left origin
  • Loading branch information
ines authored Nov 20, 2024
2 parents 6276911 + e679a06 commit 45ff613
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 19 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[metadata]
version = 0.0.3
version = 0.0.4
description = Use spaCy with PDFs, Word docs and other documents
url = https://github.com/explosion/spacy-layout
author = Explosion
Expand Down
45 changes: 27 additions & 18 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.document_converter import ConversionResult, DocumentConverter, FormatOption
from docling_core.types.doc.base import CoordOrigin
from docling_core.types.doc.labels import DocItemLabel
from spacy.language import Language
from spacy.tokens import Doc, Span, SpanGroup
Expand Down Expand Up @@ -61,31 +62,39 @@ def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream

def _result_to_doc(self, result: ConversionResult) -> Doc:
inputs = []
pages = {
(page.page_no + 1): PageLayout(
page_no=page.page_no + 1,
width=page.size.width if page.size else 0,
height=page.size.height if page.size else 0,
)
for page in result.pages
}
for item in result.document.texts:
if item.text == "":
continue
bounding_box = None
if item.prov:
prov = item.prov[0]
bounding_box = SpanLayout(
x=prov.bbox.l,
y=prov.bbox.t,
width=prov.bbox.r - prov.bbox.l,
height=prov.bbox.b - prov.bbox.t,
page_no=prov.page_no,
)
else:
bounding_box = None
page = pages[prov.page_no]
if page.width and page.height:
box = prov.bbox
height = box.b - box.t
y = (
box.t
if box.coord_origin == CoordOrigin.TOPLEFT
else page.height - box.t - height
)
bounding_box = SpanLayout(
x=box.l,
y=y,
width=box.r - box.l,
height=height,
page_no=prov.page_no,
)
inputs.append((item.text, item.label, bounding_box))
doc = self._texts_to_doc(inputs)
pages = [
PageLayout(
page_no=i + 1,
width=page.size.width if page.size else 0,
height=page.size.height if page.size else 0,
)
for i, page in enumerate(result.pages)
]
doc._.set(self.attrs.doc_layout, DocLayout(pages=pages))
doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
return doc

def _texts_to_doc(self, inputs: list[tuple[str, str, SpanLayout]]) -> Doc:
Expand Down

0 comments on commit 45ff613

Please sign in to comment.