Skip to content

Commit

Permalink
update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
rue-a committed Apr 15, 2024
1 parent c23a8ed commit b41709a
Show file tree
Hide file tree
Showing 8 changed files with 4,072 additions and 4,142 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ build
dist
*~
.vscode
ocrd.log
test_workspace
ocrd.log
68 changes: 0 additions & 68 deletions mets.xml

This file was deleted.

1,263 changes: 631 additions & 632 deletions tests/workspace/reference_page_xml/18xx-Missio-EMU-0042.xml

Large diffs are not rendered by default.

1,065 changes: 532 additions & 533 deletions tests/workspace/reference_page_xml/Ansiedlung_Korotschin_UZS_Sign_22a_0018.xml

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,045 changes: 522 additions & 523 deletions tests/workspace/reference_page_xml/Lodz_UZS_25_0056.xml

Large diffs are not rendered by default.

4,267 changes: 2,135 additions & 2,132 deletions tests/workspace/reference_page_xml/nowa_doba.xml

Large diffs are not rendered by default.

62 changes: 31 additions & 31 deletions textract2page/convert_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@


text_type_map: Final = {"PRINTED": "printed", "HANDWRITING": "handwritten-cursive"}
layout_type_map: Final = {
"LAYOUT_TITLE": "heading",
"LAYOUT_HEADER": "header",
"LAYOUT_FOOTER": "footer",
"LAYOUT_SECTION_HEADER": "heading",
"LAYOUT_PAGE_NUMBER": "page-number",
"LAYOUT_LIST": "other",
"LAYOUT_FIGURE": "other",
"LAYOUT_TABLE": "other",
"LAYOUT_KEY_VALUE_SET": "other",
"LAYOUT_TEXT": "paragraph",
}


class TextractGeometry(ABC):
Expand Down Expand Up @@ -113,7 +125,7 @@ class TextractBlock(ABC):
def __init__(self, aws_block: Dict) -> None:
self.id = aws_block.get("Id")
self.geometry = build_aws_geometry(aws_block.get("Geometry"))
self.confidence = float(aws_block.get("Confidence"))
self.confidence = float(aws_block.get("Confidence")) / 100


class TextractLayout(TextractBlock):
Expand Down Expand Up @@ -141,18 +153,7 @@ def __init__(
) -> None:
super().__init__(aws_block=aws_layout_block)
# Textract layout types -> Page layout types
layout_type_map: Final = {
"LAYOUT_TITLE": "heading",
"LAYOUT_HEADER": "header",
"LAYOUT_FOOTER": "footer",
"LAYOUT_SECTION_HEADER": "heading",
"LAYOUT_PAGE_NUMBER": "page-number",
"LAYOUT_LIST": "other",
"LAYOUT_FIGURE": "other",
"LAYOUT_TABLE": "other",
"LAYOUT_KEY_VALUE_SET": "other",
"LAYOUT_TEXT": "paragraph",
}

self.page_layout_type = layout_type_map.get(aws_layout_block["BlockType"])
self.textract_layout_type = aws_layout_block["BlockType"]

Expand Down Expand Up @@ -686,15 +687,18 @@ def convert_file(
# --------------------------------------------------------------------------
# build PRIMAPageXML
pil_img = Image.open(img_path)
img_width = pil_img.width
img_height = pil_img.height
pil_img.close()
now = datetime.now()
page_content_type = PcGtsType(
Metadata=MetadataType(
Creator="OCR-D/core %s" % VERSION, Created=now, LastChange=now
)
)
pagexml_page = PageType(
imageWidth=pil_img.width,
imageHeight=pil_img.height,
imageWidth=img_width,
imageHeight=img_height,
imageFilename=img_path,
)
page_content_type.set_Page(pagexml_page)
Expand Down Expand Up @@ -744,7 +748,7 @@ def convert_file(
pagexml_text_region_line = TextRegionType(
Coords=CoordsType(
points=points_from_aws_geometry(
line.geometry, pil_img.width, pil_img.height
line.geometry, img_width, img_height
)
),
id=line_region_id,
Expand All @@ -765,7 +769,7 @@ def convert_file(
pagexml_text_line = TextLineType(
Coords=CoordsType(
points=points_from_aws_geometry(
line.geometry, pil_img.width, pil_img.height
line.geometry, img_width, img_height
)
),
id=f"line-{line_id}",
Expand All @@ -781,7 +785,7 @@ def convert_file(
pagexml_word = WordType(
Coords=CoordsType(
points=points_from_aws_geometry(
word.geometry, pil_img.width, pil_img.height
word.geometry, img_width, img_height
)
),
id=f"word-{word.id}",
Expand All @@ -796,10 +800,10 @@ def convert_file(
for layout in layouts:
# ignore layout_type: other
if layout.textract_layout_type == "LAYOUT_FIGURE":
pagexml_text_region = ImageRegionType(
pagexml_img_region = ImageRegionType(
Coords=CoordsType(
points=points_from_aws_geometry(
layout.geometry, pil_img.width, pil_img.height
layout.geometry, img_width, img_height
)
),
id=f"layout-image-region-{layout.id}",
Expand All @@ -815,9 +819,7 @@ def convert_file(

pagexml_text_region = TextRegionType(
Coords=CoordsType(
points=points_from_aws_geometry(
layout.geometry, pil_img.width, pil_img.height
)
points=points_from_aws_geometry(layout.geometry, img_width, img_height)
),
id=f"layout-text-region-{layout.id}",
type_=layout.page_layout_type,
Expand All @@ -839,7 +841,7 @@ def convert_file(
pagexml_text_line = TextLineType(
Coords=CoordsType(
points=points_from_aws_geometry(
line.geometry, pil_img.width, pil_img.height
line.geometry, img_width, img_height
)
),
id=f"line-{line.id}",
Expand All @@ -855,7 +857,7 @@ def convert_file(
pagexml_word = WordType(
Coords=CoordsType(
points=points_from_aws_geometry(
word.geometry, pil_img.width, pil_img.height
word.geometry, img_width, img_height
)
),
id=f"word-{word.id}",
Expand All @@ -873,9 +875,7 @@ def convert_file(

pagexml_table_region = TableRegionType(
Coords=CoordsType(
points=points_from_aws_geometry(
table.geometry, pil_img.width, pil_img.height
)
points=points_from_aws_geometry(table.geometry, img_width, img_height)
),
id=f"table-region-{table_id}",
rows=table.rows,
Expand All @@ -898,7 +898,7 @@ def convert_file(
pagexml_cell_region = TextRegionType(
Coords=CoordsType(
points=points_from_aws_geometry(
cell.geometry, pil_img.width, pil_img.height
cell.geometry, img_width, img_height
)
),
id=cell_region_id,
Expand Down Expand Up @@ -935,7 +935,7 @@ def convert_file(
pagexml_text_line = TextLineType(
Coords=CoordsType(
points=points_from_aws_geometry(
line.geometry, pil_img.width, pil_img.height
line.geometry, img_width, img_height
)
),
id=f"line-{line.id}-{cell.row_index}-{cell.column_index}",
Expand All @@ -951,7 +951,7 @@ def convert_file(
pagexml_word = WordType(
Coords=CoordsType(
points=points_from_aws_geometry(
word.geometry, pil_img.width, pil_img.height
word.geometry, img_width, img_height
)
),
id=f"word-{word.id}-{cell.row_index}-{cell.column_index}",
Expand Down

0 comments on commit b41709a

Please sign in to comment.