Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Top-level reading order #23

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
# 3.7 fails because ocrd discontinued
python-version: ['3.8', '3.9', '3.10']

steps:
- uses: actions/checkout@v3
Expand Down
102 changes: 45 additions & 57 deletions tests/test_workspace.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from pathlib import Path
from os import chdir
from difflib import unified_diff
from unittest import TestCase, skip, main
from tempfile import NamedTemporaryFile
from pytest import fixture
from ocrd_utils import pushd_popd
from ocrd import Resolver
from ocrd_models.ocrd_page import parseEtree
from ocrd_models.constants import NAMESPACES as NS
from lxml import etree as ET
Expand All @@ -11,60 +12,47 @@

THIS_DIR = Path(__file__).resolve().parent

@fixture
def workspace_path(tmpdir):
workspace = str(THIS_DIR / "workspace" / "mets.xml")
workspace = Resolver().workspace_from_url(workspace, dst_dir=tmpdir, download=True)
with pushd_popd(tmpdir):
yield tmpdir

class TestConvertTextract(TestCase):
def setUp(self):
workspace = THIS_DIR / "workspace"
chdir(str(workspace))
def test_api(workspace_path, tmpdir):
test_path_dict = [
{
"aws": Path("textract_responses") / f"{filename.name.split('.', 1)[0]}.json",
"img": Path("images") / filename.name,
"xml": Path("reference_page_xml") / f"{filename.name.split('.', 1)[0]}.xml",
}
for filename in Path("images").iterdir()
]
for path in test_path_dict:
_, target_tree, _, _ = parseEtree(path["xml"], silence=True)
convert_file(str(path["aws"]), str(path["img"]), str(tmpdir/path["xml"]))
_, result_tree, _, _ = parseEtree(tmpdir/path["xml"], silence=True)
# remove elements bearing dates (Created, LastChange, Creator/Version)
for meta in target_tree.xpath(
"/page:PcGts/page:Metadata/*",
namespaces=NS,
) + result_tree.xpath(
"/page:PcGts/page:Metadata/*",
namespaces=NS,
):
meta.getparent().remove(meta)
# remove img path from Page element

self.test_path_dict = [
{
"aws": Path("textract_responses")
/ f"{filename.name.split('.', 1)[0]}.json",
"img": Path("images") / filename.name,
"xml": Path("reference_page_xml")
/ f"{filename.name.split('.', 1)[0]}.xml",
}
for filename in (workspace / "images").iterdir()
]
print(self.test_path_dict)

def test_api(self):
for path in self.test_path_dict:
print(path)
_, target_tree, _, _ = parseEtree(path["xml"], silence=True)
with NamedTemporaryFile() as out:
convert_file(str(path["aws"]), str(path["img"]), out.name)
_, result_tree, _, _ = parseEtree(out.name, silence=True)
# remove elements bearing dates (Created, LastChange, Creator/Version)
for meta in target_tree.xpath(
"/pc:PcGts/pc:Metadata/*",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
) + result_tree.xpath(
"/pc:PcGts/pc:Metadata/*",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
):
meta.getparent().remove(meta)
# remove img path from Page element

res_img_path_elem = result_tree.find(
"pc:Page",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
)
del res_img_path_elem.attrib["imageFilename"]
tar_img_path_elem = target_tree.find(
"pc:Page",
namespaces={
"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
},
)
del tar_img_path_elem.attrib["imageFilename"]
target_xml = ET.tostring(target_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
result_xml = ET.tostring(result_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
assert target_xml == result_xml
res_img_path_elem = result_tree.find(
"page:Page",
namespaces=NS,
)
del res_img_path_elem.attrib["imageFilename"]
tar_img_path_elem = target_tree.find(
"page:Page",
namespaces=NS,
)
del tar_img_path_elem.attrib["imageFilename"]
target_xml = ET.tostring(target_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
result_xml = ET.tostring(result_tree, pretty_print=True, encoding='UTF-8').decode('utf-8')
assert result_xml == target_xml, path
Binary file added tests/workspace/images/sn1991-01-03_0001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/workspace/images/sn1991-02-09_pr_0002.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
60 changes: 56 additions & 4 deletions tests/workspace/mets.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@
<mets:file ID="OCR-D-IMG_nowa_doba" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/nowa_doba.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-IMG_sn1991-02-09_pr_0002" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/sn1991-02-09_pr_0002.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-IMG_sn1991-01-03_0001" MIMETYPE="image/jpeg">
<mets:FLocat xlink:href="images/sn1991-01-03_0001.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-SEG-PAGE">
<mets:file ID="OCR-D-SEG-PAGE_f18xx-Missio-EMU-0042" MIMETYPE="application/vnd.prima.page+xml">
Expand All @@ -49,41 +55,87 @@
<mets:file ID="OCR-D-SEG-PAGE_Lodz_UZS_25_0056" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/Lodz_UZS_25_0056.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_nd1969-01-21_03" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nd1969-01-21_03.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
<mets:file ID="OCR-D-SEG-PAGE_nd1969-01-21_3" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nd1969-01-21_3.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_nowa_doba" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/nowa_doba.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_sn1991-02-09_pr_0002" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/sn1991-02-09_pr_0002.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="OCR-D-SEG-PAGE_sn1991-01-03_0001" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat xlink:href="reference_page_xml/sn1991-01-03_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="AWS">
<mets:file ID="AWS_18xx-Missio-EMU-0042" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/18xx-Missio-EMU-0042.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_Ansiedlung_Korotschin_UZS_Sign_22a_0018" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/Ansiedlung_Korotschin_UZS_Sign_22a_0018.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_Ansiedlung_WD_Wielun_Lentschütz_0053" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/Ansiedlung_WD_Wielun_Lentschütz_0053.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_Lodz_UZS_25_0056" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/Lodz_UZS_25_0056.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_nd1969-01-21_3" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/nd1969-01-21_3.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_nowa_doba" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/nowa_doba.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_sn1991-02-09_pr_0002" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/sn1991-02-09_pr_0002.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="AWS_sn1991-01-03_0001" MIMETYPE="application/json">
<mets:FLocat xlink:href="textract_responses/sn1991-01-03_0001.json" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="PHYSICAL">
<mets:div TYPE="physSequence">
<mets:div TYPE="page" ID="f18xx-Missio-EMU-0042">
<mets:fptr FILEID="OCR-D-IMG_f18xx-Missio-EMU-0042"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_f18xx-Missio-EMU-0042"/>
<mets:fptr FILEID="AWS_18xx-Missio-EMU-0042"/>
</mets:div>
<mets:div TYPE="page" ID="Ansiedlung_Korotschin_UZS_Sign_22a_0018">
<mets:fptr FILEID="OCR-D-IMG_Ansiedlung_Korotschin_UZS_Sign_22a_0018"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_Ansiedlung_Korotschin_UZS_Sign_22a_0018"/>
<mets:fptr FILEID="AWS_Ansiedlung_Korotschin_UZS_Sign_22a_0018"/>
</mets:div>
<mets:div TYPE="page" ID="Ansiedlung_WD_Wielun_Lentschütz_0053">
<mets:fptr FILEID="OCR-D-IMG_Ansiedlung_WD_Wielun_Lentschütz_0053"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_Ansiedlung_WD_Wielun_Lentschütz_0053"/>
<mets:fptr FILEID="AWS_Ansiedlung_WD_Wielun_Lentschütz_0053"/>
</mets:div>
<mets:div TYPE="page" ID="Lodz_UZS_25_0056">
<mets:fptr FILEID="OCR-D-IMG_Lodz_UZS_25_0056"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_Lodz_UZS_25_0056"/>
<mets:fptr FILEID="AWS_Lodz_UZS_25_0056"/>
</mets:div>
<mets:div TYPE="page" ID="nd1969-01-21_3">
<mets:fptr FILEID="OCR-D-IMG_nd1969-01-21_3"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_nd1969-01-21_3"/>
<mets:fptr FILEID="AWS_nd1969-01-21_3"/>
</mets:div>
<mets:div TYPE="page" ID="nowa_doba">
<mets:fptr FILEID="OCR-D-IMG_nowa_doba"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_nowa_doba"/>
<mets:fptr FILEID="AWS_nowa_doba"/>
</mets:div>
<mets:div TYPE="page" ID="sn1991-02-09_pr_0002">
<mets:fptr FILEID="OCR-D-IMG_sn1991-02-09_pr_0002"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_sn1991-02-09_pr_0002"/>
<mets:fptr FILEID="AWS_sn1991-02-09_pr_0002"/>
</mets:div>
<mets:div TYPE="page" ID="nd1969-01-21_03">
<mets:fptr FILEID="OCR-D-SEG-PAGE_nd1969-01-21_03"/>
<mets:div TYPE="page" ID="sn1991-01-03_0001">
<mets:fptr FILEID="OCR-D-IMG_sn1991-01-03_0001"/>
<mets:fptr FILEID="OCR-D-SEG-PAGE_sn1991-01-03_0001"/>
<mets:fptr FILEID="AWS_sn1991-01-03_0001"/>
</mets:div>
</mets:div>
</mets:structMap>
Expand Down
Loading
Loading