Skip to content

Commit

Permalink
Testcase and fix for get_image_paths returning non images sometimes
Browse files Browse the repository at this point in the history
Fixes #51 False warning about number of images per grp/page
  • Loading branch information
hnesk committed Oct 23, 2022
1 parent bdf70c3 commit e2f799b
Show file tree
Hide file tree
Showing 13 changed files with 4,337 additions and 5 deletions.
9 changes: 4 additions & 5 deletions ocrd_browser/model/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def get_file_index(self) -> Dict[str, OcrdFile]:

return file_index

def get_image_paths(self, file_group: str) -> Dict[str, Path]:
def get_image_paths(self, file_group: FileGroupHandle) -> Dict[str, Path]:
"""
Builds a Dict ID->Path for all page_ids fast
Expand All @@ -256,12 +256,11 @@ def get_image_paths(self, file_group: str) -> Dict[str, Path]:
image_paths = {}
file_index = self.get_file_index()
for page_id in self.page_ids:
images = [image for image in file_index.values() if
image.static_page_id == page_id and image.fileGrp == file_group]
if len(images) == 1:
images = [image for image in file_index.values() if image.static_page_id == page_id and file_group.match(image)]
if len(images) > 0:
image_paths[page_id] = self.directory.joinpath(images[0].local_filename)
else:
log.warning('Found %d images for PAGE %s and fileGrp %s, expected 1', len(images), page_id, file_group)
log.warning('Found no images for PAGE %s and fileGrp %s', page_id, file_group)
image_paths[page_id] = None
return image_paths

Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-IMG-BIN_0001">
<pc:Metadata>
<pc:Creator>OCR-D/core 2.38.0</pc:Creator>
<pc:Created>2022-10-23T22:26:03.882709</pc:Created>
<pc:LastChange>2022-10-23T22:26:03.882709</pc:LastChange>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/binarization" value="ocrd-doxa-binarize">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="0" type="dpi"/>
<pc:Label value="page" type="level-of-operation"/>
<pc:Label value="ISauvola" type="algorithm"/>
<pc:Label value="{}" type="parameters"/>
</pc:Labels>
</pc:MetadataItem>
</pc:Metadata>
<pc:Page imageFilename="OCR-D-IMG/INPUT_0017.tif" imageWidth="1457" imageHeight="2083">
<pc:AlternativeImage filename="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.IMG-BIN.png" comments=",binarized"/>
</pc:Page>
</pc:PcGts>
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-IMG-BIN_0002">
<pc:Metadata>
<pc:Creator>OCR-D/core 2.38.0</pc:Creator>
<pc:Created>2022-10-23T22:26:04.307376</pc:Created>
<pc:LastChange>2022-10-23T22:26:04.307376</pc:LastChange>
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/binarization" value="ocrd-doxa-binarize">
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
<pc:Label value="0" type="dpi"/>
<pc:Label value="page" type="level-of-operation"/>
<pc:Label value="ISauvola" type="algorithm"/>
<pc:Label value="{}" type="parameters"/>
</pc:Labels>
</pc:MetadataItem>
</pc:Metadata>
<pc:Page imageFilename="OCR-D-IMG/INPUT_0020.tif" imageWidth="1457" imageHeight="2084">
<pc:AlternativeImage filename="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.IMG-BIN.png" comments=",binarized"/>
</pc:Page>
</pc:PcGts>
Binary file not shown.
Binary file not shown.
83 changes: 83 additions & 0 deletions tests/example/workspaces/kant_aufklaerung_1784_bin/mets.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
<mets:metsHdr CREATEDATE="2017-11-30T16:18:26">
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER">
<mets:name>DFG-Koordinierungsprojekt zur Weiterentwicklung von Verfahren der Optical Character Recognition (OCR-D)</mets:name>
<mets:note>OCR-D</mets:note>
</mets:agent>
<mets:agent TYPE="OTHER" OTHERTYPE="SOFTWARE" ROLE="OTHER" OTHERROLE="preprocessing/optimization/binarization">
<mets:name>ocrd-doxa-binarize v0.0.2</mets:name>
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="input-file-grp">OCR-D-IMG</mets:note>
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="output-file-grp">OCR-D-IMG-BIN</mets:note>
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="parameter">{"dpi": 0, "level-of-operation": "page", "algorithm": "ISauvola", "parameters": {}}</mets:note>
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="page-id"/>
</mets:agent>
</mets:metsHdr>
<mets:dmdSec ID="DMDLOG_0001">
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:identifier type="purl">http://kant_aufklaerung_1784</mods:identifier>
</mods:mods>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:fileSec>
<mets:fileGrp USE="OCR-D-IMG">
<mets:file ID="INPUT_0017" MIMETYPE="image/tiff">
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-IMG/INPUT_0017.tif" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="INPUT_0020" MIMETYPE="image/tiff">
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-IMG/INPUT_0020.tif" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-GT-PAGE">
<mets:file ID="PAGE_0017_PAGE" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-PAGE/PAGE_0017_PAGE.xml" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="PAGE_0020_PAGE" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-PAGE/PAGE_0020_PAGE.xml" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-GT-ALTO">
<mets:file ID="PAGE_0017_ALTO" MIMETYPE="application/alto+xml">
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-ALTO/PAGE_0017_ALTO.xml" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="PAGE_0020_ALTO" MIMETYPE="application/alto+xml">
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-ALTO/PAGE_0020_ALTO.xml" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-IMG-BIN">
<mets:file ID="OCR-D-IMG-BIN_0001.IMG-BIN" MIMETYPE="image/png">
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.IMG-BIN.png"/>
</mets:file>
<mets:file ID="OCR-D-IMG-BIN_0001" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.xml"/>
</mets:file>
<mets:file ID="OCR-D-IMG-BIN_0002.IMG-BIN" MIMETYPE="image/png">
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.IMG-BIN.png"/>
</mets:file>
<mets:file ID="OCR-D-IMG-BIN_0002" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.xml"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="PHYSICAL">
<mets:div TYPE="physSequence">
<mets:div TYPE="page" ID="PHYS_0017">
<mets:fptr FILEID="INPUT_0017"/>
<mets:fptr FILEID="PAGE_0017_PAGE"/>
<mets:fptr FILEID="PAGE_0017_ALTO"/>
<mets:fptr FILEID="OCR-D-IMG-BIN_0001.IMG-BIN"/>
<mets:fptr FILEID="OCR-D-IMG-BIN_0001"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0020">
<mets:fptr FILEID="INPUT_0020"/>
<mets:fptr FILEID="PAGE_0020_PAGE"/>
<mets:fptr FILEID="PAGE_0020_ALTO"/>
<mets:fptr FILEID="OCR-D-IMG-BIN_0002.IMG-BIN"/>
<mets:fptr FILEID="OCR-D-IMG-BIN_0002"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
10 changes: 10 additions & 0 deletions tests/model/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ def test_get_image_paths(self):
self.assertEqual('INPUT_0017.tif', image_paths['PHYS_0017'].name)
self.assertEqual('INPUT_0020.tif', image_paths['PHYS_0020'].name)

def test_get_image_paths_only_returns_matching_groups(self):
"""
Testcase for https://github.com/hnesk/browse-ocrd/issues/51
"""
doc = Document.load(ASSETS_PATH / '../example/workspaces/kant_aufklaerung_1784_bin/mets.xml')
image_paths = doc.get_image_paths(FileGroupHandle('OCR-D-IMG-BIN', 'image/png'))
self.assertEqual(2, len(image_paths))
self.assertEqual('OCR-D-IMG-BIN_0001.IMG-BIN.png', image_paths['PHYS_0017'].name)
self.assertEqual('OCR-D-IMG-BIN_0002.IMG-BIN.png', image_paths['PHYS_0020'].name)

def test_get_default_image_group(self):
doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml')
file_group = doc.get_default_image_group(['OCR-D-IMG-BIN', 'OCR-D-IMG.*'])
Expand Down

0 comments on commit e2f799b

Please sign in to comment.