-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Testcase and fix for get_image_paths returning non images sometimes
Fixes #51 False warning about number of images per grp/page
- Loading branch information
Showing
13 changed files
with
4,337 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
282 changes: 282 additions & 0 deletions
282
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-GT-ALTO/PAGE_0017_ALTO.xml
Large diffs are not rendered by default.
Oops, something went wrong.
358 changes: 358 additions & 0 deletions
358
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-GT-ALTO/PAGE_0020_ALTO.xml
Large diffs are not rendered by default.
Oops, something went wrong.
1,428 changes: 1,428 additions & 0 deletions
1,428
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml
Large diffs are not rendered by default.
Oops, something went wrong.
2,134 changes: 2,134 additions & 0 deletions
2,134
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+137 KB
...rkspaces/kant_aufklaerung_1784_bin/OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.IMG-BIN.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 19 additions & 0 deletions
19
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-IMG-BIN_0001"> | ||
<pc:Metadata> | ||
<pc:Creator>OCR-D/core 2.38.0</pc:Creator> | ||
<pc:Created>2022-10-23T22:26:03.882709</pc:Created> | ||
<pc:LastChange>2022-10-23T22:26:03.882709</pc:LastChange> | ||
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/binarization" value="ocrd-doxa-binarize"> | ||
<pc:Labels externalModel="ocrd-tool" externalId="parameters"> | ||
<pc:Label value="0" type="dpi"/> | ||
<pc:Label value="page" type="level-of-operation"/> | ||
<pc:Label value="ISauvola" type="algorithm"/> | ||
<pc:Label value="{}" type="parameters"/> | ||
</pc:Labels> | ||
</pc:MetadataItem> | ||
</pc:Metadata> | ||
<pc:Page imageFilename="OCR-D-IMG/INPUT_0017.tif" imageWidth="1457" imageHeight="2083"> | ||
<pc:AlternativeImage filename="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.IMG-BIN.png" comments=",binarized"/> | ||
</pc:Page> | ||
</pc:PcGts> |
Binary file added
BIN
+142 KB
...rkspaces/kant_aufklaerung_1784_bin/OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.IMG-BIN.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 19 additions & 0 deletions
19
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-IMG-BIN_0002"> | ||
<pc:Metadata> | ||
<pc:Creator>OCR-D/core 2.38.0</pc:Creator> | ||
<pc:Created>2022-10-23T22:26:04.307376</pc:Created> | ||
<pc:LastChange>2022-10-23T22:26:04.307376</pc:LastChange> | ||
<pc:MetadataItem type="processingStep" name="preprocessing/optimization/binarization" value="ocrd-doxa-binarize"> | ||
<pc:Labels externalModel="ocrd-tool" externalId="parameters"> | ||
<pc:Label value="0" type="dpi"/> | ||
<pc:Label value="page" type="level-of-operation"/> | ||
<pc:Label value="ISauvola" type="algorithm"/> | ||
<pc:Label value="{}" type="parameters"/> | ||
</pc:Labels> | ||
</pc:MetadataItem> | ||
</pc:Metadata> | ||
<pc:Page imageFilename="OCR-D-IMG/INPUT_0020.tif" imageWidth="1457" imageHeight="2084"> | ||
<pc:AlternativeImage filename="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.IMG-BIN.png" comments=",binarized"/> | ||
</pc:Page> | ||
</pc:PcGts> |
Binary file added
BIN
+1.91 MB
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-IMG/INPUT_0017.tif
Binary file not shown.
Binary file added
BIN
+1.98 MB
tests/example/workspaces/kant_aufklaerung_1784_bin/OCR-D-IMG/INPUT_0020.tif
Binary file not shown.
83 changes: 83 additions & 0 deletions
83
tests/example/workspaces/kant_aufklaerung_1784_bin/mets.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd"> | ||
<mets:metsHdr CREATEDATE="2017-11-30T16:18:26"> | ||
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER"> | ||
<mets:name>DFG-Koordinierungsprojekt zur Weiterentwicklung von Verfahren der Optical Character Recognition (OCR-D)</mets:name> | ||
<mets:note>OCR-D</mets:note> | ||
</mets:agent> | ||
<mets:agent TYPE="OTHER" OTHERTYPE="SOFTWARE" ROLE="OTHER" OTHERROLE="preprocessing/optimization/binarization"> | ||
<mets:name>ocrd-doxa-binarize v0.0.2</mets:name> | ||
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="input-file-grp">OCR-D-IMG</mets:note> | ||
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="output-file-grp">OCR-D-IMG-BIN</mets:note> | ||
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="parameter">{"dpi": 0, "level-of-operation": "page", "algorithm": "ISauvola", "parameters": {}}</mets:note> | ||
<mets:note xmlns:ocrd="https://ocr-d.de" ocrd:option="page-id"/> | ||
</mets:agent> | ||
</mets:metsHdr> | ||
<mets:dmdSec ID="DMDLOG_0001"> | ||
<mets:mdWrap MDTYPE="MODS"> | ||
<mets:xmlData> | ||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> | ||
<mods:identifier type="purl">http://kant_aufklaerung_1784</mods:identifier> | ||
</mods:mods> | ||
</mets:xmlData> | ||
</mets:mdWrap> | ||
</mets:dmdSec> | ||
<mets:fileSec> | ||
<mets:fileGrp USE="OCR-D-IMG"> | ||
<mets:file ID="INPUT_0017" MIMETYPE="image/tiff"> | ||
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-IMG/INPUT_0017.tif" OTHERLOCTYPE="FILE"/> | ||
</mets:file> | ||
<mets:file ID="INPUT_0020" MIMETYPE="image/tiff"> | ||
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-IMG/INPUT_0020.tif" OTHERLOCTYPE="FILE"/> | ||
</mets:file> | ||
</mets:fileGrp> | ||
<mets:fileGrp USE="OCR-D-GT-PAGE"> | ||
<mets:file ID="PAGE_0017_PAGE" MIMETYPE="application/vnd.prima.page+xml"> | ||
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-PAGE/PAGE_0017_PAGE.xml" OTHERLOCTYPE="FILE"/> | ||
</mets:file> | ||
<mets:file ID="PAGE_0020_PAGE" MIMETYPE="application/vnd.prima.page+xml"> | ||
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-PAGE/PAGE_0020_PAGE.xml" OTHERLOCTYPE="FILE"/> | ||
</mets:file> | ||
</mets:fileGrp> | ||
<mets:fileGrp USE="OCR-D-GT-ALTO"> | ||
<mets:file ID="PAGE_0017_ALTO" MIMETYPE="application/alto+xml"> | ||
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-ALTO/PAGE_0017_ALTO.xml" OTHERLOCTYPE="FILE"/> | ||
</mets:file> | ||
<mets:file ID="PAGE_0020_ALTO" MIMETYPE="application/alto+xml"> | ||
<mets:FLocat LOCTYPE="OTHER" xlink:href="OCR-D-GT-ALTO/PAGE_0020_ALTO.xml" OTHERLOCTYPE="FILE"/> | ||
</mets:file> | ||
</mets:fileGrp> | ||
<mets:fileGrp USE="OCR-D-IMG-BIN"> | ||
<mets:file ID="OCR-D-IMG-BIN_0001.IMG-BIN" MIMETYPE="image/png"> | ||
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.IMG-BIN.png"/> | ||
</mets:file> | ||
<mets:file ID="OCR-D-IMG-BIN_0001" MIMETYPE="application/vnd.prima.page+xml"> | ||
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.xml"/> | ||
</mets:file> | ||
<mets:file ID="OCR-D-IMG-BIN_0002.IMG-BIN" MIMETYPE="image/png"> | ||
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.IMG-BIN.png"/> | ||
</mets:file> | ||
<mets:file ID="OCR-D-IMG-BIN_0002" MIMETYPE="application/vnd.prima.page+xml"> | ||
<mets:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="FILE" xlink:href="OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002.xml"/> | ||
</mets:file> | ||
</mets:fileGrp> | ||
</mets:fileSec> | ||
<mets:structMap TYPE="PHYSICAL"> | ||
<mets:div TYPE="physSequence"> | ||
<mets:div TYPE="page" ID="PHYS_0017"> | ||
<mets:fptr FILEID="INPUT_0017"/> | ||
<mets:fptr FILEID="PAGE_0017_PAGE"/> | ||
<mets:fptr FILEID="PAGE_0017_ALTO"/> | ||
<mets:fptr FILEID="OCR-D-IMG-BIN_0001.IMG-BIN"/> | ||
<mets:fptr FILEID="OCR-D-IMG-BIN_0001"/> | ||
</mets:div> | ||
<mets:div TYPE="page" ID="PHYS_0020"> | ||
<mets:fptr FILEID="INPUT_0020"/> | ||
<mets:fptr FILEID="PAGE_0020_PAGE"/> | ||
<mets:fptr FILEID="PAGE_0020_ALTO"/> | ||
<mets:fptr FILEID="OCR-D-IMG-BIN_0002.IMG-BIN"/> | ||
<mets:fptr FILEID="OCR-D-IMG-BIN_0002"/> | ||
</mets:div> | ||
</mets:div> | ||
</mets:structMap> | ||
</mets:mets> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters