Skip to content

Commit

Permalink
Merge pull request #11 from LvanWissen/master
Browse files Browse the repository at this point in the history
  • Loading branch information
brambg authored Sep 27, 2023
2 parents e121c92 + 2a7754a commit 91f3170
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 16 deletions.
38 changes: 38 additions & 0 deletions pagexml/helper/pagexml_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,44 @@ def read_line_format_file(line_format_files: Union[str, List[str]],
yield {header: row[hi] if len(row) > hi else None for hi, header in enumerate(headers)}


def get_custom_tags(doc: pdm.PageXMLDoc) -> List[Dict[str, any]]:
"""
Get all custom tags and their textual values from a PageXMLDoc.
This function assumes that the PageXML document is generated with
input of some `custom_tags` in the parse_pagexml_file function.
This helper retrieves those tags from all TextLines and finds the
corresponding text from their offset and length. It returns a
dictionary with the tag type, the textual value, region and line
id, and the offset and length.
:param doc: A PageXMLDoc
:type doc: pdm.PageXMLDoc
:return: List of custom tags
:rtype: List[Dict[str, any]]
"""
custom_tags = []

for region in doc.text_regions:
for line in region.lines:
for tag_el in line.metadata.get("custom_tags", []):
tag = tag_el["type"]
offset = tag_el["offset"]
length = tag_el["length"]

value = line.text[offset:offset+length]

custom_tags.append({
"type": tag,
"value": value,
"region_id": region.id,
"line_id": line.id,
"offset": offset,
"length": length,
})

return custom_tags

class LineIterable:

def __init__(self, line_format_files: Union[str, List[str]], headers: List[str] = None):
Expand Down
67 changes: 51 additions & 16 deletions pagexml/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from xml.parsers import expat
from datetime import datetime
from typing import Generator, List, Dict, Union
from typing import Generator, List, Dict, Union, Iterable

import xmltodict
from dateutil.parser import parse as date_parse
Expand Down Expand Up @@ -78,13 +78,13 @@ def parse_text_equiv(text_equiv: dict) -> Union[str, None]:
return None


def parse_textline(textline: dict) -> pdm.PageXMLTextLine:
def parse_textline(textline: dict, custom_tags: Iterable = []) -> pdm.PageXMLTextLine:
text = parse_text_equiv(textline['TextEquiv']) if 'TextEquiv' in textline else None
try:
return pdm.PageXMLTextLine(
xheight=int(textline['@xheight']) if '@xheight' in textline else None,
doc_id=textline['@id'] if '@id' in textline else None,
metadata=parse_custom_metadata(textline)
metadata=parse_custom_metadata(textline, custom_tags)
if '@custom' in textline
else None,
coords=parse_coords(textline['Coords']),
Expand All @@ -110,8 +110,8 @@ def parse_conf(text_element: dict) -> Union[float, None]:
return None


def parse_textline_list(textline_list: list) -> List[pdm.PageXMLTextLine]:
return [parse_textline(textline) for textline in textline_list]
def parse_textline_list(textline_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextLine]:
return [parse_textline(textline, custom_tags) for textline in textline_list]


def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict[str, str]:
Expand All @@ -129,7 +129,35 @@ def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict
return metadata


def parse_custom_metadata(text_element: Dict[str, any]) -> Dict[str, any]:
def parse_custom_metadata_element_list(custom_string: str, custom_field: str) -> List[Dict[str, str]]:
metadata_list = []

matches = re.finditer(r'\b(' + custom_field + r') {(.*?)}', custom_string)

for match in matches:
tag = match.group(1)
metadata = {"type": tag}
structure_parts = match.group(2).strip().split(';')

for part in structure_parts:
if part == '':
continue
field, value = part.split(':')

field = field.strip()
value = value.strip()

if field in ('offset', 'length'):
metadata[field] = int(value)
else:
metadata[field] = value

metadata_list.append(metadata)

return metadata_list


def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable = []) -> Dict[str, any]:
"""Parse custom metadata, like readingOrder, structure."""
metadata = {}
if '@custom' not in text_element:
Expand All @@ -140,10 +168,15 @@ def parse_custom_metadata(text_element: Dict[str, any]) -> Dict[str, any]:
metadata['structure'] = parse_custom_metadata_element(text_element['@custom'], 'structure')
if 'type' in metadata['structure']:
metadata['type'] = metadata['structure']['type']
if 'textStyle {' in text_element['@custom']:
metadata['text_style'] = parse_custom_metadata_element_list(text_element['@custom'], 'textStyle')
if custom_tags:
regex_tags = r'(?:' + '|'.join(custom_tags) + r')'
metadata['custom_tags'] = parse_custom_metadata_element_list(text_element['@custom'], regex_tags)
return metadata


def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, None]:
def parse_textregion(text_region_dict: dict, custom_tags: Iterable = []) -> Union[pdm.PageXMLTextRegion, None]:
text_region = pdm.PageXMLTextRegion(
doc_id=text_region_dict['@id'] if '@id' in text_region_dict else None,
orientation=float(text_region_dict['@orientation']) if '@orientation' in text_region_dict else None,
Expand All @@ -157,17 +190,17 @@ def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, Non
text_region.text = parse_text_equiv(text_region_dict[child])
if child == 'TextLine':
if isinstance(text_region_dict['TextLine'], list):
text_region.lines = parse_textline_list(text_region_dict['TextLine'])
text_region.lines = parse_textline_list(text_region_dict['TextLine'], custom_tags)
else:
text_region.lines = [parse_textline(text_region_dict['TextLine'])]
text_region.lines = [parse_textline(text_region_dict['TextLine'], custom_tags)]
text_region.set_as_parent(text_region.lines)
if not text_region.coords:
text_region.coords = parse_derived_coords(text_region.lines)
if child == 'TextRegion':
text_region.text_regions = []
if isinstance(text_region_dict['TextRegion'], list) is False:
text_region_dict['TextRegion'] = [text_region_dict['TextRegion']]
for tr in parse_textregion_list(text_region_dict['TextRegion']):
for tr in parse_textregion_list(text_region_dict['TextRegion'], custom_tags):
if tr is not None:
text_region.text_regions.append(tr)
text_region.set_as_parent(text_region.text_regions)
Expand All @@ -180,8 +213,8 @@ def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, Non
return text_region


def parse_textregion_list(textregion_dict_list: list) -> List[pdm.PageXMLTextRegion]:
return [parse_textregion(textregion_dict) for textregion_dict in textregion_dict_list]
def parse_textregion_list(textregion_dict_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextRegion]:
return [parse_textregion(textregion_dict, custom_tags) for textregion_dict in textregion_dict_list]


def parse_page_metadata(metadata_json: dict) -> dict:
Expand Down Expand Up @@ -233,7 +266,7 @@ def parse_page_reading_order(page_json: dict) -> dict:
return reading_order


def parse_pagexml_json(pagexml_file: str, scan_json: dict) -> pdm.PageXMLScan:
def parse_pagexml_json(pagexml_file: str, scan_json: dict, custom_tags: Iterable = []) -> pdm.PageXMLScan:
"""Parse a JSON/xmltodict representation of a PageXML file and return a PageXMLScan object."""
doc_id = pagexml_file
coords, text_regions = None, None
Expand All @@ -254,7 +287,7 @@ def parse_pagexml_json(pagexml_file: str, scan_json: dict) -> pdm.PageXMLScan:
text_regions = []
if isinstance(scan_json['TextRegion'], list) is False:
scan_json['TextRegion'] = [scan_json['TextRegion']]
for tr in parse_textregion_list(scan_json['TextRegion']):
for tr in parse_textregion_list(scan_json['TextRegion'], custom_tags=custom_tags):
if tr is not None:
text_regions.append(tr)
if 'ReadingOrder' in scan_json and scan_json['ReadingOrder']:
Expand All @@ -277,7 +310,7 @@ def read_pagexml_file(pagexml_file: str, encoding: str = 'utf-8') -> str:
return fh.read()


def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, custom_tags: Iterable = {},
encoding: str = 'utf-8') -> pdm.PageXMLScan:
"""Read PageXML from file (or content of file passed separately if read from elsewhere,
e.g. tarball) and return a PageXMLScan object.
Expand All @@ -286,6 +319,8 @@ def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
:type pagexml_file: str
:param pagexml_data: string representation of PageXML document (corresponding to the content of pagexml_file)
:type pagexml_data: str
:param custom_tags: list of custom tags to be parsed in the metadata
:type custom_tags: list
:param encoding: the encoding of the file (default utf-8)
:type encoding: str
:return: a pdm.PageXMLScan object
Expand All @@ -295,7 +330,7 @@ def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
pagexml_data = read_pagexml_file(pagexml_file, encoding=encoding)
scan_json = xmltodict.parse(pagexml_data)
try:
scan_doc = parse_pagexml_json(pagexml_file, scan_json)
scan_doc = parse_pagexml_json(pagexml_file, scan_json, custom_tags=custom_tags)
except BaseException:
print(f'Error parsing file {pagexml_file}')
raise
Expand Down

0 comments on commit 91f3170

Please sign in to comment.