From 298aba3af6e1c2a350731de840644cf6a4c514e3 Mon Sep 17 00:00:00 2001 From: Leon van Wissen Date: Wed, 2 Aug 2023 17:13:13 +0200 Subject: [PATCH 1/3] feat: parse textStyle and tags in line metadata --- pagexml/parser.py | 60 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/pagexml/parser.py b/pagexml/parser.py index f5f7c61..a7a9020 100644 --- a/pagexml/parser.py +++ b/pagexml/parser.py @@ -3,7 +3,7 @@ import re from xml.parsers import expat from datetime import datetime -from typing import Generator, List, Dict, Union +from typing import Generator, List, Dict, Union, Iterable import xmltodict from dateutil.parser import parse as date_parse @@ -78,13 +78,13 @@ def parse_text_equiv(text_equiv: dict) -> Union[str, None]: return None -def parse_textline(textline: dict) -> pdm.PageXMLTextLine: +def parse_textline(textline: dict, custom_tags: Iterable = []) -> pdm.PageXMLTextLine: text = parse_text_equiv(textline['TextEquiv']) if 'TextEquiv' in textline else None try: return pdm.PageXMLTextLine( xheight=int(textline['@xheight']) if '@xheight' in textline else None, doc_id=textline['@id'] if '@id' in textline else None, - metadata=parse_custom_metadata(textline) + metadata=parse_custom_metadata(textline, custom_tags) if '@custom' in textline else None, coords=parse_coords(textline['Coords']), @@ -110,8 +110,8 @@ def parse_conf(text_element: dict) -> Union[float, None]: return None -def parse_textline_list(textline_list: list) -> List[pdm.PageXMLTextLine]: - return [parse_textline(textline) for textline in textline_list] +def parse_textline_list(textline_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextLine]: + return [parse_textline(textline, custom_tags) for textline in textline_list] def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict[str, str]: @@ -129,7 +129,28 @@ def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict return metadata -def parse_custom_metadata(text_element: Dict[str, any]) -> Dict[str, any]: +def parse_custom_metadata_element_list(custom_string: str, custom_field: str) -> List[Dict[str, str]]: + metadata_list = [] + + matches = re.finditer(r'\b(' + custom_field + r') {(.*?)}', custom_string) + + for match in matches: + tag = match.group(1) + metadata = {"type": tag} + structure_parts = match.group(2).strip().split(';') + + for part in structure_parts: + if part == '': + continue + field, value = part.split(':') + metadata[field.strip()] = (value.strip()) + + metadata_list.append(metadata) + + return metadata_list + + +def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable = []) -> Dict[str, any]: """Parse custom metadata, like readingOrder, structure.""" metadata = {} if '@custom' not in text_element: @@ -140,10 +161,15 @@ def parse_custom_metadata(text_element: Dict[str, any]) -> Dict[str, any]: metadata['structure'] = parse_custom_metadata_element(text_element['@custom'], 'structure') if 'type' in metadata['structure']: metadata['type'] = metadata['structure']['type'] + if 'textStyle {' in text_element['@custom']: + metadata['text_style'] = parse_custom_metadata_element_list(text_element['@custom'], 'textStyle') + if custom_tags: + regex_tags = r'(?:' + '|'.join(custom_tags) + r')' + metadata['custom_tags'] = parse_custom_metadata_element_list(text_element['@custom'], regex_tags) return metadata -def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, None]: +def parse_textregion(text_region_dict: dict, custom_tags: Iterable = []) -> Union[pdm.PageXMLTextRegion, None]: text_region = pdm.PageXMLTextRegion( doc_id=text_region_dict['@id'] if '@id' in text_region_dict else None, orientation=float(text_region_dict['@orientation']) if '@orientation' in text_region_dict else None, @@ -157,9 +183,9 @@ def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, Non text_region.text = parse_text_equiv(text_region_dict[child]) if child == 'TextLine': if isinstance(text_region_dict['TextLine'], list): - text_region.lines = parse_textline_list(text_region_dict['TextLine']) + text_region.lines = parse_textline_list(text_region_dict['TextLine'], custom_tags) else: - text_region.lines = [parse_textline(text_region_dict['TextLine'])] + text_region.lines = [parse_textline(text_region_dict['TextLine'], custom_tags)] text_region.set_as_parent(text_region.lines) if not text_region.coords: text_region.coords = parse_derived_coords(text_region.lines) @@ -167,7 +193,7 @@ def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, Non text_region.text_regions = [] if isinstance(text_region_dict['TextRegion'], list) is False: text_region_dict['TextRegion'] = [text_region_dict['TextRegion']] - for tr in parse_textregion_list(text_region_dict['TextRegion']): + for tr in parse_textregion_list(text_region_dict['TextRegion'], custom_tags): if tr is not None: text_region.text_regions.append(tr) text_region.set_as_parent(text_region.text_regions) @@ -180,8 +206,8 @@ def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, Non return text_region -def parse_textregion_list(textregion_dict_list: list) -> List[pdm.PageXMLTextRegion]: - return [parse_textregion(textregion_dict) for textregion_dict in textregion_dict_list] +def parse_textregion_list(textregion_dict_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextRegion]: + return [parse_textregion(textregion_dict, custom_tags) for textregion_dict in textregion_dict_list] def parse_page_metadata(metadata_json: dict) -> dict: @@ -233,7 +259,7 @@ def parse_page_reading_order(page_json: dict) -> dict: return reading_order -def parse_pagexml_json(pagexml_file: str, scan_json: dict) -> pdm.PageXMLScan: +def parse_pagexml_json(pagexml_file: str, scan_json: dict, custom_tags: Iterable = []) -> pdm.PageXMLScan: """Parse a JSON/xmltodict representation of a PageXML file and return a PageXMLScan object.""" doc_id = pagexml_file coords, text_regions = None, None @@ -254,7 +280,7 @@ def parse_pagexml_json(pagexml_file: str, scan_json: dict) -> pdm.PageXMLScan: text_regions = [] if isinstance(scan_json['TextRegion'], list) is False: scan_json['TextRegion'] = [scan_json['TextRegion']] - for tr in parse_textregion_list(scan_json['TextRegion']): + for tr in parse_textregion_list(scan_json['TextRegion'], custom_tags=custom_tags): if tr is not None: text_regions.append(tr) if 'ReadingOrder' in scan_json and scan_json['ReadingOrder']: @@ -277,7 +303,7 @@ def read_pagexml_file(pagexml_file: str, encoding: str = 'utf-8') -> str: return fh.read() -def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, +def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, custom_tags: Iterable = {}, encoding: str = 'utf-8') -> pdm.PageXMLScan: """Read PageXML from file (or content of file passed separately if read from elsewhere, e.g. tarball) and return a PageXMLScan object. @@ -286,6 +312,8 @@ def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, :type pagexml_file: str :param pagexml_data: string representation of PageXML document (corresponding to the content of pagexml_file) :type pagexml_data: str + :param custom_tags: list of custom tags to be parsed in the metadata + :type custom_tags: list :param encoding: the encoding of the file (default utf-8) :type encoding: str :return: a pdm.PageXMLScan object @@ -295,7 +323,7 @@ def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, pagexml_data = read_pagexml_file(pagexml_file, encoding=encoding) scan_json = xmltodict.parse(pagexml_data) try: - scan_doc = parse_pagexml_json(pagexml_file, scan_json) + scan_doc = parse_pagexml_json(pagexml_file, scan_json, custom_tags=custom_tags) except BaseException: print(f'Error parsing file {pagexml_file}') raise From 44c4aa32f40e21a59976805dbe7ca9f453ebdb68 Mon Sep 17 00:00:00 2001 From: Leon van Wissen Date: Thu, 3 Aug 2023 12:54:59 +0200 Subject: [PATCH 2/3] fix: offset and length values as int --- pagexml/parser.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pagexml/parser.py b/pagexml/parser.py index a7a9020..d0c83fa 100644 --- a/pagexml/parser.py +++ b/pagexml/parser.py @@ -143,7 +143,14 @@ def parse_custom_metadata_element_list(custom_string: str, custom_field: str) -> if part == '': continue field, value = part.split(':') - metadata[field.strip()] = (value.strip()) + + field = field.strip() + value = value.strip() + + if field in ('offset', 'length'): + metadata[field] = int(value) + else: + metadata[field] = value metadata_list.append(metadata) From 2a7754aab6feb0276cd41f4c8d37b436306093fd Mon Sep 17 00:00:00 2001 From: Leon van Wissen Date: Thu, 3 Aug 2023 12:55:23 +0200 Subject: [PATCH 3/3] feat: helper function to retrieve entities --- pagexml/helper/pagexml_helper.py | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py index 54ef3fd..ad3967a 100644 --- a/pagexml/helper/pagexml_helper.py +++ b/pagexml/helper/pagexml_helper.py @@ -316,6 +316,44 @@ def read_line_format_file(line_format_files: Union[str, List[str]], yield {header: row[hi] if len(row) > hi else None for hi, header in enumerate(headers)} +def get_custom_tags(doc: pdm.PageXMLDoc) -> List[Dict[str, any]]: + """ + Get all custom tags and their textual values from a PageXMLDoc. + + This function assumes that the PageXML document is generated with + input of some `custom_tags` in the parse_pagexml_file function. + This helper retrieves those tags from all TextLines and finds the + corresponding text from their offset and length. It returns a + dictionary with the tag type, the textual value, region and line + id, and the offset and length. + + :param doc: A PageXMLDoc + :type doc: pdm.PageXMLDoc + :return: List of custom tags + :rtype: List[Dict[str, any]] + """ + custom_tags = [] + + for region in doc.text_regions: + for line in region.lines: + for tag_el in line.metadata.get("custom_tags", []): + tag = tag_el["type"] + offset = tag_el["offset"] + length = tag_el["length"] + + value = line.text[offset:offset+length] + + custom_tags.append({ + "type": tag, + "value": value, + "region_id": region.id, + "line_id": line.id, + "offset": offset, + "length": length, + }) + + return custom_tags + class LineIterable: def __init__(self, line_format_files: Union[str, List[str]], headers: List[str] = None):