From 8c47fc5e447719d5962fa32296c8cd8fae8dcc92 Mon Sep 17 00:00:00 2001 From: musicEnfanthen Date: Tue, 28 May 2024 17:11:31 +0200 Subject: [PATCH] fix(utils): improve extraction of item labels and description --- convert_source_description/utils_helper.py | 118 ++++++++++++--------- 1 file changed, 69 insertions(+), 49 deletions(-) diff --git a/convert_source_description/utils_helper.py b/convert_source_description/utils_helper.py index 00bf305..1a5f31d 100644 --- a/convert_source_description/utils_helper.py +++ b/convert_source_description/utils_helper.py @@ -31,10 +31,26 @@ ############################################ # Helper constants ############################################ -SYSTEM_STR = "System" -MEASURE_STR = "T." FOLIO_STR = "Bl." +M_SIGLE = "M " +M_STAR_SIGLE = "M* " +MEASURE_STR = "T." PAGE_STR = "S." +ROWTABLE_SHEET_ID = "SkRT" +STAR_STR = "star" +SYSTEM_STR = "System" + +COLON = ":" +DOT = "." +PARENTHESIS = "(" +SEMICOLON = ";" +SLASH = "/" +STAR = "*" +UNDERSCORE = "_" + +P_TAG = "p" +STRONG_TAG = "strong" +SUP_TAG = "sup" ############################################ @@ -75,8 +91,8 @@ def create_source_description(self, paras: List[Tag]) -> SourceDescription: source_description["id"] = source_id source_description["siglum"] = siglum source_description["siglumAddendum"] = siglum_addendum - source_description["type"] = self._strip_tag(paras[1], "p") or "" - source_description["location"] = self._strip_tag(paras[2], "p") or "" + source_description["type"] = self._strip_tag(paras[1], P_TAG) or "" + source_description["location"] = self._strip_tag(paras[2], P_TAG) or "" source_description["description"] = self._get_description(paras, source_id) return source_description @@ -165,7 +181,7 @@ def strip_tag_and_clean(self, content, tag): Returns: str: The content within the specified tags, with leading and trailing whitespace removed. """ - return self._strip_tag(self._strip_tag(content, tag), "p") + return self._strip_tag(self._strip_tag(content, tag), P_TAG) ############################################ # Helper function: _get_siglum @@ -181,7 +197,7 @@ def _get_siglum(self, paras: List[Tag]) -> Tuple[str, str]: Returns: Tuple[str, str]: A tuple containing the siglum and the siglum addendum. """ - siglum_sup_tag = paras[0].find("sup") or "" + siglum_sup_tag = paras[0].find(SUP_TAG) or "" siglum_addendum = siglum_sup_tag.get_text(strip=True) if siglum_sup_tag else "" if siglum_sup_tag: siglum_sup_tag.extract() @@ -204,7 +220,7 @@ def _get_description(self, paras: List[Tag], source_id: str) -> Description: Description: A dictionary representing the description of the source description. """ description = copy.deepcopy(defaultDescription) - desc = self._strip_tag(paras[3], "p") or "" + desc = self._strip_tag(paras[3], P_TAG) or "" description["desc"].append(desc) # Define labels and corresponding keys in the description dictionary @@ -285,13 +301,14 @@ def _extract_writing_instruments(self, writing_instruments_text: str) -> Writing # Default value for empty writing instruments writing_instruments = {"main": "", "secondary": []} if writing_instruments_text is not None: - stripped_writing_instruments = self._strip_by_delimiter(writing_instruments_text, ";") + stripped_writing_instruments = self._strip_by_delimiter( + writing_instruments_text, SEMICOLON) # Strip . from last main and secondary writing instruments - main = stripped_writing_instruments[0].strip().rstrip(".") + main = stripped_writing_instruments[0].strip().rstrip(DOT) if len(stripped_writing_instruments) > 1: secondary = [ - instr.strip().rstrip(".") + instr.strip().rstrip(DOT) for instr in self._strip_by_delimiter(stripped_writing_instruments[1], ",") ] else: @@ -316,10 +333,10 @@ def _find_siblings(self, sibling_para: Tag, paras: List[Tag]) -> List[Tag]: List[BeautifulSoup.Tag]: A list of all sibling paragraphs. """ # Check if the current paragraph contains a tag - if sibling_para.find("strong"): + if sibling_para.find(STRONG_TAG): return paras # Check if the current paragraph ends with a period - if sibling_para.text.endswith("."): + if sibling_para.text.endswith(DOT): paras.append(sibling_para) return paras # If the current paragraph does not meet the criteria, recursively search the next sibling @@ -459,41 +476,44 @@ def _get_item(self, para: Tag) -> ContentItem: item_label = "" item_link_to = {} item_description = "" - delimiter = "(" # Get content of para with inner tags - para_content = self._strip_tag(para, "p") - stripped_para_content = self._strip_by_delimiter(para_content, delimiter) - - # Get text content of para without inner tags - stripped_para_text = self._strip_by_delimiter(para.text, delimiter) - - if len(stripped_para_content) > 1: - if para_content.find("strong") and ( - stripped_para_text[0].startswith("M ") or stripped_para_text[0].startswith("M* ") - ): - # Extract itemLabel - item_label = stripped_para_text[0].strip() - - # Create itemLinkTo dictionary - sheet_id = item_label.replace(" ", "_").replace(".", "_").replace("*", "star") - complex_id = "".join(sheet_id.split("_")[0:2]).lower() - - item_link_to = {"complexId": complex_id, "sheetId": sheet_id} - - # When there is a slash in the item label, - # it means that we probably have multiple sketch items for a row table. - # In that case, link to 'SkRT' - if item_label.find("/") != -1: - item_link_to["sheetId"] = "SkRT" + para_content = self._strip_tag(para, P_TAG) + + # Check if the paragraph starts with a strong formatted sketch sigle + if para_content.find(STRONG_TAG) and ( + para.text.startswith(M_SIGLE) or para.text.startswith(M_STAR_SIGLE) + ): + # Extract itemLabel + # (Get first part of the text content of para, split by "(" ) + item_label = self._strip_by_delimiter(para.text, PARENTHESIS)[0].strip() + + # Create itemLinkTo dictionary + sheet_id = item_label.replace( + " ", + UNDERSCORE).replace( + DOT, + UNDERSCORE).replace( + STAR, + STAR_STR) + complex_id = "".join(sheet_id.split(UNDERSCORE)[0:2]).lower() + + item_link_to = {"complexId": complex_id, "sheetId": sheet_id} + + # When there is a slash in the item label, + # it means that we probably have multiple sketch items for a row table. + # In that case, link to 'SkRT' + if item_label.find(SLASH) != -1: + item_link_to["sheetId"] = ROWTABLE_SHEET_ID # Extract itemDescription - # (re-add delimiter that was removed in the stripping action above + # (re-add delimiter that gets removed in the stripping action # and remove trailing colon) - item_description = delimiter + stripped_para_content[1].strip().rstrip(":") + item_description = PARENTHESIS + \ + self._strip_by_delimiter(para_content, PARENTHESIS)[1].strip().rstrip(COLON) - elif len(stripped_para_content) == 1: - item_description = stripped_para_content[0].strip().rstrip(":") + else: + item_description = para_content.strip().rstrip(COLON) # Create item object item = copy.deepcopy(defaultContentItem) @@ -566,20 +586,20 @@ def _get_paragraph_content_by_label(self, label: str, paras: List[Tag]) -> str: if content_paragraph is None: return "" - stripped_content = self._strip_tag(content_paragraph, "p") + stripped_content = self._strip_tag(content_paragraph, P_TAG) content = self._strip_by_delimiter(stripped_content, label)[1] - if content.endswith(";"): + if content.endswith(SEMICOLON): # Check for sibling paragraphs that belong to the same content # (separated by semicolons) sibling = content_paragraph.next_sibling - while sibling is not None and sibling.name == "p": - sibling_content = self._strip_tag(sibling, "p") - if sibling_content.endswith("."): + while sibling is not None and sibling.name == P_TAG: + sibling_content = self._strip_tag(sibling, P_TAG) + if sibling_content.endswith(DOT): content += "
" + sibling_content break - if sibling_content.endswith(";"): + if sibling_content.endswith(SEMICOLON): content += "
" + sibling_content else: break @@ -641,7 +661,7 @@ def _get_system_group(self, stripped_para_text: List[str]) -> List[System]: # Extract system label if SYSTEM_STR in para: - stripped_system_text = self._strip_by_delimiter(para, ":") + stripped_system_text = self._strip_by_delimiter(para, COLON) system_label = stripped_system_text[0].replace(SYSTEM_STR, "").strip() system["system"] = system_label @@ -651,7 +671,7 @@ def _get_system_group(self, stripped_para_text: List[str]) -> List[System]: continue if MEASURE_STR in stripped_system_text[1]: - # Remove leading measure string and trailing colon or dot. + # Remove leading measure string and trailing dot or semicolon. measure_label = ( stripped_system_text[1].lstrip(MEASURE_STR).rstrip(".;").strip() )