From 7810f8cf62a693d96137b4b46f2ac134ee40a0d9 Mon Sep 17 00:00:00 2001 From: musicEnfanthen Date: Tue, 25 Jun 2024 15:01:09 +0200 Subject: [PATCH] feat(utils): apply updated source desc model with arrays --- convert_source_description/default_objects.py | 16 ++++---- convert_source_description/typed_classes.py | 16 ++++---- convert_source_description/utils_helper.py | 41 +++++++++++-------- 3 files changed, 40 insertions(+), 33 deletions(-) diff --git a/convert_source_description/default_objects.py b/convert_source_description/default_objects.py index 29eb761..ced120e 100644 --- a/convert_source_description/default_objects.py +++ b/convert_source_description/default_objects.py @@ -20,18 +20,18 @@ defaultDescription: Description = { "desc": [], - "writingMaterialString": "", + "writingMaterialStrings": [], "writingInstruments": { "main": "", "secondary": [] }, - "title": "", - "date": "", - "pagination": "", - "measureNumbers": "", - "instrumentation": "", - "annotations": "", - "content": [] + "titles": [], + "dates": [], + "paginations": [], + "measureNumbers": [], + "instrumentations": [], + "annotations": [], + "contents": [] } defaultContentItem: ContentItem = { diff --git a/convert_source_description/typed_classes.py b/convert_source_description/typed_classes.py index d915404..36ce731 100644 --- a/convert_source_description/typed_classes.py +++ b/convert_source_description/typed_classes.py @@ -52,15 +52,15 @@ class WritingInstruments(TypedDict): class Description(TypedDict): """A typed dictionary that represents a description of a source description.""" desc: List[str] - writingMaterialString: str + writingMaterialStrings: List[str] writingInstruments: WritingInstruments - title: str - date: str - pagination: str - measureNumbers: str - instrumentation: str - annotations: str - content: List[ContentItem] + titles: List[str] + dates: List[str] + paginationss: List[str] + measureNumbers: List[str] + instrumentations: List[str] + annotations: List[str] + contents: List[ContentItem] class SourceDescription(TypedDict): diff --git a/convert_source_description/utils_helper.py b/convert_source_description/utils_helper.py index 1a5f31d..44e17d2 100644 --- a/convert_source_description/utils_helper.py +++ b/convert_source_description/utils_helper.py @@ -41,6 +41,7 @@ SYSTEM_STR = "System" COLON = ":" +COMMA = "," DOT = "." PARENTHESIS = "(" SEMICOLON = ";" @@ -225,25 +226,29 @@ def _get_description(self, paras: List[Tag], source_id: str) -> Description: # Define labels and corresponding keys in the description dictionary description_labels_keys = [ - ("Beschreibstoff:", "writingMaterialString"), + ("Beschreibstoff:", "writingMaterialStrings"), ("Schreibstoff:", "writingInstruments"), - ("Titel:", "title"), - ("Datierung:", "date"), - ("Paginierung:", "pagination"), + ("Titel:", "titles"), + ("Datierung:", "dates"), + ("Paginierung:", "paginations"), ("Taktzahlen:", "measureNumbers"), - ("Besetzung:", "instrumentation"), + ("Besetzung:", "instrumentations"), ("Eintragungen:", "annotations"), ] # Get content for each label and assign it to the corresponding key for label, key in description_labels_keys: content = self._get_paragraph_content_by_label(label, paras) + + # Writing instruments require special handling if key == "writingInstruments": - content = self._extract_writing_instruments(content) + content = self._extract_writing_instruments( + content[0]) if content else description[key] + description[key] = content # Get content items - description["content"] = self._get_content_items(paras, source_id) + description["contents"] = self._get_content_items(paras, source_id) return description @@ -309,7 +314,7 @@ def _extract_writing_instruments(self, writing_instruments_text: str) -> Writing if len(stripped_writing_instruments) > 1: secondary = [ instr.strip().rstrip(DOT) - for instr in self._strip_by_delimiter(stripped_writing_instruments[1], ",") + for instr in self._strip_by_delimiter(stripped_writing_instruments[1], COMMA) ] else: secondary = [] @@ -582,31 +587,33 @@ def _get_paragraph_content_by_label(self, label: str, paras: List[Tag]) -> str: with leading and trailing whitespace removed. """ content_paragraph = self._find_tag_with_label_in_soup(label, paras) + content_lines = [] if content_paragraph is None: - return "" + return content_lines stripped_content = self._strip_tag(content_paragraph, P_TAG) - content = self._strip_by_delimiter(stripped_content, label)[1] + initial_content = self._strip_by_delimiter(stripped_content, label)[1] - if content.endswith(SEMICOLON): + content_lines.append(initial_content.strip().rstrip(DOT).rstrip(SEMICOLON)) + + if initial_content.endswith(SEMICOLON): # Check for sibling paragraphs that belong to the same content # (separated by semicolons) sibling = content_paragraph.next_sibling while sibling is not None and sibling.name == P_TAG: sibling_content = self._strip_tag(sibling, P_TAG) - if sibling_content.endswith(DOT): - content += "
" + sibling_content - break - if sibling_content.endswith(SEMICOLON): - content += "
" + sibling_content + if sibling_content.endswith(DOT) or sibling_content.endswith(SEMICOLON): + content_lines.append(sibling_content.strip().rstrip(DOT).rstrip(SEMICOLON)) + if sibling_content.endswith(DOT): + break else: break sibling = sibling.next_sibling - return content.strip() + return content_lines ############################################ # Helper function: _get_paragraph_index_by_label