Merge pull request #36 from musicEnfanthen/main

feat(utils): apply updated source desc model with arrays
webern-unibas-ch · Jun 25, 2024 · 677ae22 · 677ae22
2 parents 7a5e33e + 7810f8c
commit 677ae22
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 33 deletions.
diff --git a/convert_source_description/default_objects.py b/convert_source_description/default_objects.py
@@ -20,18 +20,18 @@
 
 defaultDescription: Description = {
     "desc": [],
-    "writingMaterialString": "",
+    "writingMaterialStrings": [],
     "writingInstruments": {
         "main": "",
         "secondary": []
     },
-    "title": "",
-    "date": "",
-    "pagination": "",
-    "measureNumbers": "",
-    "instrumentation": "",
-    "annotations": "",
-    "content": []
+    "titles": [],
+    "dates": [],
+    "paginations": [],
+    "measureNumbers": [],
+    "instrumentations": [],
+    "annotations": [],
+    "contents": []
 }
 
 defaultContentItem: ContentItem = {

diff --git a/convert_source_description/typed_classes.py b/convert_source_description/typed_classes.py
@@ -52,15 +52,15 @@ class WritingInstruments(TypedDict):
 class Description(TypedDict):
     """A typed dictionary that represents a description of a source description."""
     desc: List[str]
-    writingMaterialString: str
+    writingMaterialStrings: List[str]
     writingInstruments: WritingInstruments
-    title: str
-    date: str
-    pagination: str
-    measureNumbers: str
-    instrumentation: str
-    annotations: str
-    content: List[ContentItem]
+    titles: List[str]
+    dates: List[str]
+    paginationss: List[str]
+    measureNumbers: List[str]
+    instrumentations: List[str]
+    annotations: List[str]
+    contents: List[ContentItem]
 
 
 class SourceDescription(TypedDict):

diff --git a/convert_source_description/utils_helper.py b/convert_source_description/utils_helper.py
@@ -41,6 +41,7 @@
 SYSTEM_STR = "System"
 
 COLON = ":"
+COMMA = ","
 DOT = "."
 PARENTHESIS = "("
 SEMICOLON = ";"
@@ -225,25 +226,29 @@ def _get_description(self, paras: List[Tag], source_id: str) -> Description:
 
         # Define labels and corresponding keys in the description dictionary
         description_labels_keys = [
-            ("Beschreibstoff:", "writingMaterialString"),
+            ("Beschreibstoff:", "writingMaterialStrings"),
             ("Schreibstoff:", "writingInstruments"),
-            ("Titel:", "title"),
-            ("Datierung:", "date"),
-            ("Paginierung:", "pagination"),
+            ("Titel:", "titles"),
+            ("Datierung:", "dates"),
+            ("Paginierung:", "paginations"),
             ("Taktzahlen:", "measureNumbers"),
-            ("Besetzung:", "instrumentation"),
+            ("Besetzung:", "instrumentations"),
             ("Eintragungen:", "annotations"),
         ]
 
         # Get content for each label and assign it to the corresponding key
         for label, key in description_labels_keys:
             content = self._get_paragraph_content_by_label(label, paras)
+
+            # Writing instruments require special handling
             if key == "writingInstruments":
-                content = self._extract_writing_instruments(content)
+                content = self._extract_writing_instruments(
+                    content[0]) if content else description[key]
+
             description[key] = content
 
         # Get content items
-        description["content"] = self._get_content_items(paras, source_id)
+        description["contents"] = self._get_content_items(paras, source_id)
 
         return description
 
@@ -309,7 +314,7 @@ def _extract_writing_instruments(self, writing_instruments_text: str) -> Writing
             if len(stripped_writing_instruments) > 1:
                 secondary = [
                     instr.strip().rstrip(DOT)
-                    for instr in self._strip_by_delimiter(stripped_writing_instruments[1], ",")
+                    for instr in self._strip_by_delimiter(stripped_writing_instruments[1], COMMA)
                 ]
             else:
                 secondary = []
@@ -582,31 +587,33 @@ def _get_paragraph_content_by_label(self, label: str, paras: List[Tag]) -> str:
                 with leading and trailing whitespace removed.
         """
         content_paragraph = self._find_tag_with_label_in_soup(label, paras)
+        content_lines = []
 
         if content_paragraph is None:
-            return ""
+            return content_lines
 
         stripped_content = self._strip_tag(content_paragraph, P_TAG)
-        content = self._strip_by_delimiter(stripped_content, label)[1]
+        initial_content = self._strip_by_delimiter(stripped_content, label)[1]
 
-        if content.endswith(SEMICOLON):
+        content_lines.append(initial_content.strip().rstrip(DOT).rstrip(SEMICOLON))
+
+        if initial_content.endswith(SEMICOLON):
             # Check for sibling paragraphs that belong to the same content
             # (separated by semicolons)
             sibling = content_paragraph.next_sibling
 
             while sibling is not None and sibling.name == P_TAG:
                 sibling_content = self._strip_tag(sibling, P_TAG)
-                if sibling_content.endswith(DOT):
-                    content += "<br />" + sibling_content
-                    break
-                if sibling_content.endswith(SEMICOLON):
-                    content += "<br />" + sibling_content
+                if sibling_content.endswith(DOT) or sibling_content.endswith(SEMICOLON):
+                    content_lines.append(sibling_content.strip().rstrip(DOT).rstrip(SEMICOLON))
+                    if sibling_content.endswith(DOT):
+                        break
                 else:
                     break
 
                 sibling = sibling.next_sibling
 
-        return content.strip()
+        return content_lines
 
     ############################################
     # Helper function: _get_paragraph_index_by_label