diff --git a/convert_source_description/default_objects.py b/convert_source_description/default_objects.py
index 29eb761..ced120e 100644
--- a/convert_source_description/default_objects.py
+++ b/convert_source_description/default_objects.py
@@ -20,18 +20,18 @@
defaultDescription: Description = {
"desc": [],
- "writingMaterialString": "",
+ "writingMaterialStrings": [],
"writingInstruments": {
"main": "",
"secondary": []
},
- "title": "",
- "date": "",
- "pagination": "",
- "measureNumbers": "",
- "instrumentation": "",
- "annotations": "",
- "content": []
+ "titles": [],
+ "dates": [],
+ "paginations": [],
+ "measureNumbers": [],
+ "instrumentations": [],
+ "annotations": [],
+ "contents": []
}
defaultContentItem: ContentItem = {
diff --git a/convert_source_description/typed_classes.py b/convert_source_description/typed_classes.py
index d915404..36ce731 100644
--- a/convert_source_description/typed_classes.py
+++ b/convert_source_description/typed_classes.py
@@ -52,15 +52,15 @@ class WritingInstruments(TypedDict):
class Description(TypedDict):
"""A typed dictionary that represents a description of a source description."""
desc: List[str]
- writingMaterialString: str
+ writingMaterialStrings: List[str]
writingInstruments: WritingInstruments
- title: str
- date: str
- pagination: str
- measureNumbers: str
- instrumentation: str
- annotations: str
- content: List[ContentItem]
+ titles: List[str]
+ dates: List[str]
+ paginationss: List[str]
+ measureNumbers: List[str]
+ instrumentations: List[str]
+ annotations: List[str]
+ contents: List[ContentItem]
class SourceDescription(TypedDict):
diff --git a/convert_source_description/utils_helper.py b/convert_source_description/utils_helper.py
index 1a5f31d..44e17d2 100644
--- a/convert_source_description/utils_helper.py
+++ b/convert_source_description/utils_helper.py
@@ -41,6 +41,7 @@
SYSTEM_STR = "System"
COLON = ":"
+COMMA = ","
DOT = "."
PARENTHESIS = "("
SEMICOLON = ";"
@@ -225,25 +226,29 @@ def _get_description(self, paras: List[Tag], source_id: str) -> Description:
# Define labels and corresponding keys in the description dictionary
description_labels_keys = [
- ("Beschreibstoff:", "writingMaterialString"),
+ ("Beschreibstoff:", "writingMaterialStrings"),
("Schreibstoff:", "writingInstruments"),
- ("Titel:", "title"),
- ("Datierung:", "date"),
- ("Paginierung:", "pagination"),
+ ("Titel:", "titles"),
+ ("Datierung:", "dates"),
+ ("Paginierung:", "paginations"),
("Taktzahlen:", "measureNumbers"),
- ("Besetzung:", "instrumentation"),
+ ("Besetzung:", "instrumentations"),
("Eintragungen:", "annotations"),
]
# Get content for each label and assign it to the corresponding key
for label, key in description_labels_keys:
content = self._get_paragraph_content_by_label(label, paras)
+
+ # Writing instruments require special handling
if key == "writingInstruments":
- content = self._extract_writing_instruments(content)
+ content = self._extract_writing_instruments(
+ content[0]) if content else description[key]
+
description[key] = content
# Get content items
- description["content"] = self._get_content_items(paras, source_id)
+ description["contents"] = self._get_content_items(paras, source_id)
return description
@@ -309,7 +314,7 @@ def _extract_writing_instruments(self, writing_instruments_text: str) -> Writing
if len(stripped_writing_instruments) > 1:
secondary = [
instr.strip().rstrip(DOT)
- for instr in self._strip_by_delimiter(stripped_writing_instruments[1], ",")
+ for instr in self._strip_by_delimiter(stripped_writing_instruments[1], COMMA)
]
else:
secondary = []
@@ -582,31 +587,33 @@ def _get_paragraph_content_by_label(self, label: str, paras: List[Tag]) -> str:
with leading and trailing whitespace removed.
"""
content_paragraph = self._find_tag_with_label_in_soup(label, paras)
+ content_lines = []
if content_paragraph is None:
- return ""
+ return content_lines
stripped_content = self._strip_tag(content_paragraph, P_TAG)
- content = self._strip_by_delimiter(stripped_content, label)[1]
+ initial_content = self._strip_by_delimiter(stripped_content, label)[1]
- if content.endswith(SEMICOLON):
+ content_lines.append(initial_content.strip().rstrip(DOT).rstrip(SEMICOLON))
+
+ if initial_content.endswith(SEMICOLON):
# Check for sibling paragraphs that belong to the same content
# (separated by semicolons)
sibling = content_paragraph.next_sibling
while sibling is not None and sibling.name == P_TAG:
sibling_content = self._strip_tag(sibling, P_TAG)
- if sibling_content.endswith(DOT):
- content += "
" + sibling_content
- break
- if sibling_content.endswith(SEMICOLON):
- content += "
" + sibling_content
+ if sibling_content.endswith(DOT) or sibling_content.endswith(SEMICOLON):
+ content_lines.append(sibling_content.strip().rstrip(DOT).rstrip(SEMICOLON))
+ if sibling_content.endswith(DOT):
+ break
else:
break
sibling = sibling.next_sibling
- return content.strip()
+ return content_lines
############################################
# Helper function: _get_paragraph_index_by_label