Skip to content

Commit

Permalink
Merge pull request #36 from musicEnfanthen/main
Browse files Browse the repository at this point in the history
feat(utils): apply updated source desc model with arrays
  • Loading branch information
musicEnfanthen authored Jun 25, 2024
2 parents 7a5e33e + 7810f8c commit 677ae22
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 33 deletions.
16 changes: 8 additions & 8 deletions convert_source_description/default_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@

defaultDescription: Description = {
"desc": [],
"writingMaterialString": "",
"writingMaterialStrings": [],
"writingInstruments": {
"main": "",
"secondary": []
},
"title": "",
"date": "",
"pagination": "",
"measureNumbers": "",
"instrumentation": "",
"annotations": "",
"content": []
"titles": [],
"dates": [],
"paginations": [],
"measureNumbers": [],
"instrumentations": [],
"annotations": [],
"contents": []
}

defaultContentItem: ContentItem = {
Expand Down
16 changes: 8 additions & 8 deletions convert_source_description/typed_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,15 @@ class WritingInstruments(TypedDict):
class Description(TypedDict):
"""A typed dictionary that represents a description of a source description."""
desc: List[str]
writingMaterialString: str
writingMaterialStrings: List[str]
writingInstruments: WritingInstruments
title: str
date: str
pagination: str
measureNumbers: str
instrumentation: str
annotations: str
content: List[ContentItem]
titles: List[str]
dates: List[str]
paginationss: List[str]
measureNumbers: List[str]
instrumentations: List[str]
annotations: List[str]
contents: List[ContentItem]


class SourceDescription(TypedDict):
Expand Down
41 changes: 24 additions & 17 deletions convert_source_description/utils_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
SYSTEM_STR = "System"

COLON = ":"
COMMA = ","
DOT = "."
PARENTHESIS = "("
SEMICOLON = ";"
Expand Down Expand Up @@ -225,25 +226,29 @@ def _get_description(self, paras: List[Tag], source_id: str) -> Description:

# Define labels and corresponding keys in the description dictionary
description_labels_keys = [
("Beschreibstoff:", "writingMaterialString"),
("Beschreibstoff:", "writingMaterialStrings"),
("Schreibstoff:", "writingInstruments"),
("Titel:", "title"),
("Datierung:", "date"),
("Paginierung:", "pagination"),
("Titel:", "titles"),
("Datierung:", "dates"),
("Paginierung:", "paginations"),
("Taktzahlen:", "measureNumbers"),
("Besetzung:", "instrumentation"),
("Besetzung:", "instrumentations"),
("Eintragungen:", "annotations"),
]

# Get content for each label and assign it to the corresponding key
for label, key in description_labels_keys:
content = self._get_paragraph_content_by_label(label, paras)

# Writing instruments require special handling
if key == "writingInstruments":
content = self._extract_writing_instruments(content)
content = self._extract_writing_instruments(
content[0]) if content else description[key]

description[key] = content

# Get content items
description["content"] = self._get_content_items(paras, source_id)
description["contents"] = self._get_content_items(paras, source_id)

return description

Expand Down Expand Up @@ -309,7 +314,7 @@ def _extract_writing_instruments(self, writing_instruments_text: str) -> Writing
if len(stripped_writing_instruments) > 1:
secondary = [
instr.strip().rstrip(DOT)
for instr in self._strip_by_delimiter(stripped_writing_instruments[1], ",")
for instr in self._strip_by_delimiter(stripped_writing_instruments[1], COMMA)
]
else:
secondary = []
Expand Down Expand Up @@ -582,31 +587,33 @@ def _get_paragraph_content_by_label(self, label: str, paras: List[Tag]) -> str:
with leading and trailing whitespace removed.
"""
content_paragraph = self._find_tag_with_label_in_soup(label, paras)
content_lines = []

if content_paragraph is None:
return ""
return content_lines

stripped_content = self._strip_tag(content_paragraph, P_TAG)
content = self._strip_by_delimiter(stripped_content, label)[1]
initial_content = self._strip_by_delimiter(stripped_content, label)[1]

if content.endswith(SEMICOLON):
content_lines.append(initial_content.strip().rstrip(DOT).rstrip(SEMICOLON))

if initial_content.endswith(SEMICOLON):
# Check for sibling paragraphs that belong to the same content
# (separated by semicolons)
sibling = content_paragraph.next_sibling

while sibling is not None and sibling.name == P_TAG:
sibling_content = self._strip_tag(sibling, P_TAG)
if sibling_content.endswith(DOT):
content += "<br />" + sibling_content
break
if sibling_content.endswith(SEMICOLON):
content += "<br />" + sibling_content
if sibling_content.endswith(DOT) or sibling_content.endswith(SEMICOLON):
content_lines.append(sibling_content.strip().rstrip(DOT).rstrip(SEMICOLON))
if sibling_content.endswith(DOT):
break
else:
break

sibling = sibling.next_sibling

return content.strip()
return content_lines

############################################
# Helper function: _get_paragraph_index_by_label
Expand Down

0 comments on commit 677ae22

Please sign in to comment.