-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
141 lines (123 loc) · 5.01 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
from copy import copy
from bs4 import BeautifulSoup, NavigableString
from bs4.element import Tag
def join_hyphenated_words(words: list[str]):
"""Joins words that have been split as a result of end-of-line hyphenation."""
HYPHEN = "-"
i = 0
while i < len(words) - 1:
if words[i].endswith(HYPHEN):
words[i] = words[i].rstrip(HYPHEN) + words[i + 1]
del words[i + 1]
else:
i += 1
return words
def get_top_level_text(tag):
"""Extracts the top-level text from the tag."""
extracted_string = "".join(
child.string for child in tag.children if isinstance(child, NavigableString)
)
return re.sub(r"\s+", " ", extracted_string).strip()
def convert_xml_content_to_string(raw_content: Tag):
"""Converts the XML content of textbook section to a string"""
content = []
for child in raw_content.find_all("ab", attrs={"type": "Body"}):
for grandchild in child.children:
if not grandchild.text.strip():
continue
if grandchild.name == "w":
content += [text.strip() for text in grandchild.stripped_strings]
return " ".join(join_hyphenated_words(content))
def find_all_with_limit(soup, tag, max_depth, current_depth=0):
"""Alternative to `soup.find_all(tag, recursive=True)` that puts a
limit on recursive search depth."""
found_items = []
# Check if the maximum depth has been exceeded
if current_depth > max_depth:
return found_items
# Check if the current element matches the search criteria
if isinstance(soup, Tag):
if soup.name == tag:
found_items.append(soup)
# Recursively search in child elements
for child in soup.children:
found_items.extend(
find_all_with_limit(child, tag, max_depth, current_depth + 1)
)
return found_items
def get_subsection_refs(entry):
"""Gets the subsections associated with a TOC entry."""
next_sibling = entry.find_next_sibling()
if not next_sibling or next_sibling.name != "list":
return []
return [
ref.attrs["target"]
for ref in find_all_with_limit(next_sibling, "ref", 2)
if ref.has_attr("target")
]
def remove_subsection_content(content_xml, subsection_refs):
"""Removes content that belongs to subsections."""
for sub_ref in subsection_refs:
sub_contents = content_xml.find_all("div", {"xml:id": sub_ref})
for sub_content in sub_contents:
sub_content.decompose()
def get_concepts(index, entry_id):
"""Gets the concepts for a given entry."""
index_refs = index.find_all("ref", attrs={"target": entry_id})
if index_refs is None:
return {}
concepts = {}
for index_ref in index_refs:
concept_data = index_ref.parent
if concept_data.get("domain-specificity") not in {"core-domain", "in-domain"}:
continue
concept_id = concept_data.attrs["xml:id"]
if concept_id.startswith("example") or concept_id.endswith("example"):
continue
concepts[concept_id] = {"name": get_top_level_text(concept_data)}
concept = concept_data.find("seg")
if concept is None:
continue
definition = concept.find("gross", attrs={"property": "skos:definition"})
if definition is not None:
concepts[concept_id]["definition"] = definition.text
subject = concept.find("ref", attrs={"property": "terms:subject"})
if subject is not None:
concepts[concept_id]["subject"] = (
subject.attrs["resource"]
.removeprefix("http://dbpedia.org/resource/Category:")
.replace("_", " ")
)
return concepts
def parse_xml(soup: BeautifulSoup) -> dict:
"""
Parses TEI-encoded XML into a dictionary of TOC entries -> section contents
"""
toc = soup.find("front").find("div", attrs={"type": "contents"}).find("list")
body = soup.find("body")
index = soup.find("div", attrs={"type": "index"})
toc_entries = {}
for entry in toc.find_all("item"):
ref = entry.find("ref")
if not ref or not ref.has_attr("target"):
continue
section_entry = get_top_level_text(entry)
entry_id = ref.attrs["target"]
content_xml = copy(body.find("div", attrs={"xml:id": entry_id}))
subsection_refs = get_subsection_refs(entry)
if content_xml:
level = int(content_xml.attrs["n"])
remove_subsection_content(content_xml, subsection_refs)
content = convert_xml_content_to_string(content_xml)
word_count = len(content_xml.find_all("w"))
concepts = get_concepts(index, entry_id)
toc_entries[entry_id] = {
"entry": section_entry,
"level": level,
"content": content,
"word_count": word_count,
"subsections": subsection_refs,
"concepts": concepts,
}
return toc_entries