Skip to content

Commit

Permalink
Merge pull request #161 from Auden-Musulin-Papers/dev
Browse files Browse the repository at this point in the history
Release v0.6.0
  • Loading branch information
linxOD authored Feb 27, 2024
2 parents 32ae58c + c3f10d2 commit 6a4d58f
Show file tree
Hide file tree
Showing 57 changed files with 2,950 additions and 2,148 deletions.
55 changes: 39 additions & 16 deletions amp-app.xpr
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,30 @@
</field>
<field name="scenarioIds">
<list>
<String>index</String>
<String>toc</String>
<String>search</String>
</list>
</field>
<field name="scenarioTypes">
<list>
<String>XSL</String>
<String>XSL</String>
</list>
</field>
<field name="scenarioStorageLocations">
<list>
<Byte>2</Byte>
<Byte>2</Byte>
</list>
</field>
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/editions/</String>
<String>build_app/ant/build.xml</String>
</field>
<field name="scenarioIds">
<list>
<String>editions</String>
<String>search</String>
</list>
</field>
<field name="scenarioTypes">
Expand All @@ -68,11 +71,11 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>html/css/style.css</String>
<String>data/meta/auden-biography.xml</String>
</field>
<field name="scenarioIds">
<list>
<String>search</String>
<String>biographies</String>
</list>
</field>
<field name="scenarioTypes">
Expand All @@ -88,11 +91,11 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/editions/photos/amp-transcript__0048.xml</String>
<String>data/editions/</String>
</field>
<field name="scenarioIds">
<list>
<String>index</String>
<String>editions</String>
</list>
</field>
<field name="scenarioTypes">
Expand All @@ -108,11 +111,11 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/meta/editorial-declaration.xml</String>
<String>html/css/style.css</String>
</field>
<field name="scenarioIds">
<list>
<String>editorial-declaration</String>
<String>search</String>
</list>
</field>
<field name="scenarioTypes">
Expand All @@ -128,7 +131,7 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/indices/listorg.xml</String>
<String>data/editions/photos/amp-transcript__0048.xml</String>
</field>
<field name="scenarioIds">
<list>
Expand All @@ -148,11 +151,11 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/editions/memoirs/amp-transcript__0062.xml</String>
<String>data/meta/editorial-declaration.xml</String>
</field>
<field name="scenarioIds">
<list>
<String>editions</String>
<String>editorial-declaration</String>
</list>
</field>
<field name="scenarioTypes">
Expand All @@ -168,7 +171,27 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/editions/memoirs/amp-transcript__0028.xml</String>
<String>data/indices/listorg.xml</String>
</field>
<field name="scenarioIds">
<list>
<String>index</String>
</list>
</field>
<field name="scenarioTypes">
<list>
<String>XSL</String>
</list>
</field>
<field name="scenarioStorageLocations">
<list>
<Byte>2</Byte>
</list>
</field>
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/editions/memoirs/amp-transcript__0062.xml</String>
</field>
<field name="scenarioIds">
<list>
Expand All @@ -188,11 +211,11 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/meta/musulin-biography.xml</String>
<String>data/editions/memoirs/amp-transcript__0028.xml</String>
</field>
<field name="scenarioIds">
<list>
<String>biographies</String>
<String>editions</String>
</list>
</field>
<field name="scenarioTypes">
Expand All @@ -208,7 +231,7 @@
</scenarioAssociation>
<scenarioAssociation>
<field name="url">
<String>data/meta/auden-biography.xml</String>
<String>data/meta/musulin-biography.xml</String>
</field>
<field name="scenarioIds">
<list>
Expand Down
12 changes: 6 additions & 6 deletions build_app/ant/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
<!-- <property name="xsl_editions_xml" value="${basedir}/xslt/editions-xml.xsl"/> -->
<property name="xsl_index" value="${basedir}/xslt/index.xsl"/>
<property name="xsl_toc" value="${basedir}/xslt/toc.xsl"/>
<property name="xsl_memoirs" value="${basedir}/xslt/memoirs.xsl"/>
<!-- <property name="xsl_memoirs" value="${basedir}/xslt/memoirs.xsl"/> -->
<property name="xsl_photos" value="${basedir}/xslt/photos.xsl"/>
<property name="xsl_am" value="${basedir}/xslt/additional-materials.xsl"/>
<property name="xsl_search" value="${basedir}/xslt/search.xsl"/>
Expand All @@ -40,19 +40,19 @@
<classpath location="${basedir}/saxon/saxon9he.jar"/>
</xslt>
<delete>
<fileset dir="${target}" includes="listbibl.html"/>
<fileset dir="${target}" includes="amp-index-works.html"/>
</delete>
<delete>
<fileset dir="${target}" includes="listplace.html"/>
<fileset dir="${target}" includes="amp-index-places.html"/>
</delete>
<delete>
<fileset dir="${target}" includes="listperson.html"/>
<fileset dir="${target}" includes="amp-index-persons.html"/>
</delete>
<delete>
<fileset dir="${target}" includes="listorg.html"/>
<fileset dir="${target}" includes="amp-index-organizations.html"/>
</delete>
<delete>
<fileset dir="${target}" includes="listevent.html"/>
<fileset dir="${target}" includes="amp-index-events.html"/>
</delete>
<xslt style="${xsl_indices}" basedir="${indices}" destdir="${target}" includes="*.xml">
<factory name="net.sf.saxon.TransformerFactoryImpl"/>
Expand Down
113 changes: 93 additions & 20 deletions build_app/python/make_ts_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,57 +64,122 @@
'type': 'string[]',
'facet': True,
'optional': True
}
},
{
'name': 'events',
'type': 'string[]',
'facet': True,
'optional': True
},
{
'name': 'document_type',
'type': 'string[]',
'optional': True,
'facet': True,
},
{
'name': 'image',
'type': 'string',
},
{"name": "page_int", "type": "int32", "sort": True},
{"name": "page_str", "type": "string"},
{"name": "comments_count", "type": "int32"},
{"name": "comments_bool", "type": "bool", "facet": True},
{"name": "poem_bool", "type": "bool", "facet": True},
{"name": "poem_count", "type": "int32"}
]
}

client.collections.create(current_schema)


def get_context(xpath):
comments = False
comments_len = 0
for p in body:
try:
ent = p.xpath(xpath, namespaces={'tei': "http://www.tei-c.org/ns/1.0"})
except AttributeError:
ent = []
if len(ent) > 0:
comments = True
comments_len += len(ent)
return (comments, comments_len)


def get_entities(ent_type, ent_node, ent_name):
entities = []
e_path = f'.//tei:rs[@type="{ent_type}"]/@ref'
for p in body:
ent = p.xpath(e_path, namespaces={'tei': "http://www.tei-c.org/ns/1.0"})
try:
ent = p.xpath(e_path, namespaces={'tei': "http://www.tei-c.org/ns/1.0"})
except AttributeError:
ent = []
ref = [ref.replace("#", "") for e in ent if len(ent) > 0 for ref in e.split()]
for r in ref:
p_path = f'.//tei:{ent_node}[@xml:id="{r}"]//tei:{ent_name}[1]'
en = doc.any_xpath(p_path)
if en:
entity = " ".join(" ".join(en[0].xpath(".//text()")).split())
if len(entity) != 0:
entities.append(entity)
else:
with open("log-entities.txt", "a") as f:
f.write(f"{r} in {record['id']}\n")
if len(ref) > 0:
for r in ref:
p_path = f'.//tei:{ent_node}[@xml:id="{r}"]//tei:{ent_name}[1]'
en = doc.any_xpath(p_path)
if en:
entity = " ".join(" ".join(en[0].xpath(".//text()")).split())
if len(entity) != 0:
entities.append(entity)
else:
with open("log-entities.txt", "a") as f:
f.write(f"{r} in {record['id']}\n")
return [ent for ent in sorted(set(entities))]


records = []
cfts_records = []
for x in tqdm(files, total=len(files)):
doc = TeiReader(xml=x, xsl='./xslt/preprocess_typesense.xsl')
facs = doc.any_xpath('.//tei:body/tei:div/tei:pb/@facs')
pages = 0
try:
corresp = doc.any_xpath('.//tei:text[@type="letter"]')[0]
except IndexError:
corresp = False
try:
photo = doc.any_xpath('.//tei:text[@type="photograph"]')[0]
except IndexError:
photo = False
facs = doc.any_xpath('.//tei:body/tei:div//tei:pb')
pages = 1
for v in facs:
p_group = f""".//tei:body/tei:div/tei:p[preceding-sibling::tei:pb[1]/@facs='{v}']|
.//tei:body/tei:div/tei:lg[preceding-sibling::tei:pb[1]/@facs='{v}']"""
facs_id = v.attrib['facs']
try:
facs_page = v.attrib['ed']
except KeyError:
facs_page = str(pages)
facs_type = v.attrib['type']
p_group = f""".//tei:body/tei:div/tei:p[preceding-sibling::tei:pb[1]/@facs='{facs_id}']|
.//tei:body/tei:div/tei:lg[preceding-sibling::tei:pb[1]/@facs='{facs_id}']|
.//tei:body/tei:div/tei:div/tei:ab[preceding-sibling::tei:pb[1]/@facs='{facs_id}']|
.//tei:body/tei:div/tei:div/tei:div[preceding-sibling::tei:pb[1]/@facs='{facs_id}']"""
body = doc.any_xpath(p_group)
pages += 1
cfts_record = {
'project': 'amp',
}
record = {}
record['id'] = os.path.split(x)[-1].replace('.xml', f".html?tab={str(pages)}")
if len(facs_id) > 0:
record['image'] = facs_id.split("/")[-2]
record['page_int'] = int(pages)
record['page_str'] = str(facs_page)
if corresp:
record["document_type"] = ["Correspondence"]
elif photo:
record["document_type"] = ["Photograph"]
else:
record["document_type"] = ["Other"]
record['id'] = os.path.split(x)[-1].replace('.xml', f".html?tab={facs_page}")
cfts_record['id'] = record['id']
cfts_record['resolver'] = f"https://amp.acdh.oeaw.ac.at/{record['id']}"
record['rec_id'] = os.path.split(x)[-1]
cfts_record['rec_id'] = record['rec_id']
r_title = " ".join(" ".join(doc.any_xpath('.//tei:titleStmt/tei:title[@level="a"]/text()')).split())
record['title'] = f"{r_title} Page {str(pages)}"
record['title'] = r_title
cfts_record['title'] = record['title']
try:
date_str = doc.any_xpath('//tei:origin/tei:origDate/@notBefore')[0]
date_str = doc.any_xpath('//tei:origin/tei:origDate/@notBefore-iso')[0]
except IndexError:
date_str = doc.any_xpath('//tei:origin/tei:origDate/text()')[0]
data_str = date_str.split("--")[0]
Expand Down Expand Up @@ -151,11 +216,19 @@ def get_entities(ent_type, ent_node, ent_name):
ent_node = "bibl"
record['works'] = get_entities(ent_type=ent_type, ent_node=ent_node, ent_name=ent_name)
cfts_record['works'] = record['works']
# get unique bibls per page
ent_type = "event"
ent_name = "label"
record['events'] = get_entities(ent_type=ent_type, ent_node=ent_type, ent_name=ent_name)
cfts_record['events'] = record['events']
record['full_text'] = "\n".join(" ".join("".join(p.itertext()).split()) for p in body)
if len(record['full_text']) > 0:
records.append(record)
cfts_record['full_text'] = record['full_text']
cfts_records.append(cfts_record)
record['comments_bool'], record['comments_count'] = get_context('.//node()[@ana]')
record['poem_bool'], record['poem_count'] = get_context('.//tei:l')
pages += 1

make_index = client.collections['amp'].documents.import_(records)
print(make_index)
Expand Down
Loading

0 comments on commit 6a4d58f

Please sign in to comment.