Merge pull request #161 from Auden-Musulin-Papers/dev

Release v0.6.0
Auden-Musulin-Papers · Feb 27, 2024 · 6a4d58f · 6a4d58f
2 parents 32ae58c + c3f10d2
commit 6a4d58f
Show file tree

Hide file tree

Showing 57 changed files with 2,950 additions and 2,148 deletions.
diff --git a/amp-app.xpr b/amp-app.xpr
@@ -32,27 +32,30 @@
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
-                                        <String>index</String>
+                                        <String>toc</String>
+                                        <String>search</String>
                                     </list>
                                 </field>
                                 <field name="scenarioTypes">
                                     <list>
                                         <String>XSL</String>
+                                        <String>XSL</String>
                                     </list>
                                 </field>
                                 <field name="scenarioStorageLocations">
                                     <list>
                                         <Byte>2</Byte>
+                                        <Byte>2</Byte>
                                     </list>
                                 </field>
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/editions/</String>
+                                    <String>build_app/ant/build.xml</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
-                                        <String>editions</String>
+                                        <String>search</String>
                                     </list>
                                 </field>
                                 <field name="scenarioTypes">
@@ -68,11 +71,11 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>html/css/style.css</String>
+                                    <String>data/meta/auden-biography.xml</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
-                                        <String>search</String>
+                                        <String>biographies</String>
                                     </list>
                                 </field>
                                 <field name="scenarioTypes">
@@ -88,11 +91,11 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/editions/photos/amp-transcript__0048.xml</String>
+                                    <String>data/editions/</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
-                                        <String>index</String>
+                                        <String>editions</String>
                                     </list>
                                 </field>
                                 <field name="scenarioTypes">
@@ -108,11 +111,11 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/meta/editorial-declaration.xml</String>
+                                    <String>html/css/style.css</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
-                                        <String>editorial-declaration</String>
+                                        <String>search</String>
                                     </list>
                                 </field>
                                 <field name="scenarioTypes">
@@ -128,7 +131,7 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/indices/listorg.xml</String>
+                                    <String>data/editions/photos/amp-transcript__0048.xml</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
@@ -148,11 +151,11 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/editions/memoirs/amp-transcript__0062.xml</String>
+                                    <String>data/meta/editorial-declaration.xml</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
-                                        <String>editions</String>
+                                        <String>editorial-declaration</String>
                                     </list>
                                 </field>
                                 <field name="scenarioTypes">
@@ -168,7 +171,27 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/editions/memoirs/amp-transcript__0028.xml</String>
+                                    <String>data/indices/listorg.xml</String>
+                                </field>
+                                <field name="scenarioIds">
+                                    <list>
+                                        <String>index</String>
+                                    </list>
+                                </field>
+                                <field name="scenarioTypes">
+                                    <list>
+                                        <String>XSL</String>
+                                    </list>
+                                </field>
+                                <field name="scenarioStorageLocations">
+                                    <list>
+                                        <Byte>2</Byte>
+                                    </list>
+                                </field>
+                            </scenarioAssociation>
+                            <scenarioAssociation>
+                                <field name="url">
+                                    <String>data/editions/memoirs/amp-transcript__0062.xml</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
@@ -188,11 +211,11 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/meta/musulin-biography.xml</String>
+                                    <String>data/editions/memoirs/amp-transcript__0028.xml</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>
-                                        <String>biographies</String>
+                                        <String>editions</String>
                                     </list>
                                 </field>
                                 <field name="scenarioTypes">
@@ -208,7 +231,7 @@
                             </scenarioAssociation>
                             <scenarioAssociation>
                                 <field name="url">
-                                    <String>data/meta/auden-biography.xml</String>
+                                    <String>data/meta/musulin-biography.xml</String>
                                 </field>
                                 <field name="scenarioIds">
                                     <list>

diff --git a/build_app/ant/build.xml b/build_app/ant/build.xml
@@ -18,7 +18,7 @@
     <!-- <property name="xsl_editions_xml" value="${basedir}/xslt/editions-xml.xsl"/> -->
     <property name="xsl_index" value="${basedir}/xslt/index.xsl"/>
     <property name="xsl_toc" value="${basedir}/xslt/toc.xsl"/>
-    <property name="xsl_memoirs" value="${basedir}/xslt/memoirs.xsl"/>
+    <!-- <property name="xsl_memoirs" value="${basedir}/xslt/memoirs.xsl"/> -->
     <property name="xsl_photos" value="${basedir}/xslt/photos.xsl"/>
     <property name="xsl_am" value="${basedir}/xslt/additional-materials.xsl"/>
     <property name="xsl_search" value="${basedir}/xslt/search.xsl"/>
@@ -40,19 +40,19 @@
         <classpath location="${basedir}/saxon/saxon9he.jar"/>
     </xslt>
     <delete>
-        <fileset dir="${target}" includes="listbibl.html"/>
+        <fileset dir="${target}" includes="amp-index-works.html"/>
     </delete>
     <delete>
-        <fileset dir="${target}" includes="listplace.html"/>
+        <fileset dir="${target}" includes="amp-index-places.html"/>
     </delete>
     <delete>
-        <fileset dir="${target}" includes="listperson.html"/>
+        <fileset dir="${target}" includes="amp-index-persons.html"/>
     </delete>
     <delete>
-        <fileset dir="${target}" includes="listorg.html"/>
+        <fileset dir="${target}" includes="amp-index-organizations.html"/>
     </delete>
     <delete>
-        <fileset dir="${target}" includes="listevent.html"/>
+        <fileset dir="${target}" includes="amp-index-events.html"/>
     </delete>
     <xslt style="${xsl_indices}" basedir="${indices}" destdir="${target}" includes="*.xml">
         <factory name="net.sf.saxon.TransformerFactoryImpl"/>

diff --git a/build_app/python/make_ts_index.py b/build_app/python/make_ts_index.py
@@ -64,57 +64,122 @@
             'type': 'string[]',
             'facet': True,
             'optional': True
-        }
+        },
+        {
+            'name': 'events',
+            'type': 'string[]',
+            'facet': True,
+            'optional': True
+        },
+        {
+            'name': 'document_type',
+            'type': 'string[]',
+            'optional': True,
+            'facet': True,
+        },
+        {
+            'name': 'image',
+            'type': 'string',
+        },
+        {"name": "page_int", "type": "int32", "sort": True},
+        {"name": "page_str", "type": "string"},
+        {"name": "comments_count", "type": "int32"},
+        {"name": "comments_bool", "type": "bool", "facet": True},
+        {"name": "poem_bool", "type": "bool", "facet": True},
+        {"name": "poem_count", "type": "int32"}
     ]
 }
 
 client.collections.create(current_schema)
 
 
+def get_context(xpath):
+    comments = False
+    comments_len = 0
+    for p in body:
+        try:
+            ent = p.xpath(xpath, namespaces={'tei': "http://www.tei-c.org/ns/1.0"})
+        except AttributeError:
+            ent = []
+        if len(ent) > 0:
+            comments = True
+            comments_len += len(ent)
+    return (comments, comments_len)
+
+
 def get_entities(ent_type, ent_node, ent_name):
     entities = []
     e_path = f'.//tei:rs[@type="{ent_type}"]/@ref'
     for p in body:
-        ent = p.xpath(e_path, namespaces={'tei': "http://www.tei-c.org/ns/1.0"})
+        try:
+            ent = p.xpath(e_path, namespaces={'tei': "http://www.tei-c.org/ns/1.0"})
+        except AttributeError:
+            ent = []
         ref = [ref.replace("#", "") for e in ent if len(ent) > 0 for ref in e.split()]
-        for r in ref:
-            p_path = f'.//tei:{ent_node}[@xml:id="{r}"]//tei:{ent_name}[1]'
-            en = doc.any_xpath(p_path)
-            if en:
-                entity = " ".join(" ".join(en[0].xpath(".//text()")).split())
-                if len(entity) != 0:
-                    entities.append(entity)
-                else:
-                    with open("log-entities.txt", "a") as f:
-                        f.write(f"{r} in {record['id']}\n")
+        if len(ref) > 0:
+            for r in ref:
+                p_path = f'.//tei:{ent_node}[@xml:id="{r}"]//tei:{ent_name}[1]'
+                en = doc.any_xpath(p_path)
+                if en:
+                    entity = " ".join(" ".join(en[0].xpath(".//text()")).split())
+                    if len(entity) != 0:
+                        entities.append(entity)
+                    else:
+                        with open("log-entities.txt", "a") as f:
+                            f.write(f"{r} in {record['id']}\n")
     return [ent for ent in sorted(set(entities))]
 
 
 records = []
 cfts_records = []
 for x in tqdm(files, total=len(files)):
     doc = TeiReader(xml=x, xsl='./xslt/preprocess_typesense.xsl')
-    facs = doc.any_xpath('.//tei:body/tei:div/tei:pb/@facs')
-    pages = 0
+    try:
+        corresp = doc.any_xpath('.//tei:text[@type="letter"]')[0]
+    except IndexError:
+        corresp = False
+        try:
+            photo = doc.any_xpath('.//tei:text[@type="photograph"]')[0]
+        except IndexError:
+            photo = False
+    facs = doc.any_xpath('.//tei:body/tei:div//tei:pb')
+    pages = 1
     for v in facs:
-        p_group = f""".//tei:body/tei:div/tei:p[preceding-sibling::tei:pb[1]/@facs='{v}']|
-                      .//tei:body/tei:div/tei:lg[preceding-sibling::tei:pb[1]/@facs='{v}']"""
+        facs_id = v.attrib['facs']
+        try:
+            facs_page = v.attrib['ed']
+        except KeyError:
+            facs_page = str(pages)
+        facs_type = v.attrib['type']
+        p_group = f""".//tei:body/tei:div/tei:p[preceding-sibling::tei:pb[1]/@facs='{facs_id}']|
+                      .//tei:body/tei:div/tei:lg[preceding-sibling::tei:pb[1]/@facs='{facs_id}']|
+                      .//tei:body/tei:div/tei:div/tei:ab[preceding-sibling::tei:pb[1]/@facs='{facs_id}']|
+                      .//tei:body/tei:div/tei:div/tei:div[preceding-sibling::tei:pb[1]/@facs='{facs_id}']"""
         body = doc.any_xpath(p_group)
-        pages += 1
         cfts_record = {
             'project': 'amp',
         }
         record = {}
-        record['id'] = os.path.split(x)[-1].replace('.xml', f".html?tab={str(pages)}")
+        if len(facs_id) > 0:
+            record['image'] = facs_id.split("/")[-2]
+        record['page_int'] = int(pages)
+        record['page_str'] = str(facs_page)
+        if corresp:
+            record["document_type"] = ["Correspondence"]
+        elif photo:
+            record["document_type"] = ["Photograph"]
+        else:
+            record["document_type"] = ["Other"]
+        record['id'] = os.path.split(x)[-1].replace('.xml', f".html?tab={facs_page}")
         cfts_record['id'] = record['id']
         cfts_record['resolver'] = f"https://amp.acdh.oeaw.ac.at/{record['id']}"
         record['rec_id'] = os.path.split(x)[-1]
         cfts_record['rec_id'] = record['rec_id']
         r_title = " ".join(" ".join(doc.any_xpath('.//tei:titleStmt/tei:title[@level="a"]/text()')).split())
-        record['title'] = f"{r_title} Page {str(pages)}"
+        record['title'] = r_title
         cfts_record['title'] = record['title']
         try:
-            date_str = doc.any_xpath('//tei:origin/tei:origDate/@notBefore')[0]
+            date_str = doc.any_xpath('//tei:origin/tei:origDate/@notBefore-iso')[0]
         except IndexError:
             date_str = doc.any_xpath('//tei:origin/tei:origDate/text()')[0]
             data_str = date_str.split("--")[0]
@@ -151,11 +216,19 @@ def get_entities(ent_type, ent_node, ent_name):
             ent_node = "bibl"
             record['works'] = get_entities(ent_type=ent_type, ent_node=ent_node, ent_name=ent_name)
             cfts_record['works'] = record['works']
+            # get unique bibls per page
+            ent_type = "event"
+            ent_name = "label"
+            record['events'] = get_entities(ent_type=ent_type, ent_node=ent_type, ent_name=ent_name)
+            cfts_record['events'] = record['events']
             record['full_text'] = "\n".join(" ".join("".join(p.itertext()).split()) for p in body)
             if len(record['full_text']) > 0:
                 records.append(record)
                 cfts_record['full_text'] = record['full_text']
                 cfts_records.append(cfts_record)
+            record['comments_bool'], record['comments_count'] = get_context('.//node()[@ana]')
+            record['poem_bool'], record['poem_count'] = get_context('.//tei:l')
+        pages += 1
 
 make_index = client.collections['amp'].documents.import_(records)
 print(make_index)