-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_ts_index.py
77 lines (66 loc) · 2.2 KB
/
make_ts_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import glob
import os
from typesense.api_call import ObjectNotFound
from acdh_cfts_pyutils import TYPESENSE_CLIENT as client, CFTS_COLLECTION
from acdh_tei_pyutils.tei import TeiReader
from acdh_tei_pyutils.utils import extract_fulltext
from tqdm import tqdm
files = glob.glob("./data/editions/*.xml")
tag_blacklist = ["{http://www.tei-c.org/ns/1.0}abbr"]
COLLECTION_NAME = "grundbuecher"
try:
client.collections[COLLECTION_NAME].delete()
except ObjectNotFound:
pass
current_schema = {
"name": COLLECTION_NAME,
"enable_nested_fields": True,
"fields": [
{"name": "id", "type": "string", "sort": True},
{"name": "rec_id", "type": "string", "sort": True},
{"name": "title", "type": "string", "sort": True},
{"name": "full_text", "type": "string", "sort": True},
{
"name": "year",
"type": "int32",
"optional": True,
"facet": True,
"sort": True,
},
],
}
client.collections.create(current_schema)
dates = set()
records = []
cfts_records = []
for x in tqdm(files, total=len(files)):
cfts_record = {
"project": COLLECTION_NAME,
}
record = {}
doc = TeiReader(x)
try:
body = doc.any_xpath(".//tei:body")[0]
except IndexError:
continue
record["id"] = os.path.split(x)[-1].replace(".xml", "")
cfts_record["id"] = record["id"]
cfts_record[
"resolver"
] = f"https://grundbuecher.acdh.oeaw.ac.at/{record['id']}.html"
record["rec_id"] = os.path.split(x)[-1].replace(".xml", "")
cfts_record["rec_id"] = record["rec_id"]
record["title"] = extract_fulltext(
doc.any_xpath(".//tei:titleStmt/tei:title[1]")[0]
)
cfts_record["title"] = record["title"]
record["full_text"] = extract_fulltext(body, tag_blacklist=tag_blacklist)
cfts_record["full_text"] = record["full_text"]
records.append(record)
cfts_records.append(cfts_record)
make_index = client.collections[COLLECTION_NAME].documents.import_(records)
print(make_index)
print(f"done with indexing {COLLECTION_NAME}")
make_index = CFTS_COLLECTION.documents.import_(cfts_records, {"action": "upsert"})
print(make_index)
print(f"done with cfts-index {COLLECTION_NAME}")