-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
127 lines (108 loc) · 3.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import argparse
import os
import json
import csv
import sys
from elasticsearch import Elasticsearch
from PyQt6.QtWidgets import QApplication
from es_client import ESClient
from GUI import MyWidget
from searcher import Searcher
import yaml
yaml_path = './config.yaml'
with open(yaml_path, 'rb') as f:
yaml_dict = list(yaml.safe_load_all(f))[0]
META_PATH = yaml_dict["meta_path"]
TRANS_ROOT = yaml_dict["trans_root"]
# Define mapping
configurations = {
"settings": { # TODO 暂时不懂setting各参数代表什么意思
"index": {"number_of_replicas": 2},
"analysis": {
"filter": {
"ngram_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 15,
},
},
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "ngram_filter"],
},
},
},
},
"ep_mappings": {
'properties': {
'show_filename_prefix': {'type': 'keyword'},
'episode_filename_prefix': {'type': 'keyword'},
'language': {'type': 'keyword'},
'show_name': {'type': 'text'},
'show_description': {'type': 'text'},
'publisher': {'type': 'text'},
'episode_name': {'type': 'text'},
'episode_description': {'type': 'text'},
'duration': {'type': 'double'}, # in minute
}
},
"tr_mappings": {
'properties': {
'show_filename_prefix': {'type': 'keyword'},
'episode_filename_prefix': {'type': 'keyword'},
'transcript': {'type': 'text'},
'startTime': {'type': 'double'}, # in second
'endTime': {'type': 'double'},
'totalTime': {'type': 'double'}
}
}
}
# Define search query
search_query = {
"match": {
"description": "second"
}
}
def getargs():
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--mode', type=str, default='index', help='index or search')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = getargs()
# init es client
es = ESClient(url="http://localhost:9200")
# index_names = es.get_index_names()
######################
# Index documents
######################
if args.mode == 'index':
# check if index exists
if es.es_client.indices.exists(index='episodes'):
a = input(f"Index “episodes” already exists. Do you want to delete it and regenerate? (y/n): ")
if a == 'y':
es.es_client.indices.delete(index='episodes')
# Create index named 'episodes' according to mappings
es.create_index(index_name='episodes', mappings=configurations['ep_mappings'])
# Index documents
print("Indexing metadata...")
es.index_meta(META_PATH) # Index metadata
# check if index exists
if es.es_client.indices.exists(index='transcripts'):
a = input(f"Index “transcripts” already exists. Do you want to delete it and regenerate? (y/n): ")
if a == 'y':
es.es_client.indices.delete(index='transcripts')
# create index named 'transcripts' according to mappings
es.create_index(index_name='transcripts', mappings=configurations['tr_mappings'])
print("Indexing transcripts...")
es.index_trans(TRANS_ROOT) # Index transcripts
######################
# Execute search query
######################
if args.mode == 'search':
app = QApplication(sys.argv)
search_engine = Searcher(es.es_client)
w = MyWidget(search_engine)
sys.exit(app.exec())