-
Notifications
You must be signed in to change notification settings - Fork 0
/
medword_pipeline.py
109 lines (79 loc) · 3.35 KB
/
medword_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from shared.load_config import __CONFIG__
import os
import importlib
import json
import preprocess as pp
import embedding_fasttext
import embedding_word2vec
import embedding_word2vec_composite
import model_validation as mv
# reload imported modules, as older versions are cached (developing purpose)
importlib.reload(pp)
importlib.reload(mv)
importlib.reload(embedding_fasttext)
importlib.reload(embedding_word2vec)
def run_pipeline(embedding):
# setup needed libraries, data structures etc.
pp.setup()
# get config
config = embedding.config
### script settings ###
# if you want to produce a new train_data file from your data directory
COMPUTE_NEW_TRAIN_DATA = config.config['compute_new_data']
# if you want to train a new word2vec model from your train_data file
TRAIN_NEW_MODEL = config.config['train_new_model']
# if you want to run the validation
RUN_VALIDATION = config.config['run_validation']
# data directories
if(config.config['running_mode'] == 'develop'):
print('Running in DEVELOPPER mode.')
base_data_dir = config.config['develop_base_data_dir']
elif(config.config['running_mode'] == 'normal'):
print('Running in NORMAL mode.')
base_data_dir = config.config['base_data_dir']
else:
print("Running mode not recognized: set running_mode to 'normal' or 'develop'")
return None
# source paths for embeddings
emb_model_dir = os.path.join(base_data_dir, 'embeddings/')
emb_model_fn = config.config['embedding_model_filename']
# if not exists make embeddings folder
if not os.path.exists(emb_model_dir):
os.makedirs(emb_model_dir)
# source paths for train_data
train_data_dir = os.path.join(base_data_dir, 'train_data/')
train_data_fn = config.config['train_data_filename']
train_data_src = os.path.join(train_data_dir, train_data_fn)
# compute new train data if needed
if (COMPUTE_NEW_TRAIN_DATA):
print("\n*** COMPUTING TRAIN DATA *** ")
raw_data_dir = os.path.join(train_data_dir, 'raw_data/')
#pp.create_train_data(train_data_src, raw_data_dir, config)
pp.create_intersection_train_data(train_data_src, train_data_dir,
config)
print("*** END COMPUTING TRAIN DATA *** ")
# train embeddings
if (TRAIN_NEW_MODEL):
print("\n*** TRAINING NEW MODEL *** ")
embedding.train_model(train_data_src, emb_model_dir, emb_model_fn)
print("*** END TRAINING NEW MODEL *** ")
# validate the embedding model
if (RUN_VALIDATION):
print("\n*** VALIDATING MODEL *** ")
mv.validate_model(embedding, emb_model_dir, emb_model_fn)
print("*** END VALIDATING MODEL *** ")
if __name__ == '__main__':
config = __CONFIG__
# choose the embedding algorithm
emb_method = config.config['embedding_method']
if emb_method == 'fasttext':
embedding = embedding_fasttext.EmbeddingFasttext(config)
elif emb_method == 'word2vec':
embedding = embedding_word2vec.EmbeddingWord2vec(config)
elif emb_method == 'word2vec-compound':
embedding = embedding_word2vec_composite.EmbeddingWord2vecComposite(config)
else:
print('embedding_algorithm (in config) must be "fasttext" or "word2vec"')
raise AttributeError
run_pipeline(embedding)
print("end_main")