forked from cmantas/tiramola_v3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Coordinator.py
209 lines (168 loc) · 7.03 KB
/
Coordinator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
__author__ = 'cmantas'
from time import sleep
import CassandraCluster as Servers
from ClientsCluster import my_Clients as Clients
from lib.persistance_module import env_vars, home
from Monitoring import MonitorVms
from new_decision_module import RLDecisionMaker as DM
from lib.tiramola_logging import get_logger
from time import time
from os import remove
from threading import Thread
####### STATIC VARS ###############
my_logger = get_logger('COORDINATOR', 'INFO', logfile=home+'files/logs/Coordinator.log')
my_logger.debug("--------- NEW RUN -----------------")
#the (pending) decision at the present moment
decision = None
running_process=None
#get the endpoint for the monitoring system
monitor_clients = MonitorVms(Clients.get_monitoring_endpoint())
monitor_servers = MonitorVms(Servers.get_monitoring_endpoint())
error = None
#check if cluster exists
if Servers.exists():
my_logger.info( "Cluster exists using it as is")
#make sure no workload is running
else:
my_logger.error("Create the cluster first and then run the coordinator")
exit(-1)
def implement_decision():
"""
Used to asynchronously implement the decision that has been updated by the run function
"""
global decision
action = decision["action"]
count = decision['count']
try:
if action == "ADD":
decision_module.pending_action = action
my_logger.debug("Will add %d nodes" % count)
Servers.add_nodes(count)
# artificially delay the decision in order to discard transient measurements
my_logger.info("Sleeping! (artificial delay)")
sleep(200 +env_vars['extra_decision_delay_per_node']*count)
elif action == "REMOVE":
decision_module.pending_action = action
my_logger.debug("Will remove %d nodes" % count)
Servers.remove_nodes(count)
#not supposed to be here for pass decsion
else:
return
#update the hosts files in clients
Clients.update_hostfiles(Servers.get_hosts())
# update the state
decision_module.pending_action = None
decision_module.currentState = Servers.node_count()
except Exception as e:
#in case the action was failed set a globall error var as true
global error
error = e
running_process = None
def check_for_error():
global error
if not (error is None):
my_logger.error("I detected an error in a previous action. Raising exception")
my_logger.error("Message:" + str(error))
running_process.join()
raise error
def run(timeout=None):
"""
Runs cluster with automatic decision taking
@param timeout: the time in seconds this run should last
"""
my_logger.debug("run: Time starts now, the experiment will take %s sec" % (str(timeout)))
# convert relative timeout to absolute time
if not timeout is None: timeout = time() + timeout
#set global error to None
global error
error = None
# init the decision module
global decision_module
decision_module = DM(Servers)
# the time interval between metrics refresh
metrics_interval = env_vars["metric_fetch_interval"]
# main loop that fetches metric and takes decisions
while (timeout is None) or (time() <= timeout):
check_for_error()
sleep(metrics_interval)
# refresh the metrics
client_metrics = monitor_clients.refreshMetrics()
server_metrics = monitor_servers.refreshMetrics()
#take a decision based on the new metrics
global decision
decision = decision_module.take_decision(client_metrics, server_metrics)
# asynchronously implement that decision
if decision is None or decision["action"] == "PASS":
continue
global running_process
if not running_process is None:
running_process.join()
running_process = Thread(target=implement_decision, args=())
running_process.start()
# DONE
# join the running_process
if not running_process is None: running_process.join()
my_logger.info(" run is finished")
def train():
"""
Runs a training phase in order to collect a training set of metrics for the given cluster
"""
#change the gain function for training purposes
env_vars['gain'] = '0'
# load the training vars into the regular enviroment vars
t_vars = env_vars["training_vars"]
env_vars['decision_interval'] = t_vars['decision_interval']
env_vars['period'] = t_vars['period']
env_vars['max_cluster_size'] = t_vars['max_cluster_size']
env_vars['min_cluster_size'] = t_vars['min_cluster_size']
env_vars["add_nodes"] = 1
env_vars["rem_nodes"] = 1
env_vars["measurements_file"] = env_vars["training_file"]
env_vars['decision_threshold'] = 0
# remove the old measurements/training file so that it is replaced
try:remove(env_vars["measurements_file"])
except: pass
# # Sanity-Check the nodecount
# if Servers.node_count() != t_vars['min_cluster_size']:
# my_logger.error("TRAINING: Start training with the Minimum cluster size, %d (now:%d)" %(t_vars['min_cluster_size'], Servers.node_count()))
# exit()
Clients.kill_nodes()
Servers.kill_nodes()
Servers.bootstrap_cluster(t_vars['min_cluster_size'])
svr_hosts = Servers.get_hosts(private=env_vars["private_network"])
Clients.run({'type': 'load', 'servers': Servers.get_hosts(), 'records': t_vars['records']})
#create the parameters dictionary for the training phase
params = {'type': 'sinusoid', 'servers': svr_hosts, 'target': t_vars['target_load'],
'offset': t_vars['offset_load'], 'period': t_vars['period']}
# init the decision module
global decision_module
decision_module = DM(Servers)
#the time interval between metrics refresh
metrics_interval = env_vars["metric_fetch_interval"]
# run 1 period of workload for each of the the states between min and max cluster size
for i in range(env_vars['max_cluster_size'] - t_vars['min_cluster_size'] + 1):
my_logger.info("iteration "+str(i))
#run the workload with the specified params to the clients
Clients.run(params)
#This should only decide to add a node after a period is passed
global decision
#run for 1 period
timeout = time() + 60*env_vars['period']
while time() <= timeout:
#fetch metrics and takes decisions
sleep(metrics_interval)
# refresh the metrics
client_metrics = monitor_clients.refreshMetrics()
server_metrics = monitor_servers.refreshMetrics()
#only refresh metrics
decision_module.take_decision(client_metrics, server_metrics)
#manually add a node
decision = {"action": "ADD", 'count':1}
# synchronously implement that decision
implement_decision()
#stop the clients after one period has passed
Clients.kill_nodes()
my_logger.info("TRAINING DONE")
def test_vars():
print env_vars['gain']
print env_vars['max_cluster_size']