Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

유전 알고리즘 구현 #6

Open
wants to merge 11 commits into
base: base
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 128 additions & 32 deletions models/steps.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
import copy

import numpy as np
import utils
from models.cluster import KMeansClusters, create_kselection_model
from models.factor_analysis import FactorAnalysis
from models.preprocessing import (get_shuffle_indices, consolidate_columnlabels)
from models.gp import GPRNP
from models.lasso import LassoPath
from models.xgboost import XGBR
from models.util import DataUtil
from models.parameters import params
from models.preprocessing import (get_shuffle_indices, consolidate_columnlabels)
from models.rf import RFR

from models.gp import GPRNP

from sklearn.preprocessing import StandardScaler
from models.xgboost import XGBR
from sklearn.gaussian_process import GaussianProcessRegressor
import numpy as np
from models.parameters import params

import utils
from sklearn.preprocessing import StandardScaler


def run_workload_characterization(metric_data):
Expand All @@ -28,7 +25,7 @@ def run_workload_characterization(metric_data):
# Remove any constant columns
nonconst_matrix = []
nonconst_columnlabels = []
for col, (_,v) in zip(matrix.T, enumerate(columnlabels)):
for col, (_, v) in zip(matrix.T, enumerate(columnlabels)):
if np.any(col != col[0]):
nonconst_matrix.append(col.reshape(-1, 1))
nonconst_columnlabels.append(v)
Expand Down Expand Up @@ -71,12 +68,12 @@ def run_workload_characterization(metric_data):
return pruned_metrics


def run_knob_identification(knob_data,metric_data,mode, logger):
def run_knob_identification(knob_data, metric_data, mode, logger):
knob_matrix = knob_data['data']
knob_columnlabels = knob_data['columnlabels']

metric_matrix = metric_data['data']
#metric_columnlabels = metric_data['columnlabels']
# metric_columnlabels = metric_data['columnlabels']

encoded_knob_columnlabels = knob_columnlabels
encoded_knob_matrix = knob_matrix
Expand All @@ -92,20 +89,20 @@ def run_knob_identification(knob_data,metric_data,mode, logger):
shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :]

if mode == 'lasso':
# run lasso algorithm
# run lasso algorithm
lasso_model = LassoPath()
lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels)
lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels)
encoded_knobs = lasso_model.get_ranked_features()
elif mode == "XGB":
xgb_model = XGBR()
xgb_model.fit(shuffled_knob_matrix, shuffled_metric_matrix,encoded_knob_columnlabels)
xgb_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels)
encoded_knobs = xgb_model.get_ranked_knobs()
feature_imp = xgb_model.get_ranked_importance()
logger.info('feature importance')
logger.info(feature_imp)
elif mode == "RF":
rf = RFR()
rf.fit(shuffled_knob_matrix,shuffled_metric_matrix,encoded_knob_columnlabels)
rf.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels)
encoded_knobs = rf.get_ranked_features()
feature_imp = rf.get_ranked_importance()
logger.info('feature importance')
Expand All @@ -115,6 +112,7 @@ def run_knob_identification(knob_data,metric_data,mode, logger):

return consolidated_knobs


# def run_workload_mapping(knob_data, metric_data, target_knob, target_metric, params):
# '''
# Args:
Expand Down Expand Up @@ -200,24 +198,29 @@ def run_knob_identification(knob_data,metric_data,mode, logger):


def configuration_recommendation(target_knob, target_metric, logger, gp_type='numpy', db_type='redis', data_type='RDB'):
X_columnlabels, X_scaler, X_scaled, y_scaled, X_max, X_min, _ = utils.process_training_data(target_knob, target_metric, data_type)
X_columnlabels, X_scaler, X_scaled, y_scaled, X_max, X_min, _ = utils.process_training_data(target_knob,
target_metric,
data_type)

num_samples = params["NUM_SAMPLES"]
X_samples = np.empty((num_samples, X_scaled.shape[1]))
for i in range(X_scaled.shape[1]):
X_samples[:, i] = np.random.rand(num_samples) * (X_max[i] - X_min[i]) + X_min[i]

res = None
model = None
if gp_type == 'numpy':
# DO GPRNP
model = GPRNP(length_scale = params["GPR_LENGTH_SCALE"],
magnitude=params["GPR_MAGNITUDE"],
max_train_size=params['GPR_MAX_TRAIN_SIZE'],
batch_size=params['GPR_BATCH_SIZE'])
model.fit(X_scaled,y_scaled,ridge=params["GPR_RIDGE"])
res = model.predict(X_samples).ypreds
logger.info('do GPRNP')
del model
logger.info('do GPRNP\n')

model = GPRNP(
length_scale=params["GPR_LENGTH_SCALE"],
magnitude=params["GPR_MAGNITUDE"],
max_train_size=params['GPR_MAX_TRAIN_SIZE'],
batch_size=params['GPR_BATCH_SIZE']
)
model.fit(X_scaled, y_scaled, ridge=params["GPR_RIDGE"])
# res = model.predict(X_samples).ypreds
elif gp_type == 'scikit':
# # DO SCIKIT-LEARN GP
# model = GaussianProcessRegressor().fit(X_scaled,y_scaled)
Expand All @@ -226,13 +229,106 @@ def configuration_recommendation(target_knob, target_metric, logger, gp_type='nu

from sklearn.gaussian_process.kernels import DotProduct
GPRkernel = DotProduct(sigma_0=0.5)
model = GaussianProcessRegressor(kernel = GPRkernel,
alpha = params["ALPHA"]).fit(X_scaled,y_scaled)
model = GaussianProcessRegressor(kernel=GPRkernel,
alpha=params["ALPHA"]).fit(X_scaled, y_scaled)
res = model.predict(X_samples)
del model
# del model
else:
raise Exception("gp_type should be one of (numpy and scikit)")

res = model.predict(X_samples).ypreds
prev_best_score = res.ravel().max()

"""
Genetic Algorithm
"""
logger.info("#######################")
logger.info("Genetic Algorithm start")
logger.info("#######################\n")

# Hyper parameters
max_generation = 100
min_generation = 30
mutation_rate = 0.01

# Initialization
population = copy.deepcopy(X_samples)
n_genes = population.shape[1]

generation = 0
best_score = 0
best_generation = 0
best_population = copy.deepcopy(population)
while generation <= max_generation:
logger.info(f"Generation: {generation}")
n_population = population.shape[0]
logger.info(f"Number of Population: {n_population}")

# Fitness Evaluation
res = model.predict(population).ypreds
scores = res.ravel()
max_score = scores.max()
if generation >= min_generation and max_score <= best_score:
logger.info(f"Current generation's max score {max_score} <= best score {best_score}")
logger.info(f"Evolution stops here...\n")
break

logger.info(f"Max score: {max_score}\n")
if max_score > best_score:
best_score = max_score
best_generation = generation
best_population = copy.deepcopy(population)

sum_score = np.sum(scores)
weights = [(score / sum_score, i) for i, score in enumerate(scores)]

# Selection: top 20%
weights.sort(key=lambda x: x[0], reverse=True)
divide_n = 5
n_parents = n_population // divide_n
parents_idx = [weight[1] for weight in weights[:n_parents]]

# Production
np.random.shuffle(parents_idx)
middle = n_parents // 2
parent1_idxs = parents_idx[:middle]
parent2_idxs = parents_idx[middle:]
new_population = []
for i in range(middle):
parent1_idx = parent1_idxs[i]
parent2_idx = parent2_idxs[i]

# Crossover & Mutation
parent1 = population[parent1_idx, :]
parent2 = population[parent2_idx, :]
for j in range(divide_n * 2):
child = []
for k in range(n_genes):
flag = np.random.uniform(0, 1)
if flag < mutation_rate:
child.append((parent1[k] + parent2[k]) / 2)
elif flag < 0.5:
child.append(parent1[k])
else:
child.append(parent2[k])
new_population.append(child)

# Go to next generation
population = np.array(new_population)
generation += 1

logger.info("#######################")
logger.info("Genetic Algorithm end")
logger.info("#######################\n")

logger.info(f"Best score")
logger.info(f"Before GA: {prev_best_score}")
logger.info(f"After GA: {best_score}, at generation {best_generation}\n")

# After evolution complete
res = model.predict(best_population).ypreds
del model

best_config_idx = np.argmax(res.ravel())
# if len(set(res.ravel()))==1:
# logger.info("FAIL TRAIN")
Expand All @@ -246,8 +342,8 @@ def configuration_recommendation(target_knob, target_metric, logger, gp_type='nu
conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)}
# logger.info("\n\n\n")
logger.info(conf_map)
#convert_dict_to_conf(conf_map, data_type)
# convert_dict_to_conf(conf_map, data_type)

logger.info("FINISH TRAIN")
print(np.max(res.ravel()))
return True, np.max(res.ravel()), conf_map
return True, np.max(res.ravel()), conf_map
2 changes: 1 addition & 1 deletion tuner/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@

assert all(train_knob_data['rowlabels'] == train_internal_data['rowlabels'])

print(f'{opt.persistence} knob data = {len(knob_data)}, {knob_data.keys()}, {knob_data}')
# print(f'{opt.persistence} knob data = {len(knob_data)}, {knob_data.keys()}, {knob_data}')
# print(f'{opt.persistence} metric data = {len(metric_data)}, {metric_data.keys()}, {metric_data}')

### METRICS SIMPLIFICATION STAGE ###
Expand Down