diff --git a/stgp/Constants.py b/Arguments.py similarity index 59% rename from stgp/Constants.py rename to Arguments.py index a65276b..b75579e 100644 --- a/stgp/Constants.py +++ b/Arguments.py @@ -5,95 +5,99 @@ # # This product can be obtained in https://github.com/jespb/Python-STGP # -# Copyright ©2019 J. E. Batista +# Copyright ©2019-2021 J. E. Batista # + +# Operators to be used by the models +# Only these operators are available. To add mode, edit m3gp.Node.calculate(self, sample) OPERATORS = ["+","-","*","/"] -MAX_DEPTH = 6 # max depth of the initial trees and the trees used for mutation -POPULATION_SIZE = 200 + +# Initial Maximum depth +MAX_DEPTH = 6 + +# Number of models in the population +POPULATION_SIZE = 500 + +# Maximum number of iterations MAX_GENERATION = 100 + +# Fraction of the dataset to be used as training (used by Main_M3GP_standalone.py) TRAIN_FRACTION = 0.70 -TOURNAMENT_SIZE = 10 + +# Number of individuals to be used in the tournament +TOURNAMENT_SIZE = 5 + +# Number of best individuals to be automatically moved to the next generation ELITISM_SIZE = 1 + +# Shuffle the dataset (used by Main_M3GP_standalone.py) SHUFFLE = True -LIMIT_DEPTH=15 + +# Dimensions maximum depth +LIMIT_DEPTH=17 + +# Number of runs (used by Main_M3GP_standalone.py) RUNS = 30 + +# Verbose VERBOSE = True + +# Number of CPU Threads to be used +THREADS = 1 + + + + DATASETS_DIR = "datasets/" OUTPUT_DIR = "results/" + DATASETS = ["heart.csv"] OUTPUT = "Classification" -out = None -THREADS = 1 if "-dsdir" in argv: DATASETS_DIR = argv[argv.index("-dsdir")+1] + if "-odir" in argv: OUTPUT_DIR = argv[argv.index("-odir")+1] + if "-d" in argv: DATASETS = argv[argv.index("-d")+1].split(";") -if "-r" in argv: - OUTPUT = "Regression" + if "-runs" in argv: RUNS = int(argv[argv.index("-runs")+1]) + if "-op" in argv: OPERATORS = argv[argv.index("-op")+1].split(";") + if "-md" in argv: MAX_DEPTH = int(argv[argv.index("-md")+1]) + if "-ps" in argv: POPULATION_SIZE = int(argv[argv.index("-ps")+1]) + if "-mg" in argv: MAX_GENERATION = int(argv[argv.index("-mg")+1]) + if "-tf" in argv: - TRAIN_FRACTION = float(argv[argv.index("-train")+1]) + TRAIN_FRACTION = float(argv[argv.index("-tf")+1]) + if "-ts" in argv: TOURNAMENT_SIZE = int(argv[argv.index("-ts")+1]) + if "-es" in argv: ELITISM_SIZE = int(argv[argv.index("-es")+1]) + if "-dontshuffle" in argv: SHUFFLE = False + if "-s" in argv: VERBOSE = False -if "-ms" in argv: - MUTATION_STEP = float(argv[argv.index("-ms")+1]) + if "-t" in argv: THREADS = int(argv[argv.index("-t")+1]) - - -def openFile(name): - global out - out = open(name,"w") - -def writeToFile(msg): - global out - out.write(msg) - -def closeFile(): - global out - out.close() - -terminals = None -def setTerminals(l): - global terminals - terminals = l -def getTerminals(): - return terminals - -trainingSet = None -def setTrainingSet(ds): - global trainingSet - trainingSet = ds -def getTrainingSet(): - return trainingSet - -testSet = None -def setTestSet(ds): - global testSet - testSet = ds -def getTestSet(): - return testSet \ No newline at end of file diff --git a/Main_STGP_example.py b/Main_STGP_example.py new file mode 100644 index 0000000..fee3e8f --- /dev/null +++ b/Main_STGP_example.py @@ -0,0 +1,44 @@ +import pandas + +from stgp.STGP import STGP + +from sklearn.model_selection import train_test_split + +from sklearn.metrics import accuracy_score + +import warnings + +warnings.filterwarnings("ignore", category=FutureWarning, + message="From version 0.21, test_size will always complement", + module="sklearn") + +# +# By using this file, you are agreeing to this product's EULA +# +# This product can be obtained in https://github.com/jespb/Python-STGP +# +# Copyright ©2019-2021 J. E. Batista +# + + + +filename= "heart.csv" + +# Open the dataset +ds = pandas.read_csv("datasets/"+filename) +class_header = ds.columns[-1] + +# Split the dataset +Tr_X, Te_X, Tr_Y, Te_Y = train_test_split(ds.drop(columns=[class_header]), ds[class_header], + train_size=0.7, random_state = 42, stratify = ds[class_header]) + +# Train a model +model = STGP() +model.fit(Tr_X, Tr_Y) + +# Predict test results +pred = m3gp.predict(Te_X) + +# Obtain test accuracy +print( accuracy_score(pred, Te_Y) ) + diff --git a/Main_STGP_standalone.py b/Main_STGP_standalone.py new file mode 100644 index 0000000..6258428 --- /dev/null +++ b/Main_STGP_standalone.py @@ -0,0 +1,145 @@ +import pandas + +from stgp.STGP import STGP +from sys import argv +from Arguments import * +import os + +from sklearn.model_selection import train_test_split + +import numpy as np + +import warnings + +warnings.filterwarnings("ignore", category=FutureWarning, + message="From version 0.21, test_size will always complement", + module="sklearn") + + +# +# By using this file, you are agreeing to this product's EULA +# +# This product can be obtained in https://github.com/jespb/Python-STGP +# +# Copyright ©2019-2021 J. E. Batista +# + + + + +def openAndSplitDatasets(which,seed): + if VERBOSE: + print( "> Opening: ", which ) + + # Open dataset + ds = pandas.read_csv(DATASETS_DIR+which) + + # Read header + class_header = ds.columns[-1] + + return train_test_split(ds.drop(columns=[class_header]), ds[class_header], + train_size=TRAIN_FRACTION, random_state=seed, + stratify = ds[class_header]) + + +def run(r,dataset): + if VERBOSE: + print("> Starting run:") + print(" > ID:", r) + print(" > Dataset:", dataset) + print() + + Tr_X, Te_X, Tr_Y, Te_Y = openAndSplitDatasets(dataset,r) + + # Train a model + model = STGP(OPERATORS, MAX_DEPTH, POPULATION_SIZE, MAX_GENERATION, TOURNAMENT_SIZE, + ELITISM_SIZE, LIMIT_DEPTH, THREADS, VERBOSE) + model.fit(Tr_X, Tr_Y, Te_X, Te_Y) + + + # Obtain training results + accuracy = model.getAccuracyOverTime() + rmse = model.getRMSEOverTime() + size = model.getSizeOverTime() + model_str = str(model.getBestIndividual()) + times = model.getGenerationTimes() + + tr_acc = accuracy[0] + te_acc = accuracy[1] + tr_rmse = rmse[0] + te_rmse = rmse[1] + + if VERBOSE: + print("> Ending run:") + print(" > ID:", r) + print(" > Dataset:", dataset) + print(" > Final model:", model_str) + print(" > Training accuracy:", tr_acc[-1]) + print(" > Test accuracy:", te_acc[-1]) + print() + + return (tr_acc,te_acc, + tr_rmse,te_rmse, + size, times, + model_str) + + +def callm3gp(): + try: + os.makedirs(OUTPUT_DIR) + except: + pass + + for dataset in DATASETS: + outputFilename = OUTPUT_DIR+"stgp_"+ dataset + if not os.path.exists(outputFilename): + results = [] + + # Run the algorithm several times + for r in range(RUNS): + results.append(run(r,dataset)) + + # Write output header + file = open(outputFilename , "w") + file.write("Attribute,Run,") + for i in range(MAX_GENERATION): + file.write(str(i)+",") + file.write("\n") + + attributes= ["Training-Accuracy","Test-Accuracy", + "Training-RMSE", "Test-RMSE", + "Size", "Time", + "Final_Model"] + + # Write attributes with value over time + for ai in range(len(attributes)-1): + for i in range(RUNS): + file.write("\n"+attributes[ai]+","+str(i)+",") + file.write( ",".join([str(val) for val in results[i][ai]])) + file.write("\n") + + # Write the final models + for i in range(len(results)): + file.write("\n"+attributes[-1]+","+str(i)+",") + file.write(results[i][-1]) + file.write("\n") + + # Write some parameters + file.write("\n\nParameters") + file.write("\nOperators,"+str(OPERATORS)) + file.write("\nMax Initial Depth,"+str(MAX_DEPTH)) + file.write("\nPopulation Size,"+str(POPULATION_SIZE)) + file.write("\nMax Generation,"+str(MAX_GENERATION)) + file.write("\nTournament Size,"+str(TOURNAMENT_SIZE)) + file.write("\nElitism Size,"+str(ELITISM_SIZE)) + file.write("\nDepth Limit,"+str(LIMIT_DEPTH)) + file.write("\nThreads,"+str(THREADS)) + + + file.close() + else: + print("Filename: " + outputFilename +" already exists.") + + +if __name__ == '__main__': + callm3gp() diff --git a/README.txt b/README.txt index 2aef9a0..a5801a3 100644 --- a/README.txt +++ b/README.txt @@ -1,21 +1,23 @@ +This is a, easy-to-use, scikit-learn inspired version of the STGP algorithm. + + By using this file, you are agreeing to this product's EULA This product can be obtained in https://github.com/jespb/Python-STGP -Copyright ©2019 J. E. Batista +Copyright ©2019-2021 J. E. Batista + + +This file contains information about the command and flags used in the stand-alone version of this implementation and an explanation on how to import, use and edit this implementation. -This implementation of STGP uses the following command and flags: +This implementation of STGP can be used in a stand-alone fashion using the following command and flags: -$ python Main_STGP.py +$ python Main_STGP_standalone.py [-d datasets] - This flag expects a set of csv dataset names separated by ";" (e.g., a.csv;b.csv) - By default, the heart.csv dataset is used - [-dontshuffle] - - By using this flag, the dataset will not be shuffled; - - By default, the dataset is shuffled. - - [-dsdir dataset_dir] + [-dsdir dir] - States the dataset directory. - By default "datasets/" is used - Use "-dsdir ./" for the root directory @@ -32,7 +34,7 @@ $ python Main_STGP.py - This flag expects an integer with the maximum number of generations; - By default, this value is set to 100. - [-odir output_dir] + [-odir dir] - States the output directory. - By default "results/" is used - Use "-odir ./" for the root directory @@ -41,13 +43,10 @@ $ python Main_STGP.py - This flag excepts a set of operators separated by ";" - Allowed operators: +;-;*;/ - By default, the used operators are the sum, subtraction, multiplication and protected division. + [-ps population_size] - This flag expects an integer with the size of the population; - - By default, this value is set to 200. - - [-r] - - States the this is a regression problem. - - By default the GSGP tries to classify samples as 0 or 1 + - By default, this value is set to 500. [-runs number_of_runs] - This flag expects an integer with the number of runs to be made; @@ -61,5 +60,65 @@ $ python Main_STGP.py - This flag expects an integer with the tournament size; - By default, this value is set to 10. + [-t number_of_threads] + - This flag expects an integer with the number of threads to use while evaluating the population; + - If the value is set to 1, the multiprocessing library will not be used + - By default, this value is set to 1. + + + + + + + +How to import this implementation to your project: + - Download this repository; + - Copy the "stgp/" directory to your project directory; + - import the STGP class using "from stgp.STGP import STGP". + +How to use this implementation: + $ from stgp.STGP import STGP + $ model = STGP() + $ model.fit( training_x, training_y) + +Arguments for STGP(): + operators -> Operators used by the individual (default: ["+","-","*","/"] ) + max_depth -> Max initial depths of the individuals (default: 6) + population_size -> Population size (default: 500) + max_generation -> Maximum number of generations (default: 100) + tournament_size -> Tournament size (default: 5) + elitism_size -> Elitism selection size (default: 1) + limit_depth -> Maximum individual depth (default: 17) + threads -> Number of CPU threads to be used (default: 1) + +Arguments for model.fit(): + Tr_X -> Training samples + Tr_Y -> Training labels + Te_X -> Test samples, used in the standalone version (default: None) + Te_Y -> Test labels, used in the standalone version (default: None) + + +Useful methods: + $ model = STGP() -> starts the model; + $ model.fit(X, Y) -> fits the model to the dataset; + $ model.predict(dataset) -> Returns a list with the prediction of the given dataset. + + + + +How to edit this implementation: + Fitness Function ( stgp.Individual ): + - Change the getFitness() method to use your own fitness function; + - This implementation assumes that a higher fitness is always better. To change this, edit the __gt__ method in this class; + - You may use the getTrainingPredictions() and getTrainingSet() to obtain the models prediction and the training set; + - You can also explore the behind the standard fitness function; + - Warning: STGP evaluates every model in every run, as such, I do not recomend complex fitness functions. You should invest in fast evaluation methods to train a population. + + Classification method ( stgp.Individual ): + - Change the trainModel() method to use your own classifier; + - Assuming it is a scykit-learn implementation, you may only need to change the first few lines of this method; + - Warning: STGP evaluates every model in every run, as such, I do not recomend complex classification model. You should invest in fast classification methods to train a population and the use a more complex method (if you wish) on the final model. + + Reference: - Poli, R., Langdon, W.B., McPhee, N.F.: A Field Guide to Genetic Programming. Lulu Enterprises, UK Ltd (2008) + Poli, R., Langdon, W.B., McPhee, N.F.: A Field Guide to Genetic Programming. Lulu Enterprises, UK Ltd (2008) \ No newline at end of file diff --git a/stgp/GeneticOperators.py b/stgp/GeneticOperators.py index 822f1a1..3c9cef5 100644 --- a/stgp/GeneticOperators.py +++ b/stgp/GeneticOperators.py @@ -1,4 +1,3 @@ -from .Constants import * from .Individual import Individual from .Node import Node from random import random, randint @@ -8,79 +7,107 @@ # # This product can be obtained in https://github.com/jespb/Python-STGP # -# Copyright ©2019 J. E. Batista +# Copyright ©2019-2021 J. E. Batista # -def getElite(population): - return population[:ELITISM_SIZE] -def getOffspring(population): - isCross = random()<0.9 - offspring = [] +def tournament(population,n): + ''' + Selects "n" Individuals from the population and return a + single Individual. + + Parameters: + population (list): A list of Individuals, sorted from best to worse. + ''' + candidates = [randint(0,len(population)-1) for i in range(n)] + return population[min(candidates)] + + +def getElite(population,n): + ''' + Returns the "n" best Individuals in the population. + + Parameters: + population (list): A list of Individuals, sorted from best to worse. + ''' + return population[:n] + + +def getOffspring(population, tournament_size): + ''' + Genetic Operator: Selects a genetic operator and returns a list with the + offspring Individuals. The crossover GOs return two Individuals and the + mutation GO returns one individual. + + Parameters: + population (list): A list of Individuals, sorted from best to worse. + ''' + isCross = random()<0.5 + desc = None + if isCross: - parents = [tournament(population),tournament(population)] - - osxo = crossover(parents) - - isMutation = random() < 0.1 - if isMutation: - for i in range(len(osxo)): - osxom = mutation(osxo[i]) - offspring.extend(osxom) - else: - offspring.extend( osxo ) - + desc = STXO(population, tournament_size) else: - parent = tournament(population) - isMutation = random() < 0.1 - if isMutation: - osm = mutation(parent) - offspring.extend(osm) - else: - offspring.append(parent) - - return offspring - -def tournament(population): - candidates = [randint(0,len(population)-1) for i in range(TOURNAMENT_SIZE)] - return population[min(candidates)] + desc = STMUT(population, tournament_size) + + return desc -def crossover(parents): - ind1 = parents[0] - ind2 = parents[1] - n1 = ind1.getHead() - n2 = ind2.getHead() - n11 = n1.getRandomNode() - n21 = n2.getRandomNode() - n11.swap(n21) - - ret = [Individual(n1), Individual(n2)] - - # Rejects indivials over a certain depth - i = 0 - while i < len(ret): - if ret[i].getDepth() > LIMIT_DEPTH: - ret.pop(i) - i-=1 - i+=1 +def discardDeep(population, limit): + ret = [] + for ind in population: + if ind.getDepth() <= limit: + ret.append(ind) return ret -def mutation(parent): - ind1 = parent - n1 = ind1.getHead() - n11 = n1.getRandomNode() - n11.swap(Node()) - - ret = [Individual(n1)] - - # Rejects indivials over a certain depth - i = 0 - while i < len(ret): - if ret[i].getDepth() > LIMIT_DEPTH: - ret.pop(i) - i-=1 - i+=1 +def STXO(population, tournament_size): + ''' + Randomly selects one node from each of two individuals; swaps the node and + sub-nodes; and returns the two new Individuals as the offspring. + + Parameters: + population (list): A list of Individuals, sorted from best to worse. + ''' + ind1 = tournament(population, tournament_size) + ind2 = tournament(population, tournament_size) + + d1 = ind1.head.clone() + d2 = ind2.head.clone() + + n1 = d1.getRandomNode() + n2 = d2.getRandomNode() + + n1.swap(n2) + + ret = [] + for d in [d1,d2]: + i = Individual(ind1.operators, ind1.terminals, ind1.max_depth) + i.copy(d) + ret.append(i) + return ret + + + +def STMUT(population, tournament_size): + ''' + Randomly selects one node from a single individual; swaps the node with a + new, node generated using Grow; and returns the new Individual as the offspring. + + Parameters: + population (list): A list of Individuals, sorted from best to worse. + ''' + ind1 = tournament(population, tournament_size) + d1 = ind1.head.clone() + n1 = d1.getRandomNode() + n = Node() + n.create(ind1.operators, ind1.terminals, ind1.max_depth) + n1.swap(n) + + + ret = [] + i = Individual(ind1.operators, ind1.terminals, ind1.max_depth) + i.copy(d1) + ret.append(i) return ret diff --git a/stgp/Individual.py b/stgp/Individual.py index 1afc53a..18ac46d 100644 --- a/stgp/Individual.py +++ b/stgp/Individual.py @@ -1,144 +1,236 @@ from .Node import Node -from .Constants import * +from .SimpleThresholdClassifier import SimpleThresholdClassifier + +import pandas as pd + +from sklearn.metrics import accuracy_score +from sklearn.metrics import mean_squared_error + # # By using this file, you are agreeing to this product's EULA # -# This product can be obtained in https://github.com/jespb/Python-STGP +# This product can be obtained in https://github.com/jespb/Python-M3GP # -# Copyright ©2019 J. E. Batista +# Copyright ©2019-2021 J. E. Batista # class Individual: - head = None + training_X = None + training_Y = None - trainingAccuracy = None - testAccuracy = None + operators = None + terminals = None + max_depth = None - trainingRMSE = None - testRMSE = None + labelToInt = None + intToLabel = None + + head = None + size = 0 + depth = 0 - trainingPredictions = None - testPredictions = None trainingClassPredictions = None + trainingValuePredictions = None testClassPredictions = None + testValuePredictions = None + fitness = None + model_name = ["SimpleThresholdClassifier"][0] + model = None - def __init__(self, node = None, fromString = None): - if fromString == None: - self.head = Node() if node == None else node - else: - self.head = Node(fromString = fromString.split()) + fitnessType = ["Accuracy", "RMSE"][0] - def predict(self, sample): - return 0 if self.calculate(sample) < 0.5 else 1 + def __init__(self, operators, terminals, max_depth): + self.operators = operators + self.terminals = terminals + self.max_depth = max_depth - def calculate(self, sample): - return self.head.calculate(sample) + def create(self): + self.head = Node() + self.head.create(self.operators, self.terminals, self.max_depth, full=True) + + def copy(self, head): + self.head = head - def getHead(self): - return self.head.clone() - def getDepth(self): - return self.head.getDepth() + def __gt__(self, other): + sf = self.getFitness() + ss = self.getSize() - def getSize(self): - return self.head.getSize() + of = other.getFitness() + os = other.getSize() + + return (sf > of) or (sf == of and ss < os) + + def __ge__(self, other): + return self.getFitness() >= other.getFitness() def __str__(self): return str(self.head) - def __gt__(self, other): - # Using RMSE as fitness - if OUTPUT == "Classification": - return self.getTrainingAccuracy() > other.getTrainingAccuracy() - else: - return self.getTrainingRMSE() < other.getTrainingRMSE() + def fit(self, Tr_x, Tr_y): + ''' + Trains the classifier which will be used in the fitness function + ''' + if self.model is None: + self.training_X = Tr_x + self.training_Y = Tr_y + + self.labelToInt = {} + self.intToLabel = {} + classes = list(set(self.training_Y)) + for i in range(len(classes)): + self.labelToInt[classes[i]] = i + self.intToLabel[i] = classes[i] + + if self.model_name == "SimpleThresholdClassifier": + self.model = SimpleThresholdClassifier() + + hyper_X = self.calculate(Tr_x) + + self.model.fit(hyper_X,Tr_y) + + + def getSize(self): + ''' + Returns the total number of nodes within an individual. + ''' + return self.head.getSize() + + + def getDepth(self): + ''' + Returns the depth of individual. + ''' + return self.head.getDepth() + + + def clone(self): + ''' + Returns a deep clone of the individual's list of dimensions. + ''' + ret = Individual() + ret.copy(head.clone()) + return ret + + def convertLabelsToInt(self, Y): + ret = [ self.labelToInt[label] for label in Y ] + return ret + + def convertIntToLabels(self, Y): + ret = [ self.intToLabel[value] for value in Y ] + return ret - ## FITNESS def getFitness(self): - if self.fitness == None: - self.fitness = self.getTrainingRMSE() - return self.fitness + ''' + Returns the individual's fitness. + ''' + if self.fitness is None: + + if self.fitnessType == "Accuracy": + self.getTrainingClassPredictions() + acc = accuracy_score(self.trainingClassPredictions, self.convertLabelsToInt(self.training_Y) ) + self.fitness = acc + if self.fitnessType == "RMSE": + self.getTrainingValuePredictions() + waf = mean_squared_error(self.trainingValuePredictions, self.convertLabelsToInt(self.training_Y))**0.5 + self.fitness = waf + + return self.fitness - def getTrainingPredictions(self): - if self.trainingPredictions == None: - self.trainingPredictions = [ self.calculate(sample) for sample in getTrainingSet() ] - return self.trainingPredictions def getTrainingClassPredictions(self): - if self.trainingClassPredictions == None: - self.trainingClassPredictions = [ 0 if v < 0.5 else 1 for v in self.getTrainingPredictions() ] + if self.trainingClassPredictions is None: + self.trainingClassPredictions = self.predict(self.training_X, classOutput = True) + return self.trainingClassPredictions - def getTestPredictions(self): - if self.testPredictions == None: - self.testPredictions = [ self.calculate(sample) for sample in getTestSet() ] - return self.testPredictions + def getTestClassPredictions(self,X): + if self.testClassPredictions is None: + self.testClassPredictions = self.predict(X, classOutput = True) - def getTestClassPredictions(self): - if self.testClassPredictions == None: - self.testClassPredictions = [ 0 if v < 0.5 else 1 for v in self.getTestPredictions() ] return self.testClassPredictions + def getTrainingValuePredictions(self): + if self.trainingValuePredictions is None: + self.trainingValuePredictions = self.predict(self.training_X, classOutput = False) + + return self.trainingValuePredictions + + def getTestValuePredictions(self,X): + if self.testValuePredictions is None: + self.testValuePredictions = self.predict(X, classOutput = False) + + return self.testValuePredictions + + + + + def getAccuracy(self, X,Y,pred=None): + ''' + Returns the individual's accuracy. + ''' + if pred == "Tr": + pred = self.getTrainingClassPredictions() + elif pred == "Te": + pred = self.getTestClassPredictions(X) + else: + pred = self.predict(X) + + return accuracy_score(pred, Y) + + + def getRMSE(self, X, Y,pred=None): + ''' + Returns the individual's WAF. + ''' + if pred == "Tr": + pred = self.getTrainingValuePredictions() + elif pred == "Te": + pred = self.getTestValuePredictions(X) + else: + pred = self.predict(X) + + return mean_squared_error(pred, Y)**0.5 + + + + def calculate(self, X): + ''' + Returns the converted input space. + ''' + return self.head.calculate(X) + + + def predict(self, X, classOutput=True): + ''' + Returns the class prediction of a sample. + ''' + hyper_X = self.calculate(X) + if classOutput: + predictions = self.model.predict(hyper_X) + else: + predictions = hyper_X + + return predictions + + + + def prun(self): + ''' + Remove the dimensions that degrade the fitness. + If simp==True, also simplifies each dimension. + ''' + done = False + while not done: + state = str(self.head) + self.head.prun(self.training_X) + done = state == str(self.head) + - def getTrainingRMSE(self): - if self.trainingRMSE == None: - pred = self.getTrainingPredictions() - acc = 0 - ds = getTrainingSet() - for i in range(len(ds)): - dif = pred[i] - ds[i][-1] - acc += dif**2 - acc /= len(ds) - acc = acc**0.5 - self.trainingRMSE = acc - return self.trainingRMSE - - def getTestRMSE(self): - if self.testRMSE == None: - pred = self.getTestPredictions() - acc = 0 - ds = getTestSet() - for i in range(len(ds)): - dif = pred[i] - ds[i][-1] - acc += dif**2 - acc /= len(ds) - acc = acc**0.5 - self.testRMSE = acc - return self.testRMSE - - def getTrainingAccuracy(self): - if self.trainingAccuracy == None: - if OUTPUT != "Classification": - self.trainingAccuracy = 0 - else: - pred = self.getTrainingClassPredictions() - hits = 0 - ds = getTrainingSet() - for i in range(len(ds)): - if pred[i] == ds[i][-1]: - hits += 1 - acc = hits/len(ds) - self.trainingAccuracy = acc - return self.trainingAccuracy - - def getTestAccuracy(self): - if self.testAccuracy == None: - if OUTPUT != "Classification": - self.testAccuracy = 0 - else: - pred = self.getTestClassPredictions() - hits = 0 - ds = getTestSet() - for i in range(len(ds)): - if pred[i] == ds[i][-1]: - hits += 1 - acc = hits/len(ds) - self.testAccuracy = acc - return self.testAccuracy \ No newline at end of file diff --git a/stgp/Node.py b/stgp/Node.py index 6677e9c..f8bda6c 100644 --- a/stgp/Node.py +++ b/stgp/Node.py @@ -1,12 +1,12 @@ -from .Constants import * from random import randint, random +import numpy as np # # By using this file, you are agreeing to this product's EULA # # This product can be obtained in https://github.com/jespb/Python-STGP # -# Copyright ©2019 J. E. Batista +# Copyright ©2019-2021 J. E. Batista # class Node: @@ -14,60 +14,59 @@ class Node: right = None value = None - def __init__(self,depth=MAX_DEPTH, left=None,value=None,right=None, fromString = None, fsi=0): - if fromString == None: - if value == None: - i = randint(0,len(getTerminals())+len(OPERATORS)) # randint(a,b) = [a,b] - if i < len(OPERATORS) and depth > 1: - self.value = i - self.left = Node(depth-1) - self.right = Node(depth-1) - else: - self.value = randint(0,len(getTerminals()))-1 - if self.value == -1: - self.value *= random() - else: - self.left = left - self.right=right - self.value=value + + def __init__(self): + pass + + + def create(self, operators=None, terminals=None, depth=None,full=False): + if depth>1 and (random()<0.5 or full ==True ): + self.value = operators[randint(0,len(operators)-1)] + self.left = Node() + self.left.create(operators, terminals, depth-1) + self.right = Node() + self.right.create(operators, terminals, depth-1) else: - if fsi == 0: - fsi = [0] - if fromString[fsi[0]] == "(": - fsi[0] += 1 # salta o ( - self.left = Node(fromString=fromString, fsi = fsi) - self.value = "+-*/".find(fromString[fsi[0]]) - fsi[0] += 1 # salta a operacao - self.right = Node(fromString=fromString, fsi = fsi) - fsi[0] += 1 # salta o ) - else: - val = fromString[fsi[0]] - if val[0] == "X": - self.value = int(val[1:]) - else: - self.value = -float(val) - fsi[0] += 1 # salta o terminal + self.value = terminals[randint(0,len(terminals)-1)] # Sem literais + + + def copy(self, left=None,value=None,right=None): + self.left = left + self.right=right + self.value=value def __str__(self): if self.left == None: - return str(-self.value if self.value < 0 else getTerminals()[self.value]) + return str(self.value) else: - return "( " + str(self.left) + " " + OPERATORS[self.value] + " " + str(self.right) + " )" + return "( " + str(self.left) + " " + str(self.value) + " " + str(self.right) + " )" + def getSize(self): + ''' + Returns the total number of nodes within this Node. + ''' if self.left == None: return 1 else: return self.left.getSize() + 1 + self.right.getSize() + def getDepth(self): + ''' + Returns the depth of this Node. + ''' if self.left == None: return 1 else: return 1 + max(self.left.getDepth(),self.right.getDepth()) + def getRandomNode(self, value=None): + ''' + Returns a random Node within this Node. + ''' if value == None: value = randint(0,self.getSize()-1) if value == 0: @@ -80,8 +79,10 @@ def getRandomNode(self, value=None): return self.right.getRandomNode(value-left_size-1) - def swap(self, other): + ''' + Swaps the content of two nodes. + ''' l = self.left v = self.value r = self.right @@ -92,23 +93,147 @@ def swap(self, other): other.value = v other.right = r + def clone(self): + ''' + Returns a clone of this node. + ''' if self.left == None: - return Node(left = None, value=self.value, right = None) + n = Node() + n.copy(left = None, value=self.value, right = None) + return n else: - return Node(left = self.left.clone(), value=self.value, right=self.right.clone()) + n = Node() + n.copy(left = self.left.clone(), value=self.value, right=self.right.clone()) + return n + def calculate(self, sample): + ''' + Returns the calculated value of a sample. + ''' if self.left == None: - return -self.value if self.value < 0 else sample[self.value] + try: + return np.array( sample[self.value] )#.astype("float64") + except: + try: + return np.array( [float(self.value)]*sample.shape[0] ) + except: + print(self.value, sample) + return np.array( [float(self.value)]*sample.shape[0] ) + + else: - if self.value == 0: #+ + if self.value == "+": #+ return self.left.calculate(sample) + self.right.calculate(sample) - if self.value == 1: #- + if self.value == "-": #- return self.left.calculate(sample) - self.right.calculate(sample) - if self.value == 2: #* + if self.value == "*": #* return self.left.calculate(sample) * self.right.calculate(sample) - if self.value == 3: #/ + if self.value == "/": #/ right = self.right.calculate(sample) - return self.left.calculate(sample) if right == 0 else self.left.calculate(sample) / self.right.calculate(sample) + right = np.where(right==0, 1, right) + return self.left.calculate(sample) / right + + + def isLeaf(self): + ''' + Returns True if the Node had no sub-nodes. + ''' + return self.left == None + + def getSemantics(self,tr_x): + ''' + Returns the semantic of a Node. + ''' + return self.calculate(tr_x) + + def redirect(self, other): + ''' + Assigns the content of another Node to this Node. + ''' + self.value = other.value + self.left = other.left + self.right = other.right + + def prun(self, tr_x): + ''' + Simplifies this Node + ''' + semantics = self.getSemantics(tr_x) + semantics.sort() + if semantics[0]== semantics[-1] and len(semantics)>1: + self.value = str(semantics[0]) + self.left = None + self.right = None + # [+, -, *, /] + + # + + if self.value == "+": + # 0 + X == X + if not self.isLeaf() and ( self.left.isLeaf() and self.left.value == "0.0" ): + self.redirect(self.right) + + # X + 0 == X + if not self.isLeaf() and ( self.right.isLeaf() and self.right.value == "0.0" ): + self.redirect(self.left) + + # X + X == 2 * X + if not self.isLeaf() and ( str(self.right) == str(self.left) ): + self.value = "*" + n = Node() + n.copy(value = "2.0") + self.left.redirect( n ) + + # - + if self.value == "-": + # X - 0 == X + if not self.isLeaf() and ( self.right.isLeaf() and self.right.value == "0.0" ): + self.redirect(self.left) + + # X - X == 0 + if not self.isLeaf() and ( str(self.right) == str(self.left) ): + n = Node() + n.copy(value = "0.0") + self.redirect( n ) + + # * + if self.value == "*": + # X * 0 == 0, 0 * X == 0 + if not self.isLeaf() and ( (self.left.isLeaf() and self.left.value=="0.0") or (self.right.isLeaf() and self.right.value=="0.0") ): + n = Node() + n.copy(value = "0.0") + self.redirect( n ) + + # 1 * X == X + if not self.isLeaf() and ( self.left.isLeaf() and self.left.value == "1.0" ): + self.redirect(self.right) + + # X * 1 == X + if not self.isLeaf() and ( self.right.isLeaf() and self.right.value == "1.0" ): + self.redirect(self.left) + + # // + if self.value == "/": + # X // 0 == 1 + if not self.isLeaf() and ( self.right.isLeaf() and self.right.value=="0.0" ): + n = Node() + n.copy(value = "1.0") + self.redirect( n ) + + # X // 1 == X + if not self.isLeaf() and ( self.right.isLeaf() and self.right.value=="1.0" ): + self.redirect(self.left) + + # X // X == 1 + if not self.isLeaf() and ( str(self.right) == str(self.left) ): + n = Node() + n.copy(value = "1.0") + self.redirect( n ) + + if self.left != None: + self.left.prun(tr_x) + + if self.right != None: + self.right.prun(tr_x) diff --git a/stgp/Population.py b/stgp/Population.py index 1144c38..be51277 100644 --- a/stgp/Population.py +++ b/stgp/Population.py @@ -1,9 +1,6 @@ from .Individual import Individual -from .Node import Node -from .Constants import * -from .GeneticOperators import getElite, getOffspring +from .GeneticOperators import getElite, getOffspring, discardDeep import multiprocessing as mp -from random import random, randint import time # @@ -11,89 +8,163 @@ # # This product can be obtained in https://github.com/jespb/Python-STGP # -# Copyright ©2019 J. E. Batista +# Copyright ©2019-2021 J. E. Batista # class Population: + operators = None + max_initial_depth = None + population_size = None + max_generation = None + tournament_size = None + elitism_size = None + limit_depth = None + verbose = None + threads = None + terminals = None + + population = None bestIndividual = None + currentGeneration = 0 trainingAccuracyOverTime = None testAccuracyOverTime = None - trainingRmseOverTime = None - testRmseOverTime = None + trainingRMSEOverTime = None + testRMSEOverTime = None sizeOverTime = None - currentGeneration = None - generationTime = None + generationTimes = None + + + def __init__(self, Tr_x, Tr_y, Te_x, Te_y, operators, max_initial_depth, population_size, + max_generation, tournament_size, elitism_size, limit_depth, threads, verbose): + + self.Tr_x = Tr_x + self.Tr_y = Tr_y + self.Te_x = Te_x + self.Te_y = Te_y + + self.terminals = list(Tr_x.columns) + self.operators = operators + self.max_initial_depth = max_initial_depth + self.population_size = population_size + self.max_generation = max_generation + self.tournament_size = tournament_size + self.elitism_size = elitism_size + self.limit_depth = limit_depth + self.threads = threads + self.verbose = verbose - def __init__(self): - self.currentGeneration = 0 self.population = [] - self.trainingAccuracyOverTime = [] - self.testAccuracyOverTime = [] - self.trainingRmseOverTime = [] - self.testRmseOverTime = [] - self.sizeOverTime = [] - self.generationTimes = [] - for i in range(POPULATION_SIZE): - self.population.append(Individual()) + + while len(self.population) < self.population_size: + ind = Individual(self.operators, self.terminals, self.max_initial_depth) + ind.create() + self.population.append(ind) + + self.bestIndividual = self.population[0] + self.bestIndividual.fit(self.Tr_x, self.Tr_y) + + + if not self.Te_x is None: + self.trainingAccuracyOverTime = [] + self.testAccuracyOverTime = [] + self.trainingRMSEOverTime = [] + self.testRMSEOverTime = [] + self.sizeOverTime = [] + self.generationTimes = [] def stoppingCriteria(self): - genLimit = self.currentGeneration >= MAX_GENERATION - perfectTraining = self.bestIndividual != None - perfectTraining = perfectTraining and self.bestIndividual.getTrainingRMSE() == 0 + ''' + Returns True if the stopping criteria was reached. + ''' + genLimit = self.currentGeneration >= self.max_generation + perfectTraining = self.bestIndividual.getFitness() == 1 - return genLimit or perfectTraining + return genLimit or perfectTraining def train(self): - while self.currentGeneration < MAX_GENERATION: - duration = 0 + ''' + Training loop for the algorithm. + ''' + if self.verbose: + print("> Running log:") + while self.currentGeneration < self.max_generation: if not self.stoppingCriteria(): t1 = time.time() self.nextGeneration() t2 = time.time() duration = t2-t1 - + else: + duration = 0 self.currentGeneration += 1 - self.trainingAccuracyOverTime.append(self.bestIndividual.getTrainingAccuracy()) - self.testAccuracyOverTime.append(self.bestIndividual.getTestAccuracy()) - self.trainingRmseOverTime.append(self.bestIndividual.getTrainingRMSE()) - self.testRmseOverTime.append(self.bestIndividual.getTestRMSE()) - self.sizeOverTime.append(self.bestIndividual.getSize()) - self.generationTimes.append(duration) + + if not self.Te_x is None: + self.trainingAccuracyOverTime.append(self.bestIndividual.getAccuracy(self.Tr_x, self.Tr_y, pred="Tr")) + self.testAccuracyOverTime.append(self.bestIndividual.getAccuracy(self.Te_x, self.Te_y, pred="Te")) + self.trainingRMSEOverTime.append(self.bestIndividual.getRMSE(self.Tr_x, self.Tr_y, pred="Tr")) + self.testRMSEOverTime.append(self.bestIndividual.getRMSE(self.Te_x, self.Te_y, pred="Te")) + self.sizeOverTime.append(self.bestIndividual.getSize()) + self.generationTimes.append(duration) + + if self.verbose: + print() def nextGeneration(self): - if THREADS > 1: - with mp.Pool(processes= THREADS) as pool: - fitArray = pool.map(getTrainingPredictions, self.population) + ''' + Generation algorithm: the population is sorted; the best individual is pruned; + the elite is selected; and the offspring are created. + ''' + begin = time.time() + + # Calculates the accuracy of the population using multiprocessing + if self.threads > 1: + with mp.Pool(processes= self.threads) as pool: + model = pool.map(fitIndividuals, [(ind, self.Tr_x, self.Tr_y) for ind in self.population] ) for i in range(len(self.population)): - self.population[i].trainingPredictions = fitArray[i] + self.population[i].model = model[i][0].model + self.population[i].labelToInt = model[i][0].labelToInt + self.population[i].intToLabel = model[i][0].intToLabel + self.population[i].trainingPredictions = model[i][1] + self.population[i].training_X = self.Tr_x + self.population[i].training_Y = self.Tr_y + else: + [ ind.fit(self.Tr_x, self.Tr_y) for ind in self.population] + [ ind.getFitness() for ind in self.population ] # Sort the population from best to worse - self.population.sort(reverse=True) + self.population.sort(reverse=True) - # Update Best Individual - if(self.bestIndividual == None or self.population[0]>self.bestIndividual): + + # Update best individual + if self.population[0] > self.bestIndividual: self.bestIndividual = self.population[0] - if self.currentGeneration%10 == 0: - if OUTPUT == "Classification": - print("Gen#",self.currentGeneration, "- (TrA,TeA,TrRMSE):", self.bestIndividual.getTrainingAccuracy(),self.bestIndividual.getTestAccuracy(),self.bestIndividual.getTrainingRMSE()) - if OUTPUT == "Regression": - print("Gen#",self.currentGeneration, "- (TrRMSE,TeRMSE):", self.bestIndividual.getTrainingRMSE(), self.bestIndividual.getTestRMSE()) - # Generating Next Generation newPopulation = [] - newPopulation.extend( getElite(self.population) ) - while len(newPopulation) < POPULATION_SIZE: - newPopulation.extend( getOffspring(self.population) ) - self.population = newPopulation[:POPULATION_SIZE] + newPopulation.extend(getElite(self.population, self.elitism_size)) + while len(newPopulation) < self.population_size: + offspring = getOffspring(self.population, self.tournament_size) + offspring = discardDeep(offspring, self.limit_depth) + newPopulation.extend(offspring) + self.population = newPopulation[:self.population_size] + + + end = time.time() + + # Debug + if self.verbose and self.currentGeneration%5==0: + if not self.Te_x is None: + print(" > Gen #"+str(self.currentGeneration)+": Tr-Acc: "+ "%.6f" %self.bestIndividual.getAccuracy(self.Tr_x, self.Tr_y)+" // Te-Acc: "+ "%.6f" %self.bestIndividual.getAccuracy(self.Te_x, self.Te_y) + " // Time: " + str(end- begin) ) + else: + print(" > Gen #"+str(self.currentGeneration)+": Tr-Acc: "+ "%.6f" %self.bestIndividual.getAccuracy(self.Tr_x, self.Tr_y)) + def predict(self, sample): return "Population Not Trained" if self.bestIndividual == None else self.bestIndividual.predict(sample) @@ -104,9 +175,33 @@ def getBestIndividual(self): def getCurrentGeneration(self): return self.currentGeneration + def getTrainingAccuracyOverTime(self): + return self.trainingAccuracyOverTime + + def getTestAccuracyOverTime(self): + return self.testAccuracyOverTime + + def getTrainingRMSEOverTime(self): + return self.trainingRMSEOverTime + + def getTestRMSEOverTime(self): + return self.testRMSEOverTime + + def getSizeOverTime(self): + return self.sizeOverTime + def getGenerationTimes(self): return self.generationTimes +def calculateIndividualAccuracy_MultiProcessing(ind, fitArray, indIndex): + fitArray[indIndex] = ind.getTrainingAccuracy() + +def fitIndividuals(a): + ind,x,y = a + ind.fit(x,y) + + return ( ind, ind.predict(x) ) + def getTrainingPredictions(ind): return ind.getTrainingPredictions() diff --git a/stgp/STGP.py b/stgp/STGP.py index 01ebb5e..259122f 100644 --- a/stgp/STGP.py +++ b/stgp/STGP.py @@ -1,4 +1,3 @@ -from .Constants import * from .Population import Population # @@ -6,81 +5,123 @@ # # This product can be obtained in https://github.com/jespb/Python-STGP # -# Copyright ©2019 J. E. Batista +# Copyright ©2019-2021 J. E. Batista # +class ClassifierNotTrainedError(Exception): + """ You tried to use the classifier before training it. """ + + def __init__(self, expression, message = ""): + self.expression = expression + self.message = message + + class STGP: population = None - def __init__(self, Tr, Te): - setTerminals(Tr.columns[:-1]) + operators = None + max_depth = None + population_size = None + max_generation = None + tournament_size = None + elitism_size = None + limit_depth =None + threads = None + verbose = None + + def checkIfTrained(self): + if self.population == None: + raise ClassifierNotTrainedError("The classifier must be trained using the fit(Tr_X, Tr_Y) method before being used.") + + + def __init__(self, operators=["+","-","*","/"], max_depth = 6, population_size = 500, + max_generation = 100, tournament_size = 5, elitism_size = 1, limit_depth = 17, + threads=1, verbose = True): + + if sum( [0 if op in ["+","-","*","/"] else 0 for op in operators ] ) > 0: + print( "[Warning] Some of the following operators may not be supported:", operators) + self.operators = operators + self.max_depth = max_depth + self.population_size = population_size + self.max_generation = max_generation + self.tournament_size = tournament_size + self.elitism_size = elitism_size + self.limit_depth = limit_depth + self.threads = max(1, threads) + self.verbose = verbose + pass + + def __str__(self): + self.checkIfTrained() + + return str(self.getBestIndividual()) - Tr = [ list(sample) for sample in Tr.iloc] - Te = [ list(sample) for sample in Te.iloc] - setTrainingSet(Tr) - setTestSet(Te) + def fit(self,Tr_X, Tr_Y, Te_X = None, Te_Y = None): + if self.verbose: + print("Training a model with the following parameters: ", end="") + print("{Operators : "+str(self.operators)+"}, ", end="") + print("{Max Initial Depth : "+str(self.max_depth)+"}, ", end="") + print("{Population Size : "+str(self.population_size)+"}, ", end="") + print("{Max Generation : "+str(self.max_generation)+"}, ", end="") + print("{Tournament Size : "+str(self.tournament_size)+"}, ", end="") + print("{Elitism Size : "+str(self.elitism_size)+"}, ", end="") + print("{Depth Limit : "+str(self.limit_depth)+"}, ", end="") + print("{Threads : "+str(self.threads)+"}, ", end="") - self.population = Population() + self.population = Population(Tr_X, Tr_Y, Te_X, Te_Y, self.operators, self.max_depth, + self.population_size, self.max_generation, self.tournament_size, self.elitism_size, + self.limit_depth, self.threads, self.verbose) self.population.train() - - def getCurrentGeneration(self): - ''' - Returns the number of the current generation. - ''' - return self.population.getCurrentGeneration() - def getTrainingAccuracy(self): - ''' - Returns the training accuracy of the best individual - ''' - return self.population.bestIndividual.getTrainingAccuracy() + self.getBestIndividual().prun() - def getTestAccuracy(self): - ''' - Returns the test accuracy of the best individual - ''' - return self.population.bestIndividual.getTestAccuracy() - - def getTrainingRMSE(self): + + def predict(self, dataset): ''' - Returns the training rmse of the best individual + Returns the predictions for the samples in a dataset. ''' - return self.population.bestIndividual.getTrainingRMSE() - - def getTestRMSE(self): + self.checkIfTrained() + + return self.population.getBestIndividual().predict(dataset) + + def getBestIndividual(self): ''' - Returns the test rmse of the best individual + Returns the final M3GP model. ''' - return self.population.bestIndividual.getTestRMSE() - + self.checkIfTrained() + + return self.population.getBestIndividual() + def getAccuracyOverTime(self): ''' - Returns the training and test accuracy over the generations + Returns the training and test accuracy of the best model in each generation. ''' - return [self.population.trainingAccuracyOverTime, self.population.testAccuracyOverTime] + self.checkIfTrained() + + return [self.population.getTrainingAccuracyOverTime(), self.population.getTestAccuracyOverTime()] - def getRmseOverTime(self): + def getRMSEOverTime(self): ''' - Returns the training and test rmse over the generations + Returns the training and test accuracy of the best model in each generation. ''' - return [self.population.trainingRmseOverTime, self.population.testRmseOverTime] + self.checkIfTrained() + + return [self.population.getTrainingRMSEOverTime(), self.population.getTestRMSEOverTime()] + def getSizeOverTime(self): ''' - Returns the size of the best individual over the generations + Returns the size and number of dimensions of the best model in each generation. ''' - return self.population.sizeOverTime + self.checkIfTrained() + return self.population.getSizeOverTime() def getGenerationTimes(self): ''' Returns the time spent in each generation. ''' - return self.population.getGenerationTimes() + self.checkIfTrained() - def getBestIndividual(self): - ''' - Returns the best individual - ''' - return self.population.bestIndividual \ No newline at end of file + return self.population.getGenerationTimes() \ No newline at end of file diff --git a/stgp/SimpleThresholdClassifier.py b/stgp/SimpleThresholdClassifier.py new file mode 100644 index 0000000..3956130 --- /dev/null +++ b/stgp/SimpleThresholdClassifier.py @@ -0,0 +1,28 @@ + +# +# By using this file, you are agreeing to this product's EULA +# +# This product can be obtained in https://github.com/jespb/Python-STGP +# +# Copyright ©2019-2021 J. E. Batista +# + + +class SimpleThresholdClassifier: + + threshold = None + + def __init__(self, threshold = 0): + self.threshold = threshold + + def fit(self,X=None,Y=None): + pass + + + def predict(self, X): + """ + Receives X, a 1-D array of real values + Return a list of predictions based on the value + """ + predictions = [ 1 if value > self.threshold else 0 for value in X] + return predictions \ No newline at end of file