Update: includes adding random_states and changing the name from STGP…

… to StdGP
jespb · Oct 3, 2022 · ba5713e · ba5713e
1 parent 26f15b9
commit ba5713e
Show file tree

Hide file tree

Showing 9 changed files with 1,302 additions and 16 deletions.
diff --git a/Arguments.py b/Arguments.py
@@ -3,15 +3,17 @@
 # 
 # By using this file, you are agreeing to this product's EULA
 #
-# This product can be obtained in https://github.com/jespb/Python-STGP
+# This product can be obtained in https://github.com/jespb/Python-StdGP
 #
-# Copyright ©2019-2021 J. E. Batista
+# Copyright ©2019-2022 J. E. Batista
 #
 
 
 # Operators to be used by the models
 # Only these operators are available. To add mode, edit m3gp.Node.calculate(self, sample)
-OPERATORS = ["+","-","*","/"]
+
+#OPERATORS = [("+",2),("-",2),("*",2),("/",2),("log2",1), ("max", 3)] # Example
+OPERATORS = [("+",2),("-",2),("*",2),("/",2)] # Default
 
 # Initial Maximum depth
 MAX_DEPTH = 6
@@ -46,6 +48,14 @@
 # Number of CPU Threads to be used
 THREADS = 1
 
+# Random state
+RANDOM_STATE = 42
+
+# Models wrapped by the StdGP models
+MODEL_NAME = ["SimpleThresholdClassifier"][0]
+
+# Fitness used by the M3GP models
+FITNESS_TYPE = ["Accuracy", "MSE", "WAF", "2FOLD"][0]
 
 
 
@@ -72,6 +82,9 @@
 
 if "-op" in argv:
 	OPERATORS = argv[argv.index("-op")+1].split(";")
+	for i in range(len(OPERATORS)):
+		OPERATORS[i] = OPERATORS[i].split(",")
+		OPERATORS[i][1] = int(OPERATORS[i][1])
 
 if "-md" in argv:
 	MAX_DEPTH = int(argv[argv.index("-md")+1])
@@ -100,4 +113,7 @@
 if "-t" in argv:
 	THREADS = int(argv[argv.index("-t")+1])
 
+if "-rs" in argv:
+	RANDOM_STATE = int(argv[argv.index("-rs")+1])
+
 
diff --git a/Main_StdGP_classification_example.py b/Main_StdGP_classification_example.py
@@ -0,0 +1,40 @@
+import pandas
+
+from stdgp.StdGP import StdGP
+
+from sklearn.model_selection import train_test_split
+
+from sklearn.metrics import accuracy_score
+
+
+
+# 
+# By using this file, you are agreeing to this product's EULA
+#
+# This product can be obtained in https://github.com/jespb/Python-StdGP
+#
+# Copyright ©2019-2022 J. E. Batista
+#
+
+
+
+filename= "heart.csv"
+
+# Open the dataset
+ds = pandas.read_csv("datasets/"+filename)
+class_header = ds.columns[-1]
+
+# Split the dataset
+Tr_X, Te_X, Tr_Y, Te_Y = train_test_split(ds.drop(columns=[class_header]), ds[class_header], 
+		train_size=0.7, random_state = 42, stratify = ds[class_header])
+
+# Train a model
+model = StdGP()
+model.fit(Tr_X, Tr_Y)
+
+# Predict test results
+pred = model.predict(Te_X)
+
+# Obtain test accuracy
+print( accuracy_score(pred, Te_Y) )
+
diff --git a/Main_StdGP_standalone.py b/Main_StdGP_standalone.py
@@ -0,0 +1,157 @@
+import pandas
+
+from stdgp.StdGP import StdGP
+from sys import argv
+from Arguments import *
+import os
+
+from sklearn.model_selection import train_test_split
+
+import numpy as np
+
+
+
+
+# 
+# By using this file, you are agreeing to this product's EULA
+#
+# This product can be obtained in https://github.com/jespb/Python-StdGP
+#
+# Copyright ©2019-2022 J. E. Batista
+#
+
+
+
+
+def openAndSplitDatasets(which,seed):
+	if VERBOSE:
+		print( "> Opening: ", which )
+
+	# Open dataset
+	ds = pandas.read_csv(DATASETS_DIR+which)
+
+	# Read header
+	class_header = ds.columns[-1]
+
+	return train_test_split(ds.drop(columns=[class_header]), ds[class_header], 
+		train_size=TRAIN_FRACTION, random_state=seed, 
+		stratify = ds[class_header])
+
+
+def run(r,dataset):
+	if VERBOSE:
+		print("> Starting run:")
+		print("  > ID:", r)
+		print("  > Dataset:            "+dataset)
+		print()
+
+	Tr_X, Te_X, Tr_Y, Te_Y = openAndSplitDatasets(dataset,r)
+
+	# Train a model
+	model = StdGP(OPERATORS, MAX_DEPTH, POPULATION_SIZE, MAX_GENERATION, TOURNAMENT_SIZE, 
+		ELITISM_SIZE, LIMIT_DEPTH, THREADS, r, VERBOSE, MODEL_NAME, FITNESS_TYPE)
+	model.fit(Tr_X, Tr_Y, Te_X, Te_Y)
+
+
+	# Obtain training results
+	accuracy  = model.getAccuracyOverTime()
+	waf       = model.getWaFOverTime()
+	kappa     = model.getKappaOverTime()
+	mse       = model.getMSEOverTime()
+	size      = model.getSizeOverTime()
+	model_str = str(model.getBestIndividual())
+	times     = model.getGenerationTimes()
+
+	tr_acc     = accuracy[0]
+	te_acc     = accuracy[1]
+	tr_waf     = waf[0]
+	te_waf     = waf[1]
+	tr_kappa   = kappa[0]
+	te_kappa   = kappa[1]
+	tr_mse     = mse[0]
+	te_mse     = mse[1]
+
+	if VERBOSE:
+		print("> Ending run:")
+		print("  > ID:", r)
+		print("  > Dataset:", dataset)
+		print("  > Final model:", model_str)
+		print("  > Training accuracy:", tr_acc[-1])
+		print("  > Test accuracy:", te_acc[-1])
+		print()
+
+	return (tr_acc,te_acc,
+			tr_waf,te_waf,
+			tr_kappa,te_kappa,
+			tr_mse,te_mse,
+			size,
+			times,
+			model_str)
+
+
+def call_StdGP():
+	try:
+		os.makedirs(OUTPUT_DIR)
+	except:
+		pass
+
+	for dataset in DATASETS:
+		outputFilename = OUTPUT_DIR+"StdGP_"+ dataset
+		if not os.path.exists(outputFilename):
+			results = []
+
+			# Run the algorithm several times
+			for r in range(RUNS):
+				results.append(run(r,dataset))
+
+			# Write output header
+			file = open(outputFilename , "w")
+			file.write("Attribute,Run,")
+			for i in range(MAX_GENERATION):
+				file.write(str(i)+",")
+			file.write("\n")
+
+			attributes= ["Training-Accuracy","Test-Accuracy",
+						 "Training-WaF", "Test-WaF",
+						 "Training-Kappa", "Test-Kappa",
+						 "Training-MSE", "Test-MSE",
+						 "Size",
+						 "Time",	
+						 "Final_Model"]
+
+			# Write attributes with value over time
+			for ai in range(len(attributes)-1):
+				for i in range(RUNS):	
+					file.write("\n"+attributes[ai]+","+str(i)+",")
+					file.write( ",".join([str(val) for val in results[i][ai]]))
+				file.write("\n")
+
+			# Write the final models
+			for i in range(len(results)):
+				file.write("\n"+attributes[-1]+","+str(i)+",")
+				file.write(results[i][-1])
+			file.write("\n")
+
+			# Write some parameters
+			file.write("\n\nParameters")
+			file.write("\nOperators,"+str(OPERATORS))
+			file.write("\nMax Initial Depth,"+str(MAX_DEPTH))
+			file.write("\nPopulation Size,"+str(POPULATION_SIZE))
+			file.write("\nMax Generation,"+str(MAX_GENERATION))
+			file.write("\nTournament Size,"+str(TOURNAMENT_SIZE))
+			file.write("\nElitism Size,"+str(ELITISM_SIZE))
+			file.write("\nDepth Limit,"+str(LIMIT_DEPTH))
+			file.write("\nWrapped Model,"+MODEL_NAME)
+			file.write("\nFitness Type,"+FITNESS_TYPE)
+			file.write("\nThreads,"+str(THREADS))
+			file.write("\nRandom State,"+str(list(range(RUNS))))
+			file.write("\nDataset,"+dataset)
+
+
+			file.close()
+		else:
+			print("Filename: " + outputFilename +" already exists.")
+
+
+if __name__ == '__main__':
+	call_StdGP()
diff --git a/README.txt b/README.txt
@@ -1,17 +1,17 @@
-This is a, easy-to-use, scikit-learn inspired version of the STGP algorithm.
+This is a, easy-to-use, scikit-learn inspired version of the Standard Genetic Programming (StdGP) algorithm.
 
 
 By using this file, you are agreeing to this product's EULA
-This product can be obtained in https://github.com/jespb/Python-STGP
-Copyright ©2019-2021 J. E. Batista
+This product can be obtained in https://github.com/jespb/Python-StdGP
+Copyright ©2019-2022 J. E. Batista
 
 
 This file contains information about the command and flags used in the stand-alone version of this implementation and an explanation on how to import, use and edit this implementation.
 
 
-This implementation of STGP can be used in a stand-alone fashion using the following command and flags:
+This implementation of StdGP can be used in a stand-alone fashion using the following command and flags:
 
-$ python Main_STGP_standalone.py
+$ python Main_StdGP_standalone.py
 
 	[-d datasets] 
 		- This flag expects a set of csv dataset names separated by ";" (e.g., a.csv;b.csv)
@@ -77,11 +77,11 @@ How to import this implementation to your project:
 	- import the STGP class using "from stgp.STGP import STGP".
 
 How to use this implementation:
-	$ from stgp.STGP import STGP
-	$ model = STGP()
+	$ from stdgp.StdGP import StdGP
+	$ model = StdGP()
 	$ model.fit( training_x, training_y)
 
-Arguments for STGP():
+Arguments for StdGP():
 	operators			-> Operators used by the individual (default: ["+","-","*","/"] )
 	max_depth			-> Max initial depths of the individuals (default: 6)
 	population_size		-> Population size (default: 500)
@@ -99,25 +99,25 @@ Arguments for model.fit():
 
 
 Useful methods:
-	$ model = STGP()			-> starts the model;
+	$ model = StdGP()			-> starts the model;
 	$ model.fit(X, Y)			-> fits the model to the dataset;
 	$ model.predict(dataset)    -> Returns a list with the prediction of the given dataset.
 
 
 
 
 How to edit this implementation:
-	Fitness Function ( stgp.Individual ):
+	Fitness Function ( stdgp.Individual ):
 		- Change the getFitness() method to use your own fitness function;
 		- This implementation assumes that a higher fitness is always better. To change this, edit the __gt__ method in this class;
 		- You may use the getTrainingPredictions() and getTrainingSet() to obtain the models prediction and the training set;
 		- You can also explore the behind the standard fitness function;
-		- Warning: STGP evaluates every model in every run, as such, I do not recomend complex fitness functions. You should invest in fast evaluation methods to train a population.
+		- Warning: StdGP evaluates every model in every run, as such, I do not recomend complex fitness functions. You should invest in fast evaluation methods to train a population.
 
-	Classification method ( stgp.Individual ):
+	Classification method ( stdgp.Individual ):
 		- Change the trainModel() method to use your own classifier;
 		- Assuming it is a scykit-learn implementation, you may only need to change the first few lines of this method;
-		- Warning: STGP evaluates every model in every run, as such, I do not recomend complex classification model. You should invest in fast classification methods to train a population and the use a more complex method (if you wish) on the final model.
+		- Warning: StdGP evaluates every model in every run, as such, I do not recomend complex classification model. You should invest in fast classification methods to train a population and the use a more complex method (if you wish) on the final model.
 
 
 Reference: