Revamped to match the new Python-M3GP implementation

jespb · May 31, 2021 · 2c2202a · 2c2202a
1 parent c883fd0
commit 2c2202a
Show file tree

Hide file tree

Showing 10 changed files with 1,030 additions and 370 deletions.
diff --git a/stgp/Constants.py → Arguments.py b/stgp/Constants.py → Arguments.py
@@ -5,95 +5,99 @@
 #
 # This product can be obtained in https://github.com/jespb/Python-STGP
 #
-# Copyright ©2019 J. E. Batista
+# Copyright ©2019-2021 J. E. Batista
 #
 
+
+# Operators to be used by the models
+# Only these operators are available. To add mode, edit m3gp.Node.calculate(self, sample)
 OPERATORS = ["+","-","*","/"]
-MAX_DEPTH = 6 # max depth of the initial trees and the trees used for mutation
-POPULATION_SIZE = 200
+
+# Initial Maximum depth
+MAX_DEPTH = 6
+
+# Number of models in the population
+POPULATION_SIZE = 500
+
+# Maximum number of iterations
 MAX_GENERATION = 100
+
+# Fraction of the dataset to be used as training (used by Main_M3GP_standalone.py)
 TRAIN_FRACTION = 0.70
-TOURNAMENT_SIZE = 10
+
+# Number of individuals to be used in the tournament
+TOURNAMENT_SIZE = 5
+
+# Number of best individuals to be automatically moved to the next generation
 ELITISM_SIZE = 1
+
+# Shuffle the dataset (used by Main_M3GP_standalone.py)
 SHUFFLE = True
-LIMIT_DEPTH=15
+
+# Dimensions maximum depth
+LIMIT_DEPTH=17
+
+# Number of runs (used by Main_M3GP_standalone.py)
 RUNS = 30
+
+# Verbose
 VERBOSE = True
+
+# Number of CPU Threads to be used
+THREADS = 1
+
+
+
+
 DATASETS_DIR = "datasets/"
 OUTPUT_DIR = "results/"
+
 DATASETS = ["heart.csv"]
 OUTPUT = "Classification"
-out = None
-THREADS = 1
 
 
 
 
 if "-dsdir" in argv:
 	DATASETS_DIR = argv[argv.index("-dsdir")+1]
+
 if "-odir" in argv:
 	OUTPUT_DIR = argv[argv.index("-odir")+1]
+
 if "-d" in argv:
 	DATASETS = argv[argv.index("-d")+1].split(";")
-if "-r" in argv:
-	OUTPUT = "Regression"
+
 if "-runs" in argv:
 	RUNS = int(argv[argv.index("-runs")+1])
+
 if "-op" in argv:
 	OPERATORS = argv[argv.index("-op")+1].split(";")
+
 if "-md" in argv:
 	MAX_DEPTH = int(argv[argv.index("-md")+1])
+
 if "-ps" in argv:
 	POPULATION_SIZE = int(argv[argv.index("-ps")+1])
+
 if "-mg" in argv:
 	MAX_GENERATION = int(argv[argv.index("-mg")+1])
+
 if "-tf" in argv:
-	TRAIN_FRACTION = float(argv[argv.index("-train")+1])
+	TRAIN_FRACTION = float(argv[argv.index("-tf")+1])
+
 if "-ts" in argv:
 	TOURNAMENT_SIZE = int(argv[argv.index("-ts")+1])
+
 if "-es" in argv:
 	ELITISM_SIZE = int(argv[argv.index("-es")+1])
+
 if "-dontshuffle" in argv:
 	SHUFFLE = False
+
 if "-s" in argv:
 	VERBOSE = False
-if "-ms" in argv:
-	MUTATION_STEP = float(argv[argv.index("-ms")+1])
+
 if "-t" in argv:
 	THREADS = int(argv[argv.index("-t")+1])
 
 
-
-
-def openFile(name):
-	global out
-	out = open(name,"w")
-
-def writeToFile(msg):
-	global out
-	out.write(msg)
-
-def closeFile():
-	global out
-	out.close()
-
-terminals = None
-def setTerminals(l):
-	global terminals 
-	terminals = l
-def getTerminals():
-	return terminals
-
-trainingSet = None
-def setTrainingSet(ds):
-	global trainingSet
-	trainingSet = ds
-def getTrainingSet():
-	return trainingSet
-
-testSet = None
-def setTestSet(ds):
-	global testSet
-	testSet = ds
-def getTestSet():
-	return testSet
diff --git a/Main_STGP_example.py b/Main_STGP_example.py
@@ -0,0 +1,44 @@
+import pandas
+
+from stgp.STGP import STGP
+
+from sklearn.model_selection import train_test_split
+
+from sklearn.metrics import accuracy_score
+
+import warnings
+
+warnings.filterwarnings("ignore", category=FutureWarning,
+                        message="From version 0.21, test_size will always complement",
+                        module="sklearn")
+
+# 
+# By using this file, you are agreeing to this product's EULA
+#
+# This product can be obtained in https://github.com/jespb/Python-STGP
+#
+# Copyright ©2019-2021 J. E. Batista
+#
+
+
+
+filename= "heart.csv"
+
+# Open the dataset
+ds = pandas.read_csv("datasets/"+filename)
+class_header = ds.columns[-1]
+
+# Split the dataset
+Tr_X, Te_X, Tr_Y, Te_Y = train_test_split(ds.drop(columns=[class_header]), ds[class_header], 
+		train_size=0.7, random_state = 42, stratify = ds[class_header])
+
+# Train a model
+model = STGP()
+model.fit(Tr_X, Tr_Y)
+
+# Predict test results
+pred = m3gp.predict(Te_X)
+
+# Obtain test accuracy
+print( accuracy_score(pred, Te_Y) )
+
diff --git a/Main_STGP_standalone.py b/Main_STGP_standalone.py
@@ -0,0 +1,145 @@
+import pandas
+
+from stgp.STGP import STGP
+from sys import argv
+from Arguments import *
+import os
+
+from sklearn.model_selection import train_test_split
+
+import numpy as np
+
+import warnings
+
+warnings.filterwarnings("ignore", category=FutureWarning,
+                        message="From version 0.21, test_size will always complement",
+                        module="sklearn")
+
+
+# 
+# By using this file, you are agreeing to this product's EULA
+#
+# This product can be obtained in https://github.com/jespb/Python-STGP
+#
+# Copyright ©2019-2021 J. E. Batista
+#
+
+
+
+
+def openAndSplitDatasets(which,seed):
+	if VERBOSE:
+		print( "> Opening: ", which )
+
+	# Open dataset
+	ds = pandas.read_csv(DATASETS_DIR+which)
+
+	# Read header
+	class_header = ds.columns[-1]
+
+	return train_test_split(ds.drop(columns=[class_header]), ds[class_header], 
+		train_size=TRAIN_FRACTION, random_state=seed, 
+		stratify = ds[class_header])
+
+
+def run(r,dataset):
+	if VERBOSE:
+		print("> Starting run:")
+		print("  > ID:", r)
+		print("  > Dataset:", dataset)
+		print()
+
+	Tr_X, Te_X, Tr_Y, Te_Y = openAndSplitDatasets(dataset,r)
+
+	# Train a model
+	model = STGP(OPERATORS, MAX_DEPTH, POPULATION_SIZE, MAX_GENERATION, TOURNAMENT_SIZE, 
+		ELITISM_SIZE, LIMIT_DEPTH, THREADS, VERBOSE)
+	model.fit(Tr_X, Tr_Y, Te_X, Te_Y)
+
+
+	# Obtain training results
+	accuracy  = model.getAccuracyOverTime()
+	rmse      = model.getRMSEOverTime()
+	size      = model.getSizeOverTime()
+	model_str = str(model.getBestIndividual())
+	times     = model.getGenerationTimes()
+
+	tr_acc     = accuracy[0]
+	te_acc     = accuracy[1]
+	tr_rmse    = rmse[0]
+	te_rmse    = rmse[1]
+
+	if VERBOSE:
+		print("> Ending run:")
+		print("  > ID:", r)
+		print("  > Dataset:", dataset)
+		print("  > Final model:", model_str)
+		print("  > Training accuracy:", tr_acc[-1])
+		print("  > Test accuracy:", te_acc[-1])
+		print()
+
+	return (tr_acc,te_acc,
+			tr_rmse,te_rmse,
+			size, times,
+			model_str)
+
+
+def callm3gp():
+	try:
+		os.makedirs(OUTPUT_DIR)
+	except:
+		pass
+
+	for dataset in DATASETS:
+		outputFilename = OUTPUT_DIR+"stgp_"+ dataset
+		if not os.path.exists(outputFilename):
+			results = []
+
+			# Run the algorithm several times
+			for r in range(RUNS):
+				results.append(run(r,dataset))
+
+			# Write output header
+			file = open(outputFilename , "w")
+			file.write("Attribute,Run,")
+			for i in range(MAX_GENERATION):
+				file.write(str(i)+",")
+			file.write("\n")
+
+			attributes= ["Training-Accuracy","Test-Accuracy",
+						 "Training-RMSE", "Test-RMSE",
+						 "Size", "Time",	
+						 "Final_Model"]
+
+			# Write attributes with value over time
+			for ai in range(len(attributes)-1):
+				for i in range(RUNS):	
+					file.write("\n"+attributes[ai]+","+str(i)+",")
+					file.write( ",".join([str(val) for val in results[i][ai]]))
+				file.write("\n")
+
+			# Write the final models
+			for i in range(len(results)):
+				file.write("\n"+attributes[-1]+","+str(i)+",")
+				file.write(results[i][-1])
+			file.write("\n")
+
+			# Write some parameters
+			file.write("\n\nParameters")
+			file.write("\nOperators,"+str(OPERATORS))
+			file.write("\nMax Initial Depth,"+str(MAX_DEPTH))
+			file.write("\nPopulation Size,"+str(POPULATION_SIZE))
+			file.write("\nMax Generation,"+str(MAX_GENERATION))
+			file.write("\nTournament Size,"+str(TOURNAMENT_SIZE))
+			file.write("\nElitism Size,"+str(ELITISM_SIZE))
+			file.write("\nDepth Limit,"+str(LIMIT_DEPTH))
+			file.write("\nThreads,"+str(THREADS))
+
+
+			file.close()
+		else:
+			print("Filename: " + outputFilename +" already exists.")
+
+
+if __name__ == '__main__':
+	callm3gp()