diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 2682c35..8294e5c 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -21,6 +21,7 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt +from sklearn.metrics import r2_score project_directory = pathlib.Path(".").parent.parent.absolute() test_train_opts = options.Options( diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 61ec5d2..96bcb48 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -25,12 +25,13 @@ from tensorflow.keras.models import Model from tensorflow.keras.models import Sequential import tensorflow.keras.backend as K +from verstack import stratified_continuous_split from dfpl import callbacks as cb from dfpl import options from dfpl import plot as pl from dfpl import settings - +import wandb def sample_down_data(opts: options.Options, df: pd.DataFrame, target: str, column: str) -> (np.ndarray, np.ndarray): @@ -315,7 +316,27 @@ def acper(y_true, y_pred, t: float = 0.02): yield True else: yield False +#def calculate_r2(y_true, y_pred,t: float = 0.02 ): + # ss_res = K.sum(K.square(y_true - y_pred)) + # ss_tot = K.sum(K.square(y_true - K.mean(y_true))) + # return 1 - ss_res / (ss_tot + K.epsilon()) +def calculate_r2(y_true, y_pred): + """ + Calculate R² (coefficient of determination) manually. + + :param y_true: Array of true values + :param y_pred: Array of predicted values + :return: R² value + """ + # Residual sum of squares (SS_res) + ss_res = np.sum((y_true - y_pred) ** 2) + + # Total sum of squares (SS_tot) + ss_tot = np.sum((y_true - np.mean(y_true)) ** 2) + # Calculate R² + r2 = 1 - (ss_res / (ss_tot + np.finfo(float).eps)) # Add small epsilon to avoid division by zero + return r2 def evaluate_regression_model(x_test: np.ndarray, y_test: np.ndarray,file_prefix: str, model: Model, target: str, fold: int, threshold: float = 0.05) -> pd.DataFrame: @@ -346,9 +367,10 @@ def evaluate_regression_model(x_test: np.ndarray, y_test: np.ndarray,file_prefix abs_error = abs(error) # Compute R² (coefficient of determination) - ss_res = np.sum((y_test - y_predict) ** 2) # Residual sum of squares - ss_tot = np.sum((y_test - np.mean(y_test)) ** 2) # Total sum of squares - r2 = 1 - (ss_res / (ss_tot + np.finfo(float).eps)) # Add small epsilon to avoid division by zero + # ss_res = np.sum((y_test - y_predict) ** 2) # Residual sum of squares + # ss_tot = np.sum((y_test - np.mean(y_test)) ** 2) # Total sum of squares + # r2 = 1 - (ss_res / (ss_tot + np.finfo(float).eps)) # Add small epsilon to avoid division by zero + r2=calculate_r2(y_test, y_predict) regression_metrics = ['MSE', 'MAE', 'MdAE', 'ACPER', 'MAPE', 'RMSE','R2'] metric_values = [ @@ -500,13 +522,14 @@ def fit_and_evaluate_model(x_train: np.ndarray, x_test: np.ndarray, y_train: np. rmse_test = np.sqrt(np.mean((y_test - y_test_pred) ** 2)) # R² calculations - ss_res_train = np.sum((y_train - y_train_pred) ** 2) - ss_tot_train = np.sum((y_train - np.mean(y_train)) ** 2) - r2_train = 1 - (ss_res_train / (ss_tot_train + np.finfo(float).eps)) - - ss_res_test = np.sum((y_test - y_test_pred) ** 2) - ss_tot_test = np.sum((y_test - np.mean(y_test)) ** 2) - r2_test = 1 - (ss_res_test / (ss_tot_test + np.finfo(float).eps)) + #ss_res_train = np.sum((y_train - y_train_pred) ** 2) + # ss_tot_train = np.sum((y_train - np.mean(y_train)) ** 2) + #r2_train = 1 - (ss_res_train / (ss_tot_train + np.finfo(float).eps)) + r2_train = calculate_r2(y_train, y_train_pred) + r2_test = calculate_r2(y_test, y_test_pred) + # ss_res_test = np.sum((y_test - y_test_pred) ** 2) + #ss_tot_test = np.sum((y_test - np.mean(y_test)) ** 2) + # r2_test = 1 - (ss_res_test / (ss_tot_test + np.finfo(float).eps)) # Save metrics to a separate file metrics_file = f"{model_file_prefix}.metrics.csv" @@ -519,6 +542,15 @@ def fit_and_evaluate_model(x_train: np.ndarray, x_test: np.ndarray, y_train: np. logging.info(f"Metrics saved to {metrics_file}") + # Log metrics to W&B + wandb.log({ + "Train RMSE": rmse_train, + "Test RMSE": rmse_test, + "Train R²": r2_train, + "Test R²": r2_test + }) + + # Plot predictions vs. actual for both train and test y_train_pred = callback_model.predict(x_train).flatten() y_test_pred = callback_model.predict(x_test).flatten() @@ -593,6 +625,13 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: test_size=opts.testSize, random_state=split_random_state) + #x_train, x_test, y_train, y_test = stratified_continuous_split(x, y, + # stratify=target, + # test_size=opts.testSize, + # train_size= 1-test_size, + # continuous=True, + # random_state=split_random_state) + performance = fit_and_evaluate_model(x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test, fold=0, target=target, opts=opts) diff --git a/example/dilshana-sweep-trial/trial_sweep.json b/example/dilshana-sweep-trial/trial_sweep.json new file mode 100644 index 0000000..5744ccc --- /dev/null +++ b/example/dilshana-sweep-trial/trial_sweep.json @@ -0,0 +1,25 @@ +{ +"py/object": "dfpl.options.Options", +"inputFile": "/home/shanavas/PycharmProjects/generate_dfpl_regression_data/data/cytoxicity_zscore.csv", +"outputDir": "example/cytotox/wandb", +"ecModelDir": "/home/shanavas/PycharmProjects/deepFPlearn/example/models/generic_encoder/", +"type": "smiles", +"fpType": "topological", +"fpSize": 2048, +"encFPSize": 256, +"enableMultiLabel": false, +"verbose": 2, +"trainAC": false, +"trainFNN": true, +"compressFeatures": true, +"kFolds": 5, +"testSize": 0.2, +"optimizer": "Adam", +"lossFunction": "mse", +"epochs": 5000, +"activationFunction": "tanh", +"fnnType": "REG", +"wabTracking": true, +"normalize" : true +} + diff --git a/example/train_cytotox.json b/example/train_cytotox.json index 93b7a7f..fef6789 100755 --- a/example/train_cytotox.json +++ b/example/train_cytotox.json @@ -1,7 +1,7 @@ { "py/object": "dfpl.options.Options", "inputFile": "/home/shanavas/PycharmProjects/generate_dfpl_regression_data/data/cytoxicity_zscore.csv", - "outputDir": "example/cytotox/new/train_comp3", + "outputDir": "example/cytotox/new/train_trail", "ecModelDir": "example/models/generic_encoder/", "type": "smiles", "fpType": "topological", @@ -17,11 +17,11 @@ "optimizer": "Adam", "lossFunction": "Huber", "epochs": 5000, - "batchSize": 16 , + "batchSize": 128 , "activationFunction": "tanh", - "dropout": 0.001 , - "learningRate" : 0.00001 , - "l2reg" : 0.00000005, + "dropout": 0.06600139458057926 , + "learningRate" : 0.0008753463402134107 , + "l2reg" : 0.00007087829555603556, "fnnType": "REG", "normalize" : true } diff --git a/example/train_cytotox_good.json b/example/train_cytotox_good.json new file mode 100755 index 0000000..dd2080c --- /dev/null +++ b/example/train_cytotox_good.json @@ -0,0 +1,27 @@ +{ + "py/object": "dfpl.options.Options", + "inputFile": "/home/shanavas/PycharmProjects/generate_dfpl_regression_data/data/cytoxicity_zscore.csv", + "outputDir": "example/cytotox/new/train_comp5", + "ecModelDir": "example/models/generic_encoder/", + "type": "smiles", + "fpType": "topological", + "fpSize": 2048, + "encFPSize": 256, + "enableMultiLabel": false, + "verbose": 2, + "trainAC": false, + "trainFNN": true, + "compressFeatures": true, + "kFolds": 5, + "testSize": 0.2, + "optimizer": "Adam", + "lossFunction": "Huber", + "epochs": 5000, + "batchSize": 32 , + "activationFunction": "tanh", + "dropout": 0.05 , + "learningRate" : 0.00001 , + "l2reg" : 0.001, + "fnnType": "REG", + "normalize" : true +} diff --git a/sweep_cytotox.yaml b/sweep_cytotox.yaml index b8df4cc..8a4cb6e 100644 --- a/sweep_cytotox.yaml +++ b/sweep_cytotox.yaml @@ -6,9 +6,10 @@ command: - ${program} - "train" - "-f" - - "/home/shanavas/PycharmProjects/deepFPlearn/example/dilshana-sweep-trial/cytotox.json" + - "/home/shanavas/PycharmProjects/deepFPlearn/example/dilshana-sweep-trial/trial_sweep.json" - ${args} method: random + metric: name: val_loss goal: minimize @@ -26,7 +27,9 @@ parameters: min: 0.0001 max: 0.001 batchSize: # For batch size, it’s better to choose from discrete values - values: [32, 64, 128, 256, 512] + values: [32, 64, 128] wabTarget: - value: AR + value: actual + + diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index 8e9b264..a7c2e48 120000 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -1 +1 @@ -run-20241126_125844-yj2mtwy7/logs/debug-internal.log \ No newline at end of file +run-20241203_133139-yqh3ifss/logs/debug-internal.log \ No newline at end of file diff --git a/wandb/debug.log b/wandb/debug.log index 240ca66..b33309d 120000 --- a/wandb/debug.log +++ b/wandb/debug.log @@ -1 +1 @@ -run-20241126_125844-yj2mtwy7/logs/debug.log \ No newline at end of file +run-20241203_133139-yqh3ifss/logs/debug.log \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run index 3a5d584..1908800 120000 --- a/wandb/latest-run +++ b/wandb/latest-run @@ -1 +1 @@ -run-20241126_125844-yj2mtwy7 \ No newline at end of file +run-20241203_133139-yqh3ifss \ No newline at end of file