diff --git a/.github/workflows/macos_test_cases.yml b/.github/workflows/macos_test_cases.yml index 85d8429..e01cb98 100644 --- a/.github/workflows/macos_test_cases.yml +++ b/.github/workflows/macos_test_cases.yml @@ -5,7 +5,7 @@ on: [push, pull_request, workflow_dispatch] jobs: build: - runs-on: ["macos-latest"] + runs-on: ["macos-13"] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..9a4925a --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,17 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.8' + - run: pip3 install pre-commit + - run: pre-commit run --files spare_scores/* diff --git a/setup.cfg b/setup.cfg index effd740..aae17b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,6 +17,7 @@ per-file-ignores = __init__.py: F401, F403 [mypy] +exclude= tests # Import discovery namespace_packages = False ignore_missing_imports = True diff --git a/spare_scores/classes.py b/spare_scores/classes.py index 0918af1..1a11fff 100644 --- a/spare_scores/classes.py +++ b/spare_scores/classes.py @@ -94,9 +94,7 @@ def train_model(self, df: pd.DataFrame, **kwargs: Any) -> Any: try: result = self.model.fit( - df[self.predictors + [self.key_var] + [self.target]], - self.verbose, - **kwargs, + df[self.predictors + [self.key_var] + [self.target]], self.verbose ) except Exception as e: err = "\033[91m\033[1m" + "spare_train(): Model fit failed." + "\033[0m\n" diff --git a/spare_scores/data_prep.py b/spare_scores/data_prep.py index 64fe56f..a262c70 100644 --- a/spare_scores/data_prep.py +++ b/spare_scores/data_prep.py @@ -299,7 +299,9 @@ def age_sex_match( )[1] else: p_sex = 1 - logging.debug(f" Original: P_age: {np.round(p_age,2)}/ P_sex: {np.round(p_sex,2)}") + logging.debug( + f" Original: P_age: {np.round(p_age, 2)}/ P_sex: {np.round(p_sex, 2)}" + ) p_age_all, p_sex_all = np.array(p_age), np.array(p_sex) while np.min([p_age, p_sex]) < p_threshold: @@ -345,7 +347,7 @@ def age_sex_match( df1, df2 = df2.copy(), df1.copy() logging.debug(f" {n_orig - len(df1.index) - len(df2.index)} participants excluded") - logging.debug(f" Final: P_age: {np.round(p_age,2)}/ P_sex {np.round(p_sex,2)}") + logging.debug(f" Final: P_age: {np.round(p_age, 2)}/ P_sex {np.round(p_sex, 2)}") logging.info("Age/Sex matched!") if no_df2: return pd.concat([df1, df2], ignore_index=True) diff --git a/spare_scores/mlp.py b/spare_scores/mlp.py index 675c637..d101563 100644 --- a/spare_scores/mlp.py +++ b/spare_scores/mlp.py @@ -99,7 +99,7 @@ def __init__( "mlp__alpha": [0.001, 0.01, 0.05, 0.1], "mlp__learning_rate": ["constant", "adaptive"], "mlp__early_stopping": [True], - "mlp__max_iter": [5000], + "mlp__max_iter": [500], } def set_parameters(self, **parameters: Any) -> None: @@ -112,11 +112,11 @@ def _fit(self, df: pd.DataFrame) -> None: y = df[self.to_predict].astype("float64") if self.task == "Regression": - mlp = MLPRegressor(early_stopping=True, max_iter=5000) + mlp = MLPRegressor(early_stopping=True, max_iter=500) scoring = "neg_mean_absolute_error" metrics = ["MAE", "RMSE", "R2"] else: - mlp = MLPClassifier(early_stopping=True, max_iter=5000) + mlp = MLPClassifier(early_stopping=True, max_iter=500) scoring = "balanced_accuracy" metrics = [ "AUC", @@ -136,7 +136,7 @@ def _fit(self, df: pd.DataFrame) -> None: pipeline_obj, self.param_grid, scoring=scoring, - cv=KFold(n_splits=5, shuffle=True, random_state=10086), + cv=KFold(n_splits=5, shuffle=True, random_state=42), refit=True, ) grid_search.fit(X, y) @@ -226,13 +226,23 @@ def predict(self, df: pd.DataFrame) -> np.ndarray: return y_pred - def get_stats(self, y_test: np.ndarray, y_score: np.ndarray) -> None: - if len(y_test.unique()) == 2: - fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score, pos_label=1) + def get_stats(self, y: np.ndarray, y_hat: np.ndarray) -> None: + """ + Return the stats from the training + + :param y: original labels + :type y: np.ndarray + :param y_hat: predicted values + :type y_hat: np.ndarray + + """ + + if len(y.unique()) == 2: + fpr, tpr, thresholds = metrics.roc_curve(y, y_hat, pos_label=1) self.stats["AUC"].append(metrics.auc(fpr, tpr)) # tn, fp, fn, tp = metrics.confusion_matrix(y_test, (y_score >= thresholds[np.argmax(tpr - fpr)])*2-1).ravel() - tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_score).ravel() + tn, fp, fn, tp = metrics.confusion_matrix(y, y_hat).ravel() self.stats["Accuracy"].append((tp + tn) / (tp + tn + fp + fn)) self.stats["Sensitivity"].append(tp / (tp + fp)) self.stats["Specificity"].append(tn / (tn + fn)) @@ -241,15 +251,14 @@ def get_stats(self, y_test: np.ndarray, y_score: np.ndarray) -> None: self.stats["Recall"].append(recall) self.stats["F1"].append(2 * precision * recall / (precision + recall)) else: - self.stats["MAE"].append(metrics.mean_absolute_error(y_test, y_score)) + self.stats["MAE"].append(metrics.mean_absolute_error(y, y_hat)) self.stats["RMSE"].append( - metrics.mean_squared_error(y_test, y_score, squared=False) + metrics.mean_squared_error(y, y_hat, squared=False) ) - self.stats["R2"].append(metrics.r2_score(y_test, y_score)) - # logging.debug(' > ' + ' / '.join([f'{key}={value[-1]:#.4f}' for key, value in self.stats.items()])) + self.stats["R2"].append(metrics.r2_score(y, y_hat)) def output_stats(self) -> None: for key, value in self.stats.items(): logging.info( - f">> {key} = {np.mean(value):#.4f} \u00B1 {np.std(value):#.4f}" + f">> {key} = {np.mean(value): #.4f} \u00B1 {np.std(value): #.4f}" ) diff --git a/spare_scores/mlp_torch.py b/spare_scores/mlp_torch.py index 79a1189..3715432 100644 --- a/spare_scores/mlp_torch.py +++ b/spare_scores/mlp_torch.py @@ -1,4 +1,5 @@ import logging +import os import time from typing import Any, Tuple @@ -28,6 +29,7 @@ from sklearn.utils._testing import ignore_warnings from torch.utils.data import DataLoader, Dataset +os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" # for MPS backend device = ( "cuda" if torch.cuda.is_available() @@ -86,64 +88,43 @@ class SimpleMLP(nn.Module): def __init__( self, - num_features: int = 147, hidden_size: int = 256, classification: bool = True, - dropout: Any = 0.2, + dropout: float = 0.2, use_bn: bool = False, bn: str = "bn", ) -> None: super(SimpleMLP, self).__init__() - self.num_features = num_features self.hidden_size = hidden_size self.dropout = dropout self.classification = classification self.use_bn = use_bn - self.linear1 = nn.Linear(self.num_features, self.hidden_size) - self.norm1 = ( - nn.InstanceNorm1d(self.hidden_size, eps=1e-15) - if bn != "bn" - else nn.BatchNorm1d(self.hidden_size, eps=1e-15) - ) + def MLPLayer(hidden_size: int) -> nn.Module: + """ + Our model contains 2 MLPLayers(see bellow) + """ + return nn.Sequential( + nn.LazyLinear(hidden_size), + ( + nn.InstanceNorm1d(hidden_size, eps=1e-15) + if bn != "bn" + else nn.BatchNorm1d(hidden_size, eps=1e-15) + ), + nn.ReLU(), + nn.Dropout(p=0.2), + ) - self.linear2 = nn.Linear(self.hidden_size, self.hidden_size // 2) - self.norm2 = ( - nn.InstanceNorm1d(self.hidden_size // 2, eps=1e-15) - if bn != "bn" - else nn.BatchNorm1d(self.hidden_size // 2, eps=1e-15) + self.model = nn.Sequential( + MLPLayer(self.hidden_size), + MLPLayer(self.hidden_size // 2), + nn.LazyLinear(1), + nn.Sigmoid() if self.classification else nn.ReLU(), ) - self.linear3 = nn.Linear(self.hidden_size // 2, 1) - - self.relu = nn.ReLU() - self.dropout = nn.Dropout(p=0.2) - self.sigmoid = nn.Sigmoid() - def forward(self, x: torch.Tensor) -> torch.Tensor: - # first layer - x = self.linear1(x) - if self.use_bn: - x = self.norm1(x) - x = self.dropout(self.relu(x)) - - # second layer - x = self.linear2(x) - if self.use_bn: - x = self.norm2(x) - x = self.relu(x) - x = self.dropout(x) - - # thrid layer - x = self.linear3(x) - - if self.classification: - x = self.sigmoid(x) - else: - x = self.relu(x) - - return x.squeeze() + return self.model(x).squeeze() class MLPTorchModel: @@ -226,8 +207,7 @@ def __init__( if "num_epochs" not in kwargs.keys(): self.num_epochs = 100 - - if device != "cuda": + if device != "cuda" and device != "mps": print("You are not using the GPU! Check your device") # Model settings @@ -240,6 +220,19 @@ def __init__( self.val_dl: Any def find_best_threshold(self, y_hat: list, y: list) -> Any: + """ + Returns best threshold value using the roc_curve + + :param y_hat: predicted values + :type y_hat: list + :param y: original labels + :type y: list + + :return: the best threshold value + :rtype: Any + + """ + fpr, tpr, thresholds_roc = roc_curve(y, y_hat, pos_label=1) youden_index = tpr - fpr best_threshold_youden = thresholds_roc[np.argmax(youden_index)] @@ -297,21 +290,21 @@ def get_all_stats(self, y_hat: list, y: list, classification: bool = True) -> di return res_dict def object(self, trial: Any) -> float: - evaluation_metric = ( - "Balanced Accuarcy" if self.task == "Classification" else "MAE" + "Balanced Accuracy" if self.task == "Classification" else "MAE" ) assert self.train_dl is not None assert self.val_dl is not None - hidden_size = trial.suggest_categorical("hidden_size", [128, 256, 512]) - dropout = trial.suggest_float("dropout", 0.1, 0.8, step=0.05) + hidden_size = trial.suggest_categorical( + "hidden_size", [x for x in range(32, 512, 32)] + ) + dropout = trial.suggest_float("dropout", 0.1, 0.8, step=0.03) lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True) use_bn = trial.suggest_categorical("use_bn", [False, True]) bn = trial.suggest_categorical("bn", ["bn", "in"]) model = SimpleMLP( - num_features=len(self.predictors), hidden_size=hidden_size, classification=self.classification, dropout=dropout, @@ -430,7 +423,6 @@ def fit(self, df: pd.DataFrame, verbose: int = 1, **kwargs: Any) -> dict: best_model_state_dict = best_checkpoint["model_state_dict"] self.mdl = SimpleMLP( - num_features=len(self.predictors), hidden_size=best_hyperparams["hidden_size"], classification=self.classification, dropout=best_hyperparams["dropout"], @@ -496,5 +488,5 @@ def predict(self, df: pd.DataFrame) -> np.ndarray: def output_stats(self) -> None: for key, value in self.stats.items(): logging.info( - f">> {key} = {np.mean(value):#.4f} \u00B1 {np.std(value):#.4f}" + f">> {key} = {np.mean(value): #.4f} \u00B1 {np.std(value): #.4f}" ) diff --git a/spare_scores/svm.py b/spare_scores/svm.py index c33e375..5734927 100644 --- a/spare_scores/svm.py +++ b/spare_scores/svm.py @@ -272,7 +272,7 @@ def train_initialize(self, df: pd.DataFrame, to_predict: str) -> None: def run_CV(self, df: pd.DataFrame) -> None: for i, fold in enumerate(self.folds): if i % self.n_repeats == 0: - logging.info(f" FOLD {int(i/self.n_repeats+1)}...") + logging.info(f" FOLD {int(i / self.n_repeats + 1)}...") X_train, X_test, y_train, y_test = self.prepare_sample( df, fold, self.scaler[i], classify=self.classify ) @@ -360,5 +360,5 @@ def correct_reg_bias(self, fold: Any, y_test: list) -> Any: def output_stats(self) -> None: for key, value in self.stats.items(): logging.info( - f">> {key} = {np.mean(value):#.4f} \u00B1 {np.std(value):#.4f}" + f">> {key} = {np.mean(value): #.4f} \u00B1 {np.std(value): #.4f}" ) diff --git a/test_data_prep.py b/test_data_prep.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_spare_scores.py b/tests/unit/test_spare_scores.py index 0ea7058..46857c9 100644 --- a/tests/unit/test_spare_scores.py +++ b/tests/unit/test_spare_scores.py @@ -12,7 +12,7 @@ class CheckSpareScores(unittest.TestCase): - def test_spare_test(self): + def test_spare_test_SVM(self): self.df_fixture = load_df("../fixtures/sample_data.csv") self.model_fixture = load_model("../fixtures/sample_model.pkl.gz") @@ -55,7 +55,71 @@ def test_spare_test(self): ) self.assertTrue(result == ["ROI1"]) - def test_spare_train(self): + def test_spare_train_MLP(self): + self.df_fixture = load_df("../fixtures/sample_data.csv") + self.model_fixture = load_model("../fixtures/sample_model.pkl.gz") + # Test case 1: Testing spare_train with MLP model + result = spare_train( + self.df_fixture, + "Age", + model_type="MLP", + data_vars=[ + "ROI1", + "ROI2", + "ROI3", + "ROI4", + "ROI5", + "ROI6", + "ROI7", + "ROI8", + "ROI9", + "ROI10", + ], + ) + status, result_data = result["status"], result["data"] + metadata = result_data[1] + self.assertTrue(status == "OK") + self.assertTrue(metadata["mdl_type"] == "MLP") + self.assertTrue(metadata["kernel"] == "linear") + self.assertTrue( + set(metadata["predictors"]) == set(self.model_fixture[1]["predictors"]) + ) + self.assertTrue(metadata["to_predict"] == self.model_fixture[1]["to_predict"]) + + def test_spare_train_MLPTorch(self): + self.df_fixture = load_df("../fixtures/sample_data.csv") + self.model_fixture = load_model("../fixtures/sample_model.pkl.gz") + # Test case 1: testing training an MLPTorch model + result = spare_train( + self.df_fixture, + "Age", + model_type="MLPTorch", + data_vars=[ + "ROI1", + "ROI2", + "ROI3", + "ROI4", + "ROI5", + "ROI6", + "ROI7", + "ROI8", + "ROI9", + "ROI10", + ], + ) + + status, result_data = result["status"], result["data"] + + metadata = result_data[1] + self.assertTrue(status == "OK") + self.assertTrue(metadata["mdl_type"] == "MLPTorch") + self.assertTrue(metadata["kernel"] == "linear") + self.assertTrue( + set(metadata["predictors"]) == set(self.model_fixture[1]["predictors"]) + ) + self.assertTrue(metadata["to_predict"] == self.model_fixture[1]["to_predict"]) + + def test_spare_train_SVM(self): self.df_fixture = load_df("../fixtures/sample_data.csv") self.model_fixture = load_model("../fixtures/sample_model.pkl.gz")