Skip to content

Commit

Permalink
Merge pull request #18 from spirosmaggioros/algorithms_optimization
Browse files Browse the repository at this point in the history
Optimizations to MLPTorch module | Some more test cases | Minor syntax fixes and pre-commit workflow
  • Loading branch information
spirosmaggioros authored Aug 1, 2024
2 parents 4dc1230 + 3268904 commit 28c3e50
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 74 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/macos_test_cases.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on: [push, pull_request, workflow_dispatch]

jobs:
build:
runs-on: ["macos-latest"]
runs-on: ["macos-13"]

steps:
- uses: actions/checkout@v4
Expand Down
17 changes: 17 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: pre-commit

on:
pull_request:
push:
branches: [main]

jobs:
pre-commit:
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.8'
- run: pip3 install pre-commit
- run: pre-commit run --files spare_scores/*
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ per-file-ignores =
__init__.py: F401, F403

[mypy]
exclude= tests
# Import discovery
namespace_packages = False
ignore_missing_imports = True
Expand Down
4 changes: 1 addition & 3 deletions spare_scores/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ def train_model(self, df: pd.DataFrame, **kwargs: Any) -> Any:

try:
result = self.model.fit(
df[self.predictors + [self.key_var] + [self.target]],
self.verbose,
**kwargs,
df[self.predictors + [self.key_var] + [self.target]], self.verbose
)
except Exception as e:
err = "\033[91m\033[1m" + "spare_train(): Model fit failed." + "\033[0m\n"
Expand Down
6 changes: 4 additions & 2 deletions spare_scores/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,9 @@ def age_sex_match(
)[1]
else:
p_sex = 1
logging.debug(f" Original: P_age: {np.round(p_age,2)}/ P_sex: {np.round(p_sex,2)}")
logging.debug(
f" Original: P_age: {np.round(p_age, 2)}/ P_sex: {np.round(p_sex, 2)}"
)

p_age_all, p_sex_all = np.array(p_age), np.array(p_sex)
while np.min([p_age, p_sex]) < p_threshold:
Expand Down Expand Up @@ -345,7 +347,7 @@ def age_sex_match(
df1, df2 = df2.copy(), df1.copy()

logging.debug(f" {n_orig - len(df1.index) - len(df2.index)} participants excluded")
logging.debug(f" Final: P_age: {np.round(p_age,2)}/ P_sex {np.round(p_sex,2)}")
logging.debug(f" Final: P_age: {np.round(p_age, 2)}/ P_sex {np.round(p_sex, 2)}")
logging.info("Age/Sex matched!")
if no_df2:
return pd.concat([df1, df2], ignore_index=True)
Expand Down
35 changes: 22 additions & 13 deletions spare_scores/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def __init__(
"mlp__alpha": [0.001, 0.01, 0.05, 0.1],
"mlp__learning_rate": ["constant", "adaptive"],
"mlp__early_stopping": [True],
"mlp__max_iter": [5000],
"mlp__max_iter": [500],
}

def set_parameters(self, **parameters: Any) -> None:
Expand All @@ -112,11 +112,11 @@ def _fit(self, df: pd.DataFrame) -> None:
y = df[self.to_predict].astype("float64")

if self.task == "Regression":
mlp = MLPRegressor(early_stopping=True, max_iter=5000)
mlp = MLPRegressor(early_stopping=True, max_iter=500)
scoring = "neg_mean_absolute_error"
metrics = ["MAE", "RMSE", "R2"]
else:
mlp = MLPClassifier(early_stopping=True, max_iter=5000)
mlp = MLPClassifier(early_stopping=True, max_iter=500)
scoring = "balanced_accuracy"
metrics = [
"AUC",
Expand All @@ -136,7 +136,7 @@ def _fit(self, df: pd.DataFrame) -> None:
pipeline_obj,
self.param_grid,
scoring=scoring,
cv=KFold(n_splits=5, shuffle=True, random_state=10086),
cv=KFold(n_splits=5, shuffle=True, random_state=42),
refit=True,
)
grid_search.fit(X, y)
Expand Down Expand Up @@ -226,13 +226,23 @@ def predict(self, df: pd.DataFrame) -> np.ndarray:

return y_pred

def get_stats(self, y_test: np.ndarray, y_score: np.ndarray) -> None:
if len(y_test.unique()) == 2:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score, pos_label=1)
def get_stats(self, y: np.ndarray, y_hat: np.ndarray) -> None:
"""
Return the stats from the training
:param y: original labels
:type y: np.ndarray
:param y_hat: predicted values
:type y_hat: np.ndarray
"""

if len(y.unique()) == 2:
fpr, tpr, thresholds = metrics.roc_curve(y, y_hat, pos_label=1)
self.stats["AUC"].append(metrics.auc(fpr, tpr))

# tn, fp, fn, tp = metrics.confusion_matrix(y_test, (y_score >= thresholds[np.argmax(tpr - fpr)])*2-1).ravel()
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_score).ravel()
tn, fp, fn, tp = metrics.confusion_matrix(y, y_hat).ravel()
self.stats["Accuracy"].append((tp + tn) / (tp + tn + fp + fn))
self.stats["Sensitivity"].append(tp / (tp + fp))
self.stats["Specificity"].append(tn / (tn + fn))
Expand All @@ -241,15 +251,14 @@ def get_stats(self, y_test: np.ndarray, y_score: np.ndarray) -> None:
self.stats["Recall"].append(recall)
self.stats["F1"].append(2 * precision * recall / (precision + recall))
else:
self.stats["MAE"].append(metrics.mean_absolute_error(y_test, y_score))
self.stats["MAE"].append(metrics.mean_absolute_error(y, y_hat))
self.stats["RMSE"].append(
metrics.mean_squared_error(y_test, y_score, squared=False)
metrics.mean_squared_error(y, y_hat, squared=False)
)
self.stats["R2"].append(metrics.r2_score(y_test, y_score))
# logging.debug(' > ' + ' / '.join([f'{key}={value[-1]:#.4f}' for key, value in self.stats.items()]))
self.stats["R2"].append(metrics.r2_score(y, y_hat))

def output_stats(self) -> None:
for key, value in self.stats.items():
logging.info(
f">> {key} = {np.mean(value):#.4f} \u00B1 {np.std(value):#.4f}"
f">> {key} = {np.mean(value): #.4f} \u00B1 {np.std(value): #.4f}"
)
94 changes: 43 additions & 51 deletions spare_scores/mlp_torch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
import time
from typing import Any, Tuple

Expand Down Expand Up @@ -28,6 +29,7 @@
from sklearn.utils._testing import ignore_warnings
from torch.utils.data import DataLoader, Dataset

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" # for MPS backend
device = (
"cuda"
if torch.cuda.is_available()
Expand Down Expand Up @@ -86,64 +88,43 @@ class SimpleMLP(nn.Module):

def __init__(
self,
num_features: int = 147,
hidden_size: int = 256,
classification: bool = True,
dropout: Any = 0.2,
dropout: float = 0.2,
use_bn: bool = False,
bn: str = "bn",
) -> None:
super(SimpleMLP, self).__init__()

self.num_features = num_features
self.hidden_size = hidden_size
self.dropout = dropout
self.classification = classification
self.use_bn = use_bn

self.linear1 = nn.Linear(self.num_features, self.hidden_size)
self.norm1 = (
nn.InstanceNorm1d(self.hidden_size, eps=1e-15)
if bn != "bn"
else nn.BatchNorm1d(self.hidden_size, eps=1e-15)
)
def MLPLayer(hidden_size: int) -> nn.Module:
"""
Our model contains 2 MLPLayers(see bellow)
"""
return nn.Sequential(
nn.LazyLinear(hidden_size),
(
nn.InstanceNorm1d(hidden_size, eps=1e-15)
if bn != "bn"
else nn.BatchNorm1d(hidden_size, eps=1e-15)
),
nn.ReLU(),
nn.Dropout(p=0.2),
)

self.linear2 = nn.Linear(self.hidden_size, self.hidden_size // 2)
self.norm2 = (
nn.InstanceNorm1d(self.hidden_size // 2, eps=1e-15)
if bn != "bn"
else nn.BatchNorm1d(self.hidden_size // 2, eps=1e-15)
self.model = nn.Sequential(
MLPLayer(self.hidden_size),
MLPLayer(self.hidden_size // 2),
nn.LazyLinear(1),
nn.Sigmoid() if self.classification else nn.ReLU(),
)

self.linear3 = nn.Linear(self.hidden_size // 2, 1)

self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.2)
self.sigmoid = nn.Sigmoid()

def forward(self, x: torch.Tensor) -> torch.Tensor:
# first layer
x = self.linear1(x)
if self.use_bn:
x = self.norm1(x)
x = self.dropout(self.relu(x))

# second layer
x = self.linear2(x)
if self.use_bn:
x = self.norm2(x)
x = self.relu(x)
x = self.dropout(x)

# thrid layer
x = self.linear3(x)

if self.classification:
x = self.sigmoid(x)
else:
x = self.relu(x)

return x.squeeze()
return self.model(x).squeeze()


class MLPTorchModel:
Expand Down Expand Up @@ -226,8 +207,7 @@ def __init__(

if "num_epochs" not in kwargs.keys():
self.num_epochs = 100

if device != "cuda":
if device != "cuda" and device != "mps":
print("You are not using the GPU! Check your device")

# Model settings
Expand All @@ -240,6 +220,19 @@ def __init__(
self.val_dl: Any

def find_best_threshold(self, y_hat: list, y: list) -> Any:
"""
Returns best threshold value using the roc_curve
:param y_hat: predicted values
:type y_hat: list
:param y: original labels
:type y: list
:return: the best threshold value
:rtype: Any
"""

fpr, tpr, thresholds_roc = roc_curve(y, y_hat, pos_label=1)
youden_index = tpr - fpr
best_threshold_youden = thresholds_roc[np.argmax(youden_index)]
Expand Down Expand Up @@ -297,21 +290,21 @@ def get_all_stats(self, y_hat: list, y: list, classification: bool = True) -> di
return res_dict

def object(self, trial: Any) -> float:

evaluation_metric = (
"Balanced Accuarcy" if self.task == "Classification" else "MAE"
"Balanced Accuracy" if self.task == "Classification" else "MAE"
)
assert self.train_dl is not None
assert self.val_dl is not None

hidden_size = trial.suggest_categorical("hidden_size", [128, 256, 512])
dropout = trial.suggest_float("dropout", 0.1, 0.8, step=0.05)
hidden_size = trial.suggest_categorical(
"hidden_size", [x for x in range(32, 512, 32)]
)
dropout = trial.suggest_float("dropout", 0.1, 0.8, step=0.03)
lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
use_bn = trial.suggest_categorical("use_bn", [False, True])
bn = trial.suggest_categorical("bn", ["bn", "in"])

model = SimpleMLP(
num_features=len(self.predictors),
hidden_size=hidden_size,
classification=self.classification,
dropout=dropout,
Expand Down Expand Up @@ -430,7 +423,6 @@ def fit(self, df: pd.DataFrame, verbose: int = 1, **kwargs: Any) -> dict:
best_model_state_dict = best_checkpoint["model_state_dict"]

self.mdl = SimpleMLP(
num_features=len(self.predictors),
hidden_size=best_hyperparams["hidden_size"],
classification=self.classification,
dropout=best_hyperparams["dropout"],
Expand Down Expand Up @@ -496,5 +488,5 @@ def predict(self, df: pd.DataFrame) -> np.ndarray:
def output_stats(self) -> None:
for key, value in self.stats.items():
logging.info(
f">> {key} = {np.mean(value):#.4f} \u00B1 {np.std(value):#.4f}"
f">> {key} = {np.mean(value): #.4f} \u00B1 {np.std(value): #.4f}"
)
4 changes: 2 additions & 2 deletions spare_scores/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def train_initialize(self, df: pd.DataFrame, to_predict: str) -> None:
def run_CV(self, df: pd.DataFrame) -> None:
for i, fold in enumerate(self.folds):
if i % self.n_repeats == 0:
logging.info(f" FOLD {int(i/self.n_repeats+1)}...")
logging.info(f" FOLD {int(i / self.n_repeats + 1)}...")
X_train, X_test, y_train, y_test = self.prepare_sample(
df, fold, self.scaler[i], classify=self.classify
)
Expand Down Expand Up @@ -360,5 +360,5 @@ def correct_reg_bias(self, fold: Any, y_test: list) -> Any:
def output_stats(self) -> None:
for key, value in self.stats.items():
logging.info(
f">> {key} = {np.mean(value):#.4f} \u00B1 {np.std(value):#.4f}"
f">> {key} = {np.mean(value): #.4f} \u00B1 {np.std(value): #.4f}"
)
Empty file added test_data_prep.py
Empty file.
Loading

0 comments on commit 28c3e50

Please sign in to comment.