Merge pull request #6817 from ZanMervic/scoring-sheet

[ENH] ScoringSheet and ScoringSheetViewer widgets added
biolab · Oct 28, 2024 · ef3e166 · ef3e166
2 parents c27a803 + 12bd5a1
commit ef3e166
Show file tree

Hide file tree

Showing 29 changed files with 2,633 additions and 0 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -10,6 +10,7 @@ omit =
     */tests/*
     */setup.py
     */*/setup.py
+    Orange/classification/utils/fasterrisk/*
 
 [report]
 exclude_lines =

diff --git a/Orange/classification/__init__.py b/Orange/classification/__init__.py
@@ -20,6 +20,7 @@
 from .sgd import *
 from .neural_network import *
 from .calibration import *
+from .scoringsheet import *
 try:
     from .catgb import *
 except ModuleNotFoundError:

diff --git a/Orange/classification/scoringsheet.py b/Orange/classification/scoringsheet.py
@@ -0,0 +1,152 @@
+import numpy as np
+from Orange.classification.utils.fasterrisk.fasterrisk import (
+    RiskScoreOptimizer,
+    RiskScoreClassifier,
+)
+
+from Orange.classification import Learner, Model
+from Orange.data import Table, Storage
+from Orange.data.filter import HasClass
+from Orange.preprocess import Discretize, Impute, Continuize, SelectBestFeatures
+from Orange.preprocess.discretize import Binning
+from Orange.preprocess.score import ReliefF
+
+
+def _change_class_var_values(y):
+    """
+    Changes the class variable values from 0 and 1 to -1 and 1 or vice versa.
+    """
+    return np.where(y == 0, -1, np.where(y == -1, 0, y))
+
+
+class ScoringSheetModel(Model):
+    def __init__(self, model):
+        self.model = model
+        super().__init__()
+
+    def predict_storage(self, table):
+        if not isinstance(table, Storage):
+            raise TypeError("Data is not a subclass of Orange.data.Storage.")
+
+        y_pred = _change_class_var_values(self.model.predict(table.X))
+        y_prob = self.model.predict_prob(table.X)
+
+        scores = np.hstack(((1 - y_prob).reshape(-1, 1), y_prob.reshape(-1, 1)))
+        return y_pred, scores
+
+
+class ScoringSheetLearner(Learner):
+    __returns__ = ScoringSheetModel
+    preprocessors = [HasClass(), Discretize(method=Binning()), Impute(), Continuize()]
+
+    def __init__(
+        self,
+        num_attr_after_selection=20,
+        num_decision_params=5,
+        max_points_per_param=5,
+        num_input_features=None,
+        preprocessors=None,
+    ):
+        # Set the num_decision_params, max_points_per_param, and num_input_features normally
+        self.num_decision_params = num_decision_params
+        self.max_points_per_param = max_points_per_param
+        self.num_input_features = num_input_features
+        self.feature_to_group = None
+
+        if preprocessors is None:
+            self.preprocessors = [
+                *self.preprocessors,
+                SelectBestFeatures(method=ReliefF(), k=num_attr_after_selection),
+            ]
+
+        super().__init__(preprocessors=preprocessors)
+
+    def incompatibility_reason(self, domain):
+        reason = None
+        if len(domain.class_vars) > 1 and not self.supports_multiclass:
+            reason = "Too many target variables."
+        elif not domain.has_discrete_class:
+            reason = "Categorical class variable expected."
+        elif len(domain.class_vars[0].values) > 2:
+            reason = "Too many target variable values."
+        return reason
+
+    def fit_storage(self, table):
+        if not isinstance(table, Storage):
+            raise TypeError("Data is not a subclass of Orange.data.Storage.")
+        elif table.get_nan_count_class() > 0:
+            raise ValueError("Class variable contains missing values.")
+
+        if self.num_input_features is not None:
+            self._generate_feature_group_index(table)
+
+        X, y, _ = table.X, table.Y, table.W if table.has_weights() else None
+        learner = RiskScoreOptimizer(
+            X=X,
+            y=_change_class_var_values(y),
+            k=self.num_decision_params,
+            select_top_m=1,
+            lb=-self.max_points_per_param,
+            ub=self.max_points_per_param,
+            group_sparsity=self.num_input_features,
+            featureIndex_to_groupIndex=self.feature_to_group,
+        )
+
+        self._optimize_decision_params_adjustment(learner)
+
+        multipliers, intercepts, coefficients = learner.get_models()
+
+        model = RiskScoreClassifier(
+            multiplier=multipliers[0],
+            intercept=intercepts[0],
+            coefficients=coefficients[0],
+            featureNames=[attribute.name for attribute in table.domain.attributes],
+            X_train=X if self.num_decision_params > 10 else None,
+        )
+
+        return ScoringSheetModel(model)
+
+    def _optimize_decision_params_adjustment(self, learner):
+        """
+        This function attempts to optimize (fit) the learner, reducing the number of decision
+        parameters ('k')if optimization fails due to being too high.
+
+        Sometimes, the number of decision parameters is too high for the
+        number of input features. Which results in a ValueError.
+        Continues until successful or 'k' cannot be reduced further.
+        """
+        while True:
+            try:
+                learner.optimize()
+                return True
+            except ValueError as e:
+                learner.k -= 1
+                if learner.k < 1:
+                    # Raise a custom error when k falls below 1
+                    raise ValueError(
+                        "The number of input features is too low for the current settings."
+                    ) from e
+
+    def _generate_feature_group_index(self, table):
+        """
+        Returns a feature index to group index mapping. The group index is used to group
+        binarized features that belong to the same original feature.
+        """
+        original_feature_names = [
+            attribute.compute_value.variable.name
+            for attribute in table.domain.attributes
+        ]
+        feature_to_group_index = {
+            feature: idx for idx, feature in enumerate(set(original_feature_names))
+        }
+        feature_to_group = [
+            feature_to_group_index[feature] for feature in original_feature_names
+        ]
+        self.feature_to_group = np.asarray(feature_to_group)
+
+
+if __name__ == "__main__":
+    mock_learner = ScoringSheetLearner(20, 5, 10, None)
+    mock_table = Table("https://datasets.biolab.si/core/heart_disease.tab")
+    mock_model = mock_learner(mock_table)
+    mock_model(mock_table)
diff --git a/Orange/classification/utils/__init__.py b/Orange/classification/utils/__init__.py
diff --git a/Orange/classification/utils/fasterrisk/LICENSE b/Orange/classification/utils/fasterrisk/LICENSE
@@ -0,0 +1,32 @@
+
+
+BSD 3-Clause License
+
+Copyright (c) 2022, Jiachang Liu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/Orange/classification/utils/fasterrisk/NOTICE b/Orange/classification/utils/fasterrisk/NOTICE
@@ -0,0 +1,7 @@
+Notice for Use of FasterRisk Code in Orange3
+
+This directory ('Orange/classification/fasterrisk') contains code from the "FasterRisk" project by Jiachang Liu. This code is used under the BSD 3-Clause License. The source of this code can be found at https://github.com/jiachangliu/FasterRisk.
+
+The inclusion of the FasterRisk code in this project serves as a temporary solution to address compatibility and functionality issues arising from the strict requirements of the original package. This measure will remain in place until such time as the original maintainer updates the package to address these issues.
+
+A copy of the BSD 3-Clause License under which the FasterRisk code is licensed is included in this directory.
diff --git a/Orange/classification/utils/fasterrisk/__init__.py b/Orange/classification/utils/fasterrisk/__init__.py
diff --git a/Orange/classification/utils/fasterrisk/base_model.py b/Orange/classification/utils/fasterrisk/base_model.py
@@ -0,0 +1,123 @@
+import numpy as np
+import sys
+# import warnings
+# warnings.filterwarnings("ignore")
+from Orange.classification.utils.fasterrisk.utils import normalize_X, compute_logisticLoss_from_ExpyXB
+
+class logRegModel:
+    def __init__(self, X, y, lambda2=1e-8, intercept=True, original_lb=-5, original_ub=5):
+        self.X = X
+        self.X_normalized, self.X_mean, self.X_norm, self.scaled_feature_indices = normalize_X(self.X)
+        self.n, self.p = self.X_normalized.shape
+        self.y = y.reshape(-1).astype(float)
+        self.yX = y.reshape(-1, 1) * self.X_normalized
+        self.yXT = np.zeros((self.p, self.n))
+        self.yXT[:] = np.transpose(self.yX)[:]
+        self.beta0 = 0
+        self.betas = np.zeros((self.p, ))
+        self.ExpyXB = np.exp(self.y * self.beta0 + self.yX.dot(self.betas))
+
+        self.intercept = intercept
+        self.lambda2 = lambda2
+        self.twoLambda2 = 2 * self.lambda2
+
+        self.Lipschitz = 0.25 + self.twoLambda2
+        self.lbs = original_lb * np.ones(self.p)
+        self.lbs[self.scaled_feature_indices] *= self.X_norm[self.scaled_feature_indices]
+        self.ubs = original_ub * np.ones(self.p)
+        self.ubs[self.scaled_feature_indices] *= self.X_norm[self.scaled_feature_indices]
+
+        self.total_child_added = 0
+
+    def warm_start_from_original_beta0_betas(self, original_beta0, original_betas):
+        # betas_initial has dimension (p+1, 1)
+        self.original_beta0 = original_beta0
+        self.original_betas = original_betas
+        self.beta0, self.betas = self.transform_coefficients_to_normalized_space(self.original_beta0, self.original_betas)
+        print("warmstart solution in normalized space is {} and {}".format(self.beta0, self.betas))
+        self.ExpyXB = np.exp(self.y * self.beta0 + self.yX.dot(self.betas))
+
+    def warm_start_from_beta0_betas(self, beta0, betas):
+        self.beta0, self.betas = beta0, betas
+        self.ExpyXB = np.exp(self.y * self.beta0 + self.yX.dot(self.betas))
+
+    def warm_start_from_beta0_betas_ExpyXB(self, beta0, betas, ExpyXB):
+        self.beta0, self.betas, self.ExpyXB = beta0, betas, ExpyXB
+
+    def get_beta0_betas(self):
+        return self.beta0, self.betas
+
+    def get_beta0_betas_ExpyXB(self):
+        return self.beta0, self.betas, self.ExpyXB
+
+    def get_original_beta0_betas(self):
+        return self.transform_coefficients_to_original_space(self.beta0, self.betas)
+
+    def transform_coefficients_to_original_space(self, beta0, betas):
+        original_betas = betas.copy()
+        original_betas[self.scaled_feature_indices] = original_betas[self.scaled_feature_indices]/self.X_norm[self.scaled_feature_indices]
+        original_beta0 = beta0 - np.dot(self.X_mean, original_betas)
+        return original_beta0, original_betas
+
+    def transform_coefficients_to_normalized_space(self, original_beta0, original_betas):
+        betas = original_betas.copy()
+        betas[self.scaled_feature_indices] = betas[self.scaled_feature_indices] * self.X_norm[self.scaled_feature_indices]
+        beta0 = original_beta0 + self.X_mean.dot(original_betas)
+        return beta0, betas
+
+    def get_grad_at_coord(self, ExpyXB, betas_j, yX_j, j):
+        # return -np.dot(1/(1+ExpyXB), self.yX[:, j]) + self.twoLambda2 * betas_j
+        # return -np.inner(1/(1+ExpyXB), self.yX[:, j]) + self.twoLambda2 * betas_j
+        # return -np.inner(np.reciprocal(1+ExpyXB), self.yX[:, j]) + self.twoLambda2 * betas_j
+        return -np.inner(np.reciprocal(1+ExpyXB), yX_j) + self.twoLambda2 * betas_j
+        # return -yX_j.dot(np.reciprocal(1+ExpyXB)) + self.twoLambda2 * betas_j
+
+    def update_ExpyXB(self, ExpyXB, yX_j, diff_betas_j):
+        ExpyXB *= np.exp(yX_j * diff_betas_j)
+
+    def optimize_1step_at_coord(self, ExpyXB, betas, yX_j, j):
+        # in-place modification, heck that ExpyXB and betas are passed by reference
+        prev_betas_j = betas[j]
+        current_betas_j = prev_betas_j
+        grad_at_j = self.get_grad_at_coord(ExpyXB, current_betas_j, yX_j, j)
+        step_at_j = grad_at_j / self.Lipschitz
+        current_betas_j = prev_betas_j - step_at_j
+        # current_betas_j = np.clip(current_betas_j, self.lbs[j], self.ubs[j])
+        current_betas_j = max(self.lbs[j], min(self.ubs[j], current_betas_j))
+        diff_betas_j = current_betas_j - prev_betas_j
+        betas[j] = current_betas_j
+
+        # ExpyXB *= np.exp(yX_j * diff_betas_j)
+        self.update_ExpyXB(ExpyXB, yX_j, diff_betas_j)
+
+    def finetune_on_current_support(self, ExpyXB, beta0, betas, total_CD_steps=100):
+
+        support  = np.where(np.abs(betas) > 1e-9)[0]
+        grad_on_support = -self.yXT[support].dot(np.reciprocal(1+ExpyXB)) + self.twoLambda2 * betas[support]
+        abs_grad_on_support = np.abs(grad_on_support)
+        support = support[np.argsort(-abs_grad_on_support)]
+
+        loss_before = compute_logisticLoss_from_ExpyXB(ExpyXB) + self.lambda2 * betas[support].dot(betas[support])
+        for steps in range(total_CD_steps): # number of iterations for coordinate descent
+
+            if self.intercept:
+                grad_intercept = -np.reciprocal(1+ExpyXB).dot(self.y)
+                step_at_intercept = grad_intercept / (self.n * 0.25) # lipschitz constant is 0.25 at the intercept
+                beta0 = beta0 - step_at_intercept
+                ExpyXB *= np.exp(self.y * (-step_at_intercept))
+
+            for j in support:
+                self.optimize_1step_at_coord(ExpyXB, betas, self.yXT[j, :], j) # in-place modification on ExpyXB and betas
+
+            if steps % 10 == 0:
+                loss_after = compute_logisticLoss_from_ExpyXB(ExpyXB) + self.lambda2 * betas[support].dot(betas[support])
+                if abs(loss_before - loss_after)/loss_after < 1e-8:
+                    # print("break after {} steps; support size is {}".format(steps, len(support)))
+                    break
+                loss_before = loss_after
+
+        return ExpyXB, beta0, betas
+
+    def compute_yXB(self, beta0, betas):
+        return self.y*(beta0 + np.dot(self.X_normalized, betas))
+