mihaiconstantin · mihaiconstantin · Sep 28, 2022 · Sep 28, 2022 · Sep 28, 2022 · Sep 28, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -32,7 +32,12 @@ Imports:
     ggplot2,
     rlang,
     mvtnorm,
-    patchwork
+    patchwork,
+    utils,
+    IsingSampler,
+    IsingFit,
+    igraph,
+    glmnet
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
@@ -43,6 +48,7 @@ Collate:
     'Model.R'
     'GgmModel.R'
     'Interpolation.R'
+    'IsingModel.R'
     'Spline.R'
     'StepThree.R'
     'StepTwo.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,8 @@ S3method(summary,Validation)
 export(generate_model)
 export(powerly)
 export(validate)
+importFrom(IsingFit,IsingFit)
+importFrom(IsingSampler,IsingSampler)
 importFrom(R6,R6Class)
 importFrom(bootnet,genGGM)
 importFrom(bootnet,ggmGenerator)
@@ -40,6 +42,9 @@ importFrom(ggplot2,scale_y_continuous)
 importFrom(ggplot2,stat_ecdf)
 importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_bw)
+importFrom(glmnet,glmnet)
+importFrom(igraph,erdos.renyi.game)
+importFrom(igraph,get.adjacency)
 importFrom(mvtnorm,rmvnorm)
 importFrom(osqp,osqp)
 importFrom(osqp,osqpSettings)
@@ -60,3 +65,4 @@ importFrom(rlang,.data)
 importFrom(rlang,.env)
 importFrom(splines2,bSpline)
 importFrom(splines2,iSpline)
+importFrom(utils,getFromNamespace)
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,15 @@ is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this
 project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## Unreleased
+## Added
+- Add *temporary* support for the Ising model using the approach described by
+  [van Borkulo et al. (2014)](https://www.nature.com/articles/srep05918). The
+  support is temporary in the sense that starting with `powerly` `v2.0.0`, an
+  API is introduced that developers can consume to hook into `powerly` for
+  conducting sample size computations for arbitrary models and performance
+  measures. Related to
+  [#10](https://github.com/mihaiconstantin/powerly/issues/10).
+
 ### Changed
 - Update `GitHub` action versions for all workflow files. Closes
   [#37](https://github.com/mihaiconstantin/powerly/issues/37).

diff --git a/R/IsingModel.R b/R/IsingModel.R
@@ -0,0 +1,234 @@
+#' @include Model.R
+
+# Extract the un-exported `C++` sampler from the `IsingSampler` namespace.
+IsingSamplerCpp <- utils::getFromNamespace("IsingSamplerCpp", "IsingSampler")
+
+# Ising model implementation.
+IsingModel <- R6::R6Class("IsingModel",
+    inherit = Model,
+
+    private = list(
+        .minimum_sample_size = 100,
+
+        .has_zero_variance = function(data) {
+            return(any(apply(data, 2, sd) == 0))
+        }
+    ),
+
+    public = list(
+        create = function(nodes, density, positive = .9, ...) {
+            # Compute the number of unique parameters.
+            number_parameters = (nodes * (nodes - 1)) / 2
+
+            # Determine the ratio of positive parameters.
+            ratio <- sample(c(-1, 1), number_parameters, TRUE, prob = c(1 - positive, positive))
+
+            # Generate random network structure.
+            network <- as.matrix(
+                igraph::get.adjacency(
+                    igraph::erdos.renyi.game(
+                        n = nodes,
+                        p.or.m = density,
+                        ...
+                    )
+                )
+            )
+
+            # Sample parameters with desired positive ratio.
+            parameters <- ratio * abs(rnorm(number_parameters, mean = 0, sd = 1))
+
+            # Map the parameters onto to the network structure.
+            network[upper.tri(network)] <- network[upper.tri(network)] * parameters
+            network[lower.tri(network)] <- t(network)[lower.tri(network)]
+
+            # Sample thresholds based on network structure.
+            diag(network) <- -abs(rnorm(nodes, colSums(network) / 2, abs(colSums(network) / 6)))
+
+            return(network)
+        },
+
+        generate = function(sample_size, true_parameters) {
+            # Prevent using a sample size smaller than 50.
+            if (sample_size < private$.minimum_sample_size) {
+                stop(paste0("Sample size must be greater than ", private$.minimum_sample_size, "."))
+            }
+
+            # Sample binary data.
+            data <- IsingSamplerCpp(
+                n = sample_size,
+                graph = true_parameters,
+                thresholds = diag(true_parameters),
+                beta = 1,
+                nIter = 100,
+                responses = c(0, 1),
+                exact = FALSE,
+                constrain = matrix(NA, sample_size, ncol(true_parameters))
+            )
+
+            # Inform user about the status of the data.
+            if (private$.has_zero_variance(data)) {
+                stop("Variable(s) with SD = 0 detected. Increase sample size.")
+            }
+
+            return(data)
+        },
+
+        # Adapted from: https://github.com/cvborkulo/IsingFit.
+        estimate = function(data, and = TRUE, gamma = 0.25) {
+            # Number of variables and observations.
+            n_var <- ncol(data)
+            n_obs <- nrow(data)
+
+            # Number of predictors (i.e., all other variables in the dataset).
+            n_pred <- n_var - 1
+
+            # Create storage for `glmnet` fit results.
+            intercepts <- vector(mode = "list", length = n_var)
+            betas <- vector(mode = "list", length = n_var)
+            lambdas <- vector(mode = "list", length = n_var)
+
+            # Number of lambdas.
+            n_lambdas <- rep(0, n_var)
+
+            # Fit for each variable in turn.
+            for (i in 1:n_var) {
+                # Fit regularized logistic regression.
+                fit <- glmnet::glmnet(data[, -i], data[, i], family = "binomial")
+
+                # Extract and store results.
+                intercepts[[i]] <- fit$a0
+                betas[[i]] <- as.matrix(fit$beta)
+                lambdas[[i]] <- fit$lambda
+
+                # Number of penalty values (i.e., tunning parameters) tried.
+                n_lambdas[i] <- length(lambdas[[i]])
+            }
+
+            # Maximum number of lambdas used by `glmnet`.
+            max_n_lambdas <- max(n_lambdas)
+
+            # The number of neighbors.
+            n_neighbors <- matrix(0, max_n_lambdas, n_var)
+
+            # Log-likelihood storage for each variable and penalty parameter.
+            log_likelihood <- array(0, dim = c(n_obs, max_n_lambdas, n_var))
+
+            # Lambda matrix.
+            lambda_mat <- matrix(NA, max_n_lambdas, n_var)
+
+            # Compute likelihood and the number of neighbors for each variable.
+            for (i in 1:n_var) {
+                # Compute number of neighbors.
+                n_neighbors[1:n_lambdas[i], i] <- colSums(betas[[i]] != 0)
+
+                # Calculate likelihood (i.e., equation 1 in van Borkulo et al., 2014).
+
+                # Extract predictors.
+                predictors <- data[, -i]
+
+                # Create storage for the sum of beta coefficients times predictors.
+                y <- matrix(0, n_obs, n_lambdas[i])
+
+                # For each predictor.
+                for (k in 1:n_pred) {
+                    # Extract the beta coefficients.
+                    b <- matrix(betas[[i]][k, ], n_obs, n_lambdas[i], TRUE)
+
+                    # Collect the sum.
+                    y <- y + b * predictors[, k]
+                }
+
+                # Add the intercept (i.e., tau) term to the sum.
+                y <- matrix(intercepts[[i]], n_obs, n_lambdas[i], TRUE) + y
+
+                # Handle `NA` due to uneven number of tuning parameters.
+                n_missing_lambdas <- max_n_lambdas - n_lambdas[i]
+
+                # If there are missing lambdas, append NAs.
+                if(n_missing_lambdas > 0) {
+                    y <- cbind(y, matrix(NA, n_obs, n_missing_lambdas))
+                }
+
+                # Calculate log-likelihood.
+                log_likelihood[, , i] <- log(exp(y * data[, i]) / (1 + exp(y)))
+
+                # Fill lambda matrix.
+                lambda_mat[, i] <- c(lambdas[[i]], rep(NA, max_n_lambdas - n_lambdas[i]))
+            }
+
+            # Sum log-likelihood (i.e., lambdas by variables).
+            sum_log_likelihood <- colSums(log_likelihood, 1, na.rm = FALSE)
+
+            # Mark any zero log-likelihood as missing (?).
+            sum_log_likelihood[sum_log_likelihood == 0] <- NA
+
+            # EBIC penalty part.
+            penalty <- n_neighbors * log(n_obs) + 2 * gamma * n_neighbors * log(n_pred)
+
+            # Compute the EBIC.
+            ebic <- -2 * sum_log_likelihood + penalty
+
+            # Get indices for optimal lambdas based on minimizing the EBIC.
+            lambda_optimal_indices <- apply(ebic, 2, which.min)
+
+            # Optimal thresholds (i.e., intercepts, or tau values).
+            thresholds_optimal <- vector(mode = "numeric", length = n_var)
+
+            # Optimal weights (i.e., slopes, or beta values).
+            asymmetric_weights_optimal <- matrix(NA, n_var, n_var)
+
+            # Store optimal values for each variable.
+            for (i in 1:n_var) {
+                # Thresholds.
+                thresholds_optimal[i] <- intercepts[[i]][lambda_optimal_indices[i]]
+
+                # Weights.
+                asymmetric_weights_optimal[i, -i] <- betas[[i]][, lambda_optimal_indices[i]]
+            }
+
+            # Apply rule to create symmetrical weights matrix for the undirected graph.
+            if (and) {
+                # Apply the "AND" rule (i.e., ensure pairs of betas in both directions are non-zero).
+                adjacency <- (asymmetric_weights_optimal != 0) * 1
+                weights <- adjacency * t(adjacency) * asymmetric_weights_optimal
+
+                # Take the mean.
+                weights_optimal <- (weights + t(weights)) / 2
+            } else {
+                # Apply the "OR" rule (i.e., take the mean of pairs of beta coefficients).
+                weights_optimal <- (asymmetric_weights_optimal + t(asymmetric_weights_optimal)) / 2
+            }
+
+            # Set the thresholds on the diagonal.
+            diag(weights_optimal) <- thresholds_optimal
+
+            return(weights_optimal)
+        },
+
+        evaluate = function(true_parameters, estimated_parameters, measure, ...) {
+            # Extract the true and estimated parameters from the weights matrices.
+            true <- true_parameters[upper.tri(true_parameters)]
+            esti <- estimated_parameters[upper.tri(estimated_parameters)]
+
+            # Check if model dimensions do not match.
+            if(length(true) != length(esti)) return(NA)
+
+            # Compute true/ false | positive/ negative rates.
+            tp <- sum(true != 0 & esti != 0)
+            fp <- sum(true == 0 & esti != 0)
+            tn <- sum(true == 0 & esti == 0)
+            fn <- sum(true != 0 & esti == 0)
+
+            # Compute correct measure.
+            return(
+                switch(measure,
+                    sen = tp / (tp + fn),
+                    spe = tn / (tn + fp),
+                    mcc = (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)),
+                    rho = cor(true, esti),
+                    stop(.__ERRORS__$not_developed)
+                )
+            )
+        }
+    )
+)
diff --git a/R/ModelFactory.R b/R/ModelFactory.R
@@ -6,6 +6,7 @@ ModelFactory <- R6::R6Class("ModelFactory",
             return(
                 switch(type,
                     ggm = GgmModel$new(),
+                    ising = IsingModel$new(),
                     stop(.__ERRORS__$not_developed)
                 )
             )

diff --git a/R/powerly-package.R b/R/powerly-package.R
@@ -34,6 +34,12 @@
 #' @importFrom quadprog solve.QP
 #' @importFrom splines2 iSpline bSpline
 #' @importFrom mvtnorm rmvnorm
+#' @importFrom glmnet glmnet
+#' @importFrom igraph get.adjacency
+#' @importFrom igraph erdos.renyi.game
+#' @importFrom IsingSampler IsingSampler
+#' @importFrom IsingFit IsingFit
+#' @importFrom utils getFromNamespace
 
 #' @include logo.R
 

diff --git a/man-roxygen/generate_model.R b/man-roxygen/generate_model.R
@@ -7,7 +7,7 @@
 #' [powerly::powerly()].
 #'
 #' @param type Character string representing the type of true model. Possible
-#' values are `"ggm"` (the default).
+#' values are `"ggm"` (the default) or `"ising"`.
 #'
 #' @param ... Required arguments used for the generation of the true model. See
 #' the **True Models** section of [powerly::powerly()] for the arguments

diff --git a/man-roxygen/powerly.R b/man-roxygen/powerly.R
@@ -20,7 +20,7 @@
 #' candidate range.
 #'
 #' @param model A character string representing the type of true model to find a
-#' sample size for. Possible values are `"ggm"` (the default).
+#' sample size for. Possible values are `"ggm"` (the default) or `"ising"`.
 #'
 #' @param ... Required arguments used for the generation of the true model. See
 #' the **True Models** section for the arguments required for each true model.
@@ -142,6 +142,15 @@
 #'     - Yin, J., and Li, H. (2011). A sparse conditional gaussian graphical model for analysis of genetical genomics data. *The annals of applied statistics*, 5(4), 2630.
 #' - supported performance measures: `sen`, `spe`, `mcc`, `rho`
 #'
+#' **Ising Model**
+#' - type: cross-sectional
+#' - symbol: `ising`
+#' - `...` arguments for generating true models:
+#'   - `nodes`: A single positive integer representing the number of nodes in the network (e.g., `10`).
+#'   - `density`: A single numerical value indicating the density of the network (e.g., `0.4`).
+#'   - `positive`: A single numerical value representing the proportion of positive edges in the network (e.g., `0.9` for 90% positive edges).
+#' - supported performance measures: `sen`, `spe`, `mcc`, `rho`
+#'
 #' @section Performance Measures:
 #'
 #' | **Performance Measure**  | **Symbol** | **Lower**   | **Upper**  |

diff --git a/man/generate_model.Rd b/man/generate_model.Rd
diff --git a/man/powerly.Rd b/man/powerly.Rd