diff --git a/DESCRIPTION b/DESCRIPTION index dcc3d31..46df5ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,7 +32,12 @@ Imports: ggplot2, rlang, mvtnorm, - patchwork + patchwork, + utils, + IsingSampler, + IsingFit, + igraph, + glmnet Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) @@ -43,6 +48,7 @@ Collate: 'Model.R' 'GgmModel.R' 'Interpolation.R' + 'IsingModel.R' 'Spline.R' 'StepThree.R' 'StepTwo.R' diff --git a/NAMESPACE b/NAMESPACE index ac733c7..c496e7f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,8 @@ S3method(summary,Validation) export(generate_model) export(powerly) export(validate) +importFrom(IsingFit,IsingFit) +importFrom(IsingSampler,IsingSampler) importFrom(R6,R6Class) importFrom(bootnet,genGGM) importFrom(bootnet,ggmGenerator) @@ -40,6 +42,9 @@ importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,stat_ecdf) importFrom(ggplot2,theme) importFrom(ggplot2,theme_bw) +importFrom(glmnet,glmnet) +importFrom(igraph,erdos.renyi.game) +importFrom(igraph,get.adjacency) importFrom(mvtnorm,rmvnorm) importFrom(osqp,osqp) importFrom(osqp,osqpSettings) @@ -60,3 +65,4 @@ importFrom(rlang,.data) importFrom(rlang,.env) importFrom(splines2,bSpline) importFrom(splines2,iSpline) +importFrom(utils,getFromNamespace) diff --git a/NEWS.md b/NEWS.md index d192b7c..ec70360 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,15 @@ is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## Unreleased +## Added +- Add *temporary* support for the Ising model using the approach described by + [van Borkulo et al. (2014)](https://www.nature.com/articles/srep05918). The + support is temporary in the sense that starting with `powerly` `v2.0.0`, an + API is introduced that developers can consume to hook into `powerly` for + conducting sample size computations for arbitrary models and performance + measures. Related to + [#10](https://github.com/mihaiconstantin/powerly/issues/10). + ### Changed - Update `GitHub` action versions for all workflow files. Closes [#37](https://github.com/mihaiconstantin/powerly/issues/37). diff --git a/R/IsingModel.R b/R/IsingModel.R new file mode 100644 index 0000000..0d8bbe0 --- /dev/null +++ b/R/IsingModel.R @@ -0,0 +1,234 @@ +#' @include Model.R + +# Extract the un-exported `C++` sampler from the `IsingSampler` namespace. +IsingSamplerCpp <- utils::getFromNamespace("IsingSamplerCpp", "IsingSampler") + +# Ising model implementation. +IsingModel <- R6::R6Class("IsingModel", + inherit = Model, + + private = list( + .minimum_sample_size = 100, + + .has_zero_variance = function(data) { + return(any(apply(data, 2, sd) == 0)) + } + ), + + public = list( + create = function(nodes, density, positive = .9, ...) { + # Compute the number of unique parameters. + number_parameters = (nodes * (nodes - 1)) / 2 + + # Determine the ratio of positive parameters. + ratio <- sample(c(-1, 1), number_parameters, TRUE, prob = c(1 - positive, positive)) + + # Generate random network structure. + network <- as.matrix( + igraph::get.adjacency( + igraph::erdos.renyi.game( + n = nodes, + p.or.m = density, + ... + ) + ) + ) + + # Sample parameters with desired positive ratio. + parameters <- ratio * abs(rnorm(number_parameters, mean = 0, sd = 1)) + + # Map the parameters onto to the network structure. + network[upper.tri(network)] <- network[upper.tri(network)] * parameters + network[lower.tri(network)] <- t(network)[lower.tri(network)] + + # Sample thresholds based on network structure. + diag(network) <- -abs(rnorm(nodes, colSums(network) / 2, abs(colSums(network) / 6))) + + return(network) + }, + + generate = function(sample_size, true_parameters) { + # Prevent using a sample size smaller than 50. + if (sample_size < private$.minimum_sample_size) { + stop(paste0("Sample size must be greater than ", private$.minimum_sample_size, ".")) + } + + # Sample binary data. + data <- IsingSamplerCpp( + n = sample_size, + graph = true_parameters, + thresholds = diag(true_parameters), + beta = 1, + nIter = 100, + responses = c(0, 1), + exact = FALSE, + constrain = matrix(NA, sample_size, ncol(true_parameters)) + ) + + # Inform user about the status of the data. + if (private$.has_zero_variance(data)) { + stop("Variable(s) with SD = 0 detected. Increase sample size.") + } + + return(data) + }, + + # Adapted from: https://github.com/cvborkulo/IsingFit. + estimate = function(data, and = TRUE, gamma = 0.25) { + # Number of variables and observations. + n_var <- ncol(data) + n_obs <- nrow(data) + + # Number of predictors (i.e., all other variables in the dataset). + n_pred <- n_var - 1 + + # Create storage for `glmnet` fit results. + intercepts <- vector(mode = "list", length = n_var) + betas <- vector(mode = "list", length = n_var) + lambdas <- vector(mode = "list", length = n_var) + + # Number of lambdas. + n_lambdas <- rep(0, n_var) + + # Fit for each variable in turn. + for (i in 1:n_var) { + # Fit regularized logistic regression. + fit <- glmnet::glmnet(data[, -i], data[, i], family = "binomial") + + # Extract and store results. + intercepts[[i]] <- fit$a0 + betas[[i]] <- as.matrix(fit$beta) + lambdas[[i]] <- fit$lambda + + # Number of penalty values (i.e., tunning parameters) tried. + n_lambdas[i] <- length(lambdas[[i]]) + } + + # Maximum number of lambdas used by `glmnet`. + max_n_lambdas <- max(n_lambdas) + + # The number of neighbors. + n_neighbors <- matrix(0, max_n_lambdas, n_var) + + # Log-likelihood storage for each variable and penalty parameter. + log_likelihood <- array(0, dim = c(n_obs, max_n_lambdas, n_var)) + + # Lambda matrix. + lambda_mat <- matrix(NA, max_n_lambdas, n_var) + + # Compute likelihood and the number of neighbors for each variable. + for (i in 1:n_var) { + # Compute number of neighbors. + n_neighbors[1:n_lambdas[i], i] <- colSums(betas[[i]] != 0) + + # Calculate likelihood (i.e., equation 1 in van Borkulo et al., 2014). + + # Extract predictors. + predictors <- data[, -i] + + # Create storage for the sum of beta coefficients times predictors. + y <- matrix(0, n_obs, n_lambdas[i]) + + # For each predictor. + for (k in 1:n_pred) { + # Extract the beta coefficients. + b <- matrix(betas[[i]][k, ], n_obs, n_lambdas[i], TRUE) + + # Collect the sum. + y <- y + b * predictors[, k] + } + + # Add the intercept (i.e., tau) term to the sum. + y <- matrix(intercepts[[i]], n_obs, n_lambdas[i], TRUE) + y + + # Handle `NA` due to uneven number of tuning parameters. + n_missing_lambdas <- max_n_lambdas - n_lambdas[i] + + # If there are missing lambdas, append NAs. + if(n_missing_lambdas > 0) { + y <- cbind(y, matrix(NA, n_obs, n_missing_lambdas)) + } + + # Calculate log-likelihood. + log_likelihood[, , i] <- log(exp(y * data[, i]) / (1 + exp(y))) + + # Fill lambda matrix. + lambda_mat[, i] <- c(lambdas[[i]], rep(NA, max_n_lambdas - n_lambdas[i])) + } + + # Sum log-likelihood (i.e., lambdas by variables). + sum_log_likelihood <- colSums(log_likelihood, 1, na.rm = FALSE) + + # Mark any zero log-likelihood as missing (?). + sum_log_likelihood[sum_log_likelihood == 0] <- NA + + # EBIC penalty part. + penalty <- n_neighbors * log(n_obs) + 2 * gamma * n_neighbors * log(n_pred) + + # Compute the EBIC. + ebic <- -2 * sum_log_likelihood + penalty + + # Get indices for optimal lambdas based on minimizing the EBIC. + lambda_optimal_indices <- apply(ebic, 2, which.min) + + # Optimal thresholds (i.e., intercepts, or tau values). + thresholds_optimal <- vector(mode = "numeric", length = n_var) + + # Optimal weights (i.e., slopes, or beta values). + asymmetric_weights_optimal <- matrix(NA, n_var, n_var) + + # Store optimal values for each variable. + for (i in 1:n_var) { + # Thresholds. + thresholds_optimal[i] <- intercepts[[i]][lambda_optimal_indices[i]] + + # Weights. + asymmetric_weights_optimal[i, -i] <- betas[[i]][, lambda_optimal_indices[i]] + } + + # Apply rule to create symmetrical weights matrix for the undirected graph. + if (and) { + # Apply the "AND" rule (i.e., ensure pairs of betas in both directions are non-zero). + adjacency <- (asymmetric_weights_optimal != 0) * 1 + weights <- adjacency * t(adjacency) * asymmetric_weights_optimal + + # Take the mean. + weights_optimal <- (weights + t(weights)) / 2 + } else { + # Apply the "OR" rule (i.e., take the mean of pairs of beta coefficients). + weights_optimal <- (asymmetric_weights_optimal + t(asymmetric_weights_optimal)) / 2 + } + + # Set the thresholds on the diagonal. + diag(weights_optimal) <- thresholds_optimal + + return(weights_optimal) + }, + + evaluate = function(true_parameters, estimated_parameters, measure, ...) { + # Extract the true and estimated parameters from the weights matrices. + true <- true_parameters[upper.tri(true_parameters)] + esti <- estimated_parameters[upper.tri(estimated_parameters)] + + # Check if model dimensions do not match. + if(length(true) != length(esti)) return(NA) + + # Compute true/ false | positive/ negative rates. + tp <- sum(true != 0 & esti != 0) + fp <- sum(true == 0 & esti != 0) + tn <- sum(true == 0 & esti == 0) + fn <- sum(true != 0 & esti == 0) + + # Compute correct measure. + return( + switch(measure, + sen = tp / (tp + fn), + spe = tn / (tn + fp), + mcc = (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)), + rho = cor(true, esti), + stop(.__ERRORS__$not_developed) + ) + ) + } + ) +) diff --git a/R/ModelFactory.R b/R/ModelFactory.R index 82023b6..81eeaaf 100644 --- a/R/ModelFactory.R +++ b/R/ModelFactory.R @@ -6,6 +6,7 @@ ModelFactory <- R6::R6Class("ModelFactory", return( switch(type, ggm = GgmModel$new(), + ising = IsingModel$new(), stop(.__ERRORS__$not_developed) ) ) diff --git a/R/powerly-package.R b/R/powerly-package.R index 7f87d5e..bc3d175 100644 --- a/R/powerly-package.R +++ b/R/powerly-package.R @@ -34,6 +34,12 @@ #' @importFrom quadprog solve.QP #' @importFrom splines2 iSpline bSpline #' @importFrom mvtnorm rmvnorm +#' @importFrom glmnet glmnet +#' @importFrom igraph get.adjacency +#' @importFrom igraph erdos.renyi.game +#' @importFrom IsingSampler IsingSampler +#' @importFrom IsingFit IsingFit +#' @importFrom utils getFromNamespace #' @include logo.R diff --git a/man-roxygen/generate_model.R b/man-roxygen/generate_model.R index 9546af0..0523b11 100644 --- a/man-roxygen/generate_model.R +++ b/man-roxygen/generate_model.R @@ -7,7 +7,7 @@ #' [powerly::powerly()]. #' #' @param type Character string representing the type of true model. Possible -#' values are `"ggm"` (the default). +#' values are `"ggm"` (the default) or `"ising"`. #' #' @param ... Required arguments used for the generation of the true model. See #' the **True Models** section of [powerly::powerly()] for the arguments diff --git a/man-roxygen/powerly.R b/man-roxygen/powerly.R index 05e8685..439c9a9 100644 --- a/man-roxygen/powerly.R +++ b/man-roxygen/powerly.R @@ -20,7 +20,7 @@ #' candidate range. #' #' @param model A character string representing the type of true model to find a -#' sample size for. Possible values are `"ggm"` (the default). +#' sample size for. Possible values are `"ggm"` (the default) or `"ising"`. #' #' @param ... Required arguments used for the generation of the true model. See #' the **True Models** section for the arguments required for each true model. @@ -142,6 +142,15 @@ #' - Yin, J., and Li, H. (2011). A sparse conditional gaussian graphical model for analysis of genetical genomics data. *The annals of applied statistics*, 5(4), 2630. #' - supported performance measures: `sen`, `spe`, `mcc`, `rho` #' +#' **Ising Model** +#' - type: cross-sectional +#' - symbol: `ising` +#' - `...` arguments for generating true models: +#' - `nodes`: A single positive integer representing the number of nodes in the network (e.g., `10`). +#' - `density`: A single numerical value indicating the density of the network (e.g., `0.4`). +#' - `positive`: A single numerical value representing the proportion of positive edges in the network (e.g., `0.9` for 90% positive edges). +#' - supported performance measures: `sen`, `spe`, `mcc`, `rho` +#' #' @section Performance Measures: #' #' | **Performance Measure** | **Symbol** | **Lower** | **Upper** | diff --git a/man/generate_model.Rd b/man/generate_model.Rd index db047ad..08614f5 100644 --- a/man/generate_model.Rd +++ b/man/generate_model.Rd @@ -8,7 +8,7 @@ generate_model(type, ...) } \arguments{ \item{type}{Character string representing the type of true model. Possible -values are \code{"ggm"} (the default).} +values are \code{"ggm"} (the default) or \code{"ising"}.} \item{...}{Required arguments used for the generation of the true model. See the \strong{True Models} section of \code{\link[=powerly]{powerly()}} for the arguments diff --git a/man/powerly.Rd b/man/powerly.Rd index 8625c2f..1a9fddb 100644 --- a/man/powerly.Rd +++ b/man/powerly.Rd @@ -46,7 +46,7 @@ Monte Carlo replications to perform for each sample size selected from the candidate range.} \item{model}{A character string representing the type of true model to find a -sample size for. Possible values are \code{"ggm"} (the default).} +sample size for. Possible values are \code{"ggm"} (the default) or \code{"ising"}.} \item{...}{Required arguments used for the generation of the true model. See the \strong{True Models} section for the arguments required for each true model.} @@ -205,6 +205,19 @@ search shrinks below a specified value. } \item supported performance measures: \code{sen}, \code{spe}, \code{mcc}, \code{rho} } + +\strong{Ising Model} +\itemize{ +\item type: cross-sectional +\item symbol: \code{ising} +\item \code{...} arguments for generating true models: +\itemize{ +\item \code{nodes}: A single positive integer representing the number of nodes in the network (e.g., \code{10}). +\item \code{density}: A single numerical value indicating the density of the network (e.g., \code{0.4}). +\item \code{positive}: A single numerical value representing the proportion of positive edges in the network (e.g., \code{0.9} for 90\% positive edges). +} +\item supported performance measures: \code{sen}, \code{spe}, \code{mcc}, \code{rho} +} } \section{Performance Measures}{ diff --git a/tests/testthat/test-ising.R b/tests/testthat/test-ising.R new file mode 100644 index 0000000..b07d18f --- /dev/null +++ b/tests/testthat/test-ising.R @@ -0,0 +1,173 @@ +# Test 'IsingModel' class. + +test_that("'IsingModel' generates data correctly", { + # Sample size. + sample_size <- sample(500:2000, 1) + + # Nodes. + nodes <- sample(10:20, 1) + + # Density. + density <- sample(seq(.2, .5, .1), 1) + + # Create plain Ising model object. + ising <- IsingModel$new() + + # Create true model parameters. + true <- ising$create(nodes = nodes, density = density) + + # Generate data. + data <- ising$generate(sample_size = sample_size, true_parameters = true) + + # The data dimensions should match the number of nodes and the sample size. + expect_equal(ncol(data), ncol(true)) + expect_equal(nrow(data), sample_size) + + # The range of the data should match the expected range. + expect_equal(min(data), 0) + expect_equal(max(data), 1) + + # Sample sizes smaller than 50 are not permitted. + expect_error( + ising$generate(sample_size = 49, true_parameters = true), + "Sample size must be greater than 100." + ) +}) + + +test_that("'IsingModel' generated data matches 'IsingSampler' data", { + # Sample size. + sample_size <- sample(500:2000, 1) + + # Nodes. + nodes <- sample(10:20, 1) + + # Density. + density <- sample(seq(.2, .5, .1), 1) + + # Create plain Ising model object. + ising <- IsingModel$new() + + # Create true model parameters. + true <- ising$create(nodes = nodes, density = density) + + # Create seed for the comparison with bootnet. + seed <- sample(1:1e5, 1) + + # Generate data via 'IsingModel'. + set.seed(seed) + ising_model_data <- ising$generate(sample_size = sample_size, true_parameters = true) + + # Extract thresholds from the parameters matrix. + thresholds <- diag(true) + + # Set diagonal to zero for the requirements of the 'IsingSampler'. + diag(true) <- 0 + + # Generate data via 'IsingSampler'. + set.seed(seed) + ising_sampler_data <- IsingSampler::IsingSampler(n = sample_size, graph = true, thresholds = thresholds, method = "MH") + + # The data should be the same as 'IsingSampler' generated data. + expect_equal(ising_model_data, ising_sampler_data) +}) + + +test_that("'IsingModel' estimates model parameters correctly", { + # Sample size. + sample_size <- sample(500:2000, 1) + + # Nodes. + nodes <- sample(10:20, 1) + + # Density. + density <- sample(seq(.2, .5, .1), 1) + + # Create plain Ising model object. + ising <- IsingModel$new() + + # Create true parameters. + true <- ising$create(nodes, density) + + # Generate data. + data <- ising$generate(sample_size = sample_size, true_parameters = true) + + # Estimate via 'IsingFit'. + network_ising_fit <- IsingFit::IsingFit(data, plot = FALSE, progressbar = FALSE) + + # Estimate via 'IsingModel'. + network_ising_model <- ising$estimate(data) + + # Extract thresholds from model parameters matrix. + thresholds <- diag(network_ising_model) + + # Set diagonal of weights matrix to zero. + diag(network_ising_model) <- 0 + + # The weights should be identical across both methods. + expect_equal(network_ising_fit$weiadj, network_ising_model, ignore_attr = TRUE) + + # The thresholds should be identical across both methods. + expect_equal(network_ising_fit$thresholds, thresholds, ignore_attr = TRUE) + + # Make one variable invariant. + data[, 1] <- data[1, 1] + + # Expect the estimation to throw an error due to invariant variables. + expect_error(ising$estimate(data)) +}) + + +test_that("'IsingModel' computes the correct measure", { + # Sample size. + sample_size <- sample(500:2000, 1) + + # Nodes. + nodes <- sample(10:20, 1) + + # Density. + density <- sample(seq(.2, .5, .1), 1) + + # Create plain Ising model object. + ising <- IsingModel$new() + + # Create true parameters. + true <- ising$create(nodes, density) + + # Generate data. + data <- ising$generate(sample_size = sample_size, true_parameters = true) + + # Estimate parameters. + estimated <- ising$estimate(data) + + # Expect the computed measures to have the correct value. + expect_equal(ising$evaluate(true, estimated, measure = "sen"), compute_measure(true, estimated, "sen")) + expect_equal(ising$evaluate(true, estimated, measure = "spe"), compute_measure(true, estimated, "spe")) + + # For unknown measures and error should be thrown. + expect_error(ising$evaluate(true, estimated, measure = "unknown"), .__ERRORS__$not_developed) +}) + + +test_that("'IsingModel' does not evaluate models of different dimensions", { + # Create plain Ising model object. + ising <- IsingModel$new() + + # Create true parameters. + true <- ising$create(10, .3) + + # Generate data. + data <- ising$generate(sample_size = 500, true_parameters = true) + + # Estimate parameters. + estimated <- ising$estimate(data) + + # Drop one variable from the estimated parameters. + estimated <- estimated[, -1] + + # The evaluation should return NA because the model dimensions do not match. + expect_equal(ising$evaluate(true, estimated, measure = "sen"), NA) + expect_equal(ising$evaluate(true, estimated, measure = "spe"), NA) + expect_equal(ising$evaluate(true, estimated, measure = "mcc"), NA) + expect_equal(ising$evaluate(true, estimated, measure = "rho"), NA) +})