plot ccdf compare with the doc

sistm · Mar 27, 2024 · 3c18301 · 3c18301
1 parent a15ceb0
commit 3c18301
Showing 1 changed file with 182 additions and 0 deletions.
diff --git a/R/plot_ccdf_cit_gsa.R b/R/plot_ccdf_cit_gsa.R
@@ -0,0 +1,182 @@
+
+#' Function for plotting the CDF of all the genes within a gene set
+#'
+#' @param ccdf an R object that come from the CCDF function of the package
+
+#' @param number_y an integer value indicating the number of y thresholds for the summary curves. Default is \code{length(Y)}.
+#'
+#' @return a \code{\link[ggplot2]{ggplot}} object
+#' 
+#' @export
+#'
+#' @examples
+#' TO DO 
+#' 
+#' plot_ccdf_cit_gsa(ccdf_gs2_X_space, number_y=40)
+
+
+
+
+plot_ccdf_cit_gsa <- function(ccdf, number_y=length(ccdf[[1]]$y)){ 
+
+
+  genes <- names(ccdf)
+
+  # Create data frame with all the ccdf ----
+  data_gene <- do.call(rbind, lapply(genes, function(name) {
+    x <- ccdf[[name]]  
+    df <- cbind.data.frame(x$cdf, x$ccdf, x$x, x$y)
+    colnames(df) <- c("cdf", "ccdf", "x", "y") 
+    df$Gene <- rep(name, nrow(df))
+    return(df)
+  })) 
+
+  sev <- unique(data_gene$x)
+
+  # Complete data if no values for some y ----
+  data_gene_sep <- split(data_gene, list(data_gene$Gene, data_gene$x)) # separate the table by genes and severity
+
+  new_data_gene_sep <- lapply(data_gene_sep, function(df) {
+    # max value of current y
+    max_y_cur <- max(unique(df$y)) 
+    # max value of total y
+    max_y_all <- max(unique(data_gene$y))
+
+    # all the y that are between max current y value and max total y value
+    y_differ_data <- setdiff(data_gene$y, df$y) # y values that are not in current data 
+    miss_y <- subset(y_differ_data, y_differ_data >= max_y_cur & y_differ_data <= max_y_all) # y values that are not in current data (previous) and between the 2 max
+    n_miss_y <- length(miss_y)
+
+    # add the new y in the data + affect the max value of the ccdf to these y
+    # WARNING : ccdf trié pas y !
+    if (n_miss_y != 0){
+      df <- data.frame(cbind(rep(max(df$cdf),n_miss_y), 
+                             rep(max(df$ccdf),n_miss_y), 
+                             rep(df$x[1],n_miss_y), miss_y, 
+                             rep(df$Gene[1],n_miss_y)))
+      colnames(df) <- c("cdf","ccdf","x","y","Gene")
+      level <- 
+      label <- 
+      df$x <- factor(df$x, levels = rep(1:length(levels(data_gene$x))) , labels = levels(data_gene$x) ) 
+    } else{
+      df <- 0
+    }
+    return(df)
+  })
+
+  # Combine the data set
+  final_data <- do.call(rbind,lapply(genes, function(name){
+    # if some genes doesn't have missing y values
+    no_zero <- do.call(rbind,Filter(function(x) {!is.numeric(x) || !all(x == 0)}, new_data_gene_sep)) 
+    # new dataset completed
+    df <- rbind(data_gene[data_gene$Gene==name,], no_zero[no_zero$Gene==name,])
+    df$y <- as.numeric(df$y)
+    df$cdf <- as.numeric(df$cdf)
+    df$ccdf <- as.numeric(df$ccdf)
+    return(df)
+  }))
+
+  # Thresholds ----
+  Y_after <- final_data$y
+  # y value for each thresholds  
+  y_after <- seq(from = ifelse(length(which(Y_after==0))==0, min(Y_after), min(Y_after[-which(Y_after==0)])), 
+                 to = max(Y_after[-which.max(as.matrix(Y_after))]), length.out = number_y) 
+  #p_after <- length(y_after)
+  # index thresholds 
+  #index_jumps_after <- sapply(y_after[-p_after], function(i){sum(Y_after <= i)})  
+
+  seuils <- c(0,y_after)
+
+
+  # Compute the median ----
+  med <- lapply(sev, function(x){
+    med <- rep(NA, number_y)
+    filtre_x <- final_data$x == x
+    filtre_row_i0 <-  final_data$y >= seuils[1] & final_data$y < seuils[1+1]
+    indices0 <- which(filtre_x & filtre_row_i0)
+    med[1] <- median(final_data$ccdf[indices0])
+
+    for (i in 2:length(seuils)){
+
+      filtre_row_i <-  final_data$y >= seuils[i] & final_data$y < seuils[i+1]
+      indices <- which(filtre_x & filtre_row_i)
+
+      med[i] <- median(final_data$ccdf[indices])
+
+      ref_value <- med[i-1]
+      range_value <- final_data[filtre_x & filtre_row_i, ]$ccdf
+
+      closest_index <- min(which(range_value > ref_value))
+      closest_value <- range_value[closest_index]
+
+      if (med[i] < med[i-1] & closest_index != Inf ){ # force monotony when we can, if no values > previous median leave the current median 
+        med[i] <- closest_value
+      }
+    }
+    med <- data.frame(med[1:number_y])
+    med$y <- y_after 
+    med$x <- x 
+    names(med)[1] <- "ccdf"
+
+    return(med)
+  })
+
+  # Replace values with the max : if some values are below the max after it 
+  replace_max <- lapply(med, function(m){
+    ind_max <- which.max(m$ccdf)
+    m[ind_max:nrow(m),]$ccdf <- max(m$ccdf)
+    return(m)
+  })
+
+  # Step function : add the ccdf to the y values that are not a thresholds
+  step_function <- lapply(replace_max, function(df){
+    # y values that are not in the data of the thresholds
+    new_y <- data.frame(setdiff(final_data$y, df$y)) ; colnames(new_y) <- "y" 
+    # separate the values between the intervals of the thresholds
+    intervalle_indices <- findInterval(new_y$y, df$y, left.open = TRUE) 
+    indices_int <- intervalle_indices+1 
+
+    new_y$ccdf <- df$ccdf[indices_int]
+    new_y$x <- unique(df$x)
+
+    return(new_y)
+  })
+
+
+  # Data final -----
+  data_med <- rbind(do.call(rbind,step_function),do.call(rbind,med))
+
+
+
+  # Data modified for the legend
+  data_med$Gene <- "all"
+  data_med$Legend <- "Gene set summary"  
+
+  data_gene <- data_gene[-1]
+  data_gene$Legend <- "Genes" 
+
+
+  comb_data <- rbind(data_gene,data_med)
+
+
+  # Plot -----
+  ggplot(data = comb_data, aes(x = y, color=x, y = ccdf,linetype = Legend, size=Legend)) +
+    geom_line(aes(group = interaction(Gene,x))) + 
+    scale_size_manual(values = c(0.8,0.2)) + # change taille lignes selon variable dans aes(size) 
+    #scale_color_manual(values = c("ITU" = "brown3", "NITU" = "darkgoldenrod1", "Mild"="chartreuse3")) + 
+    labs(x = "Gene expression") +
+    theme_minimal() 
+
+}
+
+
+
+
+
+
+
+
+
+
+
+