diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/generate_sim_data.R new file mode 100644 index 0000000..d6e8c2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/generate_sim_data.R @@ -0,0 +1,48 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +lambda=seq(0,1,length.out=30)[-1] +set.seed(101) +## generate simulated scale free network +dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +n <- nrow(sim.dat) +p <- ncol(sim.dat) +################### confounded +## confounded data + +sim.dat.scaled <- scale(sim.dat) +sim.confounded=sim.dat.scaled +set.seed(101) +grp=rnorm(n) +scalar_constant = 1 +for(i in 1000:2000){ + sim.confounded[,i] = sim.confounded[,i] + scalar_constant*grp +} + +set.seed(102) +grp=rnorm(n) + +for(i in 1:500){ + sim.confounded[,i] = sim.confounded[,i] + scalar_constant*grp +} + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/run_sim.sh similarity index 52% rename from publication_rmd/simulation_scale-free_matched_gtex/run_sim.sh rename to publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/run_sim.sh index 15512b7..b43d6e0 100644 --- a/publication_rmd/simulation_scale-free_matched_gtex/run_sim.sh +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/run_sim.sh @@ -4,12 +4,12 @@ #SBATCH --mail-user=pparsan1@jhu.edu #SBATCH --time=72:0:0 #SBATCH --partition=lrgmem -#SBATCH --ntasks-per-node=25 +#SBATCH --ntasks-per-node=10 #SBATCH --mem=70G module load gcc/5.5.0 module load R/3.4.0 -cd /work-zfs/abattle4/parsana/temp/scale-free_matched_sim/ -Rscript sim.R >sim_log +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/no_scalar +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/submit_jobs.sh new file mode 100644 index 0000000..6ec2f2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch run_sim.sh data +sbatch run_sim.sh confounded +sbatch run_sim.sh corrected diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_data.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_data.sh new file mode 100644 index 0000000..2878ffa --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_data.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=1:0:0 +#SBATCH --partition=lrgmem +#SBATCH --mem=10G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound/ +Rscript generate_sim_data.R >log/generate_sim_data.log +~ diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_sim_data.R new file mode 100644 index 0000000..a3597c6 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_sim_data.R @@ -0,0 +1,52 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +set.seed(101) +## generate simulated scale free network +dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +n <- nrow(sim.dat) +p <- ncol(sim.dat) +################### confounded +## confounded data + +sim.dat.scaled <- scale(sim.dat) +sim.confounded=sim.dat.scaled + +## set seed +set.seed(101) +seed_samples <- sample(100:200, 5) + +for(eachseed in seed_samples){ + set.seed(eachseed) + thisgrp = rnorm(n) + thisprob = runif(500, 0,1) + thisgenes = sample(1000:5000, 500) + print(thisgenes) + gene_prob_counter = 0 + for(i in thisgenes){ + gene_prob_counter = gene_prob_counter + 1 + sim.confounded[,i] = sim.confounded[,i] + (thisgrp*thisprob[gene_prob_counter]) + } +} + + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/run_sim.sh new file mode 100644 index 0000000..4c88982 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/run_sim.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=72:0:0 +#SBATCH --partition=lrgmem +#SBATCH --ntasks-per-node=10 +#SBATCH --mem=55G + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/submit_jobs.sh new file mode 100644 index 0000000..4e6fb89 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch -J data run_sim.sh data +sbatch -J confounded run_sim.sh confounded +sbatch -J corrected run_sim.sh corrected diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_data.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_data.sh new file mode 100644 index 0000000..d1c973a --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_data.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=0:30:0 +#SBATCH --partition=quickdebug +#SBATCH --mem=10G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound_withsf/ +Rscript generate_sim_data.R >log/generate_sim_data.log +~ diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_sim_data.R new file mode 100644 index 0000000..7d7eac3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_sim_data.R @@ -0,0 +1,65 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +set.seed(101) + +if(!any(grepl("simulated_data.Rds", dir()))){ + +## generate simulated scale free network +dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") +}else{ + sim.dat <- readRDS("simulated_data.Rds") +} + +n <- nrow(sim.dat) +p <- ncol(sim.dat) +################### confounded +## confounded data + +sim.dat.scaled <- scale(sim.dat) +sim.confounded=sim.dat.scaled + +## set seed +set.seed(111) +seed_samples <- sample(100:200, 30) + +for(eachseed in seed_samples){ + set.seed(eachseed) + thisgrp = rnorm(n) + sf = sample(2:3,1) + sel.genes = sample(250:500, 1) + thisgenes = sample(1500:3500, sel.genes) + thisprob = runif(sel.genes, 0, 1) + #print(thisgenes) + print(sf) + this_gene_prob = 0 + for(i in thisgenes){ + this_gene_prob = this_gene_prob + 1 + sim.confounded[,i] = sim.confounded[,i] + (sf*thisgrp*thisprob[this_gene_prob]) + } +} + + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +#saveRDS(sim.dat, file = "simulated_data.Rds") +#saveRDS(sim.theta, file = "simulated_network.Rds") +print("Done") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/run_sim.sh new file mode 100644 index 0000000..efcc678 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/run_sim.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=30:0:0 +#SBATCH --partition=lrgmem +#SBATCH --ntasks-per-node=5 +#SBATCH --mem=55G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound_withsf +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/submit_jobs.sh new file mode 100644 index 0000000..6ec2f2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch run_sim.sh data +sbatch run_sim.sh confounded +sbatch run_sim.sh corrected diff --git a/publication_rmd/simulation_scale-free_matched_gtex/sim.R b/publication_rmd/simulation_scale-free_matched_gtex/sim.R deleted file mode 100644 index 3b9bd20..0000000 --- a/publication_rmd/simulation_scale-free_matched_gtex/sim.R +++ /dev/null @@ -1,67 +0,0 @@ -library(sva) -library(huge) -library(QUIC) -library(parallel) -#simulate data - -lambda=seq(0,1,length.out=30)[-1] -set.seed(101) -## generate simulated scale free network -dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, - g = NULL, prob = NULL, vis = F, verbose = TRUE) -sim.dat <- dat$data -n <- nrow(sim.dat) -p <- ncol(sim.dat) - - -## build networks -sim.cov <- cov(scale(sim.dat)) -sim.net <- mclapply(lambda, function(graphlasso, thisdat){ - thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X - thisnet[thisnet!=0] <- 1 - thisnet -}, sim.cov, mc.cores = 29) - -a <- 1 - -save(sim.net, file = "sim.Rdata") - -################### confounded -## confounded data - -sim.confounded=sim.dat -set.seed(101) -grp=rnorm(n) -for(i in 10:30){ - sim.confounded[,i] = sim.confounded[,i] + 5*grp -} - -sim.confounded.cov <- cov(scale(sim.confounded)) - -sim.confounded.net <- mclapply(lambda, function(graphlasso, thisdat){ - thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X - thisnet[thisnet!=0] <- 1 - thisnet -}, sim.confounded.cov, mc.cores = 29) -save(sim.confounded.net, file = "sim_confounded.Rdata") - -################### corrected -mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) -colnames(mod)="Intercept" -nsv=num.sv(t(sim.confounded),mod, method = "be") -print(paste("the number of PCs estimated to be removed:", nsv)) -ss=svd(scale(sim.confounded)) -grp.est=ss$u[,1:nsv] -sim.corrected=lm(sim.confounded~grp.est)$residuals - -sim.corrected.cov <- cov(scale(sim.corrected)) - -sim.corrected.net <- mclapply(lambda, function(graphlasso, thisdat){ - thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X - thisnet[thisnet!=0] <- 1 - thisnet -}, sim.corrected.cov, mc.cores = 29) - -save(sim.corrected.net, file = "sim_corrected.Rdata") - -############################################## diff --git a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.Rmd b/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.Rmd deleted file mode 100755 index 5ff4e38..0000000 --- a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.Rmd +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: "Simulation: Scale-free networks matched to GTEx" -output: - html_notebook: default - html_document: - df_print: paged - pdf_document: default ---- - -```{r, message=FALSE} -library(huge, quietly = T) -library(Matrix) -``` - -Here, we first simulated data from a multivariate gaussian distribution based on an underlying scale-free network structure in the precision matrix. We tried to match the number of samples to the number of samples in our empirical experiments done with GTEx data. This simulation included 5000 genes across 350 samples. We found that the number of false discoveries is much higher in confounded data. Further, this analysis shows that PC correction increased the number of true positives in the inferred networks - numbers that were similar to networks obtained from original simulated data(without confounding). -```{r} -set.seed(101) -## generate simulated scale free network -dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, - g = NULL, prob = NULL, vis = F, verbose = TRUE) - - -dat.theta <- dat$theta - -rm(dat) -## load networks inferred with simulated data -sim.net <- readRDS("sim.Rds") -sim.confounded.net <- readRDS("sim_confounded.Rds") -sim.corrected.net <- readRDS("sim_corrected.Rds") - -a=5 - -## Count the total number of edges in the network -true_ecount <- sum(dat.theta == 1 & col(dat.theta) < row(dat.theta)) -confounded_ecount <- sapply(sim.confounded.net, function(x) sum(x == 1 & col(x) < row(x))) -sim_ecount <- sapply(sim.net,function(x) sum(x == 1 & col(x) < row(x))) -corrected_ecount <- sapply(sim.corrected.net, function(x) sum(x == 1 & col(x) < row(x))) - -gc() -## count the number of edges common with the simulated network -confounded_common <- sapply(sim.confounded.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta) -sim_common <- sapply(sim.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta) -corrected_common <- sapply(sim.corrected.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta) - -print(paste("The number of edges in the true network:", true_ecount)) -print(paste("The number of edges in the inferred network", sim_ecount[a])) -print(paste("The number of edges in the confounded network", confounded_ecount[a])) -print(paste("The number of edges in the corrected network", corrected_ecount[a])) -print(paste("The number common edges in the confounded and true network", confounded_common[a])) -print(paste("The number common edges in the inferred and true network", sim_common[a])) -print(paste("The number common edges in the inferred and true network", corrected_common[a])) - -``` -Next, we checked the proportion of edges from original simulated data networks that were also identified in PC corrected networks, and proportion of edges from original simulated data networks that were also identified in confounded networks. -```{r} -sim_corrected_common <- mapply(function(x,y){ - sum(x+y == 2 & col(x) < row(x)) -}, sim.net, sim.corrected.net) - - -sim_corrected_common/sim_ecount - -sim_confounded_common <- mapply(function(x,y){ - sum(x+y == 2 & col(x) < row(x)) -}, sim.net, sim.confounded.net) - -sim_confounded_common/sim_ecount - -``` \ No newline at end of file diff --git a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.html b/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.html deleted file mode 100755 index 994eb79..0000000 --- a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.html +++ /dev/null @@ -1,400 +0,0 @@ - - - - -
- - - - - - - - -library(huge, quietly = T)
-library(Matrix)
-
-
-
-Here, we first simulated data from a multivariate gaussian distribution based on an underlying scale-free network structure in the precision matrix. We tried to match the number of samples to the number of samples in our empirical experiments done with GTEx data. This simulation included 5000 genes across 350 samples. We found that the number of false discoveries is much higher in confounded data. Further, this analysis shows that PC correction increased the number of true positives in the inferred networks - numbers that were similar to networks obtained from original simulated data(without confounding).
- - - -set.seed(101)
-## generate simulated scale free network
-dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL,
- g = NULL, prob = NULL, vis = F, verbose = TRUE)
-
-
-Generating data from the multivariate normal distribution with the scale-free graph structure....done.
-
-
-dat.theta <- dat$theta
-rm(dat)
-## load networks inferred with simulated data
-sim.net <- readRDS("sim.Rds")
-sim.confounded.net <- readRDS("sim_confounded.Rds")
-sim.corrected.net <- readRDS("sim_corrected.Rds")
-a=5
-## Count the total number of edges in the network
-true_ecount <- sum(dat.theta == 1 & col(dat.theta) < row(dat.theta))
-confounded_ecount <- sapply(sim.confounded.net, function(x) sum(x == 1 & col(x) < row(x)))
-sim_ecount <- sapply(sim.net,function(x) sum(x == 1 & col(x) < row(x)))
-corrected_ecount <- sapply(sim.corrected.net, function(x) sum(x == 1 & col(x) < row(x)))
-gc()
-
-
- used (Mb) gc trigger (Mb) max used (Mb)
-Ncells 2603099 139.1 4738752 253.1 4738752 253.1
-Vcells 20888753 159.4 105872022 807.8 2108080318 16083.4
-
-
-## count the number of edges common with the simulated network
-confounded_common <- sapply(sim.confounded.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-sim_common <- sapply(sim.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-corrected_common <- sapply(sim.corrected.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-print(paste("The number of edges in the true network:", true_ecount))
-
-
-[1] "The number of edges in the true network: 4999"
-
-
-print(paste("The number of edges in the inferred network", sim_ecount[a]))
-
-
-[1] "The number of edges in the inferred network 15284"
-
-
-print(paste("The number of edges in the confounded network", confounded_ecount[a]))
-
-
-[1] "The number of edges in the confounded network 240955"
-
-
-print(paste("The number of edges in the corrected network", corrected_ecount[a]))
-
-
-[1] "The number of edges in the corrected network 15730"
-
-
-print(paste("The number common edges in the confounded and true network", confounded_common[a]))
-
-
-[1] "The number common edges in the confounded and true network 163"
-
-
-print(paste("The number common edges in the inferred and true network", sim_common[a]))
-
-
-[1] "The number common edges in the inferred and true network 399"
-
-
-print(paste("The number common edges in the inferred and true network", corrected_common[a]))
-
-
-[1] "The number common edges in the inferred and true network 395"
-
-
-
-Next, we checked the proportion of edges from original simulated data networks that were also identified in PC corrected networks, and proportion of edges from original simulated data networks that were also identified in confounded networks.
- - - -sim_corrected_common <- mapply(function(x,y){
- sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.corrected.net)
-sim_corrected_common/sim_ecount
-
-
- [1] 0.9471285 0.9508433 0.9498509 0.9429816 0.9325438 0.9284082 0.9000000 0.9565217 1.0000000
-[10] 1.0000000 1.0000000 NaN NaN NaN NaN NaN NaN NaN
-[19] NaN NaN NaN NaN NaN NaN NaN NaN NaN
-[28] NaN NaN
-
-
-sim_confounded_common <- mapply(function(x,y){
- sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.confounded.net)
-sim_confounded_common/sim_ecount
-
-
- [1] 0.02322967 0.02221992 0.02121465 0.01983463 0.02152578 0.02208682 0.03846154 0.13043478
- [9] 0.12500000 0.00000000 0.00000000 NaN NaN NaN NaN NaN
-[17] NaN NaN NaN NaN NaN NaN NaN NaN
-[25] NaN NaN NaN NaN NaN
-
-
-
-library(huge, quietly = T)
-library(Matrix)
-
-
-
-Here, we first simulated data from a multivariate gaussian distribution based on an underlying scale-free network structure in the precision matrix. We tried to match the number of samples to the number of samples in our empirical experiments done with GTEx data. This simulation included 5000 genes across 350 samples. We found that the number of false discoveries is much higher in confounded data. Further, this analysis shows that PC correction increased the number of true positives in the inferred networks - numbers that were similar to networks obtained from original simulated data(without confounding).
- - - -set.seed(101)
-## generate simulated scale free network
-dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL,
- g = NULL, prob = NULL, vis = F, verbose = TRUE)
-
-
-Generating data from the multivariate normal distribution with the scale-free graph structure....done.
-
-
-dat.theta <- dat$theta
-rm(dat)
-## load networks inferred with simulated data
-sim.net <- readRDS("sim.Rds")
-sim.confounded.net <- readRDS("sim_confounded.Rds")
-sim.corrected.net <- readRDS("sim_corrected.Rds")
-a=5
-## Count the total number of edges in the network
-true_ecount <- sum(dat.theta == 1 & col(dat.theta) < row(dat.theta))
-confounded_ecount <- sapply(sim.confounded.net, function(x) sum(x == 1 & col(x) < row(x)))
-sim_ecount <- sapply(sim.net,function(x) sum(x == 1 & col(x) < row(x)))
-corrected_ecount <- sapply(sim.corrected.net, function(x) sum(x == 1 & col(x) < row(x)))
-gc()
-
-
- used (Mb) gc trigger (Mb) max used (Mb)
-Ncells 2603099 139.1 4738752 253.1 4738752 253.1
-Vcells 20888753 159.4 105872022 807.8 2108080318 16083.4
-
-
-## count the number of edges common with the simulated network
-confounded_common <- sapply(sim.confounded.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-sim_common <- sapply(sim.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-corrected_common <- sapply(sim.corrected.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-print(paste("The number of edges in the true network:", true_ecount))
-
-
-[1] "The number of edges in the true network: 4999"
-
-
-print(paste("The number of edges in the inferred network", sim_ecount[a]))
-
-
-[1] "The number of edges in the inferred network 15284"
-
-
-print(paste("The number of edges in the confounded network", confounded_ecount[a]))
-
-
-[1] "The number of edges in the confounded network 240955"
-
-
-print(paste("The number of edges in the corrected network", corrected_ecount[a]))
-
-
-[1] "The number of edges in the corrected network 15730"
-
-
-print(paste("The number common edges in the confounded and true network", confounded_common[a]))
-
-
-[1] "The number common edges in the confounded and true network 163"
-
-
-print(paste("The number common edges in the inferred and true network", sim_common[a]))
-
-
-[1] "The number common edges in the inferred and true network 399"
-
-
-print(paste("The number common edges in the inferred and true network", corrected_common[a]))
-
-
-[1] "The number common edges in the inferred and true network 395"
-
-
-
-Next, we checked the proportion of edges from original simulated data networks that were also identified in PC corrected networks, and proportion of edges from original simulated data networks that were also identified in confounded networks.
- - - -sim_corrected_common <- mapply(function(x,y){
- sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.corrected.net)
-sim_corrected_common/sim_ecount
-
-
- [1] 0.9471285 0.9508433 0.9498509 0.9429816 0.9325438 0.9284082 0.9000000 0.9565217 1.0000000
-[10] 1.0000000 1.0000000 NaN NaN NaN NaN NaN NaN NaN
-[19] NaN NaN NaN NaN NaN NaN NaN NaN NaN
-[28] NaN NaN
-
-
-sim_confounded_common <- mapply(function(x,y){
- sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.confounded.net)
-sim_confounded_common/sim_ecount
-
-
- [1] 0.02322967 0.02221992 0.02121465 0.01983463 0.02152578 0.02208682 0.03846154 0.13043478
- [9] 0.12500000 0.00000000 0.00000000 NaN NaN NaN NaN NaN
-[17] NaN NaN NaN NaN NaN NaN NaN NaN
-[25] NaN NaN NaN NaN NaN
-
-
-
-