From ff622d472fe7250575658cdf6bc269693d19cfe7 Mon Sep 17 00:00:00 2001 From: Princy Parsana Date: Tue, 12 Feb 2019 15:01:43 -0500 Subject: [PATCH] simulations matched to gtex --- .../no_scalar/generate_sim_data.R | 48 +++ .../alternate_versions/no_scalar/infer_nets.R | 43 ++ .../no_scalar}/run_sim.sh | 6 +- .../no_scalar/submit_jobs.sh | 3 + .../overlap_confound/generate_data.sh | 15 + .../overlap_confound/generate_sim_data.R | 52 +++ .../overlap_confound/infer_nets.R | 43 ++ .../overlap_confound/run_sim.sh | 14 + .../overlap_confound/submit_jobs.sh | 3 + .../overlap_confound_withsf/generate_data.sh | 15 + .../generate_sim_data.R | 65 +++ .../overlap_confound_withsf/infer_nets.R | 43 ++ .../overlap_confound_withsf/run_sim.sh | 15 + .../overlap_confound_withsf/submit_jobs.sh | 3 + .../simulation_scale-free_matched_gtex/sim.R | 67 --- .../sim_scale_free_matched.Rmd | 69 --- .../sim_scale_free_matched.html | 400 ------------------ .../sim_scale_free_matched.nb.html | 400 ------------------ .../weighted_confound/generate_data.sh | 16 + .../weighted_confound/generate_sim_data.R | 57 +++ .../weighted_confound/infer_nets.R | 43 ++ .../weighted_confound/run_sim.sh | 15 + .../weighted_confound/slurm-32414331.out | 35 ++ .../weighted_confound/slurm-32415126.out | 35 ++ .../weighted_confound/slurm-32415127.out | 35 ++ .../weighted_confound/slurm-32415128.out | 35 ++ .../weighted_confound/submit_jobs.sh | 3 + .../generate_data.sh | 16 + .../generate_sim_data.R | 57 +++ .../infer_nets.R | 43 ++ .../run_sim.sh | 15 + .../slurm-32479419.out | 36 ++ .../slurm-32481963.out | 37 ++ .../submit_jobs.sh | 3 + 34 files changed, 846 insertions(+), 939 deletions(-) create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/generate_sim_data.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/infer_nets.R rename publication_rmd/simulation_scale-free_matched_gtex/{ => alternate_versions/no_scalar}/run_sim.sh (52%) create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/submit_jobs.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_data.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_sim_data.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/infer_nets.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/run_sim.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/submit_jobs.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_data.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_sim_data.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/infer_nets.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/run_sim.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/submit_jobs.sh delete mode 100644 publication_rmd/simulation_scale-free_matched_gtex/sim.R delete mode 100755 publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.Rmd delete mode 100755 publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.html delete mode 100755 publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.nb.html create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_data.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_sim_data.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/infer_nets.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/run_sim.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32414331.out create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415126.out create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415127.out create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415128.out create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/submit_jobs.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_data.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_sim_data.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/infer_nets.R create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/run_sim.sh create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32479419.out create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32481963.out create mode 100644 publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/submit_jobs.sh diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/generate_sim_data.R new file mode 100644 index 0000000..d6e8c2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/generate_sim_data.R @@ -0,0 +1,48 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +lambda=seq(0,1,length.out=30)[-1] +set.seed(101) +## generate simulated scale free network +dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +n <- nrow(sim.dat) +p <- ncol(sim.dat) +################### confounded +## confounded data + +sim.dat.scaled <- scale(sim.dat) +sim.confounded=sim.dat.scaled +set.seed(101) +grp=rnorm(n) +scalar_constant = 1 +for(i in 1000:2000){ + sim.confounded[,i] = sim.confounded[,i] + scalar_constant*grp +} + +set.seed(102) +grp=rnorm(n) + +for(i in 1:500){ + sim.confounded[,i] = sim.confounded[,i] + scalar_constant*grp +} + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/run_sim.sh similarity index 52% rename from publication_rmd/simulation_scale-free_matched_gtex/run_sim.sh rename to publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/run_sim.sh index 15512b7..b43d6e0 100644 --- a/publication_rmd/simulation_scale-free_matched_gtex/run_sim.sh +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/run_sim.sh @@ -4,12 +4,12 @@ #SBATCH --mail-user=pparsan1@jhu.edu #SBATCH --time=72:0:0 #SBATCH --partition=lrgmem -#SBATCH --ntasks-per-node=25 +#SBATCH --ntasks-per-node=10 #SBATCH --mem=70G module load gcc/5.5.0 module load R/3.4.0 -cd /work-zfs/abattle4/parsana/temp/scale-free_matched_sim/ -Rscript sim.R >sim_log +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/no_scalar +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/submit_jobs.sh new file mode 100644 index 0000000..6ec2f2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/no_scalar/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch run_sim.sh data +sbatch run_sim.sh confounded +sbatch run_sim.sh corrected diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_data.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_data.sh new file mode 100644 index 0000000..2878ffa --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_data.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=1:0:0 +#SBATCH --partition=lrgmem +#SBATCH --mem=10G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound/ +Rscript generate_sim_data.R >log/generate_sim_data.log +~ diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_sim_data.R new file mode 100644 index 0000000..a3597c6 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/generate_sim_data.R @@ -0,0 +1,52 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +set.seed(101) +## generate simulated scale free network +dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +n <- nrow(sim.dat) +p <- ncol(sim.dat) +################### confounded +## confounded data + +sim.dat.scaled <- scale(sim.dat) +sim.confounded=sim.dat.scaled + +## set seed +set.seed(101) +seed_samples <- sample(100:200, 5) + +for(eachseed in seed_samples){ + set.seed(eachseed) + thisgrp = rnorm(n) + thisprob = runif(500, 0,1) + thisgenes = sample(1000:5000, 500) + print(thisgenes) + gene_prob_counter = 0 + for(i in thisgenes){ + gene_prob_counter = gene_prob_counter + 1 + sim.confounded[,i] = sim.confounded[,i] + (thisgrp*thisprob[gene_prob_counter]) + } +} + + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/run_sim.sh new file mode 100644 index 0000000..4c88982 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/run_sim.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=72:0:0 +#SBATCH --partition=lrgmem +#SBATCH --ntasks-per-node=10 +#SBATCH --mem=55G + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/submit_jobs.sh new file mode 100644 index 0000000..4e6fb89 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch -J data run_sim.sh data +sbatch -J confounded run_sim.sh confounded +sbatch -J corrected run_sim.sh corrected diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_data.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_data.sh new file mode 100644 index 0000000..d1c973a --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_data.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=0:30:0 +#SBATCH --partition=quickdebug +#SBATCH --mem=10G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound_withsf/ +Rscript generate_sim_data.R >log/generate_sim_data.log +~ diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_sim_data.R new file mode 100644 index 0000000..7d7eac3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/generate_sim_data.R @@ -0,0 +1,65 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +set.seed(101) + +if(!any(grepl("simulated_data.Rds", dir()))){ + +## generate simulated scale free network +dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") +}else{ + sim.dat <- readRDS("simulated_data.Rds") +} + +n <- nrow(sim.dat) +p <- ncol(sim.dat) +################### confounded +## confounded data + +sim.dat.scaled <- scale(sim.dat) +sim.confounded=sim.dat.scaled + +## set seed +set.seed(111) +seed_samples <- sample(100:200, 30) + +for(eachseed in seed_samples){ + set.seed(eachseed) + thisgrp = rnorm(n) + sf = sample(2:3,1) + sel.genes = sample(250:500, 1) + thisgenes = sample(1500:3500, sel.genes) + thisprob = runif(sel.genes, 0, 1) + #print(thisgenes) + print(sf) + this_gene_prob = 0 + for(i in thisgenes){ + this_gene_prob = this_gene_prob + 1 + sim.confounded[,i] = sim.confounded[,i] + (sf*thisgrp*thisprob[this_gene_prob]) + } +} + + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +#saveRDS(sim.dat, file = "simulated_data.Rds") +#saveRDS(sim.theta, file = "simulated_network.Rds") +print("Done") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/run_sim.sh new file mode 100644 index 0000000..efcc678 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/run_sim.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=30:0:0 +#SBATCH --partition=lrgmem +#SBATCH --ntasks-per-node=5 +#SBATCH --mem=55G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/overlap_confound_withsf +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/submit_jobs.sh new file mode 100644 index 0000000..6ec2f2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/alternate_versions/overlap_confound_withsf/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch run_sim.sh data +sbatch run_sim.sh confounded +sbatch run_sim.sh corrected diff --git a/publication_rmd/simulation_scale-free_matched_gtex/sim.R b/publication_rmd/simulation_scale-free_matched_gtex/sim.R deleted file mode 100644 index 3b9bd20..0000000 --- a/publication_rmd/simulation_scale-free_matched_gtex/sim.R +++ /dev/null @@ -1,67 +0,0 @@ -library(sva) -library(huge) -library(QUIC) -library(parallel) -#simulate data - -lambda=seq(0,1,length.out=30)[-1] -set.seed(101) -## generate simulated scale free network -dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, - g = NULL, prob = NULL, vis = F, verbose = TRUE) -sim.dat <- dat$data -n <- nrow(sim.dat) -p <- ncol(sim.dat) - - -## build networks -sim.cov <- cov(scale(sim.dat)) -sim.net <- mclapply(lambda, function(graphlasso, thisdat){ - thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X - thisnet[thisnet!=0] <- 1 - thisnet -}, sim.cov, mc.cores = 29) - -a <- 1 - -save(sim.net, file = "sim.Rdata") - -################### confounded -## confounded data - -sim.confounded=sim.dat -set.seed(101) -grp=rnorm(n) -for(i in 10:30){ - sim.confounded[,i] = sim.confounded[,i] + 5*grp -} - -sim.confounded.cov <- cov(scale(sim.confounded)) - -sim.confounded.net <- mclapply(lambda, function(graphlasso, thisdat){ - thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X - thisnet[thisnet!=0] <- 1 - thisnet -}, sim.confounded.cov, mc.cores = 29) -save(sim.confounded.net, file = "sim_confounded.Rdata") - -################### corrected -mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) -colnames(mod)="Intercept" -nsv=num.sv(t(sim.confounded),mod, method = "be") -print(paste("the number of PCs estimated to be removed:", nsv)) -ss=svd(scale(sim.confounded)) -grp.est=ss$u[,1:nsv] -sim.corrected=lm(sim.confounded~grp.est)$residuals - -sim.corrected.cov <- cov(scale(sim.corrected)) - -sim.corrected.net <- mclapply(lambda, function(graphlasso, thisdat){ - thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X - thisnet[thisnet!=0] <- 1 - thisnet -}, sim.corrected.cov, mc.cores = 29) - -save(sim.corrected.net, file = "sim_corrected.Rdata") - -############################################## diff --git a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.Rmd b/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.Rmd deleted file mode 100755 index 5ff4e38..0000000 --- a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.Rmd +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: "Simulation: Scale-free networks matched to GTEx" -output: - html_notebook: default - html_document: - df_print: paged - pdf_document: default ---- - -```{r, message=FALSE} -library(huge, quietly = T) -library(Matrix) -``` - -Here, we first simulated data from a multivariate gaussian distribution based on an underlying scale-free network structure in the precision matrix. We tried to match the number of samples to the number of samples in our empirical experiments done with GTEx data. This simulation included 5000 genes across 350 samples. We found that the number of false discoveries is much higher in confounded data. Further, this analysis shows that PC correction increased the number of true positives in the inferred networks - numbers that were similar to networks obtained from original simulated data(without confounding). -```{r} -set.seed(101) -## generate simulated scale free network -dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, - g = NULL, prob = NULL, vis = F, verbose = TRUE) - - -dat.theta <- dat$theta - -rm(dat) -## load networks inferred with simulated data -sim.net <- readRDS("sim.Rds") -sim.confounded.net <- readRDS("sim_confounded.Rds") -sim.corrected.net <- readRDS("sim_corrected.Rds") - -a=5 - -## Count the total number of edges in the network -true_ecount <- sum(dat.theta == 1 & col(dat.theta) < row(dat.theta)) -confounded_ecount <- sapply(sim.confounded.net, function(x) sum(x == 1 & col(x) < row(x))) -sim_ecount <- sapply(sim.net,function(x) sum(x == 1 & col(x) < row(x))) -corrected_ecount <- sapply(sim.corrected.net, function(x) sum(x == 1 & col(x) < row(x))) - -gc() -## count the number of edges common with the simulated network -confounded_common <- sapply(sim.confounded.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta) -sim_common <- sapply(sim.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta) -corrected_common <- sapply(sim.corrected.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta) - -print(paste("The number of edges in the true network:", true_ecount)) -print(paste("The number of edges in the inferred network", sim_ecount[a])) -print(paste("The number of edges in the confounded network", confounded_ecount[a])) -print(paste("The number of edges in the corrected network", corrected_ecount[a])) -print(paste("The number common edges in the confounded and true network", confounded_common[a])) -print(paste("The number common edges in the inferred and true network", sim_common[a])) -print(paste("The number common edges in the inferred and true network", corrected_common[a])) - -``` -Next, we checked the proportion of edges from original simulated data networks that were also identified in PC corrected networks, and proportion of edges from original simulated data networks that were also identified in confounded networks. -```{r} -sim_corrected_common <- mapply(function(x,y){ - sum(x+y == 2 & col(x) < row(x)) -}, sim.net, sim.corrected.net) - - -sim_corrected_common/sim_ecount - -sim_confounded_common <- mapply(function(x,y){ - sum(x+y == 2 & col(x) < row(x)) -}, sim.net, sim.confounded.net) - -sim_confounded_common/sim_ecount - -``` \ No newline at end of file diff --git a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.html b/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.html deleted file mode 100755 index 994eb79..0000000 --- a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.html +++ /dev/null @@ -1,400 +0,0 @@ - - - - - - - - - - - - - -Simulation: Scale-free networks matched to GTEx - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - -
library(huge, quietly = T)
-library(Matrix)
- - - -

Here, we first simulated data from a multivariate gaussian distribution based on an underlying scale-free network structure in the precision matrix. We tried to match the number of samples to the number of samples in our empirical experiments done with GTEx data. This simulation included 5000 genes across 350 samples. We found that the number of false discoveries is much higher in confounded data. Further, this analysis shows that PC correction increased the number of true positives in the inferred networks - numbers that were similar to networks obtained from original simulated data(without confounding).

- - - -
set.seed(101)
-## generate simulated scale free network
-dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL,
-               g = NULL, prob = NULL, vis = F, verbose = TRUE)
- - -
Generating data from the multivariate normal distribution with the scale-free graph structure....done.
- - -
dat.theta <- dat$theta
-rm(dat)
-## load networks inferred with simulated data
-sim.net <- readRDS("sim.Rds")
-sim.confounded.net <- readRDS("sim_confounded.Rds")
-sim.corrected.net <- readRDS("sim_corrected.Rds")
-a=5
-## Count the total number of edges in the network
-true_ecount <- sum(dat.theta == 1 & col(dat.theta) < row(dat.theta))
-confounded_ecount <- sapply(sim.confounded.net, function(x) sum(x == 1 & col(x) < row(x)))
-sim_ecount <- sapply(sim.net,function(x) sum(x == 1 & col(x) < row(x)))
-corrected_ecount <- sapply(sim.corrected.net, function(x) sum(x == 1 & col(x) < row(x)))
-gc()
- - -
           used  (Mb) gc trigger  (Mb)   max used    (Mb)
-Ncells  2603099 139.1    4738752 253.1    4738752   253.1
-Vcells 20888753 159.4  105872022 807.8 2108080318 16083.4
- - -
## count the number of edges common with the simulated network
-confounded_common <- sapply(sim.confounded.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-sim_common <- sapply(sim.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-corrected_common <- sapply(sim.corrected.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-print(paste("The number of edges in the true network:", true_ecount))
- - -
[1] "The number of edges in the true network: 4999"
- - -
print(paste("The number of edges in the inferred network", sim_ecount[a]))
- - -
[1] "The number of edges in the inferred network 15284"
- - -
print(paste("The number of edges in the confounded network", confounded_ecount[a]))
- - -
[1] "The number of edges in the confounded network 240955"
- - -
print(paste("The number of edges in the corrected network", corrected_ecount[a]))
- - -
[1] "The number of edges in the corrected network 15730"
- - -
print(paste("The number common edges in the confounded and true network", confounded_common[a]))
- - -
[1] "The number common edges in the confounded and true network 163"
- - -
print(paste("The number common edges in the inferred and true network", sim_common[a]))
- - -
[1] "The number common edges in the inferred and true network 399"
- - -
print(paste("The number common edges in the inferred and true network", corrected_common[a]))
- - -
[1] "The number common edges in the inferred and true network 395"
- - - -

Next, we checked the proportion of edges from original simulated data networks that were also identified in PC corrected networks, and proportion of edges from original simulated data networks that were also identified in confounded networks.

- - - -
sim_corrected_common <- mapply(function(x,y){
-  sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.corrected.net)
-sim_corrected_common/sim_ecount
- - -
 [1] 0.9471285 0.9508433 0.9498509 0.9429816 0.9325438 0.9284082 0.9000000 0.9565217 1.0000000
-[10] 1.0000000 1.0000000       NaN       NaN       NaN       NaN       NaN       NaN       NaN
-[19]       NaN       NaN       NaN       NaN       NaN       NaN       NaN       NaN       NaN
-[28]       NaN       NaN
- - -
sim_confounded_common <- mapply(function(x,y){
-  sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.confounded.net)
-sim_confounded_common/sim_ecount
- - -
 [1] 0.02322967 0.02221992 0.02121465 0.01983463 0.02152578 0.02208682 0.03846154 0.13043478
- [9] 0.12500000 0.00000000 0.00000000        NaN        NaN        NaN        NaN        NaN
-[17]        NaN        NaN        NaN        NaN        NaN        NaN        NaN        NaN
-[25]        NaN        NaN        NaN        NaN        NaN
- - - -
LS0tDQp0aXRsZTogIlNpbXVsYXRpb246IFNjYWxlLWZyZWUgbmV0d29ya3MgbWF0Y2hlZCB0byBHVEV4Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOiBkZWZhdWx0DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgZGZfcHJpbnQ6IHBhZ2VkDQogIHBkZl9kb2N1bWVudDogZGVmYXVsdA0KLS0tDQoNCmBgYHtyLCBtZXNzYWdlPUZBTFNFfQ0KbGlicmFyeShodWdlLCBxdWlldGx5ID0gVCkNCmxpYnJhcnkoTWF0cml4KQ0KYGBgDQoNCkhlcmUsIHdlIGZpcnN0IHNpbXVsYXRlZCBkYXRhIGZyb20gYSBtdWx0aXZhcmlhdGUgZ2F1c3NpYW4gZGlzdHJpYnV0aW9uIGJhc2VkIG9uIGFuIHVuZGVybHlpbmcgc2NhbGUtZnJlZSBuZXR3b3JrIHN0cnVjdHVyZSBpbiB0aGUgcHJlY2lzaW9uIG1hdHJpeC4gV2UgdHJpZWQgdG8gbWF0Y2ggdGhlIG51bWJlciBvZiBzYW1wbGVzIHRvIHRoZSBudW1iZXIgb2Ygc2FtcGxlcyBpbiBvdXIgZW1waXJpY2FsIGV4cGVyaW1lbnRzIGRvbmUgd2l0aCBHVEV4IGRhdGEuIFRoaXMgc2ltdWxhdGlvbiBpbmNsdWRlZCA1MDAwIGdlbmVzIGFjcm9zcyAzNTAgc2FtcGxlcy4gV2UgZm91bmQgdGhhdCB0aGUgbnVtYmVyIG9mIGZhbHNlIGRpc2NvdmVyaWVzIGlzIG11Y2ggaGlnaGVyIGluIGNvbmZvdW5kZWQgZGF0YS4gRnVydGhlciwgdGhpcyBhbmFseXNpcyBzaG93cyB0aGF0IFBDIGNvcnJlY3Rpb24gaW5jcmVhc2VkIHRoZSBudW1iZXIgb2YgdHJ1ZSBwb3NpdGl2ZXMgaW4gdGhlIGluZmVycmVkIG5ldHdvcmtzIC0gbnVtYmVycyB0aGF0IHdlcmUgc2ltaWxhciB0byBuZXR3b3JrcyBvYnRhaW5lZCBmcm9tIG9yaWdpbmFsIHNpbXVsYXRlZCBkYXRhKHdpdGhvdXQgY29uZm91bmRpbmcpLiANCmBgYHtyfQ0Kc2V0LnNlZWQoMTAxKQ0KIyMgZ2VuZXJhdGUgc2ltdWxhdGVkIHNjYWxlIGZyZWUgbmV0d29yaw0KZGF0IDwtIGh1Z2UuZ2VuZXJhdG9yKG4gPSAzNTAsIGQgPSA1MDAwLCBncmFwaCA9ICJzY2FsZS1mcmVlIiwgdiA9IE5VTEwsIHUgPSBOVUxMLA0KICAgICAgICAgICAgICAgZyA9IE5VTEwsIHByb2IgPSBOVUxMLCB2aXMgPSBGLCB2ZXJib3NlID0gVFJVRSkNCg0KDQpkYXQudGhldGEgPC0gZGF0JHRoZXRhDQoNCnJtKGRhdCkNCiMjIGxvYWQgbmV0d29ya3MgaW5mZXJyZWQgd2l0aCBzaW11bGF0ZWQgZGF0YQ0Kc2ltLm5ldCA8LSByZWFkUkRTKCJzaW0uUmRzIikNCnNpbS5jb25mb3VuZGVkLm5ldCA8LSByZWFkUkRTKCJzaW1fY29uZm91bmRlZC5SZHMiKQ0Kc2ltLmNvcnJlY3RlZC5uZXQgPC0gcmVhZFJEUygic2ltX2NvcnJlY3RlZC5SZHMiKQ0KDQphPTUNCg0KIyMgQ291bnQgdGhlIHRvdGFsIG51bWJlciBvZiBlZGdlcyBpbiB0aGUgbmV0d29yaw0KdHJ1ZV9lY291bnQgPC0gc3VtKGRhdC50aGV0YSA9PSAxICYgY29sKGRhdC50aGV0YSkgPCByb3coZGF0LnRoZXRhKSkNCmNvbmZvdW5kZWRfZWNvdW50IDwtIHNhcHBseShzaW0uY29uZm91bmRlZC5uZXQsIGZ1bmN0aW9uKHgpIHN1bSh4ID09IDEgJiBjb2woeCkgPCByb3coeCkpKQ0Kc2ltX2Vjb3VudCA8LSBzYXBwbHkoc2ltLm5ldCxmdW5jdGlvbih4KSBzdW0oeCA9PSAxICYgY29sKHgpIDwgcm93KHgpKSkNCmNvcnJlY3RlZF9lY291bnQgPC0gc2FwcGx5KHNpbS5jb3JyZWN0ZWQubmV0LCBmdW5jdGlvbih4KSBzdW0oeCA9PSAxICYgY29sKHgpIDwgcm93KHgpKSkNCg0KZ2MoKQ0KIyMgY291bnQgdGhlIG51bWJlciBvZiBlZGdlcyBjb21tb24gd2l0aCB0aGUgc2ltdWxhdGVkIG5ldHdvcmsNCmNvbmZvdW5kZWRfY29tbW9uIDwtIHNhcHBseShzaW0uY29uZm91bmRlZC5uZXQsIGZ1bmN0aW9uKHgseSkge3N1bSgoeCt5KSA9PSAyICYgY29sKHgpIDwgcm93KHgpKX0sIGRhdC50aGV0YSkNCnNpbV9jb21tb24gPC0gc2FwcGx5KHNpbS5uZXQsIGZ1bmN0aW9uKHgseSkge3N1bSgoeCt5KSA9PSAyICYgY29sKHgpIDwgcm93KHgpKX0sIGRhdC50aGV0YSkNCmNvcnJlY3RlZF9jb21tb24gPC0gc2FwcGx5KHNpbS5jb3JyZWN0ZWQubmV0LCBmdW5jdGlvbih4LHkpIHtzdW0oKHgreSkgPT0gMiAmIGNvbCh4KSA8IHJvdyh4KSl9LCBkYXQudGhldGEpDQoNCnByaW50KHBhc3RlKCJUaGUgbnVtYmVyIG9mIGVkZ2VzIGluIHRoZSB0cnVlIG5ldHdvcms6IiwgdHJ1ZV9lY291bnQpKQ0KcHJpbnQocGFzdGUoIlRoZSBudW1iZXIgb2YgZWRnZXMgaW4gdGhlIGluZmVycmVkIG5ldHdvcmsiLCBzaW1fZWNvdW50W2FdKSkNCnByaW50KHBhc3RlKCJUaGUgbnVtYmVyIG9mIGVkZ2VzIGluIHRoZSBjb25mb3VuZGVkIG5ldHdvcmsiLCBjb25mb3VuZGVkX2Vjb3VudFthXSkpDQpwcmludChwYXN0ZSgiVGhlIG51bWJlciBvZiBlZGdlcyBpbiB0aGUgY29ycmVjdGVkIG5ldHdvcmsiLCBjb3JyZWN0ZWRfZWNvdW50W2FdKSkNCnByaW50KHBhc3RlKCJUaGUgbnVtYmVyIGNvbW1vbiBlZGdlcyBpbiB0aGUgY29uZm91bmRlZCBhbmQgdHJ1ZSBuZXR3b3JrIiwgY29uZm91bmRlZF9jb21tb25bYV0pKQ0KcHJpbnQocGFzdGUoIlRoZSBudW1iZXIgY29tbW9uIGVkZ2VzIGluIHRoZSBpbmZlcnJlZCBhbmQgdHJ1ZSBuZXR3b3JrIiwgc2ltX2NvbW1vblthXSkpDQpwcmludChwYXN0ZSgiVGhlIG51bWJlciBjb21tb24gZWRnZXMgaW4gdGhlIGluZmVycmVkIGFuZCB0cnVlIG5ldHdvcmsiLCBjb3JyZWN0ZWRfY29tbW9uW2FdKSkNCg0KYGBgDQpOZXh0LCB3ZSBjaGVja2VkIHRoZSBwcm9wb3J0aW9uIG9mIGVkZ2VzIGZyb20gb3JpZ2luYWwgc2ltdWxhdGVkIGRhdGEgbmV0d29ya3MgdGhhdCB3ZXJlIGFsc28gaWRlbnRpZmllZCBpbiBQQyBjb3JyZWN0ZWQgbmV0d29ya3MsIGFuZCBwcm9wb3J0aW9uIG9mIGVkZ2VzIGZyb20gb3JpZ2luYWwgc2ltdWxhdGVkIGRhdGEgbmV0d29ya3MgdGhhdCB3ZXJlIGFsc28gaWRlbnRpZmllZCBpbiBjb25mb3VuZGVkIG5ldHdvcmtzLg0KYGBge3J9DQpzaW1fY29ycmVjdGVkX2NvbW1vbiA8LSBtYXBwbHkoZnVuY3Rpb24oeCx5KXsNCiAgc3VtKHgreSA9PSAyICYgY29sKHgpIDwgcm93KHgpKQ0KfSwgc2ltLm5ldCwgc2ltLmNvcnJlY3RlZC5uZXQpDQoNCg0Kc2ltX2NvcnJlY3RlZF9jb21tb24vc2ltX2Vjb3VudA0KDQpzaW1fY29uZm91bmRlZF9jb21tb24gPC0gbWFwcGx5KGZ1bmN0aW9uKHgseSl7DQogIHN1bSh4K3kgPT0gMiAmIGNvbCh4KSA8IHJvdyh4KSkNCn0sIHNpbS5uZXQsIHNpbS5jb25mb3VuZGVkLm5ldCkNCg0Kc2ltX2NvbmZvdW5kZWRfY29tbW9uL3NpbV9lY291bnQNCg0KYGBg
- - - -
- - - - - - - - diff --git a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.nb.html b/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.nb.html deleted file mode 100755 index 994eb79..0000000 --- a/publication_rmd/simulation_scale-free_matched_gtex/sim_scale_free_matched.nb.html +++ /dev/null @@ -1,400 +0,0 @@ - - - - - - - - - - - - - -Simulation: Scale-free networks matched to GTEx - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - -
library(huge, quietly = T)
-library(Matrix)
- - - -

Here, we first simulated data from a multivariate gaussian distribution based on an underlying scale-free network structure in the precision matrix. We tried to match the number of samples to the number of samples in our empirical experiments done with GTEx data. This simulation included 5000 genes across 350 samples. We found that the number of false discoveries is much higher in confounded data. Further, this analysis shows that PC correction increased the number of true positives in the inferred networks - numbers that were similar to networks obtained from original simulated data(without confounding).

- - - -
set.seed(101)
-## generate simulated scale free network
-dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL,
-               g = NULL, prob = NULL, vis = F, verbose = TRUE)
- - -
Generating data from the multivariate normal distribution with the scale-free graph structure....done.
- - -
dat.theta <- dat$theta
-rm(dat)
-## load networks inferred with simulated data
-sim.net <- readRDS("sim.Rds")
-sim.confounded.net <- readRDS("sim_confounded.Rds")
-sim.corrected.net <- readRDS("sim_corrected.Rds")
-a=5
-## Count the total number of edges in the network
-true_ecount <- sum(dat.theta == 1 & col(dat.theta) < row(dat.theta))
-confounded_ecount <- sapply(sim.confounded.net, function(x) sum(x == 1 & col(x) < row(x)))
-sim_ecount <- sapply(sim.net,function(x) sum(x == 1 & col(x) < row(x)))
-corrected_ecount <- sapply(sim.corrected.net, function(x) sum(x == 1 & col(x) < row(x)))
-gc()
- - -
           used  (Mb) gc trigger  (Mb)   max used    (Mb)
-Ncells  2603099 139.1    4738752 253.1    4738752   253.1
-Vcells 20888753 159.4  105872022 807.8 2108080318 16083.4
- - -
## count the number of edges common with the simulated network
-confounded_common <- sapply(sim.confounded.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-sim_common <- sapply(sim.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-corrected_common <- sapply(sim.corrected.net, function(x,y) {sum((x+y) == 2 & col(x) < row(x))}, dat.theta)
-print(paste("The number of edges in the true network:", true_ecount))
- - -
[1] "The number of edges in the true network: 4999"
- - -
print(paste("The number of edges in the inferred network", sim_ecount[a]))
- - -
[1] "The number of edges in the inferred network 15284"
- - -
print(paste("The number of edges in the confounded network", confounded_ecount[a]))
- - -
[1] "The number of edges in the confounded network 240955"
- - -
print(paste("The number of edges in the corrected network", corrected_ecount[a]))
- - -
[1] "The number of edges in the corrected network 15730"
- - -
print(paste("The number common edges in the confounded and true network", confounded_common[a]))
- - -
[1] "The number common edges in the confounded and true network 163"
- - -
print(paste("The number common edges in the inferred and true network", sim_common[a]))
- - -
[1] "The number common edges in the inferred and true network 399"
- - -
print(paste("The number common edges in the inferred and true network", corrected_common[a]))
- - -
[1] "The number common edges in the inferred and true network 395"
- - - -

Next, we checked the proportion of edges from original simulated data networks that were also identified in PC corrected networks, and proportion of edges from original simulated data networks that were also identified in confounded networks.

- - - -
sim_corrected_common <- mapply(function(x,y){
-  sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.corrected.net)
-sim_corrected_common/sim_ecount
- - -
 [1] 0.9471285 0.9508433 0.9498509 0.9429816 0.9325438 0.9284082 0.9000000 0.9565217 1.0000000
-[10] 1.0000000 1.0000000       NaN       NaN       NaN       NaN       NaN       NaN       NaN
-[19]       NaN       NaN       NaN       NaN       NaN       NaN       NaN       NaN       NaN
-[28]       NaN       NaN
- - -
sim_confounded_common <- mapply(function(x,y){
-  sum(x+y == 2 & col(x) < row(x))
-}, sim.net, sim.confounded.net)
-sim_confounded_common/sim_ecount
- - -
 [1] 0.02322967 0.02221992 0.02121465 0.01983463 0.02152578 0.02208682 0.03846154 0.13043478
- [9] 0.12500000 0.00000000 0.00000000        NaN        NaN        NaN        NaN        NaN
-[17]        NaN        NaN        NaN        NaN        NaN        NaN        NaN        NaN
-[25]        NaN        NaN        NaN        NaN        NaN
- - - -
LS0tDQp0aXRsZTogIlNpbXVsYXRpb246IFNjYWxlLWZyZWUgbmV0d29ya3MgbWF0Y2hlZCB0byBHVEV4Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOiBkZWZhdWx0DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgZGZfcHJpbnQ6IHBhZ2VkDQogIHBkZl9kb2N1bWVudDogZGVmYXVsdA0KLS0tDQoNCmBgYHtyLCBtZXNzYWdlPUZBTFNFfQ0KbGlicmFyeShodWdlLCBxdWlldGx5ID0gVCkNCmxpYnJhcnkoTWF0cml4KQ0KYGBgDQoNCkhlcmUsIHdlIGZpcnN0IHNpbXVsYXRlZCBkYXRhIGZyb20gYSBtdWx0aXZhcmlhdGUgZ2F1c3NpYW4gZGlzdHJpYnV0aW9uIGJhc2VkIG9uIGFuIHVuZGVybHlpbmcgc2NhbGUtZnJlZSBuZXR3b3JrIHN0cnVjdHVyZSBpbiB0aGUgcHJlY2lzaW9uIG1hdHJpeC4gV2UgdHJpZWQgdG8gbWF0Y2ggdGhlIG51bWJlciBvZiBzYW1wbGVzIHRvIHRoZSBudW1iZXIgb2Ygc2FtcGxlcyBpbiBvdXIgZW1waXJpY2FsIGV4cGVyaW1lbnRzIGRvbmUgd2l0aCBHVEV4IGRhdGEuIFRoaXMgc2ltdWxhdGlvbiBpbmNsdWRlZCA1MDAwIGdlbmVzIGFjcm9zcyAzNTAgc2FtcGxlcy4gV2UgZm91bmQgdGhhdCB0aGUgbnVtYmVyIG9mIGZhbHNlIGRpc2NvdmVyaWVzIGlzIG11Y2ggaGlnaGVyIGluIGNvbmZvdW5kZWQgZGF0YS4gRnVydGhlciwgdGhpcyBhbmFseXNpcyBzaG93cyB0aGF0IFBDIGNvcnJlY3Rpb24gaW5jcmVhc2VkIHRoZSBudW1iZXIgb2YgdHJ1ZSBwb3NpdGl2ZXMgaW4gdGhlIGluZmVycmVkIG5ldHdvcmtzIC0gbnVtYmVycyB0aGF0IHdlcmUgc2ltaWxhciB0byBuZXR3b3JrcyBvYnRhaW5lZCBmcm9tIG9yaWdpbmFsIHNpbXVsYXRlZCBkYXRhKHdpdGhvdXQgY29uZm91bmRpbmcpLiANCmBgYHtyfQ0Kc2V0LnNlZWQoMTAxKQ0KIyMgZ2VuZXJhdGUgc2ltdWxhdGVkIHNjYWxlIGZyZWUgbmV0d29yaw0KZGF0IDwtIGh1Z2UuZ2VuZXJhdG9yKG4gPSAzNTAsIGQgPSA1MDAwLCBncmFwaCA9ICJzY2FsZS1mcmVlIiwgdiA9IE5VTEwsIHUgPSBOVUxMLA0KICAgICAgICAgICAgICAgZyA9IE5VTEwsIHByb2IgPSBOVUxMLCB2aXMgPSBGLCB2ZXJib3NlID0gVFJVRSkNCg0KDQpkYXQudGhldGEgPC0gZGF0JHRoZXRhDQoNCnJtKGRhdCkNCiMjIGxvYWQgbmV0d29ya3MgaW5mZXJyZWQgd2l0aCBzaW11bGF0ZWQgZGF0YQ0Kc2ltLm5ldCA8LSByZWFkUkRTKCJzaW0uUmRzIikNCnNpbS5jb25mb3VuZGVkLm5ldCA8LSByZWFkUkRTKCJzaW1fY29uZm91bmRlZC5SZHMiKQ0Kc2ltLmNvcnJlY3RlZC5uZXQgPC0gcmVhZFJEUygic2ltX2NvcnJlY3RlZC5SZHMiKQ0KDQphPTUNCg0KIyMgQ291bnQgdGhlIHRvdGFsIG51bWJlciBvZiBlZGdlcyBpbiB0aGUgbmV0d29yaw0KdHJ1ZV9lY291bnQgPC0gc3VtKGRhdC50aGV0YSA9PSAxICYgY29sKGRhdC50aGV0YSkgPCByb3coZGF0LnRoZXRhKSkNCmNvbmZvdW5kZWRfZWNvdW50IDwtIHNhcHBseShzaW0uY29uZm91bmRlZC5uZXQsIGZ1bmN0aW9uKHgpIHN1bSh4ID09IDEgJiBjb2woeCkgPCByb3coeCkpKQ0Kc2ltX2Vjb3VudCA8LSBzYXBwbHkoc2ltLm5ldCxmdW5jdGlvbih4KSBzdW0oeCA9PSAxICYgY29sKHgpIDwgcm93KHgpKSkNCmNvcnJlY3RlZF9lY291bnQgPC0gc2FwcGx5KHNpbS5jb3JyZWN0ZWQubmV0LCBmdW5jdGlvbih4KSBzdW0oeCA9PSAxICYgY29sKHgpIDwgcm93KHgpKSkNCg0KZ2MoKQ0KIyMgY291bnQgdGhlIG51bWJlciBvZiBlZGdlcyBjb21tb24gd2l0aCB0aGUgc2ltdWxhdGVkIG5ldHdvcmsNCmNvbmZvdW5kZWRfY29tbW9uIDwtIHNhcHBseShzaW0uY29uZm91bmRlZC5uZXQsIGZ1bmN0aW9uKHgseSkge3N1bSgoeCt5KSA9PSAyICYgY29sKHgpIDwgcm93KHgpKX0sIGRhdC50aGV0YSkNCnNpbV9jb21tb24gPC0gc2FwcGx5KHNpbS5uZXQsIGZ1bmN0aW9uKHgseSkge3N1bSgoeCt5KSA9PSAyICYgY29sKHgpIDwgcm93KHgpKX0sIGRhdC50aGV0YSkNCmNvcnJlY3RlZF9jb21tb24gPC0gc2FwcGx5KHNpbS5jb3JyZWN0ZWQubmV0LCBmdW5jdGlvbih4LHkpIHtzdW0oKHgreSkgPT0gMiAmIGNvbCh4KSA8IHJvdyh4KSl9LCBkYXQudGhldGEpDQoNCnByaW50KHBhc3RlKCJUaGUgbnVtYmVyIG9mIGVkZ2VzIGluIHRoZSB0cnVlIG5ldHdvcms6IiwgdHJ1ZV9lY291bnQpKQ0KcHJpbnQocGFzdGUoIlRoZSBudW1iZXIgb2YgZWRnZXMgaW4gdGhlIGluZmVycmVkIG5ldHdvcmsiLCBzaW1fZWNvdW50W2FdKSkNCnByaW50KHBhc3RlKCJUaGUgbnVtYmVyIG9mIGVkZ2VzIGluIHRoZSBjb25mb3VuZGVkIG5ldHdvcmsiLCBjb25mb3VuZGVkX2Vjb3VudFthXSkpDQpwcmludChwYXN0ZSgiVGhlIG51bWJlciBvZiBlZGdlcyBpbiB0aGUgY29ycmVjdGVkIG5ldHdvcmsiLCBjb3JyZWN0ZWRfZWNvdW50W2FdKSkNCnByaW50KHBhc3RlKCJUaGUgbnVtYmVyIGNvbW1vbiBlZGdlcyBpbiB0aGUgY29uZm91bmRlZCBhbmQgdHJ1ZSBuZXR3b3JrIiwgY29uZm91bmRlZF9jb21tb25bYV0pKQ0KcHJpbnQocGFzdGUoIlRoZSBudW1iZXIgY29tbW9uIGVkZ2VzIGluIHRoZSBpbmZlcnJlZCBhbmQgdHJ1ZSBuZXR3b3JrIiwgc2ltX2NvbW1vblthXSkpDQpwcmludChwYXN0ZSgiVGhlIG51bWJlciBjb21tb24gZWRnZXMgaW4gdGhlIGluZmVycmVkIGFuZCB0cnVlIG5ldHdvcmsiLCBjb3JyZWN0ZWRfY29tbW9uW2FdKSkNCg0KYGBgDQpOZXh0LCB3ZSBjaGVja2VkIHRoZSBwcm9wb3J0aW9uIG9mIGVkZ2VzIGZyb20gb3JpZ2luYWwgc2ltdWxhdGVkIGRhdGEgbmV0d29ya3MgdGhhdCB3ZXJlIGFsc28gaWRlbnRpZmllZCBpbiBQQyBjb3JyZWN0ZWQgbmV0d29ya3MsIGFuZCBwcm9wb3J0aW9uIG9mIGVkZ2VzIGZyb20gb3JpZ2luYWwgc2ltdWxhdGVkIGRhdGEgbmV0d29ya3MgdGhhdCB3ZXJlIGFsc28gaWRlbnRpZmllZCBpbiBjb25mb3VuZGVkIG5ldHdvcmtzLg0KYGBge3J9DQpzaW1fY29ycmVjdGVkX2NvbW1vbiA8LSBtYXBwbHkoZnVuY3Rpb24oeCx5KXsNCiAgc3VtKHgreSA9PSAyICYgY29sKHgpIDwgcm93KHgpKQ0KfSwgc2ltLm5ldCwgc2ltLmNvcnJlY3RlZC5uZXQpDQoNCg0Kc2ltX2NvcnJlY3RlZF9jb21tb24vc2ltX2Vjb3VudA0KDQpzaW1fY29uZm91bmRlZF9jb21tb24gPC0gbWFwcGx5KGZ1bmN0aW9uKHgseSl7DQogIHN1bSh4K3kgPT0gMiAmIGNvbCh4KSA8IHJvdyh4KSkNCn0sIHNpbS5uZXQsIHNpbS5jb25mb3VuZGVkLm5ldCkNCg0Kc2ltX2NvbmZvdW5kZWRfY29tbW9uL3NpbV9lY291bnQNCg0KYGBg
- - - -
- - - - - - - - diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_data.sh b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_data.sh new file mode 100644 index 0000000..68eb139 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_data.sh @@ -0,0 +1,16 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=1:0:0 +#SBATCH --partition=lrgmem +#SBATCH --mem=10G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/ +Rscript generate_sim_data.R >log/generate_sim_data.log + + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_sim_data.R new file mode 100644 index 0000000..87ef777 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/generate_sim_data.R @@ -0,0 +1,57 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +## generate simulated scale free network + +if(!any(grepl("simulated_data.Rds", dir()))){ +set.seed(101) +## generate simulated scale free network +dat <- huge.generator(n = 350, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") +}else{ + sim.dat <- readRDS("simulated_data.Rds") +} + +n <- nrow(sim.dat) +p <- ncol(sim.dat) + +sim.dat.scaled <- scale(sim.dat) + +sim.confounded=sim.dat.scaled +set.seed(101) +grp1=rnorm(n) +set.seed(102) +grp2=rnorm(n) +scalar_constant = 2 +set.seed(103) +prob1 = runif(1:1500, 0.1, 1) +prob2 = runif(1:1500, 0.1, 1) + +probcount = 0 + +for(i in c(4001:4500, 1001:2000)){ + probcount = probcount + 1 + sim.confounded[,i] = sim.confounded[,i] + (scalar_constant * prob1[probcount]*grp1) + (scalar_constant * prob2[probcount]*grp2) +} + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +#saveRDS(sim.dat, file = "simulated_data.Rds") +#saveRDS(sim.theta, file = "simulated_network.Rds") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/run_sim.sh new file mode 100644 index 0000000..3b63f6d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/run_sim.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=72:0:0 +#SBATCH --partition=lrgmem +#SBATCH --ntasks-per-node=10 +#SBATCH --mem=70G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32414331.out b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32414331.out new file mode 100644 index 0000000..56633f2 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32414331.out @@ -0,0 +1,35 @@ + +Lmod is automatically replacing "intel/18.0" with "gcc/5.5.0". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) R/3.4.0 2) openmpi/3.1 + +Loading required package: mgcv +Loading required package: nlme +This is mgcv 1.8-25. For overview type 'help("mgcv-package")'. +Loading required package: genefilter +Loading required package: BiocParallel +Loading required package: methods +Loading required package: Matrix +Loading required package: lattice +Loading required package: igraph + +Attaching package: ‘igraph’ + +The following objects are masked from ‘package:stats’: + + decompose, spectrum + +The following object is masked from ‘package:base’: + + union + +Loading required package: MASS + +Attaching package: ‘MASS’ + +The following object is masked from ‘package:genefilter’: + + area + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415126.out b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415126.out new file mode 100644 index 0000000..56633f2 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415126.out @@ -0,0 +1,35 @@ + +Lmod is automatically replacing "intel/18.0" with "gcc/5.5.0". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) R/3.4.0 2) openmpi/3.1 + +Loading required package: mgcv +Loading required package: nlme +This is mgcv 1.8-25. For overview type 'help("mgcv-package")'. +Loading required package: genefilter +Loading required package: BiocParallel +Loading required package: methods +Loading required package: Matrix +Loading required package: lattice +Loading required package: igraph + +Attaching package: ‘igraph’ + +The following objects are masked from ‘package:stats’: + + decompose, spectrum + +The following object is masked from ‘package:base’: + + union + +Loading required package: MASS + +Attaching package: ‘MASS’ + +The following object is masked from ‘package:genefilter’: + + area + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415127.out b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415127.out new file mode 100644 index 0000000..56633f2 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415127.out @@ -0,0 +1,35 @@ + +Lmod is automatically replacing "intel/18.0" with "gcc/5.5.0". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) R/3.4.0 2) openmpi/3.1 + +Loading required package: mgcv +Loading required package: nlme +This is mgcv 1.8-25. For overview type 'help("mgcv-package")'. +Loading required package: genefilter +Loading required package: BiocParallel +Loading required package: methods +Loading required package: Matrix +Loading required package: lattice +Loading required package: igraph + +Attaching package: ‘igraph’ + +The following objects are masked from ‘package:stats’: + + decompose, spectrum + +The following object is masked from ‘package:base’: + + union + +Loading required package: MASS + +Attaching package: ‘MASS’ + +The following object is masked from ‘package:genefilter’: + + area + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415128.out b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415128.out new file mode 100644 index 0000000..56633f2 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/slurm-32415128.out @@ -0,0 +1,35 @@ + +Lmod is automatically replacing "intel/18.0" with "gcc/5.5.0". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) R/3.4.0 2) openmpi/3.1 + +Loading required package: mgcv +Loading required package: nlme +This is mgcv 1.8-25. For overview type 'help("mgcv-package")'. +Loading required package: genefilter +Loading required package: BiocParallel +Loading required package: methods +Loading required package: Matrix +Loading required package: lattice +Loading required package: igraph + +Attaching package: ‘igraph’ + +The following objects are masked from ‘package:stats’: + + decompose, spectrum + +The following object is masked from ‘package:base’: + + union + +Loading required package: MASS + +Attaching package: ‘MASS’ + +The following object is masked from ‘package:genefilter’: + + area + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/submit_jobs.sh new file mode 100644 index 0000000..6ec2f2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch run_sim.sh data +sbatch run_sim.sh confounded +sbatch run_sim.sh corrected diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_data.sh b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_data.sh new file mode 100644 index 0000000..f86ae61 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_data.sh @@ -0,0 +1,16 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=20:0:0 +#SBATCH --partition=lrgmem +#SBATCH --mem=70G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/ +Rscript generate_sim_data.R >log/generate_sim_data.log + + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_sim_data.R b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_sim_data.R new file mode 100644 index 0000000..1216cd5 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/generate_sim_data.R @@ -0,0 +1,57 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +#simulate data + +## generate simulated scale free network + +if(!any(grepl("simulated_data.Rds", dir()))){ +set.seed(101) +## generate simulated scale free network +dat <- huge.generator(n = 30000, d = 5000, graph = "scale-free", v = NULL, u = NULL, + g = NULL, prob = NULL, vis = F, verbose = TRUE) + +sim.dat <- dat$data +sim.theta <- dat$theta +saveRDS(sim.dat, file = "simulated_data.Rds") +saveRDS(sim.theta, file = "simulated_network.Rds") +}else{ + sim.dat <- readRDS("simulated_data.Rds") +} + +n <- nrow(sim.dat) +p <- ncol(sim.dat) + +sim.dat.scaled <- scale(sim.dat) + +sim.confounded=sim.dat.scaled +set.seed(101) +grp1=rnorm(n) +set.seed(102) +grp2=rnorm(n) +scalar_constant = 2 +set.seed(103) +prob1 = runif(1:1500, 0.1, 1) +prob2 = runif(1:1500, 0.1, 1) + +probcount = 0 + +for(i in c(4001:4500, 1001:2000)){ + probcount = probcount + 1 + sim.confounded[,i] = sim.confounded[,i] + (scalar_constant * prob1[probcount]*grp1) + (scalar_constant * prob2[probcount]*grp2) +} + +### correct data +mod=matrix(1,nrow=dim(sim.confounded)[1],ncol=1) +colnames(mod)="Intercept" +nsv=num.sv(t(sim.confounded),mod, method = "be") +print(paste("the number of PCs estimated to be removed:", nsv)) +ss=svd(scale(sim.confounded)) +grp.est=ss$u[,1:nsv] +sim.corrected=lm(sim.confounded~grp.est)$residuals + +saveRDS(sim.corrected, file = "simulated_corrected.Rds") +saveRDS(sim.confounded, file = "simulated_confounded.Rds") +#saveRDS(sim.dat, file = "simulated_data.Rds") +#saveRDS(sim.theta, file = "simulated_network.Rds") diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/infer_nets.R b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/infer_nets.R new file mode 100644 index 0000000..3db93e3 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/infer_nets.R @@ -0,0 +1,43 @@ +library(sva) +library(huge) +library(QUIC) +library(parallel) +library(Matrix) + +######## functions +q_normalize <- function(dat){ + n = nrow(dat) + p = ncol(dat) + rank.dat = dat # matrix for ranking + for (i in 1:p){ + rank.dat[,i] = rank(dat[,i]) + } + U = rank.dat/(n+1) + norm.dat = qnorm(U) + norm.dat +} +################### + + +lambda=seq(0.1,1,length.out=20) + +## read input +inputargs <- commandArgs(TRUE) +fn <- inputargs[1] +#sn <- inputargs[2] +## load data +sim.dat <- readRDS(paste("simulated_", fn, ".Rds", sep = "")) + +## build networks +sim.cov <- cov(scale(sim.dat)) + +sim.net <- mclapply(lambda, function(graphlasso, thisdat){ + thisnet <- QUIC::QUIC(S = thisdat, rho = graphlasso)$X + thisnet[thisnet!=0] <- 1 + thisnet <- Matrix(thisnet, sparse = T) + thisnet +}, sim.cov, mc.cores = 29) + + +saveRDS(sim.net, file = paste("networks_", fn, ".Rds", sep = "")) + diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/run_sim.sh b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/run_sim.sh new file mode 100644 index 0000000..3b63f6d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/run_sim.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH +#SBATCH --mail-type=END +#SBATCH --mail-user=pparsan1@jhu.edu +#SBATCH --time=72:0:0 +#SBATCH --partition=lrgmem +#SBATCH --ntasks-per-node=10 +#SBATCH --mem=70G + + +module load gcc/5.5.0 +module load R/3.4.0 + +cd /work-zfs/abattle4/parsana/networks_correction/publication_rmd/simulation_scale-free_matched_gtex/weighted_confound +Rscript infer_nets.R $1 >log/$1.log diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32479419.out b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32479419.out new file mode 100644 index 0000000..d265055 --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32479419.out @@ -0,0 +1,36 @@ + +Lmod is automatically replacing "intel/18.0" with "gcc/5.5.0". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) R/3.4.0 2) openmpi/3.1 + +Loading required package: mgcv +Loading required package: nlme +This is mgcv 1.8-25. For overview type 'help("mgcv-package")'. +Loading required package: genefilter +Loading required package: BiocParallel +Loading required package: methods +Loading required package: Matrix +Loading required package: lattice +Loading required package: igraph + +Attaching package: ‘igraph’ + +The following objects are masked from ‘package:stats’: + + decompose, spectrum + +The following object is masked from ‘package:base’: + + union + +Loading required package: MASS + +Attaching package: ‘MASS’ + +The following object is masked from ‘package:genefilter’: + + area + +slurmstepd: error: *** JOB 32479419 ON bigmem0049 CANCELLED AT 2019-02-11T17:20:54 DUE TO TIME LIMIT *** diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32481963.out b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32481963.out new file mode 100644 index 0000000..abdb61f --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/slurm-32481963.out @@ -0,0 +1,37 @@ + +Lmod is automatically replacing "intel/18.0" with "gcc/5.5.0". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) R/3.4.0 2) openmpi/3.1 + +Loading required package: mgcv +Loading required package: nlme +This is mgcv 1.8-25. For overview type 'help("mgcv-package")'. +Loading required package: genefilter +Loading required package: BiocParallel +Loading required package: methods +Loading required package: Matrix +Loading required package: lattice +Loading required package: igraph + +Attaching package: ‘igraph’ + +The following objects are masked from ‘package:stats’: + + decompose, spectrum + +The following object is masked from ‘package:base’: + + union + +Loading required package: MASS + +Attaching package: ‘MASS’ + +The following object is masked from ‘package:genefilter’: + + area + +/var/spool/slurm//job32481963/slurm_script: line 14: 74795 Killed Rscript generate_sim_data.R > log/generate_sim_data.log +slurmstepd: error: Detected 1 oom-kill event(s) in step 32481963.batch cgroup. Some of your processes may have been killed by the cgroup out-of-memory handler. diff --git a/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/submit_jobs.sh b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/submit_jobs.sh new file mode 100644 index 0000000..6ec2f2d --- /dev/null +++ b/publication_rmd/simulation_scale-free_matched_gtex/weighted_confounded_largesample/submit_jobs.sh @@ -0,0 +1,3 @@ +sbatch run_sim.sh data +sbatch run_sim.sh confounded +sbatch run_sim.sh corrected