From b6b0ec7e8930ed2841c33852663bf87f0f2b071b Mon Sep 17 00:00:00 2001
From: drneavin <d.neavin@garvan.org.au>
Date: Wed, 31 Aug 2022 20:34:07 +1000
Subject: [PATCH] updating scripts

---
 Clustering/All_Integrated_QC.R                |   9 +-
 Expression_Boxplots/Expression_Boxplots.R     |  66 +-
 .../pluripotency_deg_boxplots.R               | 128 +++
 .../Expression_Correlation.R                  | 260 +++++-
 Nona_multiome/cell_line_proportions.R         |   8 +-
 RNAvelocity/latent_on_seurat2.R               |  34 +-
 Variance/Distribution_tests.R                 |   7 +
 .../post_review/prepare_pseudotime.R          |   6 +-
 .../post_review/pseudotime_effect.R           | 231 ++++-
 .../post_review/pseudotime_effect.smk         |  41 +
 .../post_review/pseudotime_effect_combine.R   | 412 +++++++++
 .../post_review/pseudotime_effect_snake.sh    | 135 +++
 .../variance_partition_post_review_combine.R  | 125 ++-
 Variance/test_interaction_nb_model.R          |  89 +-
 eQTL_check/eQTL_village_interaction.sh        | 109 +++
 eQTL_check/eQTL_village_interaction.smk       |  59 ++
 eQTL_check/eQTL_village_interaction_combine.R | 357 +++++++
 .../eQTL_village_interaction_deboever.R       | 177 ++++
 .../eQTL_village_interaction_kilpinen.R       | 177 ++++
 eQTL_check/filter_snps_kilpinen.R             |  83 ++
 eQTL_check/multi-passage/filter_snps.R        |  88 ++
 eQTL_check/multi-passage/genes2test.sh        |  30 +
 eQTL_check/multi-passage/metadata_prep.R      |  18 +
 eQTL_check/multi-passage/pull_eQTL_SNPs.sh    |  85 ++
 eQTL_check/multi-passage/test_eQTL.R          | 204 ++++
 eQTL_check/multi-passage/test_eQTL.sh         | 109 +++
 eQTL_check/multi-passage/test_eQTL.smk        |  53 ++
 .../test_eQTL_combine_deboever.R              | 114 +++
 eQTL_check/pull_eQTL_SNPs.sh                  |  15 +-
 eQTL_check/test_eQTL.smk                      |  25 +-
 eQTL_check/test_eQTL_combine.R                | 177 +++-
 eQTL_check/test_eQTL_combine_deboever.R       |  96 ++
 eQTL_check/test_eQTL_combine_kilpinen.R       | 109 +++
 eQTL_check/test_eQTL_kilpinen.R               | 206 +++++
 .../variance_partition_multipassage.R         | 358 +++++++
 .../variance_partition_multipassage.smk       |  99 ++
 .../variance_partition_multipassage_combine.R | 874 ++++++++++++++++++
 ...nce_partition_multipassage_integratedSCT.R | 357 +++++++
 ...artition_multipassage_integratedSCT_Ncov.R | 358 +++++++
 .../variance_partition_multipassage_snake.sh  | 136 +++
 multi-passage/preQC.R                         | 362 +++++++-
 multi-passage/test_vireo_ambient/vireo.sh     |   6 +
 .../test_vireo_ambient/vireo_submit.sh        |  41 +
 multi-passage/transfer_data.sh                |  45 +
 scCODA/sccod.py                               | 354 +++++++
 scCODA/sccoda_plots.R                         | 105 +++
 scCODA/sccoda_preparation.R                   |  40 +
 47 files changed, 6874 insertions(+), 103 deletions(-)
 create mode 100644 Expression_Boxplots/pluripotency_deg_boxplots.R
 create mode 100644 Variance/RNAvelocity/post_review/pseudotime_effect.smk
 create mode 100644 Variance/RNAvelocity/post_review/pseudotime_effect_combine.R
 create mode 100644 Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh
 create mode 100644 eQTL_check/eQTL_village_interaction.sh
 create mode 100644 eQTL_check/eQTL_village_interaction.smk
 create mode 100644 eQTL_check/eQTL_village_interaction_combine.R
 create mode 100644 eQTL_check/eQTL_village_interaction_deboever.R
 create mode 100644 eQTL_check/eQTL_village_interaction_kilpinen.R
 create mode 100644 eQTL_check/filter_snps_kilpinen.R
 create mode 100644 eQTL_check/multi-passage/filter_snps.R
 create mode 100644 eQTL_check/multi-passage/genes2test.sh
 create mode 100644 eQTL_check/multi-passage/metadata_prep.R
 create mode 100644 eQTL_check/multi-passage/pull_eQTL_SNPs.sh
 create mode 100644 eQTL_check/multi-passage/test_eQTL.R
 create mode 100644 eQTL_check/multi-passage/test_eQTL.sh
 create mode 100644 eQTL_check/multi-passage/test_eQTL.smk
 create mode 100644 eQTL_check/multi-passage/test_eQTL_combine_deboever.R
 create mode 100644 eQTL_check/test_eQTL_combine_deboever.R
 create mode 100644 eQTL_check/test_eQTL_combine_kilpinen.R
 create mode 100644 eQTL_check/test_eQTL_kilpinen.R
 create mode 100644 multi-passage/Variance/variance_partition_multipassage.R
 create mode 100644 multi-passage/Variance/variance_partition_multipassage.smk
 create mode 100644 multi-passage/Variance/variance_partition_multipassage_combine.R
 create mode 100644 multi-passage/Variance/variance_partition_multipassage_integratedSCT.R
 create mode 100644 multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R
 create mode 100644 multi-passage/Variance/variance_partition_multipassage_snake.sh
 create mode 100644 multi-passage/test_vireo_ambient/vireo.sh
 create mode 100644 multi-passage/test_vireo_ambient/vireo_submit.sh
 create mode 100644 multi-passage/transfer_data.sh
 create mode 100644 scCODA/sccod.py
 create mode 100644 scCODA/sccoda_plots.R
 create mode 100644 scCODA/sccoda_preparation.R

diff --git a/Clustering/All_Integrated_QC.R b/Clustering/All_Integrated_QC.R
index a9dc280..5ae1641 100644
--- a/Clustering/All_Integrated_QC.R
+++ b/Clustering/All_Integrated_QC.R
@@ -38,14 +38,15 @@ save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){
 seurat <- readRDS(paste0(datadir,"seurat_integrated_all_times_clustered.rds"))
 seurat@meta.data$Location <- gsub("_.+", "", seurat@meta.data$Location_Time)
 seurat@meta.data$phases <- factor(seurat@meta.data$phases, levels = c("G1", "S", "G2M"))
-
+seurat@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat@meta.data$Time) %>% gsub("Village Day 4", "Village", .) %>% gsub("Thawed Village Day 0", "Uni-Culture", .) %>% gsub("Thawed Village Day 7", "Village", .)
 
 ##### Set up Colors #####
 cell_line_colors <- c("FSA0006" = "#F79E29", "MBE1006" = "#9B2C99", "TOB0421"= "#35369C")
 replicate_colors <- c("1" = "#ACD39E", "2" = "#5AAE61", "3" = "#1B7837")
 time_colors <- c("Baseline" = "#b9cee4", "Village Day 4" = "#8a92bb", "Thawed Village Day 0" = "#7d57a0", "Thawed Village Day 7" = "#853786")
+village_colors <- c("Uni-Culture" = "#613246", "Village" = "#A286AA")
 cycle_colors <- c("G1" = "#4393C3", "S" = "#92C5DE", "G2M" = "#D1E5F0")
-site_colors <- c("Brisbane" = "#5D59AB", "Sydney" = "#A7C9A9", "Melbourne" = "#179085")
+site_colors <- c("Brisbane" = "#536CB4", "Sydney" = "#62BD67", "Melbourne" = "#D24F72")
 cluster_colors <- c("0" = "#C9D8EA", "1" = "#928CC5", "2" = "#A7C9A9", "3" = "#179085", "4" = "#F79F9F", "5" = "#C35B76", "6" = "#F4C893", "7" = "#F6AA4B")
 
 
@@ -54,6 +55,10 @@ cluster_colors <- c("0" = "#C9D8EA", "1" = "#928CC5", "2" = "#A7C9A9", "3" = "#1
 UMAP_cell_line <- DimPlot(seurat, reduction = "umap", group.by = c("integrated_snn_res.0.28"), cols = cluster_colors) + labs(color="Cluster") + ggtitle(NULL) 
 save_figs(UMAP_cell_line, basename = paste0(outdir,"Cluster_umap"),width = 15, height = 15)
 
+## With Village ##
+UMAP_village <- DimPlot(seurat, reduction = "umap", group.by = c("Village"), cols = village_colors) + labs(color="Village") + ggtitle(NULL) 
+save_figs(UMAP_village, basename = paste0(outdir,"Village_umap"),width = 15, height = 15)
+
 ## By Cell Line ##
 UMAP_cell_line <- DimPlot(seurat, reduction = "umap", group.by = c("Final_Assignment"), cols = cell_line_colors) + labs(color="Cell Line") + ggtitle(NULL) 
 save_figs(UMAP_cell_line, basename = paste0(outdir,"Cell_Line_umap"),width = 15, height = 15)
diff --git a/Expression_Boxplots/Expression_Boxplots.R b/Expression_Boxplots/Expression_Boxplots.R
index fe312ed..d6cc51b 100644
--- a/Expression_Boxplots/Expression_Boxplots.R
+++ b/Expression_Boxplots/Expression_Boxplots.R
@@ -40,7 +40,7 @@ save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){
 ##### Set up colors #####
 variable_colors <- c(Village = "#A2B0D0", Replicate = "#64A66B", Line = "#68319B") 
 line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C")
-village_colors <- c(Baseline = "#b9cee4", Village = "#8a92bb" )
+village_colors <- c("Uni-Culture" = "#613246", "Village" = "#A286AA")
 
 site_updates <- c("Brisbane" = "Site 1" ,"Melbourne" = "Site 2", "Sydney" = "Site 3")
 
@@ -57,7 +57,7 @@ names(seurat_list) <- gsub("_seurat.rds", "", files)
 
 seurat_list <- lapply(seurat_list, function(x){
 	x$Location <- ifelse(x$Time %in% c("Thawed Village Day 0", "Thawed Village Day 7"), "Sydney_Cryopreserved", x$Location)
-	x$Time <- ifelse(x$Time %in% c("Village Day 4", "Thawed Village Day 7"), "Village", "Baseline")
+	x$Time <- ifelse(x$Time %in% c("Village Day 4", "Thawed Village Day 7"), "Village", "Uni-Culture")
 	for (location in names(site_updates)){
 		x$Location <- gsub(location, site_updates[location], x$Location)
 	}
@@ -455,29 +455,73 @@ for (gene in pluri_genes$Gene){
 	df_list[[gene]] <- expression_df(seurat_list, pluri_genes[which(pluri_genes$Gene == gene),"ENSG"], c("Location", "Time", "Final_Assignment"))
 	df_list[[gene]]$Gene <- gene
 }
-df <- do.call(rbind, df_list)
+df <- data.table(do.call(rbind, df_list))
 
 dir.create(paste0(outdir,"pluri_genes/"))
 
 
-p_counts <- ggplot(df[which(df$Location != "Site 3_Cryopreserved"),], aes(x = Final_Assignment, y = Counts, color = Time)) +
-					geom_boxplot(outlier.size = 0.5) +
+
+
+##### Read in significance #####
+pluri_deg <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Expression_Boxplots/pluri_degs/LR_DEGs_4pluri_genes.rds")
+
+
+pluri_deg <- lapply(names(pluri_deg), function(x){
+	strings <- unlist(str_split(x, "_"))
+	pluri_deg[[x]]$Location <- gsub("Brisbane", "Site 1", strings[1]) %>% gsub("Melbourne", "Site 2", .) %>% gsub("Sydney", "Site 3", .)
+	pluri_deg[[x]]$Cryopreservation <- strings[2]
+	pluri_deg[[x]]$Final_Assignment <- strings[3]
+	return(pluri_deg[[x]])
+})
+
+pluri_deg_dt <- do.call(rbind,pluri_deg)
+colnames(pluri_deg_dt) <- gsub("GeneID", "Gene", colnames(pluri_deg_dt))
+pluri_deg_dt$Symbol <- "*"
+pluri_deg_dt$counts_position <- ifelse(pluri_deg_dt$Gene == "MYC", 26,
+									ifelse(pluri_deg_dt$Gene == "NANOG", 22, 
+										ifelse(pluri_deg_dt$Gene == "POU5F1", 115, 60)))
+
+df$Cryopreservation <- ifelse(df$Location != "3_Cryopreserved", "Fresh", "Cryopreserved")
+df$Location <- gsub("_Cryopreserved", "", df$Location)
+
+
+p_counts <- ggplot(df[Cryopreservation != "Cryopreserved"], aes(x = Final_Assignment, y = Counts, color = Time)) +
+					geom_boxplot(aes(fill = Time), outlier.size = 0.5) +
 					theme_classic() +
-					facet_grid(Gene ~ Location, scales = "free_y") +
 					scale_color_manual(values = village_colors) +
+					scale_fill_manual(values = alpha(village_colors, 0.3)) +
 					theme(legend.position = "none",
-						axis.title.x = element_blank())
-save_figs(p_counts, paste0(outdir,"pluri_genes/pluri_counts"), width = 16, height = 14)
+						axis.title.x = element_blank()) +
+					geom_text(
+						data = pluri_deg_dt[Cryopreservation != "Cryopreserved"],
+						aes(x = Final_Assignment, y = counts_position,label = Symbol), 
+						color = "black",
+						size = 4) +
+					facet_grid(Gene ~ Location, scales = "free_y") +
+					theme(axis.text.x = element_text(angle = 45, hjust = 1))
+					
+save_figs(p_counts, paste0(outdir,"pluri_genes/pluri_counts"), width = 16, height = 16)
 
 
-p_scaled <- ggplot(df[which(df$Location != "Site 3_Cryopreserved"),], aes(x = Final_Assignment, y = Normalized, color = Time)) +
-					geom_boxplot(outlier.size = 0.5) +
+
+pluri_deg_dt$counts_position <- ifelse(pluri_deg_dt$Gene == "MYC", 15.5,
+									ifelse(pluri_deg_dt$Gene == "NANOG", 10, 
+										ifelse(pluri_deg_dt$Gene == "POU5F1", 9, 12)))
+
+p_scaled <- ggplot(df[Location != "Cryopreserved"], aes(x = Final_Assignment, y = Normalized, color = Time)) +
+					geom_boxplot(aes(fill = Time),outlier.size = 0.15, lwd=0.3) +
 					theme_classic() +
 					facet_grid(Gene ~ Location, scales = "free_y") +
 					scale_color_manual(values = village_colors) +
+					scale_fill_manual(values = alpha(village_colors, 0.4)) +
 					theme(axis.title.x = element_blank()) +
 					ylab("Normalized Expression") +
-					theme(axis.text.x = element_text(angle = 45, hjust = 1))
+					theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+					geom_text(
+						data = pluri_deg_dt[Cryopreservation != "Cryopreserved"],
+						aes(x = Final_Assignment, y = counts_position,label = Symbol), 
+						color = "black",
+						size = 3) 
 save_figs(p_scaled, paste0(outdir,"pluri_genes/pluri_normalized"), width = 15, height = 10)
 
 
diff --git a/Expression_Boxplots/pluripotency_deg_boxplots.R b/Expression_Boxplots/pluripotency_deg_boxplots.R
new file mode 100644
index 0000000..b1f2f89
--- /dev/null
+++ b/Expression_Boxplots/pluripotency_deg_boxplots.R
@@ -0,0 +1,128 @@
+library(data.table)
+library(tidyverse)
+library(Seurat)
+
+
+
+
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Expression_Boxplots/pluri_degs/"
+dir.create(outdir, recursive = TRUE)
+
+
+pluri_genes <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/pluripotency_genes.tsv")
+
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/All_data_integrated_remove_bad/seurat_integrated_all_times_clustered.rds")
+seurat@meta.data$Location <- gsub("_.+", "", seurat@meta.data$Location)
+seurat@meta.data$Cryopreserved <- ifelse(grepl("Thawed", seurat@meta.data$Location_Time), "Cryopreserved", "Fresh")
+seurat@meta.data$Location_Cryopreserved_Line <- paste0(seurat@meta.data$Location, "_", seurat@meta.data$Cryopreserved, "_", seurat@meta.data$Final_Assignment)
+seurat@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat@meta.data$Time) %>% 
+        gsub("Thawed Village Day 0", "Uni-Culture", .) %>%
+        gsub("Thawed Village Day 7", "Village", .) %>%
+        gsub("Village Day 4", "Village", .)
+
+
+##### Combine replicates from same groups together #####
+seurat_list <- lapply(unique(seurat@meta.data$Location_Cryopreserved_Line), function(x){
+	subset(seurat, subset = Location_Cryopreserved_Line == x)
+})
+names(seurat_list) <- unique(seurat@meta.data$Location_Cryopreserved_Line)
+
+
+
+seurat_list <- lapply(seurat_list, function(x){
+    Idents(x) <- "Village"
+    return(x)
+})
+
+
+
+seurat_list <- lapply(seurat_list, function(x){
+    SCTransform(x, vars.to.regress = c("scores.G1", "scores.S", "scores.G2M", "percent.mt", "percent.rb"), return.only.var.genes = FALSE)
+})
+
+seurat_list <- lapply(seurat_list, function(x){
+    PrepSCTFindMarkers(x)
+})
+
+saveRDS(seurat_list, paste0(outdir, "sct_location_cryo_line.rds"))
+seurat_list <- readRDS(paste0(outdir, "sct_location_cryo_line.rds"))
+
+
+
+##### Combine Village and Baselines together #####
+### Try with logistic MAST ###
+MAST_DEGs <- lapply(seurat_list, function(x){
+    FindMarkers(x, ident.1 = "Uni-Culture", ident.2 = "Village", latent.vars = "MULTI_classification", test.use = "MAST", logfc.threshold = 0, assay = "RNA")
+})
+
+saveRDS(MAST_DEGs, paste0(outdir, "MAST_DEGs.rds"))
+MAST_DEGs <- readRDS(paste0(outdir, "MAST_DEGs.rds"))
+
+
+### Update multiple testing for all sites ###
+MAST_DEGs_nrow_list <- lapply(MAST_DEGs, function(x) data.table(nrow(x)))
+MAST_DEGs_nrow <- sum(do.call(rbind, MAST_DEGs_nrow_list)$V1)
+
+MAST_DEGs <- lapply(MAST_DEGs, function(x){
+    x$p_val_adj_updated <- p.adjust(x$p_val, method = "bonferroni", n = MAST_DEGs_nrow)
+    return(x)
+})
+
+
+MAST_DEGs_pluri <- lapply(MAST_DEGs, function(x){
+    x$ENSG <- rownames(x)
+    x <- data.table(x)
+    x <- x[pluri_genes, on = c("ENSG")]
+    x <- x[!is.na(p_val) & p_val_adj_updated < 0.05]
+    return(x)
+})
+
+MAST_DEGs_pluri <- lapply(MAST_DEGs, function(x){
+    x$ENSG <- rownames(x)
+    x <- data.table(x)
+    x <- x[pluri_genes, on = c("ENSG")]
+    x <- x[!is.na(p_val) & p_val_adj < 0.05]
+    return(x)
+})
+
+MAST_DEGs_pluri_4 <- lapply(MAST_DEGs_pluri, function(x){
+    x[GeneID %in% c("MYC", "NANOG", "POU5F1", "SOX2")]
+})
+
+
+### Try with logistic regression ###
+LR_DEGs <- lapply(seurat_list, function(x){
+    FindMarkers(x, ident.1 = "Uni-Culture", ident.2 = "Village", latent.vars = "MULTI_classification", test.use = "LR", logfc.threshold = 0)
+})
+
+saveRDS(LR_DEGs, paste0(outdir, "LR_DEGs.rds"))
+##### Will use LR instead of MAST - MAST finds more DEG and lower p-values which I'm not sure is legitimate #####
+
+
+### Update multiple testing for all sites ###
+LR_DEGs_nrow_list <- lapply(LR_DEGs, function(x) data.table(nrow(x)))
+LR_DEGs_nrow <- sum(do.call(rbind, LR_DEGs_nrow_list)$V1)
+
+LR_DEGs <- lapply(LR_DEGs, function(x){
+    x$p_val_adj_updated <- p.adjust(x$p_val, method = "bonferroni", n = LR_DEGs_nrow)
+    return(x)
+})
+
+
+LR_DEGs_pluri <- lapply(LR_DEGs, function(x){
+    x$ENSG <- rownames(x)
+    x <- data.table(x)
+    x <- x[pluri_genes, on = c("ENSG")]
+    x <- x[!is.na(p_val) & p_val_adj_updated < 0.05]
+    return(x)
+})
+
+
+LR_DEGs_pluri_4 <- lapply(LR_DEGs_pluri, function(x){
+    x[GeneID %in% c("MYC", "NANOG", "POU5F1", "SOX2")]
+})
+
+
+
+saveRDS(LR_DEGs_pluri_4, paste0(outdir, "LR_DEGs_4pluri_genes.rds"))
+
diff --git a/Expression_Correlation/Expression_Correlation.R b/Expression_Correlation/Expression_Correlation.R
index 855d768..eb421e0 100644
--- a/Expression_Correlation/Expression_Correlation.R
+++ b/Expression_Correlation/Expression_Correlation.R
@@ -227,8 +227,266 @@ summary_SCT_list <- lapply(names(SCT_combined_list_list), function(x){
 
 
 
-summary_SCT <- do.call(rbind, summary_SCT_list)
+summary_SCT <- data.table(do.call(rbind, summary_SCT_list))
 summary_SCT$Replicate <- gsub("Brisbane", "Replicate", summary_SCT$Replicate) %>% gsub("Melbourne", "Replicate", .) %>% gsub("Sydney", "Replicate", .)
+
+
+
+##### Get unique id for each  #####
+summary_SCT$ID <- paste(summary_SCT$Location, summary_SCT$Time, summary_SCT$Line, summary_SCT$Replicate, sep = "-")
+
+
+correlations <- lapply(unique(summary_SCT$ID), function(group1){
+	print(group1)
+	tmp <- lapply(unique(summary_SCT$ID), function(group2){
+		print(group2)
+		genes <- summary_SCT[ID == group1][summary_SCT[ID == group1]$Gene %in% summary_SCT[ID == group2]$Gene]$Gene
+		print(all(summary_SCT[ID == group1][Gene %in% genes]$Gene == summary_SCT[ID == group2][Gene %in% genes]$Gene))
+		return(data.table(Group1 = group1, Group2 = group2, Spearman = cor(summary_SCT[ID == group1][Gene %in% genes]$Mean, summary_SCT[ID == group2][Gene %in% genes]$Mean, method = "spearman")))
+	})
+	return(do.call(rbind, tmp))
+})
+
+saveRDS(correlations, paste0(outdir, "correlations.rds"))
+
+
+correlations_pearson <- lapply(unique(summary_SCT$ID), function(group1){
+	print(group1)
+	tmp <- lapply(unique(summary_SCT$ID), function(group2){
+		print(group2)
+		genes <- summary_SCT[ID == group1][summary_SCT[ID == group1]$Gene %in% summary_SCT[ID == group2]$Gene]$Gene
+		print(all(summary_SCT[ID == group1][Gene %in% genes]$Gene == summary_SCT[ID == group2][Gene %in% genes]$Gene))
+		return(data.table(Group1 = group1, Group2 = group2, Pearson = cor(summary_SCT[ID == group1][Gene %in% genes]$Mean, summary_SCT[ID == group2][Gene %in% genes]$Mean)))
+	})
+	return(do.call(rbind, tmp))
+})
+
+correlations_pearson_dt <- do.call(rbind, correlations_pearson)
+
+tmp <- summary_SCT[ID == "Brisbane-Baseline-FSA0006-Replicate1"][summary_SCT[ID == "Brisbane-Baseline-TOB0421-Replicate1"], on = "Gene"]
+
+scatter_test <- ggplot(tmp, aes(Mean,i.Mean)) +
+	geom_point() +
+	theme_classic()
+
+ggsave(scatter_test, filename = paste0(outdir, "test_correlation.png"))
+
+
+correlations_dt <- do.call(rbind, correlations)
+correlations_dt[, c("Location1", "Village1", "Line1", "Replicate1") := tstrsplit(Group1, "-", fixed=TRUE)]
+correlations_dt[, c("Location2", "Village2", "Line2", "Replicate2") := tstrsplit(Group2, "-", fixed=TRUE)]
+
+
+
+##### Try with data slot instead #####
+summary_SCT_data_list <- lapply(names(SCT_combined_list_list), function(x){
+	temp3 <- lapply(names(SCT_combined_list_list[[x]]), function(y){
+		temp2 <- lapply(names(SCT_combined_list_list[[x]][[y]]), function(z){
+			temp <- lapply(names(SCT_combined_list_list[[x]][[y]][[z]]), function(rep){
+				data.frame(Gene = rownames(SCT_combined_list_list[[x]][[y]][[z]][[rep]]), Mean = rowMeans(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@data), N = ncol(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@data), Replicate = rep, Line = z, Location = y, Time = x)
+			})
+			do.call(rbind, temp)
+		})
+		do.call(rbind, temp2)
+	})
+	do.call(rbind, temp3)
+})
+
+
+
+summary_SCT_data <- data.table(do.call(rbind, summary_SCT_data_list))
+summary_SCT_data$Replicate <- gsub("Brisbane", "Replicate", summary_SCT_data$Replicate) %>% gsub("Melbourne", "Replicate", .) %>% gsub("Sydney", "Replicate", .)
+
+
+
+##### Get unique id for each  #####
+summary_SCT_data$ID <- paste(summary_SCT_data$Location, summary_SCT_data$Time, summary_SCT_data$Line, summary_SCT_data$Replicate, sep = "-")
+
+
+correlations_data <- lapply(unique(summary_SCT_data$ID), function(group1){
+	print(group1)
+	tmp <- lapply(unique(summary_SCT_data$ID)[(which(unique(summary_SCT_data$ID) == group1) + 1):length(unique(summary_SCT_data$ID))], function(group2){
+		print(group2)
+		genes <- summary_SCT_data[ID == group1][summary_SCT_data[ID == group1]$Gene %in% summary_SCT_data[ID == group2]$Gene]$Gene
+		print(all(summary_SCT_data[ID == group1][Gene %in% genes]$Gene == summary_SCT_data[ID == group2][Gene %in% genes]$Gene))
+		return(data.table(Group1 = group1, Group2 = group2, Spearman = cor(summary_SCT_data[ID == group1][Gene %in% genes]$Mean, summary_SCT_data[ID == group2][Gene %in% genes]$Mean, method = "spearman")))
+	})
+	return(do.call(rbind, tmp))
+})
+
+
+
+correlations_data_dt <- do.call(rbind, correlations_data)
+correlations_data_dt[, c("Location1", "Village1", "Line1", "Replicate1") := tstrsplit(Group1, "-", fixed=TRUE)]
+correlations_data_dt[, c("Location2", "Village2", "Line2", "Replicate2") := tstrsplit(Group2, "-", fixed=TRUE)]
+
+correlations_data_fresh_dt <- correlations_data_dt[Location1 != "Sydney_Cryopreserved" | Location2 != "Sydney_Cryopreserved"]
+
+
+#####  #####
+correlations_data_fresh_village_dt <- correlations_data_fresh_dt[Replicate1 == Replicate2 & Line1 == Line2 & Location1 == Location2 & Village1 != Village2 & Group1 != Group2]
+correlations_data_fresh_village_dt$Group <- "Village"
+
+correlations_data_fresh_replicate_dt <- correlations_data_fresh_dt[Village1 == Village2 & Line1 == Line2 & Location1 == Location2 & Replicate1 != Replicate2 & Group1 != Group2]
+correlations_data_fresh_replicate_dt$Group <- "Replicate"
+
+correlations_data_fresh_location_dt <- correlations_data_fresh_dt[Replicate1 == Replicate2 & Line1 == Line2 & Village1 == Village2 & Location1 != Location2 & Group1 != Group2]
+correlations_data_fresh_location_dt$Group <- "Location"
+
+correlations_data_fresh_line_dt <- correlations_data_fresh_dt[Village1 == Village2 & Location1 == Location2 & Line1 != Line2 & Group1 != Group2]
+correlations_data_fresh_line_dt$Group <- "Line"
+
+correlations_data_fresh_combined_dt <- rbind(correlations_data_fresh_village_dt, correlations_data_fresh_replicate_dt, correlations_data_fresh_location_dt, correlations_data_fresh_line_dt)
+
+correlations_data_fresh_combined_dt$Group <- factor(correlations_data_fresh_combined_dt$Group, levels = c("Replicate", "Village", "Line", "Location"))
+
+
+res <- list()
+
+for (group1 in unique(correlations_data_fresh_combined_dt$Group)){
+	for (group2 in unique(correlations_data_fresh_combined_dt$Group)[(which(unique(correlations_data_fresh_combined_dt$Group) == group1)+1):length(unique(correlations_data_fresh_combined_dt$Group))]){
+		res[[group1]][[group2]] <- wilcox.test(correlations_data_fresh_combined_dt[Group == group1]$Spearman, correlations_data_fresh_combined_dt[Group == group2]$Spearman,exact = FALSE)
+	}
+}
+
+
+
+
+correlations_data_fresh_combined_dt_med <- correlations_data_fresh_combined_dt %>% 
+  group_by(Group) %>%
+  mutate(median = median(as.numeric(Spearman)))
+
+
+correlation_fresh_dist <- ggplot(correlations_data_fresh_combined_dt, aes(Spearman)) +
+	geom_histogram(bins = 50) +
+	facet_wrap(vars(factor(Group, levels = c("Replicate", "Village", "Line", "Location"))), ncol = 1, scales = "free_y") +
+	theme_classic() +
+	geom_vline(data = correlations_data_fresh_combined_dt_med, aes(xintercept = median), linetype = "dashed")
+
+ggsave(correlation_fresh_dist, filename = paste0(outdir, "correlation_distributions_fresh.png"), width = 3)
+ggsave(correlation_fresh_dist, filename = paste0(outdir, "correlation_distributions_fresh.pdf"), width = 3)
+
+
+##### For cryopreserdd #####
+
+correlations_data_combined_cryo_dt <- correlations_data_dt[grepl("Sydney", Location1) & grepl("Sydney", Location2)]
+correlations_data_combined_cryo_dt$Cryopreserved1 <- gsub("Sydney_", "",correlations_data_combined_cryo_dt$Location1)
+correlations_data_combined_cryo_dt$Cryopreserved2 <- gsub("Sydney_", "",correlations_data_combined_cryo_dt$Location2)
+
+
+#####  #####
+correlations_data_cryo_village_dt <- correlations_data_combined_cryo_dt[Replicate1 == Replicate2 & Line1 == Line2 & Cryopreserved1 == Cryopreserved2  & Village1 != Village2 & Group1 != Group2]
+correlations_data_cryo_village_dt$Group <- "Village"
+
+correlations_data_cryo_replicate_dt <- correlations_data_combined_cryo_dt[Village1 == Village2 & Line1 == Line2 & Cryopreserved1 == Cryopreserved2 & Replicate1 != Replicate2 & Group1 != Group2]
+correlations_data_cryo_replicate_dt$Group <- "Replicate"
+
+correlations_data_cryo_cryo_dt <- correlations_data_combined_cryo_dt[Replicate1 == Replicate2 & Line1 == Line2 & Village1 == Village2 & Cryopreserved1 != Cryopreserved2 & Group1 != Group2]
+correlations_data_cryo_cryo_dt$Group <- "Cryopreserved"
+
+correlations_data_cryo_line_dt <- correlations_data_combined_cryo_dt[Village1 == Village2 & Cryopreserved1 == Cryopreserved2  & Line1 != Line2 & Group1 != Group2]
+correlations_data_cryo_line_dt$Group <- "Line"
+
+correlations_data_cryo_combined_dt <- rbind(correlations_data_cryo_village_dt, correlations_data_cryo_replicate_dt, correlations_data_cryo_cryo_dt, correlations_data_cryo_line_dt)
+
+correlations_data_cryo_combined_dt$Group <- factor(correlations_data_cryo_combined_dt$Group, levels = c("Replicate", "Cryopreserved", "Village", "Line"))
+
+correlations_data_cryo_combined_dt_med <- correlations_data_cryo_combined_dt %>% 
+  group_by(Group) %>%
+  mutate(median = median(as.numeric(Spearman)))
+
+
+
+correlation_cryo_dist <- ggplot(correlations_data_cryo_combined_dt, aes(Spearman)) +
+	geom_histogram(bins = 50) +
+	facet_wrap(vars(factor(Group, levels = c("Replicate", "Cryopreserved", "Village", "Line"))), ncol = 1, scales = "free_y") +
+	theme_classic() +
+	geom_vline(data = correlations_data_cryo_combined_dt_med, aes(xintercept = median), linetype = "dashed")
+
+ggsave(correlation_cryo_dist, filename = paste0(outdir, "correlation_distributions_cryo.png"), width = 3)
+ggsave(correlation_cryo_dist, filename = paste0(outdir, "correlation_distributions_cryo.pdf"), width = 3)
+
+
+
+
+
+##### Try with counts slot instead #####
+summary_SCT_counts_list <- lapply(names(SCT_combined_list_list), function(x){
+	temp3 <- lapply(names(SCT_combined_list_list[[x]]), function(y){
+		temp2 <- lapply(names(SCT_combined_list_list[[x]][[y]]), function(z){
+			temp <- lapply(names(SCT_combined_list_list[[x]][[y]][[z]]), function(rep){
+				data.frame(Gene = rownames(SCT_combined_list_list[[x]][[y]][[z]][[rep]]), Mean = rowMeans(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@counts), N = ncol(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@counts), Replicate = rep, Line = z, Location = y, Time = x)
+			})
+			do.call(rbind, temp)
+		})
+		do.call(rbind, temp2)
+	})
+	do.call(rbind, temp3)
+})
+
+
+
+summary_SCT_counts <- data.table(do.call(rbind, summary_SCT_counts_list))
+summary_SCT_counts$Replicate <- gsub("Brisbane", "Replicate", summary_SCT_counts$Replicate) %>% gsub("Melbourne", "Replicate", .) %>% gsub("Sydney", "Replicate", .)
+
+
+
+##### Get unique id for each  #####
+summary_SCT_counts$ID <- paste(summary_SCT_counts$Location, summary_SCT_counts$Time, summary_SCT_counts$Line, summary_SCT_counts$Replicate, sep = "-")
+
+
+correlations_counts <- lapply(unique(summary_SCT_counts$ID), function(group1){
+	print(group1)
+	tmp <- lapply(unique(summary_SCT_counts$ID)[(which(unique(summary_SCT_counts$ID) == group1) + 1):length(unique(summary_SCT_counts$ID))], function(group2){
+		print(group2)
+		genes <- summary_SCT_counts[ID == group1][summary_SCT_counts[ID == group1]$Gene %in% summary_SCT_counts[ID == group2]$Gene]$Gene
+		print(all(summary_SCT_counts[ID == group1][Gene %in% genes]$Gene == summary_SCT_counts[ID == group2][Gene %in% genes]$Gene))
+		return(data.table(Group1 = group1, Group2 = group2, Spearman = cor(summary_SCT_counts[ID == group1][Gene %in% genes]$Mean, summary_SCT_counts[ID == group2][Gene %in% genes]$Mean, method = "spearman")))
+	})
+	return(do.call(rbind, tmp))
+})
+
+
+
+correlations_counts_dt <- do.call(rbind, correlations_counts)
+correlations_counts_dt[, c("Location1", "Village1", "Line1", "Replicate1") := tstrsplit(Group1, "-", fixed=TRUE)]
+correlations_counts_dt[, c("Location2", "Village2", "Line2", "Replicate2") := tstrsplit(Group2, "-", fixed=TRUE)]
+
+correlations_counts_village_dt <- correlations_counts_dt[Replicate1 == Replicate2 & Line1 == Line2 & Location1 == Location2 & Village1 != Village2 & Group1 != Group2]
+correlations_counts_village_dt$Group <- "Village"
+
+correlations_counts_replicate_dt <- correlations_counts_dt[Village1 == Village2 & Line1 == Line2 & Location1 == Location2 & Replicate1 != Replicate2 & Group1 != Group2]
+correlations_counts_replicate_dt$Group <- "Replicate"
+
+correlations_counts_location_dt <- correlations_counts_dt[Replicate1 == Replicate2 & Line1 == Line2 & Village1 == Village2 & Location1 != Location2 & Group1 != Group2]
+correlations_counts_location_dt$Group <- "Location"
+
+correlations_counts_line_dt <- correlations_counts_dt[Replicate1 == Replicate2 & Village1 == Village2 & Location1 == Location2 & Line1 != Line2 & Group1 != Group2]
+correlations_counts_line_dt$Group <- "Line"
+
+correlations_counts_combined_dt <- rbind(correlations_counts_village_dt, correlations_counts_replicate_dt, correlations_counts_location_dt, correlations_counts_line_dt)
+
+correlations_counts_combined_dt_med <- correlations_counts_combined_dt %>% 
+  group_by(Group) %>%
+  mutate(median = median(as.numeric(Spearman)))
+
+
+correlations_counts_combined_dt$Group <- factor(correlations_counts_combined_dt$Group, levels = c("Replicate", "Village", "Line", "Location"))
+
+
+correlation_dist <- ggplot(correlations_counts_combined_dt, aes(Spearman)) +
+	geom_histogram(bins = 50) +
+	facet_wrap(vars(factor(Group, levels = c("Replicate", "Village", "Line", "Location"))), ncol = 1, scales = "free_y") +
+	theme_classic() +
+	geom_vline(data = correlations_counts_combined_dt_med, aes(xintercept = median), linetype = "dashed")
+
+ggsave(correlation_dist, filename = paste0(outdir, "correlation_distributions_counts.png"), width = 3)
+
+
+
+
+
+
 summary_SCT_wide <- pivot_wider(summary_SCT, names_from = Replicate, values_from = c(Mean, N))
 summary_SCT_wide$SD <- rowSds(as.matrix(summary_SCT_wide[,c("Mean_Replicate1", "Mean_Replicate2", "Mean_Replicate3")]))
 summary_SCT_wide$Mean <- (summary_SCT_wide$Mean_Replicate1 * summary_SCT_wide$N_Replicate1 + summary_SCT_wide$Mean_Replicate2 * summary_SCT_wide$N_Replicate2 + summary_SCT_wide$Mean_Replicate3 * summary_SCT_wide$N_Replicate3)/(summary_SCT_wide$N_Replicate1 + summary_SCT_wide$N_Replicate2 + summary_SCT_wide$N_Replicate3)
diff --git a/Nona_multiome/cell_line_proportions.R b/Nona_multiome/cell_line_proportions.R
index fbbb345..c8f47a4 100644
--- a/Nona_multiome/cell_line_proportions.R
+++ b/Nona_multiome/cell_line_proportions.R
@@ -32,7 +32,7 @@ village_id_list <- lapply(villages, function(x){
     # tmp <- fread(paste0(datadir,x,"/CombinedResults/Final_Assignments_demultiplexing_doublets_new_edit.txt"), sep = "\t")
     tmp <- readRDS(paste0(non_dir,x))
     dt <- data.table(tmp@meta.data)
-    dt$Pool_ID <- gsub("_DemuxALL.rds", "",x))
+    dt$Pool_ID <- gsub("_DemuxALL.rds", "",x)
     dt$Day <- as.numeric(as.character(gsub("Village_Day", "", dt$Pool_ID)))
     return(dt)
 })
@@ -66,6 +66,11 @@ village_summary_singlets <- data.table(prop.table(table(village_id[Assignment !=
 
 village_summary_singlets$Assignment <- factor(village_summary_singlets$Assignment, levels = rev(village_summary_singlets[Day_updated == 15]$Assignment[order(village_summary_singlets[Day_updated == 15]$N)]))
 
+colors <- c("#f44336", "#e81f63", "#9c27b0", "#673ab7", "#3f51b5", "#2096f3","#2096f3", "#009688", "#4caf50", "#8bc34a", "#cddc39", "#ffeb3b", "#ffc108", "#ff9801", "#ff5723" ,"#795548", "#9e9e9e", "#607d8b")
+names(colors) <- levels(village_summary_singlets$Assignment)
+
+saveRDS(colors, paste0(outdir,"line_colors"))
+
 
 ##### Make proportion plots (area plot) #####
 p_stacked_area <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day_updated)), y = N, fill = factor(Assignment), group = Assignment)) +
@@ -80,6 +85,7 @@ ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.pdf"), width = 7,
 
 
 
+
 ##### Make line plot of propotion over time #####
 p_line <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day_updated)), y = N, color = Assignment)) +
     geom_point() +
diff --git a/RNAvelocity/latent_on_seurat2.R b/RNAvelocity/latent_on_seurat2.R
index 3e27439..ab321a0 100644
--- a/RNAvelocity/latent_on_seurat2.R
+++ b/RNAvelocity/latent_on_seurat2.R
@@ -1,10 +1,11 @@
 library(data.table)
 library(Seurat)
 library(ggplot2)
-library(Nebulosa)
-library(schex)
-library(R.utils)
-library(Hmisc)
+library(tidyverse)
+# library(Nebulosa)
+# library(schex)
+# library(R.utils)
+# library(Hmisc)
 
 
 ##### Setting up Directories
@@ -16,6 +17,7 @@ dir.create(outdir, recursive = TRUE)
 
 
 line_colors = c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C")
+village_colors <- c("Uni-Culture" = "#613246", "Village" = "#A286AA")
 
 
 ##### Read in data #####
@@ -397,15 +399,29 @@ summary(lm(POU5F1 ~ equal_groups, data = latent_split))
 
 
 ##### Make histograms of each location, colored by time, faceted by location #####
-seurat_noNA@meta.data$Time <- gsub("Village Day 4","Village", seurat_noNA@meta.data$Time) %>% gsub("Thawed Village Day 7", "Village", .) %>% gsub("Thawed Village Day 0","Baseline", .)
-
 seurat_noNA@meta.data$Location <- ifelse(grepl("Thawed", seurat_noNA@meta.data$Location), "Sydney-Cryopreserved", gsub("_.+", "", seurat_noNA@meta.data$Location))
+seurat_noNA@meta.data$Location <- gsub("Brisbane", "Site 1", seurat_noNA@meta.data$Location) %>% gsub("Melbourne", "Site 2", .) %>% gsub("Sydney", "Site 3", .)
+seurat_noNA@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat_noNA@meta.data$Time) %>% gsub("Village Day 4", "Village", .) %>% gsub("Thawed Village Day 0", "Uni-Culture", .) %>% gsub("Thawed Village Day 7", "Village", .)
+
 
 seurat_noNA@meta.data$Location_Individual <- paste0(seurat_noNA@meta.data$Location, "-", seurat_noNA@meta.data$Final_Assignment)
 
-pLocation_Time_latent <- ggplot(seurat_noNA@meta.data, aes(latent_time, color = Time)) +
-	geom_density() +
+
+pLocation_Time_latent <- ggplot(seurat_noNA@meta.data, aes(latent_time, fill = Village)) +
+	geom_density(alpha = 0.75) +
 	theme_classic() +
-	facet_grid(Location ~ Final_Assignment)
+	facet_grid(Location ~ Final_Assignment) +
+	scale_fill_manual(values = village_colors)
 ggsave(pLocation_Time_latent, filename = paste0(outdir,"faceted_histogram_latent.png"))
+ggsave(pLocation_Time_latent, filename = paste0(outdir,"faceted_histogram_latent.pdf"))
+
+
+
+pLocation_Time_latent_line <- ggplot(seurat_noNA@meta.data, aes(latent_time, fill = Village)) +
+	geom_density(alpha = 0.75) +
+	theme_classic() +
+	facet_grid(vars(Final_Assignment)) +
+	scale_fill_manual(values = village_colors)
+ggsave(pLocation_Time_latent_line, filename = paste0(outdir,"faceted_histogram_latent_line.png"), width = 4, height = 3)
+ggsave(pLocation_Time_latent_line, filename = paste0(outdir,"faceted_histogram_latent_line.pdf"), width = 4, height = 3)
 
diff --git a/Variance/Distribution_tests.R b/Variance/Distribution_tests.R
index f50065b..4e090b9 100644
--- a/Variance/Distribution_tests.R
+++ b/Variance/Distribution_tests.R
@@ -5,6 +5,7 @@ library(gamlss)
 library(fitdistrplus)
 library(VGAM)
 library(emdbook)
+library(data.table)
 
 
 save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){
@@ -214,6 +215,12 @@ seurat <- readRDS(paste0(datadir,"seurat_integrated_all_times_clustered.rds"))
 ## Keep only genes expressed in at least 1% of cells 
 seurat_sub <- subset(seurat, features = rownames(seurat)[which(rowSums(seurat[["SCT"]]@counts > 0)/ncol(seurat[["SCT"]]@counts) >= 0.01)])
 saveRDS(seurat_sub, paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing.rds"))
+seurat_sub <- readRDS(paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing.rds"))
+
+
+### Make a list of the genes to be used by snakemake in variance_partition_post_reviedw.smk ###
+genes_dt <- data.table(Gene = rownames(seurat_sub))
+fwrite(genes_dt, paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing_genelist.tsv"), sep = "\t")
 
 
 aic_df <- as.data.frame(matrix(nrow = 0, ncol = 3))
diff --git a/Variance/RNAvelocity/post_review/prepare_pseudotime.R b/Variance/RNAvelocity/post_review/prepare_pseudotime.R
index c440c7d..cdcd507 100644
--- a/Variance/RNAvelocity/post_review/prepare_pseudotime.R
+++ b/Variance/RNAvelocity/post_review/prepare_pseudotime.R
@@ -41,4 +41,8 @@ seurat_noNA@meta.data$Cryopreserved <-ifelse(seurat_noNA@meta.data$Location == "
 seurat_noNA@meta.data$Location <- gsub("_Cryopreserved", "", seurat_noNA@meta.data$Location)
 
 
-saveRDS(seurat_noNA, paste0(outdir, "seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds"))
\ No newline at end of file
+saveRDS(seurat_noNA, paste0(outdir, "seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds"))
+seurat_noNA <- readRDS(paste0(outdir, "seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds"))
+
+
+fwrite(data.table(Gene = rownames(seurat_noNA)), paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.tsv"))
\ No newline at end of file
diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect.R b/Variance/RNAvelocity/post_review/pseudotime_effect.R
index dfb1a32..363d1c6 100644
--- a/Variance/RNAvelocity/post_review/pseudotime_effect.R
+++ b/Variance/RNAvelocity/post_review/pseudotime_effect.R
@@ -34,17 +34,18 @@ icc_glmmtmb <- function(model, percent = TRUE) {
 args <- commandArgs(trailingOnly = TRUE)
 icc_interaction_outdir <- paste0(args[1])
 icc_outdir <- paste0(args[2])
-model_interaction_outdir <- paste0(args[3])
-model_outdir <- paste0(args[4])
-resid_outdir <- paste0(args[5])
-gene <- as.character(args[6])
+plot_outdir <- paste0(args[3])
+gene <- as.character(args[4])
+effects_outdir <- paste0(args[5])
 
+
+print(icc_interaction_outdir)
 print(icc_outdir)
-print(icc_outdir)
-print(model_outdir)
-print(resid_outdir)
+print(plot_outdir)
 print(gene)
 
+line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C")
+
 
 
 ##### Read in data #####
@@ -57,47 +58,217 @@ colnames(icc_summary) <- gsub("gene", "ensg", colnames(icc_summary))
 
 
 ### Make DF for modeling ###
-df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)),  "Cryopreserved" = seurat$Cryopreserved, "Site" = seurat$Location, "Pseudotime" = round(seurat$latent_time, 2))
+df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,],"Normalized Counts" = seurat[["SCT"]]@counts[gene,], "Log Expression" = seurat[["SCT"]]@data[gene,],  "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)),  "Cryopreserved" = seurat$Cryopreserved, "Site" = seurat$Location, "Pseudotime" = round(seurat$latent_time, 2))
 colnames(df_hier_unscale)[1] <- "Expression"
 
 
 
 ##### Get list of variables to fit before testing pseudotime effect #####
-variables <- icc_summary[ensg == gene & grp != "Residual"]$grp
+variables <- c(icc_summary[ensg == gene & grp != "Residual"]$grp, "Pseudotime")
+variables <- c(variables[!grepl(":",variables)], variables[grepl(":", variables)])
 
+model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
 
+if (length(variables) > 1){
+    model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Pseudotime"], collapse = ") + (1|"), ")"))
+} else {
+    model <- as.formula(paste0("Expression ~ 1"))
+}
 
-##### Test pseudotime impact #####
-### Fit the known variables to get residuals ###
-model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
-model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+boolFalse<-F
+while(boolFalse==F & length(variables) > 0){
+  tryCatch({
+    print(variables)
+    model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+    model_glmmtmb_loo <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+    boolFalse<-T
+  },error=function(e){
+    if (length(variables) > 1){
+        variables <- variables[1:(length(variables) -1)]
+    } else {
+        variables <- c()
+    }
+  })
+}
 
 
-### Test pseudotime on residuals ###
-df_hier_unscale$Residuals <- resid(model_glmmtmb)
+if ("Pseudotime" %in% variables){
+
+    ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+    while ((!model_glmmtmb$sdr$pdHess | !model_glmmtmb_loo$sdr$pdHess) & length(variables) > 0){
+        print("Singular fit: removing last variable and rerunning with one less covariate.")
+        if (length(variables) > 1){
+            variables <- variables[1:(length(variables) -1)]
+            print(variables)
+            model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+            model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+            model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Pseudotime"], collapse = ") + (1|"), ")"))
+            model_glmmtmb_loo <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+        } else {
+            variables <- c()
+        }
+    }
 
-model_pseudotime <- suppress_warnings(glmmTMB(Residuals ~ Pseudotime, data = df_hier_unscale, REML = TRUE), "giveCsparse")
-model_pseudotime2 <- suppress_warnings(glmmTMB(Residuals ~ 1, data = df_hier_unscale, REML = TRUE), "giveCsparse")
+    print(variables)
+    
+    if ("Pseudotime" %in% variables){
 
-### Test with Anova ###
-P_value <- anova(model_pseudotime2, model_pseudotime)$`Pr(>Chisq)`[2]
-P_value <- anova(model_glmmtmb, model_pseudotime2)$`Pr(>Chisq)`[2]
+        icc <- data.table(grp = c("Pseudotime", "Residual"), P = as.numeric(NA))
 
+        icc[grp == "Pseudotime"]$P <- anova(model_glmmtmb_loo, model_glmmtmb)$`Pr(>Chisq)`[2]
 
-test_plot <- ggplot(df_hier_unscale, aes(Pseudotime, Residuals, color = Site)) +
-    geom_point()  +
-    facet_grid(~Line) +
-    geom_smooth(method = "lm", se = FALSE)
 
-ggsave(test_plot, filename = "/directflow/SCCGGroupShare/projects/DrewNeavin/test.png")
+        if (icc[grp == "Pseudotime"]$P < 0.05/length(variables)){
+
+            ##### Calculate full model #####
+            updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+
+            model_loo_updated <- list()
+            model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+            ### Calculate the variance explained by each of the included variables ###
+            icc <- icc_glmmtmb(model_loo_updated[["all"]])
+
+            ### Recalfulate significance ###
+            icc$P <- as.numeric(NA)
+            icc$gene <- gene
+
+            for (variable in variables){
+                print(variable)
+                if (length(variables) > 1){
+                    model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                } else {
+                    model <- as.formula(paste0("Expression ~ 1"))
+                }
+                model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2]
+            }
+        }
+
+        interaction_variables <- c()
+
+        if ("Line" %in% variables & "Pseudotime" %in% variables){
+            interaction_variables <- c(interaction_variables, "Line:Pseudotime")
 
+            model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
 
-if (P_value < 0.05/(length(variables) + 1)){
-    ### Test for amount of variance explained ###
-    model_pseudotime2 <- suppress_warnings(glmmTMB(Residuals ~ 1 + Pseudotime, data = df_hier_unscale, REML = TRUE), "giveCsparse")
+            boolFalse<-F
+            while(boolFalse==F & length(interaction_variables) > 0){
+            tryCatch({
+                print(c(variables, interaction_variables))
+                model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                boolFalse<-T
+            },error=function(e){
+                if (length(interaction_variables) > 1){
+                    interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)]
+                } else {
+                    interaction_variables <- c()
+                }
+            })
+            }
+
+            if ("Line:Pseudotime" %in% interaction_variables){
+                ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+                while (!model_glmmtmb_interaction$sdr$pdHess & length(interaction_variables) > 0 ){
+                    print("Singular fit: removing last variable and rerunning with one less covariate.")
+                    if (length(interaction_variables) > 1){
+                        interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)]
+                        print(c(interaction_variables, variables))
+                        model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+                        model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    } else {
+                        interaction_variables <- c()
+                    }
+                }
+
+                if ("Line:Pseudotime" %in% interaction_variables){
+                    
+                    model_loo_interaction <- list()
+
+                    icc_interaction <- data.table(grp = interaction_variables, P = as.numeric(NA))
+
+                    for (variable in c(interaction_variables)){
+                        print(variable)
+                        if (length(interaction_variables) > 1){
+                            model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")"))
+                        } else {
+                            model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                        }
+                        model_loo_interaction[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                        icc_interaction[grp == variable]$P <- anova(model_loo_interaction[[variable]], model_glmmtmb_interaction)$`Pr(>Chisq)`[2]
+                    }
+
+                    if (icc_interaction[grp == "Line:Pseudotime"]$P < 0.05/length(c(variables, interaction_variables)) & !is.na(icc_interaction[grp == "Line:Pseudotime"]$P)){
+                        print("interaction significant:")
+                        print(icc_interaction)
+
+                        model_loo_interaction_updated <- model_loo_interaction
+
+                        updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+
+                        model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+                        ### Calculate the variance explained by each of the included variables ###
+                        icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]])
+
+
+                        ### Recalculate significance ###
+                        icc_interaction$P <- as.numeric(NA)
+                        icc_interaction$gene <- gene
+
+                        for (variable in c(variables, interaction_variables)){
+                            print(variable)
+                            # if (length(c(interaction_variables)) > 1){
+                                model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")"))
+                            # } else {
+                            #     model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                            # }
+                            model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                            icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2]
+                        }
+
+
+                        plot <- ggplot(df_hier_unscale, aes(Pseudotime, Expression, color = Line)) +
+                                    geom_point(alpha = 0.3)  +
+                                    # geom_smooth(method = "lm", se = TRUE) +
+                                    geom_smooth(se = TRUE) +
+                                    theme_classic() +
+                                    scale_color_manual(values = line_colors)
+
+                        ggsave(plot, filename = paste0(plot_outdir, gene,".png"), width = 5, height = 4)
+
+                        plot_facet <- ggplot(df_hier_unscale, aes(Pseudotime, Expression, color = Line)) +
+                                    geom_point(alpha = 0.3)  +
+                                    facet_wrap(vars(paste0(Site, " ", Cryopreserved)), ncol = 1, scales = "free_y") +
+                                    geom_smooth(se = TRUE) +
+                                    # geom_smooth(method = "lm", se = TRUE) +
+                                    theme_classic() +
+                                    scale_color_manual(values = line_colors)
+
+                        ggsave(plot_facet, filename = paste0(plot_outdir, gene,"_site_facet.png"), width = 4, height = 7)
+
+
+                        saveRDS(icc_interaction, paste0(icc_interaction_outdir, gene, "_icc.rds"), compress = TRUE)
+
+
+                        model_interaction_effects_formula <- as.formula(paste0("Expression ~ ", paste0(c(variables, interaction_variables), collapse = " + ")))
+                        model_interaction_effects <- suppress_warnings(glmmTMB(formula = noquote(model_interaction_effects_formula), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+                        dt <- data.table(grp = rownames(coef(summary(model_interaction_effects))$cond), Effect = coef(summary(model_interaction_effects))$cond[,"Estimate"], P = coef(summary(model_interaction_effects))$cond[,"Pr(>|z|)"])
+
+                        saveRDS(dt, paste0(effects_outdir, gene, "_effects.rds"), compress = TRUE)
+                    }
+                } 
+            } 
+        } 
+    } else {
+        icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+    }
+}
 
 
+saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE)
 
-    ### Test for interactions
 
-}
diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect.smk b/Variance/RNAvelocity/post_review/pseudotime_effect.smk
new file mode 100644
index 0000000..939eba1
--- /dev/null
+++ b/Variance/RNAvelocity/post_review/pseudotime_effect.smk
@@ -0,0 +1,41 @@
+import pandas as pd
+
+
+genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/data/seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.tsv"
+genes = pd.read_csv(genes_file, sep = "\t")
+# genes = genes.iloc[1:1000]
+
+
+rule all:
+    input:
+        expand( "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc/{gene}_icc.rds", gene = genes.Gene)
+
+
+rule partition_variance:
+    input:
+        seurat = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/data/seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc/{gene}_icc.rds"
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 64,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 64
+    threads: 1
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/Variance/RNAvelocity/post_review/pseudotime_effect.R",
+        out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc/",
+        out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc_interaction/",
+        out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc_interaction/",
+        out_plot = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/plots/",
+        out_effects = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/effect_betas/"
+    log:
+    shell:
+        """
+        mkdir -p {params.out_icc_interaction}
+        mkdir -p {params.out_model_interaction}
+        mkdir -p {params.out_plot}
+        mkdir -p {params.out_effects}
+        
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_plot} {wildcards.gene} {params.out_effects}
+        """
+
+
diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect_combine.R b/Variance/RNAvelocity/post_review/pseudotime_effect_combine.R
new file mode 100644
index 0000000..b5733f8
--- /dev/null
+++ b/Variance/RNAvelocity/post_review/pseudotime_effect_combine.R
@@ -0,0 +1,412 @@
+library(tidyverse)
+library(ggplot2)
+library(ggrepel)
+library(data.table)
+library(Seurat)
+library(viridis)
+library(colorspace)
+library(RColorBrewer)
+library(ggsignif)
+
+
+dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/"
+datadir <- paste0(dir,"output/Variance/RNAvelocity/post_review/results/gene_separated/")
+icc_dir <- paste0(datadir,"icc/")
+icc_interaction_dir <- paste0(datadir,"icc_interaction/")
+effect_interaction_dir <- paste0(datadir,"effect_betas/")
+outdir <- paste0(dir,"output/Variance/RNAvelocity/post_review/results/combined/")
+dir.create(outdir)
+
+
+
+
+save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){
+    ggsave(plot, filename = paste0(basename,".png"), height = height, width = width, units = units)
+    ggsave(plot, filename = paste0(basename,".pdf"), height = height, width = width, units = units)
+    ggsave(plot, filename = paste0(basename,".eps"), height = height, width = width, units = units)
+}
+
+
+
+##### Set up colors #####
+variable_colors <- c(Village = "#a186aa", Replicate = "#f2c1ce", Line = "#4734a9") 
+line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C")
+
+
+site_updates <- c("Brisbane" = "Site 1", "Melbourne" = "Site 2" ,"Sydney" = "Site 3")
+
+
+##### Add gene IDs for easy identification downstream #####
+GeneConversion1 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/DRENEA_1/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t")
+GeneConversion2 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/Village_B_1_week/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t")
+
+GeneConversion <- unique(rbind(GeneConversion1, GeneConversion2))
+GeneConversion <- GeneConversion[!duplicated(GeneConversion$X1),]
+GeneConversion$X3 <- NULL
+colnames(GeneConversion) <- c("ENSG_ID", "Gene_ID")
+
+
+
+
+if (!file.exists(paste0(outdir,"variance_partitionin_df.tsv"))){
+	##### Get list of files #####
+	icc_files <- list.files(icc_dir)
+	icc_interaction_files <- list.files(icc_interaction_dir)
+	effect_interaction_files <- list.files(effect_interaction_dir)
+
+
+	##### Read in the files #####
+	icc_list <- lapply(icc_files, function(icc){
+		readRDS(paste0(icc_dir,icc))
+	})
+	names(icc_list) <- gsub("_icc.rds", "", icc_files)
+
+
+	icc_interaction_list <- lapply(icc_interaction_files, function(icc){
+		readRDS(paste0(icc_interaction_dir,icc))
+	})
+	names(icc_interaction_list) <- gsub("_icc.rds", "", icc_interaction_files)
+
+
+	effect_interaction_list <- lapply(effect_interaction_files, function(icc){
+		readRDS(paste0(effect_interaction_dir,icc))
+	})
+	names(effect_interaction_list) <- gsub("_effects.rds", "", effect_interaction_files)
+
+
+	##### Make dataframe of effect sizes from models #####
+	effect_interaction_dt_list <- lapply(effect_interaction_list, function(x){
+		tmp <- data.table(grp=character(), gene=character(), effect=numeric())
+
+	})
+
+
+	##### Combine fits Results #####
+	fits_df <- do.call(rbind, fits)
+
+	fits_df$Location <- gsub("_ENSG.+", "",rownames(fits_df))
+
+	write_delim(fits_df, paste0(outdir,"variance_partitionin_df.tsv"), delim = "\t")
+
+} else {
+	fits_df <- read_delim(paste0(outdir,"variance_partitionin_df.tsv"), delim = "\t")
+}
+
+
+### Read in the seurat data objects
+seurat_list <- list()
+for (location in unique(fits_df$Location)){
+	seurat_list[[location]] <- readRDS(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/nb_partitioning/data/",location,"_SCT_seurat_1pct.rds"))
+}
+
+### Make dataframe of location x number for faulty models ###
+faulty_df <- data.table(Location = gsub("_nb\\.o.+", "", faulty$V1), Number = gsub(".+nb\\.o\\d+\\.", "", faulty$V1))
+
+## Add gene id to the faulty dataframe
+faulty_df$ENSG <- NA
+for (row in 1:nrow(faulty_df)){
+	print(row)
+	faulty_df$ENSG[row] <- rownames(seurat_list[[faulty_df$Location[row]]])[as.numeric(faulty_df$Number[row])] 
+}
+
+
+### Remove the faulty genes ###
+fits_df <- setDT(fits_df)[!faulty_df, on = c("Location", "Gene" = "ENSG")]
+
+
+
+##### Make long dataframe for plotting #####
+fits_df_long <- pivot_longer(fits_df, cols = c("Line", "Village", "Replicate"), names_to = "Covariate", values_to = "Variance_Explained")
+fits_df_long$Variance_Explained <- round(fits_df_long$Variance_Explained,6)
+
+fits_df_long <- left_join(fits_df_long, GeneConversion, by = c("Gene" = "ENSG_ID"))
+
+
+
+
+##### Make some figures!!! #####
+### FRESH ###
+pTotal_Cont <- ggplot(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved"),], aes(Variance_Explained*100, color = Covariate)) +
+	geom_density() +
+	facet_grid(vars(Location)) +
+	theme_classic()  +
+	# scale_y_continuous(trans = "log10") +
+	scale_x_continuous(trans = "log10") +
+	scale_color_manual(values = variable_colors) +
+	geom_vline(xintercept = 1, linetype="dashed")
+
+save_figs(pTotal_Cont, paste0(outdir, "Total_Contribution_Histogram_cov"))
+
+
+table(subset(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved"),], (Variance_Explained > 0.90))$Gene_ID)
+subset(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved"),], (Variance_Explained > 0.90))
+fits_df_long[grepl("XIST", fits_df_long$Gene_ID),]
+fits_df_long[grepl("MT-ATP6", fits_df_long$Gene_ID),]
+
+table(subset(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved" & fits_df_long$Covariate == "Village"),], (Variance_Explained > 0.46))$Gene_ID)
+
+for (location in names(site_updates)){
+	fits_df_long$Location <- gsub(location, site_updates[location], fits_df_long$Location)
+}
+
+fits_df_long$Location <- gsub("Cryo", "\nCryo", fits_df_long$Location)
+
+fits_df_long$Quartile <- gsub(".+_Q", "Q", fits_df_long$Location)
+fits_df_long$Location <- gsub("_Q\\d", "", fits_df_long$Location)
+fits_df_long$Location <- gsub("_", "", fits_df_long$Location)
+
+pVar_Explained_box <- ggplot(fits_df_long, aes(x = Covariate, y = Variance_Explained*100, color = Covariate)) +
+	geom_boxplot(outlier.size = 0.25) +
+	facet_grid(Location ~ Quartile) +
+	scale_color_manual(values = variable_colors) +
+	theme_classic() +
+	ylab("Percent Gene Expression Variance Explained")  +
+	theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+save_figs(pVar_Explained_box, paste0(outdir, "Total_Contribution_Boxplot_cov"), width = 20, height = 12)
+
+
+
+fits_df_long[which(fits_df_long$Variance_Explained > 0.5 & fits_df_long$Covariate == "Village"),]
+table(fits_df_long[which(fits_df_long$Variance_Explained > 0.4 & fits_df_long$Covariate == "Village"),]$Gene)
+table(fits_df_long[which(fits_df_long$Variance_Explained > 0.5 & fits_df_long$Covariate == "Line"),]$Gene)
+head(fits_df_long[which(fits_df_long$Covariate == "Village"),][rev(order(na.omit(fits_df_long[which(fits_df_long$Covariate == "Village"),"Variance_Explained"]))),])
+
+
+
+
+### Variance explained correlation between sites ###
+fits_df_long$Location_Quartile <- paste0(fits_df_long$Location, "-", fits_df_long$Quartile)
+fits_df_long <- data.table(fits_df_long)
+
+fwrite(fits_df_long, sep = "\t", paste0(outdir, "var_explained_long.tsv"))
+
+### Subset
+fits_df_long_line <- fits_df_long[Covariate == "Line"]
+fits_df_long_rep <- fits_df_long[Covariate == "Replicate"]
+fits_df_long_village <- fits_df_long[Covariate == "Village"]
+
+
+### identify the combinatinons to use
+pairs_df <- as.data.table(t(combn(sort(unique(fits_df_long$Location_Quartile)), 2, simplify = TRUE)))
+colnames(pairs_df) <- c("Location_Quartile.x", "Location_Quartile.y")
+
+
+### Self Join ###
+fits_df_long_line_joined <- merge(pairs_df, fits_df_long_line, by.x = "Location_Quartile.x", by.y = "Location_Quartile", allow.cartesian=TRUE)
+fits_df_long_line_joined <- merge(fits_df_long_line_joined, fits_df_long_line, by.x = c("Location_Quartile.y", "Gene", "Covariate", "Gene_ID"), by.y = c("Location_Quartile", "Gene", "Covariate", "Gene_ID"), allow.cartesian=TRUE)
+fits_df_long_line_joined <- na.omit(fits_df_long_line_joined)
+
+fits_df_long_rep_joined <- merge(pairs_df, fits_df_long_rep, by.x = "Location_Quartile.x", by.y = "Location_Quartile", allow.cartesian=TRUE)
+fits_df_long_rep_joined <- merge(fits_df_long_rep_joined, fits_df_long_rep, by.x = c("Location_Quartile.y", "Gene", "Covariate", "Gene_ID"), by.y = c("Location_Quartile", "Gene", "Covariate", "Gene_ID"), allow.cartesian=TRUE)
+fits_df_long_rep_joined <- na.omit(fits_df_long_rep_joined)
+
+fits_df_long_village_joined <- merge(pairs_df, fits_df_long_village, by.x = "Location_Quartile.x", by.y = "Location_Quartile", allow.cartesian=TRUE)
+fits_df_long_village_joined <- merge(fits_df_long_village_joined, fits_df_long_village, by.x = c("Location_Quartile.y", "Gene", "Covariate", "Gene_ID"), by.y = c("Location_Quartile", "Gene", "Covariate", "Gene_ID"), allow.cartesian=TRUE)
+fits_df_long_village_joined <- na.omit(fits_df_long_village_joined)
+
+
+### Remove the ones that are merges between the same ###
+fits_df_long_line_joined <- fits_df_long_line_joined[which(fits_df_long_line_joined$Location_Quartile.x != fits_df_long_line_joined$Location_Quartile.y),]
+fits_df_long_rep_joined <- fits_df_long_rep_joined[which(fits_df_long_rep_joined$Location_Quartile.x != fits_df_long_rep_joined$Location_Quartile.y),]
+fits_df_long_village_joined <- fits_df_long_village_joined[which(fits_df_long_village_joined$Location_Quartile.x != fits_df_long_village_joined$Location_Quartile.y),]
+
+
+fits_df_long_joined_list <- list(fits_df_long_line_joined, fits_df_long_rep_joined, fits_df_long_village_joined)
+names(fits_df_long_joined_list) <- c("Line", "Replicate", "Village")
+
+
+# ################
+# lapply(fits_df_long_joined_list, function(x){
+# 	x$Location <- gsub("Site 3", "Site 4", x$Location) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 3", .) 
+# 	x$Location_Quartile.x <- gsub("Site 3", "Site 4", x$Location_Quartile.x) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 3", .) 
+# 	x$Location_Quartile.y <- gsub("Site 3", "Site 4", x$Location_Quartile.y) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 3", .) 
+# 	return(x)
+# })
+
+# ################
+
+
+
+saveRDS(fits_df_long_joined_list, paste0(outdir, "variance_explained_long_list.rds"))
+fits_df_long_joined_list <- readRDS(paste0(outdir, "variance_explained_long_list.rds"))
+
+Rsquared_list <- lapply(names(fits_df_long_joined_list), function(x){
+	tmp <- pairs_df
+	tmp$Covariate <- x
+	tmp$Rsquared <- NA
+	return(data.table(tmp))
+})
+names(Rsquared_list) <- names(fits_df_long_joined_list)
+
+
+
+Rsquared_list <- lapply(names(Rsquared_list), function(x){
+	print(x)
+	for (row in 1:nrow(Rsquared_list[[x]])){
+		print(row)
+		Rsquared_list[[x]]$Rsquared[row] <- round(summary(lm(Variance_Explained.y ~ Variance_Explained.x, fits_df_long_joined_list[[x]][Location_Quartile.x == Rsquared_list[[x]]$Location_Quartile.x[row] & Location_Quartile.y == Rsquared_list[[x]]$Location_Quartile.y[row],]))$r.squared,2)
+		# Rsquared_list[[x]]$pearson[row] <- round(cor(fits_df_long_joined_list[[x]][which(fits_df_long_joined_list[[x]]$Location_Quartile.x == Rsquared_list[[x]]$Location_Quartile.x[row] & 
+		# 																					fits_df_long_joined_list[[x]]$Location_Quartile.y == Rsquared_list[[x]]$Location_Quartile.y[row]),]$Variance_Explained.y, 
+		# 						fits_df_long_joined_list[[x]][which(fits_df_long_joined_list[[x]]$Location_Quartile.x == Rsquared_list[[x]]$Location_Quartile.x[row] & 
+		# 																					fits_df_long_joined_list[[x]]$Location_Quartile.y == Rsquared_list[[x]]$Location_Quartile.y[row]),]$Variance_Explained.x, use="complete.obs"), 2)
+	}
+
+	Rsquared_list[[x]] <- na.omit(Rsquared_list[[x]])
+	return(Rsquared_list[[x]])
+})
+names(Rsquared_list) <- names(fits_df_long_joined_list)
+
+saveRDS(Rsquared_list, paste0(outdir, "R_squared_list.rds"))
+
+
+
+
+fits_df_long_joined_list <- readRDS(paste0(outdir, "variance_explained_long_list.rds"))
+Rsquared_list <- readRDS(paste0(outdir, "R_squared_list.rds"))
+
+
+Rsquared_list <- lapply(Rsquared_list, function(x){
+	x$Location_Quartile.x <- gsub("Site 3", "Site 4", x$Location_Quartile.x) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 2", .) %>% gsub("Cryo", "\nCryo", .)
+	x$Location_Quartile.y <- gsub("Site 3", "Site 4", x$Location_Quartile.y) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 2", .) %>% gsub("Cryo", "\nCryo", .)
+	return(x)
+})
+
+
+pCorr_tile <- ggplot(Rsquared_list[["Line"]], aes(Location_Quartile.x, Location_Quartile.y, fill = Rsquared)) +
+				geom_tile() +
+				theme_classic() +
+				theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
+				scale_fill_continuous_sequential(palette = "Blues")
+ggsave(pCorr_tile, filename = paste0(outdir,"Rsquared_heatmap.png"), height = 5.85)
+
+
+
+##### Make lists that are only same site #####
+Rsquared_list_subset <- lapply(Rsquared_list, function(x){
+	tmp <- list()
+	for (location in unique(gsub("-.+", "", unique(c(x$Location_Quartile.x, x$Location_Quartile.y))))){
+		tmp[[location]] <- x[grepl(paste0(location, "-"), x$Location_Quartile.x) & grepl(paste0(location, "-"), Location_Quartile.y)]
+		tmp[[location]]$Location <- location
+		tmp[[location]]$Quintile.x <- gsub(".+-", "", tmp[[location]]$Location_Quartile.x)
+		tmp[[location]]$Quintile.y <- gsub(".+-", "", tmp[[location]]$Location_Quartile.y)
+	}
+	tmp2 <- do.call(rbind, tmp)
+	return(tmp2)
+})
+
+
+pCorr_tile <- ggplot(Rsquared_list_subset[["Line"]], aes(Quintile.x, Quintile.y, fill = Rsquared)) +
+				geom_tile() +
+				theme_classic() +
+				facet_wrap(vars(Location), nrow = 2) +
+				theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
+				scale_fill_continuous_sequential(palette = "Purples") +
+				xlab("Quintile") +
+				ylab("Quintile") +
+				labs(fill = "Line\nR^2")
+ggsave(pCorr_tile, filename = paste0(outdir,"Rsquared_heatmap_location_facet.png"), height = 3.25, width = 3.5)
+ggsave(pCorr_tile, filename = paste0(outdir,"Rsquared_heatmap_location_facet.pdf"), height = 3.25, width = 3.5)
+
+
+pCorr_Village<- ggplot(Rsquared_list_subset[["Village"]], aes(Quintile.x, Quintile.y, fill = Rsquared)) +
+				geom_tile() +
+				theme_classic() +
+				facet_wrap(vars(Location), nrow = 2) +
+				theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
+				scale_fill_continuous_sequential(palette = "Blues") +				
+				xlab("Quintile") +
+				ylab("Quintile") +
+				labs(fill = "Village\nR^2")
+ggsave(pCorr_Village, filename = paste0(outdir,"Rsquared_heatmap_village_facet.png"), height = 3.25, width = 3.5)
+ggsave(pCorr_Village, filename = paste0(outdir,"Rsquared_heatmap_village_facet.pdf"), height = 3.25, width = 3.5)
+
+
+
+
+
+##### Make lists that are only same quintile #####
+fits_df_long <- fread(sep = "\t", paste0(outdir, "var_explained_long.tsv"))
+
+
+Rsquared_list_subset_quint <- lapply(Rsquared_list, function(x){
+	tmp <- list()
+	for (quintile in unique(fits_df_long$Quartile)){
+		tmp[[quintile]] <- x[grepl(paste0("-", quintile), x$Location_Quartile.x) & grepl(paste0("-", quintile), x$Location_Quartile.y)]
+		tmp[[quintile]]$Quintile <- quintile
+		tmp[[quintile]]$Location.x <- gsub("-.+", "", tmp[[quintile]]$Location_Quartile.x)
+		tmp[[quintile]]$Location.y <- gsub("-.+", "", tmp[[quintile]]$Location_Quartile.y)
+	}
+	tmp2 <- do.call(rbind, tmp)
+	return(tmp2)
+})
+
+bluecols <- colorRampPalette(brewer.pal(9, "Blues"))
+purplecols <- colorRampPalette(brewer.pal(9, "Purples"))
+
+pCorr_Line_quant <- ggplot(Rsquared_list_subset_quint[["Line"]], aes(Location.x, Location.y, fill = Rsquared)) +
+				geom_tile() +
+				theme_classic() +
+				facet_wrap(vars(Quintile), nrow = 1) +
+				theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
+				scale_fill_gradientn(colours = purplecols(100), limits=c(0, 1))
+				# scale_fill_continuous_sequential(palette = "Purples")
+ggsave(pCorr_Line_quant, filename = paste0(outdir,"Rsquared_heatmap_line_facet_quint.png"), height = 2.5, width = 6.5)
+
+pCorr_Village_quant <- ggplot(Rsquared_list_subset_quint[["Village"]], aes(Location.x, Location.y, fill = Rsquared)) +
+				geom_tile() +
+				theme_classic() +
+				facet_wrap(vars(Quintile), nrow = 1) +
+				theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
+				scale_fill_gradientn(colours = bluecols(100), limits=c(0, 1))
+				# scale_fill_continuous_sequential(palette = "Blues", begin = 0, end = 1)
+ggsave(pCorr_Village_quant, filename = paste0(outdir,"Rsquared_heatmap_village_facet_quint.png"), height = 2.5, width = 6.5)
+
+
+
+pCorr_Location_Point_Line <- ggplot(fits_df_long_joined_list[["Line"]], aes(Variance_Explained.x*100, Variance_Explained.y*100, color = Covariate)) +
+	geom_point(size = 0.5, alpha = 0.5) +
+	facet_grid(rows = vars(Location.y), cols = vars(Location.x)) +
+	scale_color_manual(values = variable_colors[["Line"]]) +
+	theme_classic() +
+	ylim(0,110)+
+	# geom_text(size = 2.5, x = 0, y = 110, aes(label = paste0("R^2 = ", Rsquared)), data = Rsquared_list[["Line"]], hjust = 0) +
+	xlab("Percent Gene Expression Variance Explained") +
+	ylab("Percent Gene Expression Variance Explained") 
+save_figs(pCorr_Location_Point_Line, paste0(outdir, "Correlation_Point_Line"), width = 50, height = 5*7.25)
+
+
+pCorr_Location_Point_Village <- ggplot(fits_df_joined[which(fits_df_joined$Covariate == "Village" & fits_df_joined$Location.x != "Site 2_Cryopreserved" & fits_df_joined$Location.y != "Site 2_Cryopreserved"),], aes(Variance_Explained.x*100, Variance_Explained.y*100, color = Covariate)) +
+	geom_point(size = 0.5, alpha = 0.5) +
+	facet_grid(rows = vars(Location.y), cols = vars(Location.x)) +
+	scale_color_manual(values = variable_colors[["Village"]]) +
+	theme_classic() +
+	ylim(0,70)+
+	geom_text(size = 2.5, x = 0, y = 70, aes(label = paste0("R^2 = ", Rsquared)), data = Rsquared[which(Rsquared$Covariate == "Village" & Rsquared$Location.x != "Site 2_Cryopreserved" & Rsquared$Location.y != "Site 2_Cryopreserved"),], hjust = 0) +
+	xlab("Percent Gene Expression\nVariance Explained") +
+	ylab("Percent Gene Expression\nVariance Explained") +
+	geom_text_repel(size = 2.5, data = subset(fits_df_joined[which(fits_df_joined$Location.x != "Site 2_Cryopreserved" & fits_df_joined$Location.y != "Site 2_Cryopreserved"),], 
+							(Gene_ID %in% c("MT-ATP6", "MT-ND1") & Covariate == "Village")), 
+							aes(label = Gene_ID), box.padding =1)
+save_figs(pCorr_Location_Point_Village, paste0(outdir, "Correlation_Point_Village"), width = 10, height = 7.25)
+
+
+
+
+##### Make a point figure instead of heatmap for Correlation between sites for each quintile
+Rsquared_list_subset <- do.call(rbind, Rsquared_list_subset_quint)
+
+pR2_point <- ggplot(Rsquared_list_subset[Covariate != "Replicate"], aes(Covariate, Rsquared, color = Quintile)) +
+				geom_jitter(width = 0.1, size = 0.75) +
+				theme_classic() +
+				ylab("R^2") +
+				scale_color_viridis(discrete = TRUE) +
+				geom_signif(comparisons = list(c("Line", "Village")), 
+								map_signif_level=TRUE, y = 1,
+								test = "t.test", test.args=list(alternative = "two.sided", var.equal = FALSE, paired=TRUE))
+
+ggsave(pR2_point, filename = paste0(outdir, "R2_scatter.png"), width = 2.5, height = 3)
+ggsave(pR2_point, filename = paste0(outdir, "R2_scatter.pdf"), width = 2.5, height = 3)
+
+
+t.test(Rsquared_list_subset[Covariate == "Village"]$Rsquared, Rsquared_list_subset[Covariate == "Line"]$Rsquared, alternative = "two.sided", var.equal = FALSE, paired=TRUE)
+
diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh b/Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh
new file mode 100644
index 0000000..54f6b76
--- /dev/null
+++ b/Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh
@@ -0,0 +1,135 @@
+
+SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/Variance/RNAvelocity/post_review/pseudotime_effect.smk"
+LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/logs/"
+
+mkdir -p $LOG
+
+cd /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/Variance/RNAvelocity/post_review
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --rerun-incomplete \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --reason > jobs2run.txt
+
+
+
+nohup \
+  snakemake \
+    --snakefile $SNAKEFILE \
+    --jobs 200 \
+    --use-singularity \
+    --restart-times 1 \
+    --rerun-incomplete \
+    --keep-going \
+    --cluster \
+        "qsub -S /bin/bash \
+        -q short.q \
+        -r yes \
+        -pe smp {threads} \
+        -l tmp_requested={resources.disk_per_thread_gb}G \
+        -l mem_requested={resources.mem_per_thread_gb}G \
+        -e $LOG \
+        -o $LOG \
+        -j y \
+        -V" \
+  > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log &
+
+
+### Without -pe smp
+nohup \
+  snakemake \
+    --snakefile $SNAKEFILE \
+    --jobs 200 \
+    --use-singularity \
+    --restart-times 1 \
+    --rerun-incomplete \
+    --keep-going \
+    --cluster \
+        "qsub -S /bin/bash \
+        -q short.q \
+        -r yes \
+        -l tmp_requested={resources.disk_per_thread_gb}G \
+        -l mem_requested={resources.mem_per_thread_gb}G \
+        -e $LOG \
+        -o $LOG \
+        -j y \
+        -V" \
+  > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log &
+
+34342
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --cleanup-metadata \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds
+
+
+
+
+
+
+ENSG00000140481
+ENSG00000158552
+ENSG00000167004
+ENSG00000184281
+ENSG00000185437
+ENSG00000185875
+ENSG00000196109
+ENSG00000197536
+ENSG00000215910
+ENSG00000215883
+ENSG00000258297
+ENSG00000259488
+ENSG00000274877
+
+
+ENSG00000158555
+ENSG00000167005
+ENSG00000184292
+ENSG00000184281
+ENSG00000185437
+ENSG00000185875
+ENSG00000196109
+ENSG00000197536
+ENSG00000215866
+ENSG00000258289
+ENSG00000259485
+ENSG00000274828
+
+
+rm ENSG00000167011_fitted_models.rds
+rm ENSG00000184304_fitted_models.rds
+rm ENSG00000185477_fitted_models.rds
+rm ENSG00000185900_fitted_models.rds
+rm ENSG00000196132_fitted_models.rds
+rm ENSG00000197566_fitted_models.rds
+rm ENSG00000217930_fitted_models.rds
+rm ENSG00000258593_fitted_models.rds
+rm ENSG00000259715_fitted_models.rds
+rm ENSG00000275074_fitted_models.rds
diff --git a/Variance/post_review2/variance_partition_post_review_combine.R b/Variance/post_review2/variance_partition_post_review_combine.R
index 5f9c3ce..ae6d13f 100644
--- a/Variance/post_review2/variance_partition_post_review_combine.R
+++ b/Variance/post_review2/variance_partition_post_review_combine.R
@@ -27,7 +27,7 @@ dir.create(outdir, recursive = TRUE)
 
 vars <- c("Line", "Village", "Cryopreserved", "Site",  "Replicate","Line:Village", "Line:Cryopreserved", "Line:Site", "Village:Cryopreserved","Village:Site",  "Replicate:Village", "Replicate:Line","Replicate:Cryopreserved",  "Replicate:Site", "Residual")
 # var_colors <- c(colorRampPalette(brewer.pal(11, "Spectral"))(14), "grey90")
-var_colors <- c("#4734a9", rev(c("#115cc7", "#a4c3c8", "#499090", "#405940", "#685a54", "#f7d312", "#f8bf33", "#e4910e", "#f65d19", "#931519", "#f2c1ce", "#e17aab", "#a186aa")), "gray90")
+var_colors <- c("#4734a9", rev(c("#115cc7", "#a4c3c8", "#499090", "#405940", "#685a54", "#f7d312", "#f8bf33", "#e4910e", "#f65d19", "#931519", "#f2c1ce", "#e17aab", "#a186aa")), "gray80")
 # var_colors <- c("#a2104d", "#fde64b", "#3e90c1", "#358856", "#f5764e", "#685ba6", "#728d01", "#abd302", "#8b6248", "#6ec4aa", "gray90")
 names(var_colors) <- vars
 
@@ -95,6 +95,9 @@ icc_interaction_dt$grp_size <- factor(icc_interaction_dt$grp_size, levels = uniq
 ### *** Need to add individual effects without interaction in to interaction dt *** ###
 icc_interaction_plus_dt <- rbind(icc_interaction_dt, icc_dt[!(gene %in% icc_interaction_dt$gene)])
 
+fwrite(icc_interaction_plus_dt, paste0(outdir, "effect_results.tsv"), sep = "\t")
+icc_interaction_plus_dt <- fread(paste0(outdir, "effect_results.tsv"), sep = "\t")
+
 
 group_size  <- data.table(table(icc_interaction_plus_dt$grp))
 colnames(group_size) <- c("grp", "size")
@@ -209,20 +212,24 @@ icc_interaction_plus_dt$grp_size <- factor(icc_interaction_plus_dt$grp_size, lev
                                                                                        unique(grep("Replicate:Site\nN ",  icc_interaction_plus_dt$grp_size, value = TRUE)), 
                                                                                        unique(grep("Residual\nN = ", icc_interaction_plus_dt$grp_size, value = TRUE))))
 
-pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(vars)))) + 
-                geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) +
-                geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) +
+pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = factor(grp_size, levels = levels(grp_size)), fill = factor(grp, levels = rev(vars)))) + 
+                # geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) +
+                # geom_point(size =1,position = position_nudge(y=-0.1), shape = "|", aes(color = factor(grp, levels = rev(vars))), alpha = 0.75) +
+                geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) +
+                geom_point(size =1, position = position_nudge(y=-0.11), shape = "|", aes(color = factor(grp, levels = rev(vars)))) +
+                # geom_boxplot(size = 0.1,width = .15, outlier.size = 0.1, position = position_nudge(y=-0.12), alpha = 0.75) +
                 coord_cartesian(xlim = c(1.2, NA), clip = "off") +
                 theme_classic() +
                 theme(axis.title.y=element_blank()) +
                 xlab("Percent Variance Explained") +
                 scale_y_discrete(expand = c(0.03, 0)) +
-                scale_fill_manual(values = var_colors) +
-                geom_vline(xintercept = 1, linetype = "dashed", color = "red")
+                scale_fill_manual(values = var_colors, name = "Variable") +
+                scale_color_manual(values = var_colors, name = "Variable") +
+                geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5)
 
 
-ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.png"), height = 8, width = 7)
-ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.pdf"), height = 8, width = 7)
+ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.png"), height = 4, width = 7)
+ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.pdf"), height = 4, width = 7)
 
 
 
@@ -596,6 +603,108 @@ ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "eQTL_Genes_Var
 
 
 
+eqtls_icc_1pct_grouped_line_dt <- eqtls_icc_1pct_grouped_dt[gene %in% unique(eqtls_icc_1pct_grouped_dt[grp == "Line"]$gene)]
+
+
+table(eqtls_icc_1pct_grouped_line_dt[gene %in% unique(eqtls_icc_1pct_grouped_dt[grp == "Line:Village"]$gene)]$grp)
+
+
+length(unique(eqtls$gene)) ## number that has snp-gene pairs in deboever and our data 4112
+length(unique(eqtls_icc$gene)) ## 2337
+length(unique(eqtls_icc_1pct_grouped_dt$gene)) ### 944
+table(eqtls_icc_1pct_grouped_dt$grp) ## 909 sig in line
+909/944 ### 0.9629237
+
+
+##### do for deboever as well #####
+
+##### check for variance explained for eQTL genes (from Kilpinen et al) that are #####
+deboever_eqtls <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv", sep = "\t")
+
+
+deboever_eqtls_icc <- icc_interaction_plus_dt[unique(deboever_eqtls[,"gene_id"]), on = c("gene" = "gene_id")]
+deboever_eqtls_icc$grp <- factor(deboever_eqtls_icc$grp, levels = rev(vars))
+deboever_eqtls_icc <- deboever_eqtls_icc[data.table(gene = deboever_eqtls_icc[grp == "Residual"][order(percent)]$gene), on = "gene"]
+deboever_eqtls_icc$Gene_ID <- factor(deboever_eqtls_icc$Gene_ID, levels = unique(deboever_eqtls_icc$Gene_ID))
+ 
+
+group_size_eqtl  <- data.table(table(deboever_eqtls_icc$grp))
+colnames(group_size_eqtl) <- c("grp", "size")
+group_size_eqtl$grp_size <- paste0(group_size_eqtl$grp, "\nN = ", group_size_eqtl$size)
+
+deboever_eqtls_icc <- group_size_eqtl[deboever_eqtls_icc, on = "grp"]
+grp_size_order_eqtl <- c("Line\nN = 2371", "Village\nN = 1836", "Site\nN = 2461", "Replicate\nN = 869", "Line:Village\nN = 367", "Line:Site\nN = 911", "Village:Site\nN = 898","Replicate:Village\nN = 305", "Replicate:Line\nN = 25", "Replicate:Site\nN = 78", "Residual\nN = 2542")
+deboever_eqtls_icc$grp_size <- factor(deboever_eqtls_icc$grp_size, levels = grp_size_order_eqtl)
+
+
+
+deboever_eqtls_icc_1pct <- deboever_eqtls_icc
+
+
+for (ensg in unique(deboever_eqtls_icc$gene)){
+    if (!any(deboever_eqtls_icc[gene == ensg & grp != "Residual"]$percent > 1)){
+        deboever_eqtls_icc_1pct <- deboever_eqtls_icc_1pct[gene != ensg]
+    }
+}
+
+
+
+deboever_eqtls_icc_1pct_grouped_list <- list()
+
+
+for (ensg in unique(deboever_eqtls_icc_1pct$gene)){
+    group <- deboever_eqtls_icc_1pct[gene == ensg & grp != "Residual"][which.max(percent)]$grp
+    deboever_eqtls_icc_1pct_grouped_list[[group]][[ensg]] <- deboever_eqtls_icc_1pct[gene == ensg]
+}
+
+deboever_eqtls_icc_1pct_grouped <- lapply(deboever_eqtls_icc_1pct_grouped_list, function(x) do.call(rbind, x))
+deboever_eqtls_icc_1pct_grouped <- lapply(names(deboever_eqtls_icc_1pct_grouped), function(x){
+    deboever_eqtls_icc_1pct_grouped[[x]]$largest_contributor <- x
+    return(deboever_eqtls_icc_1pct_grouped[[x]])
+})
+
+deboever_eqtls_icc_1pct_grouped_dt <- do.call(rbind, deboever_eqtls_icc_1pct_grouped)
+
+deboever_eqtls_icc_1pct_grouped_dt$largest_contributor <- factor(deboever_eqtls_icc_1pct_grouped_dt$largest_contributor, levels = c("Line", "Village", "Site", "Line:Village", "Line:Site", "Village:Site", "Replicate:Village"))
+
+
+
+pPluri_Genes_largest_Cont_eqtl <- ggplot() +
+						geom_bar(data = deboever_eqtls_icc_1pct_grouped_dt, aes(Gene_ID, percent, fill = factor(grp, levels = rev(vars))), position = "stack", stat = "identity", alpha = 0.75) +
+						theme_classic() +
+						facet_grid(. ~ largest_contributor, scales = "free_x", space = "free_x") +
+						scale_fill_manual(values = var_colors) +
+						ylab("Percent Gene Expression Variance Explained") +
+						theme(axis.title.x=element_blank(),
+                            axis.text.x = element_blank(),
+                            panel.spacing.x=unit(0, "lines"),
+                            axis.ticks.x = element_blank()) +
+                        geom_hline(yintercept = 1, linetype = "dashed") 
+                        # scale_y_discrete(expand = c(0.03, 0))
+                        # scale_x_discrete(expand = c(0.03, 0))
+
+
+ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "deboever_eQTL_Genes_Variance_Contributions_1pct_largest_cont.png"), width = 10, height = 4)
+ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "deboever_eQTL_Genes_Variance_Contributions_1pct_largest_cont.pdf"), width = 10, height = 4)
+
+
+
+deboever_eqtls_icc_1pct_grouped_line_dt <- deboever_eqtls_icc_1pct_grouped_dt[gene %in% unique(deboever_eqtls_icc_1pct_grouped_dt[grp == "Line"]$gene)]
+
+
+table(deboever_eqtls_icc_1pct_grouped_line_dt[gene %in% unique(deboever_eqtls_icc_1pct_grouped_dt[grp == "Line:Village"]$gene)]$grp)
+
+
+length(unique(deboever_eqtls$gene)) ## number that has snp-gene pairs in deboever and 
+length(unique(deboever_eqtls_icc$gene)) ## 2548
+length(unique(deboever_eqtls_icc_1pct_grouped_dt$gene)) ### 1189
+table(deboever_eqtls_icc_1pct_grouped_dt$grp) ## 1189 sig in line
+length(unique(deboever_eqtls_icc_1pct_grouped_dt$gene))/table(deboever_eqtls_icc_1pct_grouped_dt$grp)["Line"] ### 1
+
+
+
+
+
 
 
 
diff --git a/Variance/test_interaction_nb_model.R b/Variance/test_interaction_nb_model.R
index 926966c..51e8ba4 100644
--- a/Variance/test_interaction_nb_model.R
+++ b/Variance/test_interaction_nb_model.R
@@ -14,6 +14,8 @@ library(lmerTest)
 library(tictoc)
 library(Rfast)
 library(sjstats)
+library(car)
+library(data.table)
 
 
 
@@ -405,6 +407,89 @@ sum(icc_glmmtmb(model_glmmtmb_mt)$vcov)
 model_glmmtmb_short <- glmmTMB(Expression ~ (1 | Replicate) + (1 | Village) + (1 | Line) + (1 | Site), data = df_hier_unscale, REML = TRUE)
 model_glmmtmb_short_interaction <- glmmTMB(Expression ~ (1 | Replicate) + (1 | Village) + (1 | Line) + (1 | Site) + (1 | Village:Line), data = df_hier_unscale, REML = TRUE)
 
-anova(model_glmmtmb_short, model_glmmtmb_short_interaction)$`Pr(>Chisq)`
+anova(model_glmmtmb_short, model_glmmtmb_short_interaction)$`Pr(>Chisq)` ### the order of testing matters!!!
 
-ranova_mt <- ranova(model_glmmtmb_mt)
+
+
+# ranova_mt <- ranova(model_glmmtmb_mt)
+
+
+##### Leave one out method #####
+variables <- c("Replicate", "Village", "Line", "Site", "Replicate:Village", "Replicate:Line", "Replicate:Site", "Village:Line", "Village:Site", "Line:Site")
+
+model_loo <- list()
+
+sig_dt <- data.table(var = variables, P = as.numeric(NA))
+
+for (variable in variables){
+    print(variable)
+    model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+    model_loo[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE)
+    sig_dt[var == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2]
+}
+
+
+updated_variables <- sig_dt[P < 0.05/10]$var
+updated_model <- as.formula(paste0("Expression ~ (1|", paste0(updated_variables, collapse = ") + (1|"), ")"))
+
+final_full_model <- glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE)
+
+icc_glmmtmb(final_full_model)
+
+model_loo_updated <- list()
+
+sig_dt_updated <- data.table(var = variables, P = as.numeric(NA))
+
+for (variable in updated_variables){
+    print(variable)
+    model <- as.formula(paste0("Expression ~ (1|", paste0(updated_variables[!updated_variables %in% variable], collapse = ") + (1|"), ")"))
+    model_loo_updated[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE)
+    sig_dt_updated[var == variable]$P <- anova(model_loo_updated[[variable]], final_full_model)$`Pr(>Chisq)`[2]
+}
+
+
+### Test mt gene
+model_loo_mt <- list()
+
+sig_dt_mt <- data.table(var = variables, P = as.numeric(NA))
+
+for (variable in variables){
+    print(variable)
+    model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+    model_loo_mt[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale_mt, REML = TRUE)
+    sig_dt_mt[var == variable]$P <- anova(model_loo_mt[[variable]], model_glmmtmb_mt)$`Pr(>Chisq)`[2]
+}
+
+
+updated_variables_mt <- sig_dt_mt[P < 0.05/10]$var
+updated_model_mt<- as.formula(paste0("Expression ~ (1|", paste0(updated_variables_mt, collapse = ") + (1|"), ")"))
+
+final_full_model_mt <- glmmTMB(formula = noquote(updated_model_mt), data = df_hier_unscale_mt, REML = TRUE)
+
+icc_glmmtmb(final_full_model_mt)
+
+model_loo_mt_updated <- list()
+
+sig_dt_mt_updated <- data.table(var = variables, P = as.numeric(NA))
+
+for (variable in updated_variables){
+    print(variable)
+    model <- as.formula(paste0("Expression ~ (1|", paste0(updated_variables[!updated_variables %in% variable], collapse = ") + (1|"), ")"))
+    model_loo_mt_updated[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale_mt, REML = TRUE)
+    sig_dt_mt_updated[var == variable]$P <- anova(model_loo_mt_updated[[variable]], final_full_model)$`Pr(>Chisq)`[2]
+}
+
+
+
+### Check if order of fixed effects impacts the residuals (will save to be used for eQTL detection) ###
+model_fixed <- as.formula(paste0("Expression ~ ", paste0(variables, collapse = " + ")))
+model_random <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+model_glmmtmb_fixed <- glmmTMB(model, data = df_hier_unscale_mt, REML = TRUE)
+model_glmmtmb_random <- glmmTMB(model, data = df_hier_unscale_mt, REML = TRUE)
+
+
+all(resid(model_glmmtmb_fixed) == resid(model_glmmtmb_random))
+
+
+
+model_glmmtmb_mt <- glmmTMB(Expression ~ (1 | Replicate) + (1 | Village) + (1 | Line) + (1 | Site) + (1|Replicate:Village) + (1|Replicate:Line) + (1|Replicate:Site) + (1|Village:Line) + (1|Village:Site) + (1|Line:Site), data = df_hier_unscale_mt, REML = TRUE)
diff --git a/eQTL_check/eQTL_village_interaction.sh b/eQTL_check/eQTL_village_interaction.sh
new file mode 100644
index 0000000..dd3a06a
--- /dev/null
+++ b/eQTL_check/eQTL_village_interaction.sh
@@ -0,0 +1,109 @@
+
+
+SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/eQTL_village_interaction.smk"
+LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/logs"
+
+mkdir -p $LOG
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --reason > jobs2run.txt
+
+
+
+nohup \
+  snakemake \
+    --snakefile $SNAKEFILE \
+    --jobs 200 \
+    --use-singularity \
+    --restart-times 1 \
+    --keep-going \
+    --cluster \
+        "qsub -S /bin/bash \
+        -q short.q \
+        -r yes \
+        -pe smp {threads} \
+        -l tmp_requested={resources.disk_per_thread_gb}G \
+        -l mem_requested={resources.mem_per_thread_gb}G \
+        -e $LOG \
+        -o $LOG \
+        -j y \
+        -V" \
+  > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log &
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --cleanup-metadata \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds
+
+
+
+
+
+
+ENSG00000140481
+ENSG00000158552
+ENSG00000167004
+ENSG00000184281
+ENSG00000185437
+ENSG00000185875
+ENSG00000196109
+ENSG00000197536
+ENSG00000215910
+ENSG00000215883
+ENSG00000258297
+ENSG00000259488
+ENSG00000274877
+
+
+ENSG00000158555
+ENSG00000167005
+ENSG00000184292
+ENSG00000184281
+ENSG00000185437
+ENSG00000185875
+ENSG00000196109
+ENSG00000197536
+ENSG00000215866
+ENSG00000258289
+ENSG00000259485
+ENSG00000274828
+
+
+rm ENSG00000167011_fitted_models.rds
+rm ENSG00000184304_fitted_models.rds
+rm ENSG00000185477_fitted_models.rds
+rm ENSG00000185900_fitted_models.rds
+rm ENSG00000196132_fitted_models.rds
+rm ENSG00000197566_fitted_models.rds
+rm ENSG00000217930_fitted_models.rds
+rm ENSG00000258593_fitted_models.rds
+rm ENSG00000259715_fitted_models.rds
+rm ENSG00000275074_fitted_models.rds
diff --git a/eQTL_check/eQTL_village_interaction.smk b/eQTL_check/eQTL_village_interaction.smk
new file mode 100644
index 0000000..b37e717
--- /dev/null
+++ b/eQTL_check/eQTL_village_interaction.smk
@@ -0,0 +1,59 @@
+import pandas as pd
+
+
+genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"
+genes_file_kilpinen = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"
+genes = pd.read_csv(genes_file, sep = "\t")
+# genes = genes.iloc[1]
+genes_kilpenen = pd.read_csv(genes_file_kilpinen, sep = "\t")
+# genes_kilpenen = genes_kilpenen.iloc[1]
+
+rule all:
+    input:
+        expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/{gene}_snpXvillage_interactions.tsv", gene = genes.gene_id),
+        expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/{gene}_snpXvillage_interactions.tsv", gene = genes_kilpenen.gene_id),
+
+
+rule interaction_deboever:
+    input:
+        bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned.bed"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/{gene}_snpXvillage_interactions.tsv",
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 8,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 8
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/eQTL_village_interaction_deboever.R",
+        outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/",
+        snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene],
+        datadir = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/data/"
+    log:
+    shell:
+        """        
+        mkdir -p {params.datadir}
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed} {params.datadir}
+        """
+
+
+
+rule interaction_kilpinen:
+    input:
+        bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned.bed"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/{gene}_snpXvillage_interactions.tsv",
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 8,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 8
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/eQTL_village_interaction_kilpinen.R",
+        outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/",
+        snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene],
+        datadir = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/data/"
+    log:
+    shell:
+        """  
+        mkdir -p {params.datadir}
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed} {params.datadir}
+        """
diff --git a/eQTL_check/eQTL_village_interaction_combine.R b/eQTL_check/eQTL_village_interaction_combine.R
new file mode 100644
index 0000000..0e7973c
--- /dev/null
+++ b/eQTL_check/eQTL_village_interaction_combine.R
@@ -0,0 +1,357 @@
+library(data.table)
+library(dplyr)
+library(ggplot2)
+library(colorspace)
+library(DEGreport)
+library(ggpp)
+
+
+deboever_indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/"
+kilpinen_indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/"
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/combined/"
+
+dir.create(outdir, recursive = TRUE)
+
+
+##### Read in results #####
+### Deboever
+deboever_files <- list.files(deboever_indir)
+
+
+deboever_results_list <- lapply(deboever_files, function(x){
+    tmp <- fread(paste0(deboever_indir, x), sep = "\t")
+    tmp$ensg <- gsub("_snpXvillage_interactions.tsv", "", x)
+    return(tmp)
+})
+
+
+
+deboever_results_dt <- do.call(rbind, deboever_results_list)
+deboever_results_dt$snp_fdr <- p.adjust(deboever_results_dt$snp_p, method = "fdr")
+deboever_results_dt$snpXvillage_fdr <- p.adjust(deboever_results_dt$snpXvillage_p, method = "fdr")
+deboever_results_dt$study <- "DeBoever"
+
+
+### Kilpinen
+kilpinen_files <- list.files(kilpinen_indir)
+
+kilpinen_results_list <- lapply(kilpinen_files, function(x){
+    tmp <- fread(paste0(kilpinen_indir, x), sep = "\t")
+    tmp$ensg <- gsub("_snpXvillage_interactions.tsv", "", x)
+    return(tmp)
+})
+
+
+kilpinen_results_dt <- do.call(rbind, kilpinen_results_list)
+kilpinen_results_dt$snp_fdr <- p.adjust(kilpinen_results_dt$snp_p, method = "fdr")
+kilpinen_results_dt$snpXvillage_fdr <- p.adjust(kilpinen_results_dt$snpXvillage_p, method = "fdr")
+kilpinen_results_dt$study <- "Kilpinen"
+
+
+results_dt <- rbind(deboever_results_dt,kilpinen_results_dt)
+
+
+results_dt[order(snp_fdr)]
+results_dt[order(snpXvillage_fdr)]
+
+### See how many snps have all different genotypes for each cell line
+results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006] ## 1108 total
+
+fwrite(results_dt, paste0(outdir, "snpXvillage_results.tsv"), sep = "\t")
+
+
+results_dt$significant <- ifelse(results_dt$snpXvillage_fdr < 0.05, TRUE, FALSE)
+
+results_dt[ensg == "ENSG00000106153"]
+
+results_dt$snpXvillage_sig <- ifelse(results_dt$snpXvillage_fdr < 0.05, "significant", "not significant")
+
+
+correlation <- list()
+correlation_dt <- data.table(study = unique(results_dt$study), rho = as.numeric(NA), p = as.numeric(NA))
+
+for (Study in results_dt$study){
+    correlation[[Study]] <- cor.test(results_dt[study == Study]$snp_beta, results_dt[study == Study]$snpXvillage_beta, method = "spearman", exact = TRUE)
+    correlation_dt[study == Study, "rho"] <- correlation[[Study]]$estimate[1]
+    correlation_dt[study == Study, "p"] <- correlation[[Study]]$p.value
+}
+
+
+correlation_plot <- ggplot(results_dt[study == "DeBoever"]) +
+        geom_hline(yintercept=0) +
+        geom_vline(xintercept=0) +
+        geom_point(aes(snp_beta, snpXvillage_beta)) +
+        theme_classic() +
+        # facet_wrap(vars(study)) +
+        xlab("SNP Effect Size") +
+        ylab("SNP x Village Interaction Effect Size") +
+        geom_text(data = correlation_dt, aes(x = -3.85, y = 5.5, label = paste0("Rho = ",round(rho,2),"***")))
+        #  scale_color_discrete_sequential(palette = "ag_GrnYl", rev = FALSE) +
+
+ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png"), height = 3, width = 5.5)
+
+
+
+### add number of genes in each quadrant
+dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0]) # 2216
+dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0]) # 2103
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0]) # 1155
+dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0]) # 1160
+
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 1670
+dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 1582
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 546
+dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 586
+
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 546
+dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 521
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 609
+dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 574
+
+
+### Sig
+dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 15
+dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 18
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 67
+dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 72
+
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 13
+dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 14
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 47
+dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 46
+
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 2
+dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 4
+
+dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 20
+dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 26
+
+
+### Filter by genes that matched direction in original dataset ###
+effect_match_dt <- fread( "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_overlap/combined/overlap_results.tsv")
+
+effect_match_dt_sig_match <- effect_match_dt[direction == "match" & fdr < 0.05]
+
+
+results_dt_sig_match <- results_dt[effect_match_dt_sig_match[,c("study", "ensg", "ID_ref_alt")], on = c("study", "ensg", "ID_ref_alt")]
+
+
+
+correlation_sig_match <- list()
+correlation_sig_match_dt <- data.table(study = unique(results_dt_sig_match$study), rho = as.numeric(NA), p = as.numeric(NA))
+
+for (Study in results_dt_sig_match$study){
+    correlation_sig_match[[Study]] <- cor.test(results_dt_sig_match[study == Study]$snp_beta, results_dt_sig_match[study == Study]$snpXvillage_beta, method = "spearman", exact = TRUE)
+    correlation_sig_match_dt[study == Study, "rho"] <- correlation_sig_match[[Study]]$estimate[1]
+    correlation_sig_match_dt[study == Study, "p"] <- correlation_sig_match[[Study]]$p.value
+}
+
+
+correlation_sig_match_plot <- ggplot(results_dt_sig_match[study == "DeBoever"], aes(snp_beta, snpXvillage_beta)) +
+        geom_hline(yintercept=0) +
+        geom_vline(xintercept=0) +
+        geom_point() +
+        theme_classic() +
+        # facet_wrap(vars(study)) +
+        xlab("SNP Effect Size") +
+        ylab("SNP*Village Interaction\nEffect Size") +
+        geom_text(data = correlation_sig_match_dt, aes(x = -4, y = 0.5, label = paste0("Rho = ",round(rho,2),"***"))) +
+        stat_quadrant_counts() +
+        xlim(-7,3) +
+        ylim(-7,3)
+        #  scale_color_discrete_sequential(palette = "ag_GrnYl", rev = FALSE) +
+
+ggsave(correlation_sig_match_plot, filename = paste0(outdir, "correlation_sig_match_plot.png"), height = 2.5, width = 2.75)
+ggsave(correlation_sig_match_plot, filename = paste0(outdir, "correlation_sig_match_plot.pdf"), height = 2.5, width = 2.75)
+
+
+
+### add number of genes in each quadrant
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0]) # 2216
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0]) # 2103
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0]) # 1155
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0]) # 1160
+
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 1670
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 1582
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 546
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 586
+
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 546
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 521
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 609
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 574
+
+
+### Sig
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 15
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 18
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 67
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 72
+
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 13
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 14
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 47
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 46
+
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 2
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 4
+
+dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 20
+dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 26
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+### Filter by original interaction significant genes ###
+effects_dt <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/combined/effect_results.tsv")
+
+interaction_sig_genes <- unique(effects_dt[grp == "Line:Village", "gene"])
+
+results_dt_subset <- results_dt[ensg %in% interaction_sig_genes$gene]
+
+correlation_plot_sub <- ggplot(results_dt_subset, aes(snp_beta, snpXvillage_beta, color = snpXvillage_sig)) +
+        geom_point() +
+        theme_classic() +
+         scale_color_discrete_sequential(palette = "ag_GrnYl", rev = FALSE) +
+         facet_wrap(vars(study)) +
+         geom_hline(yintercept=0) +
+         geom_vline(xintercept=0)
+        # scale_color_manual(values = c("grey","black"))
+
+ggsave(correlation_plot_sub, filename = paste0(outdir, "correlation_plot_var_sig.png"), height = 2.25, width = 5.5)
+
+
+
+### add number of genes in each quadrant
+dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta < 0]) # 202
+dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta > 0]) # 232
+
+dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta > 0]) # 304
+dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta < 0]) # 323
+
+
+dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 144
+dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 153
+
+dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 199
+dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 219
+
+
+dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 58
+dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 79
+
+dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 105
+dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 104
+
+
+
+deboever_eqtls <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/combined/deboever_overlap_results.tsv", sep = "\t")
+deboever_eqtls_dif_snp <- deboever_eqtls[direction == "different_snp"]
+
+results_dt_subset_correct_direction <- results_dt_subset[!(ID_ref_alt %in% deboever_eqtls_dif_snp$gene_id_deboever)]
+
+
+deboever_results_dt[deboever_eqtls[, .(gene_id_deboever, ID_ref_alt)], on = c("ensg" = "gene_id_deboever", "ID_ref_alt")][ensg == "ENSG00000106153"]
+
+
+
+### add number of genes in each quadrant
+dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta < 0]) # 144
+dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta > 0]) # 153
+
+dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta > 0]) # 199
+dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta < 0]) # 219
+
+
+dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 144
+dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 153
+
+dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 199
+dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 219
+
+
+dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 58
+dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 79
+
+dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 105
+dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 104
+
+
+
+
+
+histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 0.05) +
+        theme_classic() +
+        geom_vline(aes(xintercept=0.05),
+            color="red", linetype="dashed", size=1) +
+        scale_fill_manual(values = c("grey","black"))
+        
+ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png"))
+
+histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        geom_vline(aes(xintercept=log10(0.05)),
+            color="red", linetype="dashed", size=0.25) +
+        scale_fill_manual(values = c("grey","black")) +
+        scale_y_continuous(expand = c(0, 0)) +
+        ylab("Number of Associations") +
+        labs(fill = "eQTL\nEffect\nDirection\nAgreement")
+
+        
+ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png"))
+
+
+popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        scale_fill_manual(values = c("grey","black")) +
+        theme(axis.title.x=element_blank(),
+            axis.title.y=element_blank(),
+            legend.position="none",
+            axis.text.x=element_text(size=7),
+            axis.text.y=element_text(size=7)) +
+        scale_y_continuous(expand = c(0, 0))
+
+ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png"))
+
+combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, 
+                       ymin = 500, ymax = 1225)
+
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3)
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3)
+
+
+
+### Calculate number of eQTLs and write in notion for writing paper time ###
diff --git a/eQTL_check/eQTL_village_interaction_deboever.R b/eQTL_check/eQTL_village_interaction_deboever.R
new file mode 100644
index 0000000..8899377
--- /dev/null
+++ b/eQTL_check/eQTL_village_interaction_deboever.R
@@ -0,0 +1,177 @@
+### Author: Drew neavin
+### Date: 25 June, 2022
+### Rational: Reviewer asked if eQTL effects are consistently higher in a village so need to look at village*snp effect for eQTLs
+
+
+##### Read in Libraries #####
+library(data.table)
+library(tidyverse)
+library(Seurat)
+library(vcfR)
+library(pkgcond)
+library(glmmTMB)
+
+
+
+##### Set up directories #####
+args <- commandArgs(trailingOnly = TRUE)
+ensg <- as.character(args[1])
+outdir <- args[2]
+bed <- args[3]
+datadir <- args[4]
+
+
+
+##### Read in data #####
+### significant effects for all genes dataframe ###
+effects_dt <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/combined/effect_results.tsv")
+
+### Seurat object ###
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/seurat_integrated_noncryo_1pct_expressing.rds")
+
+
+### SNP vcf ###
+vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_finalized_snps.recode.vcf")
+
+
+##### Read in dataframe of pairs to test #####
+snp_gene_pairs <- fread(bed)
+snp_gene_pairs$gene_id <- gsub("\\..+", "", snp_gene_pairs$gene_id)
+
+
+##### Subset the correct snp #####
+snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg]
+
+
+##### Make model dataframe #####
+df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[ensg,], "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)), "Site" = seurat$Location)
+colnames(df_hier_unscale)[1] <- "Expression"
+
+
+
+
+##### Get the Correct Variables for Modeling #####
+variables <- unique(effects_dt[gene == ensg]$grp)
+variables <- variables[!(variables %in% c("Line", "Line:Village", "Residual"))]
+
+
+
+##### Fit effects except line and village*line effects ######
+model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+###### Summarize data to site x replicate x village #####
+data <- data.table(df_hier_unscale)
+data$Residuals <- resid(model_glmmtmb)
+data_sum <- data[, .(residual=mean(Residuals)), by = .(Village, Line, Site, Replicate)]
+
+
+
+##### Get the snp from the vcf #####
+## GT ##
+snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F))
+
+
+if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){
+    message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.")
+    format_clust = "GT"
+
+    if (any(grepl("\\|",snp_dt[,1]))){
+        separator = "|"
+        message("Detected | separator for GT genotype format in cluster vcf")
+    } else if (any(grepl("/",snp_dt[,1]))) {
+        separator = "/"
+        message("Detected / separator for GT genotype format in cluster vcf")
+    } else {
+        format_clust = NA
+        message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.")
+    }
+    if (!is.na(format_clust)){
+        snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)})))
+    }
+}
+
+
+colnames(snp_dt) <- gsub("36_TOB00421_i_E8", "TOB0421", colnames(snp_dt)) %>%
+                        gsub("22_FSA", "FSA0006", .) %>%
+                        gsub("29_MBE", "MBE1006", .)
+
+    
+## Get specific SNP ##
+snp_dt$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT'])
+
+snp_dt_subset <- snp_dt[ID_ref_alt %in% snp_gene_pairs_subset$ID_ref_alt]
+snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID_ref_alt"), measure.vars = c("TOB0421", "FSA0006", "MBE1006"))
+
+
+
+##### Make dataframe with results, snp genotypes and our beta + pvalue #####
+results_dt <- snp_gene_pairs_subset[,c("ID", "REF", "ALT", "ID_ref_alt")]
+results_dt <- results_dt[snp_dt_subset, on = "ID_ref_alt"]
+
+results_dt$snp_beta <- as.numeric(NA)
+results_dt$snp_beta_se <- as.numeric(NA)
+results_dt$snp_z <- as.numeric(NA)
+results_dt$snp_p <- as.numeric(NA)
+results_dt$snpXvillage_beta <- as.numeric(NA)
+results_dt$snpXvillage_beta_se <- as.numeric(NA)
+results_dt$snpXvillage_z <- as.numeric(NA)
+results_dt$snpXvillage_p <- as.numeric(NA)
+
+
+
+
+for (snp in unique(snp_dt_subset_long$ID_ref_alt)){
+
+    ### Add SNP data to data data.table ###
+    data_sum_snp <- snp_dt_subset_long[ID_ref_alt == snp][data_sum, on = c("variable" = "Line")]
+
+    gt_long <- melt(results_dt[ID_ref_alt == snp][,c("ID_ref_alt", "TOB0421", "FSA0006", "MBE1006", "REF", "ALT")], measure.vars = c("TOB0421", "FSA0006", "MBE1006"))
+    gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), 
+                            ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT),
+                                ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA)))
+
+    gt_long <- unique(gt_long)
+
+    data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID_ref_alt")]
+
+    data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype))
+
+
+    ##### Fit SNP => store resids #####
+    ##### Check for beta #####
+    base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE)
+    snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE)
+
+    anova_results <- anova(base_model, snp_model)
+
+    results_dt[ID_ref_alt == snp]$snp_beta <- summary(snp_model)$coefficients$cond[2,1]
+    results_dt[ID_ref_alt == snp]$snp_beta_se <- summary(snp_model)$coefficients$cond[2,2]
+    results_dt[ID_ref_alt == snp]$snp_z <- summary(snp_model)$coefficients$cond[2,3]
+    results_dt[ID_ref_alt == snp]$snp_p <- anova_results$`Pr(>Chisq)`[2]
+
+
+    ##### resids fit SNP*village #####
+    data_sum_snp$residual2 <- resid(snp_model)
+
+    snpXvillage_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value)*Village, data = data_sum_snp, REML = TRUE)
+
+    anova_results2 <- anova(snp_model, snpXvillage_model)
+
+
+    results_dt[ID_ref_alt == snp]$snpXvillage_beta <- summary(snpXvillage_model)$coefficients$cond[2,1]
+    results_dt[ID_ref_alt == snp]$snpXvillage_beta_se <- summary(snpXvillage_model)$coefficients$cond[2,2]
+    results_dt[ID_ref_alt == snp]$snpXvillage_z <- summary(snpXvillage_model)$coefficients$cond[2,3]
+    results_dt[ID_ref_alt == snp]$snpXvillage_p <- anova_results2$`Pr(>Chisq)`[2]
+
+    fwrite(data_sum_snp, paste0(datadir, ensg, "_",snp,"_dataframe"), sep = "\t")
+}
+
+fwrite(results_dt, paste0(outdir, ensg, "_snpXvillage_interactions.tsv"), sep = "\t")
+
+
diff --git a/eQTL_check/eQTL_village_interaction_kilpinen.R b/eQTL_check/eQTL_village_interaction_kilpinen.R
new file mode 100644
index 0000000..316f018
--- /dev/null
+++ b/eQTL_check/eQTL_village_interaction_kilpinen.R
@@ -0,0 +1,177 @@
+### Author: Drew neavin
+### Date: 25 June, 2022
+### Rational: Reviewer asked if eQTL effects are consistently higher in a village so need to look at village*snp effect for eQTLs
+
+
+##### Read in Libraries #####
+library(data.table)
+library(tidyverse)
+library(Seurat)
+library(vcfR)
+library(pkgcond)
+library(glmmTMB)
+
+
+
+##### Set up directories #####
+args <- commandArgs(trailingOnly = TRUE)
+ensg <- as.character(args[1])
+outdir <- args[2]
+bed <- args[3]
+datadir <- args[4]
+
+
+
+##### Read in data #####
+### significant effects for all genes dataframe ###
+effects_dt <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/combined/effect_results.tsv")
+
+### Seurat object ###
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/seurat_integrated_noncryo_1pct_expressing.rds")
+
+
+### SNP vcf ###
+vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/finalized_snps.recode.vcf")
+
+
+##### Read in dataframe of pairs to test #####
+snp_gene_pairs <- fread(bed)
+snp_gene_pairs$gene_id <- gsub("\\..+", "", snp_gene_pairs$gene_id)
+
+
+##### Subset the correct snp #####
+snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg]
+
+
+##### Make model dataframe #####
+df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[ensg,], "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)), "Site" = seurat$Location)
+colnames(df_hier_unscale)[1] <- "Expression"
+
+
+
+
+##### Get the Correct Variables for Modeling #####
+variables <- unique(effects_dt[gene == ensg]$grp)
+variables <- variables[!(variables %in% c("Line", "Line:Village", "Residual"))]
+
+
+
+##### Fit effects except line and village*line effects ######
+model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+###### Summarize data to site x replicate x village #####
+data <- data.table(df_hier_unscale)
+data$Residuals <- resid(model_glmmtmb)
+data_sum <- data[, .(residual=mean(Residuals)), by = .(Village, Line, Site, Replicate)]
+
+
+
+##### Get the snp from the vcf #####
+## GT ##
+snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F))
+
+
+if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){
+    message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.")
+    format_clust = "GT"
+
+    if (any(grepl("\\|",snp_dt[,1]))){
+        separator = "|"
+        message("Detected | separator for GT genotype format in cluster vcf")
+    } else if (any(grepl("/",snp_dt[,1]))) {
+        separator = "/"
+        message("Detected / separator for GT genotype format in cluster vcf")
+    } else {
+        format_clust = NA
+        message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.")
+    }
+    if (!is.na(format_clust)){
+        snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)})))
+    }
+}
+
+
+colnames(snp_dt) <- gsub("36_TOB00421_i_E8", "TOB0421", colnames(snp_dt)) %>%
+                        gsub("22_FSA", "FSA0006", .) %>%
+                        gsub("29_MBE", "MBE1006", .)
+
+    
+## Get specific SNP ##
+snp_dt$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT'])
+
+snp_dt_subset <- snp_dt[ID_ref_alt %in% snp_gene_pairs_subset$ID_ref_alt]
+snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID_ref_alt"), measure.vars = c("TOB0421", "FSA0006", "MBE1006"))
+
+
+
+##### Make dataframe with results, snp genotypes and our beta + pvalue #####
+results_dt <- snp_gene_pairs_subset[,c("ID", "REF", "ALT", "ID_ref_alt")]
+results_dt <- results_dt[snp_dt_subset, on = "ID_ref_alt"]
+
+results_dt$snp_beta <- as.numeric(NA)
+results_dt$snp_beta_se <- as.numeric(NA)
+results_dt$snp_z <- as.numeric(NA)
+results_dt$snp_p <- as.numeric(NA)
+results_dt$snpXvillage_beta <- as.numeric(NA)
+results_dt$snpXvillage_beta_se <- as.numeric(NA)
+results_dt$snpXvillage_z <- as.numeric(NA)
+results_dt$snpXvillage_p <- as.numeric(NA)
+
+
+
+
+for (snp in unique(snp_dt_subset_long$ID_ref_alt)){
+
+    ### Add SNP data to data data.table ###
+    data_sum_snp <- snp_dt_subset_long[ID_ref_alt == snp][data_sum, on = c("variable" = "Line")]
+
+    gt_long <- melt(results_dt[ID_ref_alt == snp][,c("ID_ref_alt", "TOB0421", "FSA0006", "MBE1006", "REF", "ALT")], measure.vars = c("TOB0421", "FSA0006", "MBE1006"))
+    gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), 
+                            ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT),
+                                ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA)))
+
+    gt_long <- unique(gt_long)
+
+    data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID_ref_alt")]
+
+    data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype))
+
+
+    ##### Fit SNP => store resids #####
+    ##### Check for beta #####
+    base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE)
+    snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE)
+
+    anova_results <- anova(base_model, snp_model)
+
+    results_dt[ID_ref_alt == snp]$snp_beta <- summary(snp_model)$coefficients$cond[2,1]
+    results_dt[ID_ref_alt == snp]$snp_beta_se <- summary(snp_model)$coefficients$cond[2,2]
+    results_dt[ID_ref_alt == snp]$snp_z <- summary(snp_model)$coefficients$cond[2,3]
+    results_dt[ID_ref_alt == snp]$snp_p <- anova_results$`Pr(>Chisq)`[2]
+
+
+    ##### resids fit SNP*village #####
+    data_sum_snp$residual2 <- resid(snp_model)
+
+    snpXvillage_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value)*Village, data = data_sum_snp, REML = TRUE)
+
+    anova_results2 <- anova(snp_model, snpXvillage_model)
+
+
+    results_dt[ID_ref_alt == snp]$snpXvillage_beta <- summary(snpXvillage_model)$coefficients$cond[2,1]
+    results_dt[ID_ref_alt == snp]$snpXvillage_beta_se <- summary(snpXvillage_model)$coefficients$cond[2,2]
+    results_dt[ID_ref_alt == snp]$snpXvillage_z <- summary(snpXvillage_model)$coefficients$cond[2,3]
+    results_dt[ID_ref_alt == snp]$snpXvillage_p <- anova_results2$`Pr(>Chisq)`[2]
+
+    fwrite(data_sum_snp, paste0(datadir, ensg, "_",snp,"_dataframe"), sep = "\t")
+}
+
+fwrite(results_dt, paste0(outdir, ensg, "_snpXvillage_interactions.tsv"), sep = "\t")
+
+
diff --git a/eQTL_check/filter_snps_kilpinen.R b/eQTL_check/filter_snps_kilpinen.R
new file mode 100644
index 0000000..57528e6
--- /dev/null
+++ b/eQTL_check/filter_snps_kilpinen.R
@@ -0,0 +1,83 @@
+library(vcfR)
+library(data.table)
+library(tidyverse)
+
+
+##### Set up variables #####
+dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/"
+
+
+iPSC_Village/scripts/QCmetric_figs
+##### Read in data #####
+vcf <- read.vcfR(paste0(dir, "finalized_snps.recode.vcf"))
+eqtls_overlap_dt <- fread(paste0(dir,"kilpinen_imputed_overlapping_filtered_header.bed"), sep = "\t")
+
+geno <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F))
+
+
+if (!all(colSums(is.na(geno)) == nrow(geno))){
+    message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.")
+    format_clust = "GT"
+
+    if (any(grepl("\\|",geno[,1]))){
+        separator = "|"
+        message("Detected | separator for GT genotype format in cluster vcf")
+    } else if (any(grepl("/",geno[,1]))) {
+        separator = "/"
+        message("Detected / separator for GT genotype format in cluster vcf")
+    } else {
+        format_clust = NA
+        message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.")
+    }
+    if (!is.na(format_clust)){
+        geno <- data.table(as_tibble(lapply(geno, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)})))
+    }
+}
+
+
+geno$ID <- vcf@fix[,'ID']
+geno$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT'])
+
+eqtls_overlap_dt <- eqtls_overlap_dt[ID %in% unique(geno$ID)]
+eqtls_overlap_dt$ID_ref_alt <- paste0(eqtls_overlap_dt$ID, "_", eqtls_overlap_dt$REF, "_", eqtls_overlap_dt$ALT)
+
+
+temp <- list()
+
+for (gene in unique(eqtls_overlap_dt$gene_id)){
+    print(gene)
+    temp[[gene]] <- eqtls_overlap_dt[gene_id == gene]
+    k = 1
+    while (k < nrow(temp[[gene]])){
+        test_snp <- temp[[gene]]$ID_ref_alt[k]
+        for (snp in temp[[gene]][(k+1):nrow(temp[[gene]]),]$ID_ref_alt){
+            if (abs(cor(as.numeric(geno[ID_ref_alt == test_snp,1:3]), as.numeric(geno[ID_ref_alt == snp,1:3]))) == 1){
+                temp[[gene]] <- temp[[gene]][ID_ref_alt != snp]
+            } 
+        }
+        k = k + 1
+    }
+}
+
+
+eqtls_overlap_dt_subset <- do.call(rbind, temp)
+eqtls_overlap_dt_subset$gene_id <- gsub("\\..+", "", eqtls_overlap_dt_subset$gene_id)
+
+
+resid_files <- list.files("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/gene_separated/residuals4qtl")
+ensg_list <- unique(gsub("_residuals4qtl.rds", "", resid_files))
+
+eqtls_overlap_dt_subset <- eqtls_overlap_dt_subset[gene_id %in% ensg_list]
+
+
+fwrite(eqtls_overlap_dt_subset, paste0(dir, "kilpinen_imputed_overlapping_filtered_header_pruned.bed"), sep = "\t")
+
+
+eqtls_overlap_dt_subset_gene_snp <- eqtls_overlap_dt_subset[,c("gene_id","ID_ref_alt")]
+
+fwrite(eqtls_overlap_dt_subset_gene_snp, paste0(dir, "kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"), sep = "\t")
+
+
diff --git a/eQTL_check/multi-passage/filter_snps.R b/eQTL_check/multi-passage/filter_snps.R
new file mode 100644
index 0000000..4321269
--- /dev/null
+++ b/eQTL_check/multi-passage/filter_snps.R
@@ -0,0 +1,88 @@
+library(vcfR)
+library(data.table)
+library(tidyverse)
+
+
+##### Set up variables #####
+dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/"
+
+
+iPSC_Village/scripts/QCmetric_figs
+##### Read in data #####
+vcf <- read.vcfR(paste0(dir, "deboever_finalized_snps.vcf"))
+eqtls_overlap_dt <- fread(paste0(dir,"deboever_imputed_overlapping_filtered_header.bed"), sep = "\t")
+
+geno <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F))
+
+
+if (!all(colSums(is.na(geno)) == nrow(geno))){
+    message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.")
+    format_clust = "GT"
+
+    if (any(grepl("\\|",geno[,1]))){
+        separator = "|"
+        message("Detected | separator for GT genotype format in cluster vcf")
+    } else if (any(grepl("/",geno[,1]))) {
+        separator = "/"
+        message("Detected / separator for GT genotype format in cluster vcf")
+    } else {
+        format_clust = NA
+        message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.")
+    }
+    if (!is.na(format_clust)){
+        geno <- data.table(as_tibble(lapply(geno, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)})))
+    }
+}
+
+
+geno$ID <- vcf@fix[,'ID']
+# geno$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT'])
+
+eqtls_overlap_dt <- eqtls_overlap_dt[ID %in% unique(geno$ID)]
+eqtls_overlap_numberic_gt <- apply(eqtls_overlap_dt[,38:ncol(eqtls_overlap_dt)], 2, function(y) as.numeric(gsub("[0-9]\\|[0-9]:", "", y) %>% gsub(":.+", "", .)))
+eqtls_overlap_dt <- cbind(eqtls_overlap_dt[,1:37],eqtls_overlap_numberic_gt)
+
+
+# eqtls_overlap_dt$ID_ref_alt <- paste0(eqtls_overlap_dt$ID, "_", eqtls_overlap_dt$REF, "_", eqtls_overlap_dt$ALT)
+
+
+temp <- list()
+
+for (gene in unique(eqtls_overlap_dt$gene_id)){
+    print(gene)
+    temp[[gene]] <- eqtls_overlap_dt[gene_id == gene]
+    k = 1
+    while (k < nrow(temp[[gene]])){
+        test_snp <- temp[[gene]]$ID[k]
+        for (snp in temp[[gene]][(k+1):nrow(temp[[gene]]),]$ID){
+            if (abs(cor(as.numeric(geno[ID == test_snp,1:(ncol(geno) - 1)]), as.numeric(geno[ID == snp,1:(ncol(geno) - 1)]))) == 1){
+                temp[[gene]] <- temp[[gene]][ID != snp]
+            } 
+        }
+        k = k + 1
+    }
+}
+
+
+eqtls_overlap_dt_subset <- do.call(rbind, temp)
+eqtls_overlap_dt_subset$gene_id <- gsub("\\..+", "", eqtls_overlap_dt_subset$gene_id)
+
+
+resid_files <- list.files("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl")
+
+ensg_list <- unique(gsub("_residuals4qtl.rds", "", resid_files))
+
+eqtls_overlap_dt_subset <- eqtls_overlap_dt_subset[gene_id %in% ensg_list]
+
+
+fwrite(eqtls_overlap_dt_subset, paste0(dir, "deboever_imputed_overlapping_filtered_header_pruned.bed"), sep = "\t")
+
+
+eqtls_overlap_dt_subset_gene_snp <- eqtls_overlap_dt_subset[,c("gene_id","ID")]
+
+fwrite(eqtls_overlap_dt_subset_gene_snp, paste0(dir, "deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"), sep = "\t")
+
+
diff --git a/eQTL_check/multi-passage/genes2test.sh b/eQTL_check/multi-passage/genes2test.sh
new file mode 100644
index 0000000..562495e
--- /dev/null
+++ b/eQTL_check/multi-passage/genes2test.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+OUTDIR="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage"
+mkdir -p $OUTDIR
+
+kilpinen_genes="$OUTDIR/../KilpinenOverlap/gene_snp_list.tsv"
+deboever_genes="$OUTDIR/../KilpinenOverlap/deboever_gene_snp_list.tsv"
+variance_genes="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl"
+
+
+ls $variance_genes | sed "s/_residuals4qtl.rds//g" | sort -u > $OUTDIR/residual_genes.tsv
+awk '{print $2}' $kilpinen_genes | sort -u > $OUTDIR/kilpinen_genes.tsv
+awk '{print $2}' $deboever_genes | sort -u > $OUTDIR/deboever_genes.tsv
+
+
+comm -12 $OUTDIR/residual_genes.tsv head $OUTDIR/kilpinen_genes.tsv > $OUTDIR/genes_residual_and_kilpinen.tsv ## NONE
+comm -12 $OUTDIR/residual_genes.tsv $OUTDIR/deboever_genes.tsv > $OUTDIR/genes_residual_and_deboever.tsv
+
+
+### Get finalized list of genes that were eQTLs and demonstrate line effects
+grep -F -f $OUTDIR/genes_residual_and_deboever.tsv $deboever_genes > $OUTDIR/finalized_deboever_gene_snp_list.tsv
+
+
+
+### Filter vcf for just these snps ###
+awk '{print $1}' $OUTDIR/finalized_deboever_gene_snp_list.tsv | sort -u > $OUTDIR/finalized_deboever_snps.tsv
+
+grep "#" $OUTDIR/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf >$OUTDIR/deboever_finalized_snps.vcf
+grep -f $OUTDIR/finalized_deboever_snps.tsv $OUTDIR/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf >> $OUTDIR/deboever_finalized_snps.vcf
+
diff --git a/eQTL_check/multi-passage/metadata_prep.R b/eQTL_check/multi-passage/metadata_prep.R
new file mode 100644
index 0000000..7ee4031
--- /dev/null
+++ b/eQTL_check/multi-passage/metadata_prep.R
@@ -0,0 +1,18 @@
+library(Seurat)
+library(tivdyverse)
+library(data.table)
+
+
+dir.create("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Kilpinen_eQTLs/")
+
+
+
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds")
+
+
+### Make DF for modeling ###
+df_hier_unscale <- data.frame("Passage" = as.factor(gsub("Village_", "", seurat@meta.data$Pool)), "Line" = seurat@meta.data$AtLeastHalfSinglet_Individual_Assignment)
+df_hier_unscale$Barcode <- colnames(seurat)
+
+fwrite(df_hier_unscale, paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/cell_meta.tsv"), sep = "\t")
+
diff --git a/eQTL_check/multi-passage/pull_eQTL_SNPs.sh b/eQTL_check/multi-passage/pull_eQTL_SNPs.sh
new file mode 100644
index 0000000..61ba936
--- /dev/null
+++ b/eQTL_check/multi-passage/pull_eQTL_SNPs.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+
+vcf=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/multi-passage/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases.recode.vcf
+kilpinen=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs/KilpineniPSCeQTLs.txt
+deboever=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs/DeBoeveriPSCeQTLs.txt
+
+OUT=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs
+OUT_TMP=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs/tmp
+
+mkdir -p $OUT_TMP
+
+INTERSECT_OUT=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage
+
+mkdir -p $INTERSECT_OUT
+
+
+
+### Filter on individuals and MAF for our lines ###
+sed '/<CN/d' $vcf > $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps.vcf
+sed -i '/<INV>/d' $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps.vcf
+
+
+
+conda activate vcftools
+
+    vcftools --vcf $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps.vcf --recode --mac 1 --out $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered
+
+    bcftools view -e 'COUNT(GT="AA")=N_SAMPLES || COUNT(GT="RR")=N_SAMPLES || COUNT(GT="RA")=N_SAMPLES' $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered.recode.vcf > $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf
+
+conda deactivate
+
+### Redo intersection with these SNPs ##$#
+conda activate bedtools
+
+    bedtools intersect -a $OUT/kilpinen.bed -b $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf -wa -wb > $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed
+
+conda deactivate
+
+
+
+head -n 1 $OUT/kilpinen.bed > $INTERSECT_OUT/kilpinen_header_bed.tsv
+grep "#CHROM" $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf > $INTERSECT_OUT/kilpinen_header_vcf.tsv
+paste -d"\t" $INTERSECT_OUT/kilpinen_header_bed.tsv $INTERSECT_OUT/kilpinen_header_vcf.tsv > $INTERSECT_OUT/kilpinen_combined_header.tsv
+
+cat $INTERSECT_OUT/kilpinen_combined_header.tsv > $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed
+cat $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed >> $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed
+
+
+
+
+
+### Use this file to test for trends in gene
+awk 'BEGIN{FS=OFS="\t"}{print($24, $7)}' $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed | sed 's/\..*//g' > $INTERSECT_OUT/gene_snp_list.tsv
+
+sed -i '1i snp\tgene' $INTERSECT_OUT/gene_snp_list.tsv
+
+
+
+
+
+
+
+##### DEBOEVER DATA #####
+### Intersect with these SNPs ##$#
+conda activate bedtools
+
+    bedtools intersect -a $OUT/deboever.bed -b $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf -wa -wb > $INTERSECT_OUT/deboever_imputed_overlapping_filtered.bed
+
+conda deactivate
+
+
+head -n 1 $OUT/deboever.bed > $INTERSECT_OUT/deboever_header_bed.tsv
+grep "#CHROM" $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf > $INTERSECT_OUT/deboever_header_vcf.tsv
+paste -d"\t" $INTERSECT_OUT/deboever_header_bed.tsv $INTERSECT_OUT/deboever_header_vcf.tsv > $INTERSECT_OUT/deboever_combined_header.tsv
+
+cat $INTERSECT_OUT/deboever_combined_header.tsv > $INTERSECT_OUT/deboever_imputed_overlapping_filtered_header.bed
+cat $INTERSECT_OUT/deboever_imputed_overlapping_filtered.bed >> $INTERSECT_OUT/deboever_imputed_overlapping_filtered_header.bed
+
+
+### Use this file to test for trends in gene
+awk 'BEGIN{FS=OFS="\t"}{print($31, $15, $25)}' $INTERSECT_OUT/deboever_imputed_overlapping_filtered_header.bed | sed 's/\..*\t/\t/g' > $INTERSECT_OUT/deboever_gene_snp_list.tsv
+
+
+
diff --git a/eQTL_check/multi-passage/test_eQTL.R b/eQTL_check/multi-passage/test_eQTL.R
new file mode 100644
index 0000000..4bab96e
--- /dev/null
+++ b/eQTL_check/multi-passage/test_eQTL.R
@@ -0,0 +1,204 @@
+### Reason: test eQTLs identified in Kilpinen et al in our dataset
+### Author: Drew Neavin
+### Date: 1 April, 2022
+
+
+library(data.table)
+library(Seurat)
+library(vcfR)
+library(glmmTMB)
+library(dplyr)
+library(ggplot2)
+
+
+
+##### Bring in variables #####
+### Bring in arguments
+args <- commandArgs(trailingOnly = TRUE)
+ensg <- args[1]
+outdir <- args[2]
+bed <- args[3]
+
+
+cell_line_colors <- c("TOB0199" = "#f44336", "TOB0220" = "#e81f63", "MBE2900" = "#9c27b0", "MBE0953" = "#673ab7", "TOB0421" = "#3f51b5", "MBE2817" = "#2096f3","FSA0004" = "#2096f3", "TOB0435" = "#009688", "WAB0004" = "#4caf50", "IST1877" = "#8bc34a", "WAB0103" = "#cddc39", "TOB0198" = "#ffeb3b", "TOB0205" = "#ffc108", "WAB0038" = "#ff9801", "IST3323" = "#ff5723" , "FSA0001" = "#795548", "180N" = "#9e9e9e", "166" = "#607d8b")
+
+##### Read in dataframe of pairs to test #####
+snp_gene_pairs <- fread(bed)
+
+
+
+
+##### Subset the correct  snp #####
+snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg]
+
+
+
+##### Read in data #####
+residuals <- readRDS(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl/", ensg, "_residuals4qtl.rds"))
+residual_dt <- data.table(residuals)
+colnames(residual_dt) <- c("residual")
+residual_dt$Barcode <- names(residuals)
+
+vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/deboever_finalized_snps.vcf")
+meta <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/cell_meta.tsv", sep = "\t")
+data <- meta[residual_dt, on = "Barcode"]
+data$Barcode <- NULL
+
+
+
+
+### Summarize data by site and line and replicatae
+data_sum <- data[, .(residual=mean(residual)), by = .(Passage, Line)]
+data_sum$Line <- gsub("^0_", "", data_sum$Line) %>%
+                        gsub("D-", "", .) %>%
+                            gsub("\\.\\d\\.", "", .) %>%
+                                gsub("N-", "", .) %>%
+                                    gsub("-P36", "", .)  %>%
+                                        gsub("-", "", .)
+
+
+##### Get the snp from the vcf #####
+## GT ##
+snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F))
+
+
+if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){
+    message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.")
+    format_clust = "GT"
+
+    if (any(grepl("\\|",snp_dt[,1]))){
+        separator = "|"
+        message("Detected | separator for GT genotype format in cluster vcf")
+    } else if (any(grepl("/",snp_dt[,1]))) {
+        separator = "/"
+        message("Detected / separator for GT genotype format in cluster vcf")
+    } else {
+        format_clust = NA
+        message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.")
+    }
+    if (!is.na(format_clust)){
+        snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)})))
+    }
+}
+
+
+colnames(snp_dt) <- gsub("^0_", "", colnames(snp_dt)) %>%
+                        gsub("D-", "", .) %>%
+                            gsub("\\.\\d\\.", "", .) %>%
+                                gsub("N-", "", .) %>%
+                                    gsub("-P36", "", .)  %>%
+                                        gsub("-", "", .)
+
+    
+## Get specific SNP ##
+snp_dt$ID <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],":", vcf@fix[,'REF'],":", vcf@fix[,'ALT'])
+
+snp_dt_subset <- snp_dt[ID %in% snp_gene_pairs_subset$ID]
+
+
+### Add SNP data to data data.table ###
+snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID"), measure.vars = colnames(snp_dt_subset)[1:(ncol(snp_dt_subset) - 1)])
+snp_dt_subset_long$variable <- gsub("^0_", "", snp_dt_subset_long$variable) %>%
+                        gsub("D-", "", .) %>%
+                            gsub("\\.\\d\\.", "", .) %>%
+                                gsub("N-", "", .) %>%
+                                    gsub("-P36", "", .)  %>%
+                                        gsub("-", "", .)
+
+
+
+##### Make dataframe with deboever results, snp genotypes and our beta + pvalue #####
+results_dt <- snp_gene_pairs_subset[,c("chrom", "start", "end", "rsid", "maf", "stat", "pvalue", "beta", "sebeta", "gene_id", "gene_name", "ref","alt", "ID", "REF", "ALT")]
+colnames(results_dt)[5:13] <- paste0(colnames(results_dt)[5:13], "_deboever")
+results_dt <- results_dt[snp_dt_subset, on = "ID"]
+
+results_dt$dataset_beta <- as.numeric(NA)
+results_dt$dataset_beta_se <- as.numeric(NA)
+results_dt$dataset_z <- as.numeric(NA)
+results_dt$dataset_p <- as.numeric(NA)
+results_dt$direction <- as.character(NA)
+
+
+for (snp in unique(snp_dt_subset_long$ID)){
+
+    data_sum_snp <- snp_dt_subset_long[ID == snp][data_sum, on = c("variable" = "Line")]
+
+    columns <- c("ID",  unique(snp_dt_subset_long$variable), "REF", "ALT")
+    gt_long <- melt(results_dt[ID == snp][,..columns], measure.vars =  unique(snp_dt_subset_long$variable))
+    gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), 
+                            ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT),
+                                ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA)))
+
+    gt_long <- unique(gt_long)
+
+    data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID")]
+
+    data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype))
+
+
+    ##### Check for beta #####
+    base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE)
+    snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE)
+
+    anova_results <- anova(base_model, snp_model)
+
+    results_dt[ID == snp]$dataset_beta <- summary(snp_model)$coefficients$cond[2,1]
+    results_dt[ID == snp]$dataset_beta_se <- summary(snp_model)$coefficients$cond[2,2]
+    results_dt[ID == snp]$dataset_z <- summary(snp_model)$coefficients$cond[2,3]
+    results_dt[ID == snp]$dataset_p <- anova_results$`Pr(>Chisq)`[2]
+
+    if (results_dt[ID == snp]$ref_deboever == results_dt[ID == snp]$REF & results_dt[ID == snp]$alt_deboever == results_dt[ID == snp]$ALT) {
+        if ((results_dt[ID == snp]$beta_deboever * results_dt[ID == snp]$dataset_beta) > 0){
+            results_dt[ID == snp]$direction <- "match"
+        } else {
+            results_dt[ID == snp]$direction <- "opposite"
+        }
+    } else if (results_dt[ID == snp]$ref_deboever == results_dt[ID == snp]$ALT & results_dt[ID == snp]$REF == results_dt[ID == snp]$alt_deboever) {
+        if ((results_dt[ID == snp]$beta_deboever * results_dt[ID == snp]$dataset_beta) < 0){
+            results_dt[ID == snp]$direction <- "match"
+        } else {
+            results_dt[ID == snp]$direction <- "opposite"
+        }
+    } else {
+        results_dt[ID == snp]$direction <- "different_snp"
+    }
+
+    ###### Make a figure of the results
+    ### Different shapes for location
+    ### Different fill for village and uniculture
+    ### Color by line
+    if (!results_dt[ID == snp]$direction ==  "different_snp"){
+
+        data_sum_snp$value <- factor(data_sum_snp$value, levels = sort(unique(data_sum_snp$value)))
+
+        labels <- levels(data_sum_snp$Genotype)
+        names(labels) <- as.numeric(levels(data_sum_snp$value))
+
+        plot <- ggplot(data_sum_snp, aes(value, residual)) +
+            geom_point(aes(color = variable)) +
+            theme_classic() +
+            scale_color_manual(values = cell_line_colors) +
+            scale_fill_manual(values = cell_line_colors) +
+            geom_smooth(aes(as.numeric(value), residual), position = "identity",method = "lm", color = "black", se=FALSE) +
+            scale_x_discrete(labels=labels) +
+            ylab("Normalized Expression") +
+            ggtitle(paste0(results_dt[ID == snp]$rsid, "-", results_dt[ID == snp]$gene_name_deboever, " eQTL"),
+                    subtitle = paste0("beta = ", round(results_dt[ID == snp]$dataset_beta, 3))) +
+            xlab(paste0(results_dt[ID == snp]$rsid, '\nGenotype')) +
+            theme(plot.title = element_text(hjust = 0.5),
+                    plot.subtitle = element_text(hjust = 0.5)) +
+            labs(color = "iPSC Line", shape = "Site & Village", fill = NULL)
+
+        ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_deboever_eQTL_results.png"), width = 5, height = 3.5)
+        ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_deboever_eQTL_results.pdf"), width = 5, height = 3.5)
+    }
+}
+
+fwrite(results_dt, paste0(outdir,"beds/",ensg, "_deboever_eQTL_results.bed"), sep = "\t")
+
+
+
+
diff --git a/eQTL_check/multi-passage/test_eQTL.sh b/eQTL_check/multi-passage/test_eQTL.sh
new file mode 100644
index 0000000..7bf4e60
--- /dev/null
+++ b/eQTL_check/multi-passage/test_eQTL.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/multi-passage/test_eQTL.smk"
+LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output//multi-passage/deboever/gene_separate/logs/"
+
+mkdir -p $LOG
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --reason > jobs2run.txt
+
+
+
+nohup \
+  snakemake \
+    --snakefile $SNAKEFILE \
+    --jobs 200 \
+    --use-singularity \
+    --restart-times 1 \
+    --keep-going \
+    --cluster \
+        "qsub -S /bin/bash \
+        -q short.q \
+        -r yes \
+        -pe smp {threads} \
+        -l tmp_requested={resources.disk_per_thread_gb}G \
+        -l mem_requested={resources.mem_per_thread_gb}G \
+        -e $LOG \
+        -o $LOG \
+        -j y \
+        -V" \
+  > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log &
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --cleanup-metadata \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds
+
+
+
+
+
+
+ENSG00000140481
+ENSG00000158552
+ENSG00000167004
+ENSG00000184281
+ENSG00000185437
+ENSG00000185875
+ENSG00000196109
+ENSG00000197536
+ENSG00000215910
+ENSG00000215883
+ENSG00000258297
+ENSG00000259488
+ENSG00000274877
+
+
+ENSG00000158555
+ENSG00000167005
+ENSG00000184292
+ENSG00000184281
+ENSG00000185437
+ENSG00000185875
+ENSG00000196109
+ENSG00000197536
+ENSG00000215866
+ENSG00000258289
+ENSG00000259485
+ENSG00000274828
+
+
+rm ENSG00000167011_fitted_models.rds
+rm ENSG00000184304_fitted_models.rds
+rm ENSG00000185477_fitted_models.rds
+rm ENSG00000185900_fitted_models.rds
+rm ENSG00000196132_fitted_models.rds
+rm ENSG00000197566_fitted_models.rds
+rm ENSG00000217930_fitted_models.rds
+rm ENSG00000258593_fitted_models.rds
+rm ENSG00000259715_fitted_models.rds
+rm ENSG00000275074_fitted_models.rds
diff --git a/eQTL_check/multi-passage/test_eQTL.smk b/eQTL_check/multi-passage/test_eQTL.smk
new file mode 100644
index 0000000..d63e8e9
--- /dev/null
+++ b/eQTL_check/multi-passage/test_eQTL.smk
@@ -0,0 +1,53 @@
+import pandas as pd
+
+
+genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"
+# genes_file_kilpinen = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"
+genes = pd.read_csv(genes_file, sep = "\t")
+# genes_kilpenen = pd.read_csv(genes_file_kilpinen, sep = "\t")
+# genes_kilpenen = genes_kilpenen.iloc[1]
+
+rule all:
+    input:
+        expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed", gene = genes.gene_id),
+        # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed", gene = genes_kilpenen.gene_id),
+
+
+rule eqtl:
+    input:
+        bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/deboever_imputed_overlapping_filtered_header_pruned.bed"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed"
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 4,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 4
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/multi-passage/test_eQTL.R",
+        outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/",
+        snp=lambda wildcards: genes.ID[genes.gene_id == wildcards.gene],
+    log:
+    shell:
+        """        
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed}
+        """
+
+
+rule eqtl_kilpinen:
+    input:
+        bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned.bed"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed"
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 4,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 4
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/test_eQTL_kilpinen.R",
+        outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/",
+        snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene],
+    log:
+    shell:
+        """        
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed}
+        """
diff --git a/eQTL_check/multi-passage/test_eQTL_combine_deboever.R b/eQTL_check/multi-passage/test_eQTL_combine_deboever.R
new file mode 100644
index 0000000..7d152f4
--- /dev/null
+++ b/eQTL_check/multi-passage/test_eQTL_combine_deboever.R
@@ -0,0 +1,114 @@
+library(data.table)
+library(dplyr)
+library(ggplot2)
+library(colorspace)
+library(ggpp)
+
+
+indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/beds/"
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/combined/"
+
+dir.create(outdir, recursive = TRUE)
+
+
+### Read in results
+files <- list.files(indir)
+
+bed_results_list <- lapply(files, function(x){
+    fread(paste0(indir, x), sep = "\t")
+})
+
+
+results_dt <- do.call(rbind, bed_results_list)
+
+
+results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr")
+
+
+results_dt <- results_dt[order(fdr)]
+
+head(results_dt, n =100)
+
+
+fwrite(results_dt, paste0(outdir, "deboever_overlap_results.tsv"), sep = "\t")
+
+results_dt <- results_dt[direction != "different_snp"]
+results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE)
+
+
+results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match"))
+
+
+correlation_plot <- ggplot(results_dt, aes(beta_deboever, dataset_beta, color = log(fdr))) +
+        geom_point() +
+        theme_classic() +
+         scale_color_continuous_sequential(palette = "Purp", rev = FALSE)
+        # scale_color_manual(values = c("grey","black"))
+
+ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png"))
+
+
+histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 0.05) +
+        theme_classic() +
+        geom_vline(aes(xintercept=0.05),
+            color="red", linetype="dashed", size=1) +
+        scale_fill_manual(values = c("grey","black"))
+        
+ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png"))
+
+histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        geom_vline(aes(xintercept=log10(0.05)),
+            color="red", linetype="dashed", size=0.25) +
+        scale_fill_manual(values = c("grey","black")) +
+        scale_y_continuous(expand = c(0, 0)) +
+        ylab("Number of Associations") +
+        labs(fill = "eQTL\nEffect\nDirection\nAgreement")
+
+        
+ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png"))
+
+
+popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        scale_fill_manual(values = c("grey","black")) +
+        theme(axis.title.x=element_blank(),
+            axis.title.y=element_blank(),
+            legend.position="none",
+            axis.text.x=element_text(size=7),
+            axis.text.y=element_text(size=7)) +
+        scale_y_continuous(expand = c(0, 0))
+
+ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png"))
+
+combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, 
+                       ymin = 500, ymax = 1225)
+
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3)
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3)
+
+
+results_dt$updated_beta <- ifelse(results_dt$ref_deboever != results_dt$REF, results_dt$beta_deboever * -1, results_dt$beta_deboever)
+
+
+
+
+correlation_plot <- ggplot(results_dt, aes(updated_beta, dataset_beta, color = direction)) +
+        geom_hline(yintercept=0) +
+        geom_vline(xintercept=0) +
+        geom_point(alpha = 0.5) +
+        theme_classic() +
+        ylab("This Dataset Effect Size") +
+        xlab("DeBoever Effect Size") +
+        scale_color_manual(values = c("grey60", "black"), name = "Effect\nDirections") +
+        stat_quadrant_counts() +
+        xlim(-2.25,2.25) +
+        ylim(-8.5,8.5) +
+        theme(legend.position = "none")
+
+ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.png"), height = 2.5, width = 2.5)
+ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.pdf"), height = 2.5, width = 2.5)
+
diff --git a/eQTL_check/pull_eQTL_SNPs.sh b/eQTL_check/pull_eQTL_SNPs.sh
index 4fd2b48..95a89a5 100644
--- a/eQTL_check/pull_eQTL_SNPs.sh
+++ b/eQTL_check/pull_eQTL_SNPs.sh
@@ -71,8 +71,21 @@ conda activate bedtools
 
 conda deactivate
 
+
+
+head -n 1 $OUT/kilpinen.bed > $INTERSECT_OUT/kilpinen_header_bed.tsv
+grep "#CHROM" $INTERSECT_OUT/merged_imputed_AllChrs_iPSC_R2_0.3_filtered_diff_genotypes.vcf > $INTERSECT_OUT/kilpinen_header_vcf.tsv
+paste -d"\t" $INTERSECT_OUT/kilpinen_header_bed.tsv $INTERSECT_OUT/kilpinen_header_vcf.tsv > $INTERSECT_OUT/kilpinen_combined_header.tsv
+
+cat $INTERSECT_OUT/kilpinen_combined_header.tsv > $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed
+cat $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed >> $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed
+
+
+
+
+
 ### Use this file to test for trends in gene
-awk 'BEGIN{FS=OFS="\t"}{print($24, $7)}' $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed | sed 's/\..*//g' > $INTERSECT_OUT/gene_snp_list.tsv
+awk 'BEGIN{FS=OFS="\t"}{print($24, $7)}' $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed | sed 's/\..*//g' > $INTERSECT_OUT/gene_snp_list.tsv
 
 sed -i '1i snp\tgene' $INTERSECT_OUT/gene_snp_list.tsv
 
diff --git a/eQTL_check/test_eQTL.smk b/eQTL_check/test_eQTL.smk
index 85465ea..ee79b38 100644
--- a/eQTL_check/test_eQTL.smk
+++ b/eQTL_check/test_eQTL.smk
@@ -2,12 +2,15 @@ import pandas as pd
 
 
 genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"
+genes_file_kilpinen = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"
 genes = pd.read_csv(genes_file, sep = "\t")
-# genes = genes.iloc[1]
+genes_kilpenen = pd.read_csv(genes_file_kilpinen, sep = "\t")
+# genes_kilpenen = genes_kilpenen.iloc[1]
 
 rule all:
     input:
-        expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed", gene = genes.gene_id),
+        # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed", gene = genes.gene_id),
+        expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed", gene = genes_kilpenen.gene_id),
 
 
 rule eqtl:
@@ -30,3 +33,21 @@ rule eqtl:
         """
 
 
+rule eqtl_kilpinen:
+    input:
+        bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned.bed"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed"
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 4,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 4
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/test_eQTL_kilpinen.R",
+        outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/",
+        snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene],
+    log:
+    shell:
+        """        
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed}
+        """
diff --git a/eQTL_check/test_eQTL_combine.R b/eQTL_check/test_eQTL_combine.R
index 6252323..390a764 100644
--- a/eQTL_check/test_eQTL_combine.R
+++ b/eQTL_check/test_eQTL_combine.R
@@ -2,26 +2,53 @@ library(data.table)
 library(dplyr)
 library(ggplot2)
 library(colorspace)
+library(ggpp)
 
 
-indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/"
-outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/combined/"
+indir_deboever <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/"
+indir_kilpinen <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/"
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_overlap/combined/"
+
 
 dir.create(outdir, recursive = TRUE)
 
 
 ### Read in results
-files <- list.files(indir)
+deboever_files <- list.files(indir_deboever)
+
+bed_results_list <- lapply(deboever_files, function(x){
+    fread(paste0(indir_deboever, x), sep = "\t")
+})
+
+
+deboever_results_dt <- do.call(rbind, bed_results_list)
+deboever_results_dt$fdr <- p.adjust(deboever_results_dt$dataset_p, method = "fdr")
+deboever_results_dt$study <- "DeBoever"
+
+
+### Kilpinen
+kilpinen_files <- list.files(indir_kilpinen)
+
+kilpinen_results_list <- lapply(kilpinen_files, function(x){
+    fread(paste0(indir_kilpinen, x), sep = "\t")
 
-bed_results_list <- lapply(files, function(x){
-    fread(paste0(indir, x), sep = "\t")
 })
 
 
-results_dt <- do.call(rbind, bed_results_list)
+kilpinen_results_dt <- do.call(rbind, kilpinen_results_list)
+kilpinen_results_dt$fdr <- p.adjust(kilpinen_results_dt$dataset_p, method = "fdr")
+kilpinen_results_dt$study <- "Kilpinen"
+
 
+deboever_results_dt_prep <- deboever_results_dt[,c("chrom", "start", "end", "maf_deboever", "ref_deboever", "alt_deboever", "gene_id_deboever", "beta_deboever", "ID_ref_alt", "REF", "ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta", "dataset_p", "direction", "fdr", "study")]
+colnames(deboever_results_dt_prep) <- c("chrom", "start", "end", "maf", "REF", "ALT", "ensg", "beta", "ID_ref_alt", "dataset_REF", "dataset_ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta", "dataset_p", "direction", "fdr", "study")
+kilpinen_results_dt_prep <- kilpinen_results_dt[,c("#chrom", "start", "end", "maf_kilpinen", "REF", "ALT", "gene_id_kilpinen", "lmm_peer_beta_kilpinen", "ID_ref_alt", "REF", "ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta",  "dataset_p", "direction", "fdr", "study")]
+colnames(kilpinen_results_dt_prep) <- c("chrom", "start", "end", "maf", "REF", "ALT", "ensg", "beta", "ID_ref_alt", "dataset_REF", "dataset_ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta",  "dataset_p", "direction", "fdr", "study")
+kilpinen_results_dt_prep$REF <- NA
+kilpinen_results_dt_prep$ALT <- NA
 
-results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr")
+
+results_dt <- rbind(deboever_results_dt_prep, kilpinen_results_dt_prep)
 
 
 results_dt <- results_dt[order(fdr)]
@@ -30,31 +57,97 @@ head(results_dt, n =100)
 
 results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006]
 
-fwrite(results_dt, paste0(outdir, "deboever_overlap_results.tsv"), sep = "\t")
+fwrite(results_dt, paste0(outdir, "overlap_results.tsv"), sep = "\t")
 
 results_dt <- results_dt[direction != "different_snp"]
 results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE)
 
-results_dt[gene_name_deboever == "CHCHD2"]
+results_dt[ensg == "ENSG00000106153"]
 
 results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match"))
 
 
-correlation_plot <- ggplot(results_dt, aes(beta_deboever, dataset_beta, color = log(fdr))) +
-        geom_point() +
+results_dt$updated_beta <- ifelse(results_dt$study == "Kilpinen" & results_dt$direction == "match" & results_dt$beta * results_dt$dataset_beta < 0, results_dt$beta * -1, 
+    ifelse(results_dt$study == "Kilpinen" & results_dt$direction == "opposite" & results_dt$beta * results_dt$dataset_beta > 0, results_dt$beta * -1, 
+        ifelse(results_dt$study == "DeBoever" & results_dt$REF != results_dt$dataset_REF, results_dt$beta * -1, results_dt$beta)))
+
+
+correlation <- list()
+correlation_dt <- data.table(study = unique(results_dt$study), rho = as.numeric(NA), p = as.numeric(NA))
+
+for (Study in results_dt$study){
+    correlation[[Study]] <- cor.test(results_dt[study == Study]$updated_beta, results_dt[study == Study]$dataset_beta, method = "spearman", exact = TRUE)
+    correlation_dt[study == Study, "rho"] <- correlation[[Study]]$estimate[1]
+    correlation_dt[study == Study, "p"] <- correlation[[Study]]$p.value
+}
+
+correlation_plot <- ggplot(results_dt, aes(updated_beta, dataset_beta, color = significant)) +
+        geom_hline(yintercept=0) +
+        geom_vline(xintercept=0) +
+        geom_point(alpha = 0.25) +
         theme_classic() +
-         scale_color_continuous_sequential(palette = "Purp", rev = FALSE)
+        # scale_color_continuous_sequential(palette = "Purp", rev = FALSE) +
+        scale_color_manual(values = c("lightgrey", "black")) +
+        facet_wrap(vars(study))
         # scale_color_manual(values = c("grey","black"))
 
 ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png"))
 
 
+
+correlation_plot <- ggplot(results_dt[study == "DeBoever" & significant == TRUE], aes(updated_beta, dataset_beta, color = direction)) +
+        geom_hline(yintercept=0) +
+        geom_vline(xintercept=0) +
+        geom_point(alpha = 0.25) +
+        theme_classic() +
+        ylab("This Dataset Effect Size") +
+        xlab("DeBoever Effect Size") +
+        scale_color_manual(values = c("grey60", "black"), name = "Effect\nDirections") +
+        stat_quadrant_counts() +
+        xlim(-2.25,2.25) +
+        ylim(-8.5,8.5) +
+        theme(legend.position = "none")
+
+ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.png"), height = 2.5, width = 2.5)
+ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.pdf"), height = 2.5, width = 2.5)
+
+
+
+
+### add number of genes in each quadrant
+dim(results_dt[updated_beta < 0 & dataset_beta < 0]) # 2255
+dim(results_dt[updated_beta > 0 & dataset_beta > 0]) # 2116
+
+dim(results_dt[updated_beta < 0 & dataset_beta > 0]) # 1071
+dim(results_dt[updated_beta > 0 & dataset_beta < 0]) # 1008
+
+
+dim(results_dt[updated_beta < 0 & dataset_beta < 0 & study == "DeBoever"]) # 1534
+dim(results_dt[updated_beta > 0 & dataset_beta > 0 & study == "DeBoever"]) # 1378
+
+dim(results_dt[updated_beta < 0 & dataset_beta > 0 & study == "DeBoever"]) # 688
+dim(results_dt[updated_beta > 0 & dataset_beta < 0 & study == "DeBoever"]) # 600
+
+
+dim(results_dt[updated_beta < 0 & dataset_beta < 0 & study == "Kilpinen"]) # 721
+dim(results_dt[updated_beta > 0 & dataset_beta > 0 & study == "Kilpinen"]) # 738
+
+dim(results_dt[updated_beta < 0 & dataset_beta > 0 & study == "Kilpinen"]) # 383
+dim(results_dt[updated_beta > 0 & dataset_beta < 0 & study == "Kilpinen"]) # 408
+
+
+
+
+
+
+
 histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) +
         geom_histogram(position = "stack", binwidth = 0.05) +
         theme_classic() +
         geom_vline(aes(xintercept=0.05),
             color="red", linetype="dashed", size=1) +
-        scale_fill_manual(values = c("grey","black"))
+        scale_fill_manual(values = c("grey","black")) +
+        facet_wrap(vars(study))
         
 ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png"))
 
@@ -64,9 +157,11 @@ histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) +
         geom_vline(aes(xintercept=log10(0.05)),
             color="red", linetype="dashed", size=0.25) +
         scale_fill_manual(values = c("grey","black")) +
-        scale_y_continuous(expand = c(0, 0))
+        scale_y_continuous(expand = c(0, 0)) +
+        ylab("Number of Associations") +
+        labs(fill = "eQTL\nEffect\nDirection\nAgreement") +
+        facet_wrap(vars(study), scales = "free")
 
-        
 ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png"))
 
 
@@ -79,14 +174,58 @@ popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr)
             legend.position="none",
             axis.text.x=element_text(size=7),
             axis.text.y=element_text(size=7)) +
-        scale_y_continuous(expand = c(0, 0))
+        scale_y_continuous(expand = c(0, 0)) +
+        facet_wrap(vars(study), scales = "free")
+
 
 ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png"))
 
-combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, 
+annotation_custom2 <- function (grob, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf, data) 
+{
+  layer(data = data, stat = StatIdentity, position = PositionIdentity, 
+        geom = ggplot2:::GeomCustomAnn,
+        inherit.aes = TRUE, params = list(grob = grob, 
+                                          xmin = xmin, xmax = xmax, 
+                                          ymin = ymin, ymax = ymax))
+}
+
+get_inset <- function(df){
+  p <- ggplot(data=df %>% 
+                group_by(study) %>% 
+                slice(1),
+               aes(x = fdr, fill = direction)) +
+    geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        scale_fill_manual(values = c("grey","black")) +
+        theme(axis.title.x=element_blank(),
+            axis.title.y=element_blank(),
+            legend.position="none",
+            axis.text.x=element_text(size=7),
+            axis.text.y=element_text(size=7)) +
+        scale_y_continuous(expand = c(0, 0)) 
+  return(p)
+}
+
+
+insets <- results_dt %>% 
+  split(f = .$study) %>%
+  purrr::map(~annotation_custom2(
+    grob = ggplotGrob(get_inset(.)), 
+    data = data.frame(category=unique(.$study)),
+    ymin = -500, ymax=1225, xmin=-32, xmax=-15)
+  )
+
+
+combined_plot_log <- histogram_plot_log + annotation_custom2(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, 
+                       ymin = 500, ymax = 1225)
+
+combined_plot_log <- histogram_plot_log + annotation_custom(grob=ggplotGrob(insets), xmin = -35, xmax = -15, 
                        ymin = 500, ymax = 1225)
 
-ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3)
-ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3)
+
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 2.5)
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 2.5)
+
 
 
+### Calculate number of eQTLs and write in notion for writing paper time ###
diff --git a/eQTL_check/test_eQTL_combine_deboever.R b/eQTL_check/test_eQTL_combine_deboever.R
new file mode 100644
index 0000000..8aa6c76
--- /dev/null
+++ b/eQTL_check/test_eQTL_combine_deboever.R
@@ -0,0 +1,96 @@
+library(data.table)
+library(dplyr)
+library(ggplot2)
+library(colorspace)
+
+
+indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/"
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/combined/"
+
+dir.create(outdir, recursive = TRUE)
+
+
+### Read in results
+files <- list.files(indir)
+
+bed_results_list <- lapply(files, function(x){
+    fread(paste0(indir, x), sep = "\t")
+})
+
+
+results_dt <- do.call(rbind, bed_results_list)
+
+
+results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr")
+
+
+results_dt <- results_dt[order(fdr)]
+
+head(results_dt, n =100)
+
+results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006]
+
+fwrite(results_dt, paste0(outdir, "deboever_overlap_results.tsv"), sep = "\t")
+
+results_dt <- results_dt[direction != "different_snp"]
+results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE)
+
+results_dt[gene_name_deboever == "CHCHD2"]
+
+results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match"))
+
+
+correlation_plot <- ggplot(results_dt, aes(beta_deboever, dataset_beta, color = log(fdr))) +
+        geom_point() +
+        theme_classic() +
+         scale_color_continuous_sequential(palette = "Purp", rev = FALSE)
+        # scale_color_manual(values = c("grey","black"))
+
+ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png"))
+
+
+histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 0.05) +
+        theme_classic() +
+        geom_vline(aes(xintercept=0.05),
+            color="red", linetype="dashed", size=1) +
+        scale_fill_manual(values = c("grey","black"))
+        
+ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png"))
+
+histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        geom_vline(aes(xintercept=log10(0.05)),
+            color="red", linetype="dashed", size=0.25) +
+        scale_fill_manual(values = c("grey","black")) +
+        scale_y_continuous(expand = c(0, 0)) +
+        ylab("Number of Associations") +
+        labs(fill = "eQTL\nEffect\nDirection\nAgreement")
+
+        
+ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png"))
+
+
+popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        scale_fill_manual(values = c("grey","black")) +
+        theme(axis.title.x=element_blank(),
+            axis.title.y=element_blank(),
+            legend.position="none",
+            axis.text.x=element_text(size=7),
+            axis.text.y=element_text(size=7)) +
+        scale_y_continuous(expand = c(0, 0))
+
+ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png"))
+
+combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, 
+                       ymin = 500, ymax = 1225)
+
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3)
+ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3)
+
+
+
+### Calculate number of eQTLs and write in notion for writing paper time ###
diff --git a/eQTL_check/test_eQTL_combine_kilpinen.R b/eQTL_check/test_eQTL_combine_kilpinen.R
new file mode 100644
index 0000000..fde0f74
--- /dev/null
+++ b/eQTL_check/test_eQTL_combine_kilpinen.R
@@ -0,0 +1,109 @@
+library(data.table)
+library(dplyr)
+library(ggplot2)
+library(colorspace)
+
+
+indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/"
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/combined/"
+
+dir.create(outdir, recursive = TRUE)
+
+
+### Read in results
+files <- list.files(indir)
+
+bed_results_list <- lapply(files, function(x){
+    fread(paste0(indir, x), sep = "\t")
+})
+
+
+results_dt <- do.call(rbind, bed_results_list)
+
+
+results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr")
+
+
+results_dt <- results_dt[order(fdr)]
+
+head(results_dt, n =100)
+
+results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006]
+
+fwrite(results_dt, paste0(outdir, "kilpinen_overlap_results.tsv"), sep = "\t")
+
+results_dt <- results_dt[direction != "different_snp"]
+results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE)
+
+results_dt[Gene_ID == "CHCHD2"]
+
+results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match"))
+
+results_dt$lmm_peer_beta_kilpinen4plotting <- ifelse(results_dt$maf_kilpinen > 0.5, (-1 * results_dt$lmm_peer_beta_kilpinen), results_dt$lmm_peer_beta_kilpinen)
+results_dt$dataset_beta4plotting <- ifelse(results_dt$dataset_maf > 0.5, (-1 * results_dt$dataset_beta), results_dt$dataset_beta)
+
+
+
+correlation_plot <- ggplot(results_dt, aes(lmm_peer_beta_kilpinen4plotting, dataset_beta4plotting, color = log(fdr))) +
+        geom_point() +
+        theme_classic() +
+         scale_color_continuous_sequential(palette = "Purp", rev = FALSE)
+        # scale_color_manual(values = c("grey","black"))
+
+ggsave(correlation_plot, filename = paste0(outdir, "kilpinen_correlation_plot.png"))
+
+
+histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 0.05) +
+        theme_classic() +
+        geom_vline(aes(xintercept=0.05),
+            color="red", linetype="dashed", size=1) +
+        scale_fill_manual(values = c("grey","black"))
+        
+ggsave(histogram_plot, filename = paste0(outdir, "kilpinen_histogram_plot.png"))
+
+
+histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        geom_vline(aes(xintercept=log10(0.05)),
+            color="red", linetype="dashed", size=0.25) +
+        scale_fill_manual(values = c("grey","black")) +
+        scale_y_continuous(expand = c(0, 0)) +
+        ylab("Number of Associations") +
+        labs(fill = "eQTL\nEffect\nDirection\nAgreement")
+
+        
+ggsave(histogram_plot_log, filename = paste0(outdir, "kilpinen_histogram_plot_log.png"))
+
+
+popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) +
+        geom_histogram(position = "stack", binwidth = 1) +
+        theme_classic() +
+        scale_fill_manual(values = c("grey","black")) +
+        theme(axis.title.x=element_blank(),
+            axis.title.y=element_blank(),
+            legend.position="none",
+            axis.text.x=element_text(size=7),
+            axis.text.y=element_text(size=7)) +
+        scale_y_continuous(expand = c(0, 0))
+
+ggsave(popout_histogram_plot_log, filename = paste0(outdir, "kilpinen_popout_histogram_plot_log.png"))
+
+combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -29, xmax = -12, 
+                       ymin = 275, ymax = 750)
+
+ggsave(combined_plot_log, filename = paste0(outdir, "kilpinen_histogram_plot_log_w_popout.png"), width = 5, height = 3)
+ggsave(combined_plot_log, filename = paste0(outdir, "kilpinen_histogram_plot_log_w_popout.pdf"), width = 5, height = 3)
+
+
+
+### Calculate number of eQTLs and write in notion for writing paper time ###
+nrow(results_dt)
+
+table(results_dt$direction)
+
+nrow(results_dt[fdr < 0.05])
+
+table(results_dt[fdr < 0.05]$direction)
+
diff --git a/eQTL_check/test_eQTL_kilpinen.R b/eQTL_check/test_eQTL_kilpinen.R
new file mode 100644
index 0000000..1b4bd07
--- /dev/null
+++ b/eQTL_check/test_eQTL_kilpinen.R
@@ -0,0 +1,206 @@
+### Reason: test eQTLs identified in Kilpinen et al in our dataset
+### Author: Drew Neavin
+### Date: 1 April, 2022
+
+
+library(data.table)
+library(Seurat)
+library(vcfR)
+library(glmmTMB)
+library(dplyr)
+library(ggplot2)
+
+
+
+##### Bring in variables #####
+### Bring in arguments
+args <- commandArgs(trailingOnly = TRUE)
+ensg <- args[1]
+outdir <- args[2]
+bed <- args[3]
+datadir <- args[4]
+
+
+cell_line_colors <- c("FSA0006" = "#F79E29", "MBE1006" = "#9B2C99", "TOB0421"= "#35369C")
+
+
+##### Read in dataframe of pairs to test #####
+snp_gene_pairs <- fread(bed)
+snp_gene_pairs$gene_id <- gsub("\\..+", "", snp_gene_pairs$gene_id)
+
+
+
+
+##### Subset the correct  snp #####
+snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg]
+
+
+
+##### Read in data #####
+residuals <- readRDS(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/gene_separated/residuals4qtl/", ensg, "_residuals4qtl.rds"))
+residual_dt <- data.table(residuals)
+colnames(residual_dt) <- c("residual")
+residual_dt$Barcode <- names(residuals)
+
+vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/finalized_snps.recode.vcf")
+meta <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Kilpinen_eQTLs/cell_meta.tsv", sep = "\t")
+data <- meta[residual_dt, on = "Barcode"]
+data$Barcode <- NULL
+
+
+
+
+### Summarize data by site and line and replicatae
+data_sum <- data[, .(residual=mean(residual)), by = .(Village, Line, Site, Replicate)]
+
+
+
+##### Get the snp from the vcf #####
+## GT ##
+snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F))
+
+
+if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){
+    message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.")
+    format_clust = "GT"
+
+    if (any(grepl("\\|",snp_dt[,1]))){
+        separator = "|"
+        message("Detected | separator for GT genotype format in cluster vcf")
+    } else if (any(grepl("/",snp_dt[,1]))) {
+        separator = "/"
+        message("Detected / separator for GT genotype format in cluster vcf")
+    } else {
+        format_clust = NA
+        message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.")
+    }
+    if (!is.na(format_clust)){
+        snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>%
+                                lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)})))
+    }
+}
+print("Completed GT conversion")
+
+
+colnames(snp_dt) <- gsub("36_TOB00421_i_E8", "TOB0421", colnames(snp_dt)) %>%
+                        gsub("22_FSA", "FSA0006", .) %>%
+                        gsub("29_MBE", "MBE1006", .)
+
+    
+## Get specific SNP ##
+snp_dt$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT'])
+snp_dt$dataset_maf <- (1 -  as.numeric(gsub(";.+", "", vcf@fix[,'INFO']) %>% gsub("AF=", "", .)))
+
+snp_dt_subset <- snp_dt[ID_ref_alt %in% snp_gene_pairs_subset$ID_ref_alt]
+
+
+
+### Add SNP data to data data.table ###
+snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID_ref_alt", "dataset_maf"), measure.vars = c("TOB0421", "FSA0006", "MBE1006"))
+
+
+
+##### Make dataframe with kilpinen results, snp genotypes and our beta + pvalue #####
+results_dt <- snp_gene_pairs_subset[,c("#chrom", "start", "end", "variant_id", "info_score", "maf", "gene_id", "lmm_pvalue", "lmm_qvalue", "lmm_peer_beta", "primary_eQTL", "iPSC_specific", "ID", "REF", "ALT", "ID_ref_alt")]
+colnames(results_dt) <- c(colnames(results_dt)[1:4],paste0(colnames(results_dt)[5:12], "_kilpinen"), colnames(results_dt)[13:ncol(results_dt)])
+results_dt <- results_dt[snp_dt_subset, on = "ID_ref_alt"]
+
+
+
+### Add genes to dataframe ###
+gene_conversion <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/DRENEA_1/outs/filtered_feature_bc_matrix/features.tsv.gz", header = FALSE, col.names = c("ENSG", "Gene_ID", "Assay"))
+results_dt$Gene_ID <- gene_conversion[ENSG == ensg]$Gene_ID
+
+
+
+results_dt$dataset_beta <- as.numeric(NA)
+results_dt$dataset_beta_se <- as.numeric(NA)
+results_dt$dataset_z <- as.numeric(NA)
+results_dt$dataset_p <- as.numeric(NA)
+results_dt$direction <- as.character(NA)
+
+
+for (snp in unique(snp_dt_subset_long$ID_ref_alt)){
+
+    data_sum_snp <- snp_dt_subset_long[ID_ref_alt == snp][data_sum, on = c("variable" = "Line")]
+
+    gt_long <- melt(results_dt[ID_ref_alt == snp][,c("ID_ref_alt", "TOB0421", "FSA0006", "MBE1006", "REF", "ALT")], measure.vars = c("TOB0421", "FSA0006", "MBE1006"))
+    gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), 
+                            ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT),
+                                ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA)))
+
+    gt_long <- unique(gt_long)
+
+    data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID_ref_alt")]
+
+    data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype))
+
+
+    ##### Check for beta #####
+    base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE)
+    snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE)
+
+    anova_results <- anova(base_model, snp_model)
+
+    results_dt[ID_ref_alt == snp]$dataset_beta <- summary(snp_model)$coefficients$cond[2,1]
+    results_dt[ID_ref_alt == snp]$dataset_beta_se <- summary(snp_model)$coefficients$cond[2,2]
+    results_dt[ID_ref_alt == snp]$dataset_z <- summary(snp_model)$coefficients$cond[2,3]
+    results_dt[ID_ref_alt == snp]$dataset_p <- anova_results$`Pr(>Chisq)`[2]
+
+    if ((results_dt[ID_ref_alt == snp]$maf < 0.4 & results_dt[ID_ref_alt == snp]$dataset_maf < 0.4) | (results_dt[ID_ref_alt == snp]$maf > 0.6 & results_dt[ID_ref_alt == snp]$dataset_maf > 0.6)) {
+        if ((results_dt[ID_ref_alt == snp]$lmm_peer_beta * results_dt[ID_ref_alt == snp]$dataset_beta) > 0){
+            results_dt[ID_ref_alt == snp]$direction <- "match"
+        } else {
+            results_dt[ID_ref_alt == snp]$direction <- "opposite"
+        }
+    } else {
+        if ((results_dt[ID_ref_alt == snp]$lmm_peer_beta * results_dt[ID_ref_alt == snp]$dataset_beta) < 0){
+            results_dt[ID_ref_alt == snp]$direction <- "match"
+        } else {
+            results_dt[ID_ref_alt == snp]$direction <- "opposite"
+        }
+    } 
+
+
+    ###### Make a figure of the results
+    ### Different shapes for location
+    ### Different fill for village and uniculture
+    ### Color by line
+    if (!results_dt[ID_ref_alt == snp]$direction ==  "different_snp"){
+        shapes <- c(0,15,1,16,2,17)
+        names(shapes) <- paste0(c("Sydney", "Sydney", "Melbourne", "Melbourne", "Brisbane", "Brisbane"), " ", rep(c("Uni-culture","Village")))
+
+        data_sum_snp$shapes <- paste0(data_sum_snp$Site, " ", gsub(0, "Uni-culture",gsub(1, "Village", data_sum_snp$Village)))
+        data_sum_snp$value <- factor(data_sum_snp$value, levels = sort(unique(data_sum_snp$value)))
+
+        labels <- levels(data_sum_snp$Genotype)
+        names(labels) <- as.numeric(levels(data_sum_snp$value))
+
+        plot <- ggplot(data_sum_snp, aes(value, residual)) +
+            geom_point(aes(shape = shapes, color = variable)) +
+            theme_classic() +
+            scale_color_manual(values = cell_line_colors) +
+            scale_fill_manual(values = cell_line_colors) +
+            scale_shape_manual(values = shapes) +
+            geom_smooth(aes(as.numeric(value), residual), position = "identity",method = "lm", color = "black", se=FALSE) +
+            scale_x_discrete(labels=labels) +
+            ylab("Normalized Expression") +
+            ggtitle(paste0(snp, "-", results_dt[ID_ref_alt == snp]$Gene_ID, " eQTL"),
+                    subtitle = paste0("beta = ", round(results_dt[ID_ref_alt == snp]$dataset_beta, 3))) +
+            xlab(paste0(results_dt[ID_ref_alt == snp]$rsid, '\nGenotype')) +
+            theme(plot.title = element_text(hjust = 0.5),
+                    plot.subtitle = element_text(hjust = 0.5)) +
+            labs(color = "iPSC Line", shape = "Site & Village", fill = NULL)
+
+        ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_kilpinen_eQTL_results.png"), width = 5, height = 3.5)
+        ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_kilpinen_eQTL_results.pdf"), width = 5, height = 3.5)
+    }
+}
+
+fwrite(results_dt, paste0(outdir,"beds/",ensg, "_kilpinen_eQTL_results.bed"), sep = "\t")
+
+
+
+
diff --git a/multi-passage/Variance/variance_partition_multipassage.R b/multi-passage/Variance/variance_partition_multipassage.R
new file mode 100644
index 0000000..cffa815
--- /dev/null
+++ b/multi-passage/Variance/variance_partition_multipassage.R
@@ -0,0 +1,358 @@
+library(haven)
+library(ggplot2)
+library(glmmTMB)
+library(Seurat)
+library(tidyverse)
+library(specr)
+library(data.table)
+library(dsLib)
+library(pkgcond)
+library(texreg)
+
+
+inicio("Starting Analysis")
+
+
+##### Define functions #####
+icc_glmmtmb <- function(model, percent = TRUE) {
+    tmp <- VarCorr(model)
+    var <- do.call(rbind, lapply(names(tmp$cond), function(x) data.table("grp" = x, "vcov" = attr(tmp$cond[[x]], "stddev")^2)))
+    var <- rbind(var, data.table("grp" = "Residual", "vcov" = sigma(model)^2))
+    sum_var <- sum(var$vcov)
+    var <- var %>% dplyr::mutate(icc = vcov/sum_var)
+    if (isTRUE(percent)) {
+        var <- var %>% dplyr::mutate(percent = .data$icc * 100)
+    }
+    return(var)
+}
+
+
+
+##### Bring in variables #####
+### Bring in arguments
+args <- commandArgs(trailingOnly = TRUE)
+icc_interaction_outdir <- paste0(args[1])
+icc_outdir <- paste0(args[2])
+model_interaction_outdir <- paste0(args[3])
+model_outdir <- paste0(args[4])
+resid_outdir <- paste0(args[5])
+gene <- as.character(args[6])
+
+print(icc_outdir)
+print(icc_outdir)
+print(model_outdir)
+print(resid_outdir)
+print(gene)
+
+
+
+##### Read in seurat with genes #####
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/seurat_joint_SCT_1pct_expressing.rds")
+
+
+
+### Make DF for modeling ###
+df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Line" = seurat@meta.data$Assignment, "Time" = as.factor(seurat@meta.data$Pool))
+colnames(df_hier_unscale)[1] <- "Expression"
+
+
+
+##### Leave one out method #####
+variables <- c("Line", "Time")
+
+model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+
+boolFalse<-F
+while(boolFalse==F & length(variables) > 0){
+  tryCatch({
+    print(variables)
+    model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+    boolFalse<-T
+  },error=function(e){
+    if (length(variables) > 1){
+        variables <- variables[1:(length(variables) -1)]
+    } else {
+        variables <- c()
+    }
+  })
+}
+
+
+if (!length(variables) == 0){
+
+
+### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+while (!model_glmmtmb$sdr$pdHess & length(variables) > 0 ){
+    print("Singular fit: removing last variable and rerunning with one less covariate.")
+    if (length(variables) > 1){
+        variables <- variables[1:(length(variables) -1)]
+        print(variables)
+        model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+        model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+    } else {
+        variables <- c()
+    }
+}
+
+print(variables)
+
+if (length(variables) > 0){
+
+    model_loo <- list()
+
+    icc <- data.table(grp = variables, P = as.numeric(NA))
+
+    for (variable in variables){
+        print(variable)
+        if (length(variables) > 1){
+            model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+        } else {
+            model <- as.formula(paste0("Expression ~ 1"))
+        }
+        model_loo[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+        icc[grp == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2]
+    }
+
+
+    if (!(any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){
+        model_loo_updated <- model_loo
+
+            updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+            model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+            ### Calculate the variance explained by each of the included variables ###
+            icc <- icc_glmmtmb(model_loo_updated[["all"]])
+
+
+            ### Recalculate significance ###
+            icc$P <- as.numeric(NA)
+            icc$gene <- gene
+
+            for (variable in variables){
+                print(variable)
+                if (length(variables) > 1){
+                    model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                } else {
+                    model <- as.formula(paste0("Expression ~ 1"))
+                }
+                model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2]
+            }
+        }
+
+
+    while((any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){
+
+        print("Removing non-significant vartiables and retesting signficance")
+
+        ##### Identify variables to keep #####
+        variables <- icc[P < 0.05/length(variables)]$grp
+
+        if (length(variables) > 0){
+            
+            ##### Calculate full model #####
+            updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+
+            model_loo_updated <- list()
+            model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+            ### Calculate the variance explained by each of the included variables ###
+            icc <- icc_glmmtmb(model_loo_updated[["all"]])
+
+
+
+            ### Recalfulate significance ###
+            icc$P <- as.numeric(NA)
+            icc$gene <- gene
+
+            for (variable in variables){
+                print(variable)
+                if (length(variables) > 1){
+                    model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                } else {
+                    model <- as.formula(paste0("Expression ~ 1"))
+                }
+                model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2]
+            }
+
+
+    
+        } else {
+            icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+            model_loo_updated <- list()
+        }
+    }
+
+        interaction_variables <- c()
+
+        if (length(variables) > 1){
+            ### Add in interactions of the significant variables
+            if ("Line" %in% variables & "Time" %in% variables){
+                interaction_variables <- c(interaction_variables, "Line:Time")
+            }
+
+            model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+
+
+            boolFalse<-F
+            while(boolFalse==F & length(interaction_variables) > 0){
+            tryCatch({
+                print(c(variables, interaction_variables))
+                model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                boolFalse<-T
+            },error=function(e){
+                if (length(interaction_variables) > 1){
+                    interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)]
+                } else {
+                    interaction_variables <- c()
+            }
+            })
+            }
+
+            ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+            while (!model_glmmtmb_interaction$sdr$pdHess & length(interaction_variables) > 0 ){
+                print("Singular fit: removing last variable and rerunning with one less covariate.")
+                if (length(interaction_variables) > 1){
+                    interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)]
+                    print(c(interaction_variables, variables))
+                    model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+                    model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                } else {
+                    interaction_variables <- c()
+                }
+            }
+
+            if (length(interaction_variables) > 0){
+
+                model_loo_interaction <- list()
+
+                icc_interaction <- data.table(grp = interaction_variables, P = as.numeric(NA))
+
+                for (variable in c(interaction_variables)){
+                    print(variable)
+                    if (length(interaction_variables) > 1){
+                        model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")"))
+                    } else {
+                        model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                    }
+                    model_loo_interaction[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    icc_interaction[grp == variable]$P <- anova(model_loo_interaction[[variable]], model_glmmtmb_interaction)$`Pr(>Chisq)`[2]
+                }
+
+
+                if (!(any(icc_interaction[grp != "Residual"]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[grp != "Residual"]$P)))){
+                    model_loo_interaction_updated <- model_loo_interaction
+
+                    updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+
+                    model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+                    ### Calculate the variance explained by each of the included variables ###
+                    icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]])
+
+
+                    ### Recalculate significance ###
+                    icc_interaction$P <- as.numeric(NA)
+                    icc_interaction$gene <- gene
+
+                    for (variable in c(variables, interaction_variables)){
+                        print(variable)
+                        if (length(c(interaction_variables)) > 1){
+                            model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")"))
+                        } else {
+                            model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                        }
+                        model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                        icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2]
+                    }
+                }
+
+
+                while((any(icc_interaction[!(grp %in% c("Residual", variables))]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[!(grp %in%c("Residual", variables))]$P)))){
+
+                    print("Removing non-significant vartiables and retesting signficance")
+
+                    ##### Identify variables to keep #####
+                    interaction_variables <- icc_interaction[!(grp %in% c("Residual", variables)) & P < 0.05/length(c(variables, interaction_variables))]$grp
+
+                    if (length(interaction_variables) > 0){
+                        
+                        ##### Calculate full model #####
+                        updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+
+
+                        model_loo_interaction_updated <- list()
+                        model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+                        ### Calculate the variance explained by each of the included variables ###
+                        icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]])
+
+
+
+                        ### Recalfulate significance ###
+                        icc_interaction$P <- as.numeric(NA)
+                        icc_interaction$gene <- gene
+
+                        for (variable in c(variables, interaction_variables)){
+                            print(variable)
+                            model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!(c(variables, interaction_variables) %in% variable)], collapse = ") + (1|"), ")"))
+                            model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                            icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2]
+                        }
+                    } else {
+                        icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+                        model_loo_interaction_updated <- list()
+                    }
+
+                    if (nrow(icc_interaction) > nrow(icc)){
+                        saveRDS(icc_interaction, paste0(icc_interaction_outdir, gene, "_icc.rds"), compress = TRUE)
+                        saveRDS(model_loo_interaction_updated, paste0(model_interaction_outdir, gene, "_fitted_models.rds"), compress = TRUE)
+                    }
+                }
+            } else {
+                icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+                model_loo_interaction_updated <- list()
+            }
+
+
+        ### If line is significant, then get residuals for downstream qtl checks ###
+        if ("Line" %in% variables){
+            print("Making residuals for qtl detection")
+            if (length(variables) > 1){
+                    if (length(interaction_variables) > 0){
+                        model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[-grep("Line", c(variables, interaction_variables))], collapse = ") + (1|"), ")"))
+                    } else {
+                        model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Line"], collapse = ") + (1|"), ")"))
+                    }
+            } else {
+                model_no_line <- as.formula(paste0("Expression ~ 1"))
+            }
+            fit_no_line <- glmmTMB(formula = noquote(model_no_line), data = df_hier_unscale, REML = TRUE)
+            residuals <- resid(fit_no_line)
+            saveRDS(residuals, paste0(resid_outdir, gene, "_residuals4qtl.rds"), compress = TRUE)
+        }
+    
+        } else {
+            icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+            model_loo_updated <- list()
+        }
+    } else {
+        icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+        model_loo_updated <- list()
+    }
+}
+
+
+saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE)
+saveRDS(model_loo_updated, paste0(model_outdir, gene, "_fitted_models.rds"), compress = TRUE)
+
+
+fin()
\ No newline at end of file
diff --git a/multi-passage/Variance/variance_partition_multipassage.smk b/multi-passage/Variance/variance_partition_multipassage.smk
new file mode 100644
index 0000000..9bb352f
--- /dev/null
+++ b/multi-passage/Variance/variance_partition_multipassage.smk
@@ -0,0 +1,99 @@
+import pandas as pd
+
+
+genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/genes_1pct_expressing.tsv"
+genes = pd.read_csv(genes_file, sep = "\t")
+# genes = genes.iloc[1]
+
+
+rule all:
+    input:
+        # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/fit_models/{gene}_fitted_models.rds", gene = genes.Gene),
+        # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/fit_models/{gene}_fitted_models.rds", gene = genes.Gene),
+        expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/fit_models/{gene}_fitted_models.rds", gene = genes.Gene),
+
+
+rule partition_variance:
+    input:
+       "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/seurat_joint_SCT_1pct_expressing.rds"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc/{gene}_icc.rds",
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/fit_models/{gene}_fitted_models.rds"
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 16,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 16
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage.R",
+        out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc/",
+        out_model="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/fit_models/",
+        out_resids="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/residuals4qtl/",
+        out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc_interaction/",
+        out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc_interaction/"
+    log:
+    shell:
+        """
+        mkdir -p {params.out_icc_interaction}
+        mkdir -p {params.out_model_interaction}
+        mkdir -p {params.out_resids}
+        
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_model_interaction} {params.out_model} {params.out_resids} {wildcards.gene}
+        """
+
+
+rule partition_variance_ncov:
+    input:
+        seurat = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/seurat_joint_SCT_1pct_expressing.rds"
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc/{gene}_icc.rds",
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc_interaction/{gene}_icc.rds",
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/fit_models/{gene}_fitted_models.rds"
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 16,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 16
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R",
+        out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc/",
+        out_model="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/fit_models/",
+        out_resids="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/residuals4qtl/",
+        out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc_interaction/",
+        out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc_interaction/"
+    log:
+    shell:
+        """
+        mkdir -p {params.out_icc_interaction}
+        mkdir -p {params.out_model_interaction}
+        mkdir -p {params.out_resids}
+        
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_model_interaction} {params.out_model} {params.out_resids} {wildcards.gene}
+        """
+        
+
+### Tried manually with just CHCHD2 and the Ncov impacts the ability to get the line effect so cannot run this way (could downsample some of the pools to be more even sizes)
+rule partition_variance_integratedSCT:
+    input:
+        seurat =  ancient("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds")
+    output:
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc/{gene}_icc.rds",
+        "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/fit_models/{gene}_fitted_models.rds"
+    resources:
+        mem_per_thread_gb = lambda wildcards, attempt: attempt * 8,
+        disk_per_thread_gb = lambda wildcards, attempt: attempt * 8
+    threads: 4
+    params:
+        script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R",
+        out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc/",
+        out_model="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/fit_models/",
+        out_resids="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl/",
+        out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc_interaction/",
+        out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc_interaction/"
+    log:
+    shell:
+        """
+        mkdir -p {params.out_icc_interaction}
+        mkdir -p {params.out_model_interaction}
+        mkdir -p {params.out_resids}
+        
+        /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_model_interaction} {params.out_model} {params.out_resids} {wildcards.gene}
+        """
\ No newline at end of file
diff --git a/multi-passage/Variance/variance_partition_multipassage_combine.R b/multi-passage/Variance/variance_partition_multipassage_combine.R
new file mode 100644
index 0000000..37150ad
--- /dev/null
+++ b/multi-passage/Variance/variance_partition_multipassage_combine.R
@@ -0,0 +1,874 @@
+##### Reason: combine the results for the variance explained by different factors for each gene
+##### Author: Drew Neavin
+##### Date: 14 March, 2022
+
+
+##### Load in libraries #####
+library(data.table)
+library(tidyverse)
+library(ggridges)
+library(raincloudplots)
+library(ggdist)
+library(clusterProfiler)
+library(org.Hs.eg.db)
+library(GOSemSim)
+library(RColorBrewer)
+
+
+
+##### Set up directories #####
+dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/"
+icc_dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc/"
+icc_dir2 <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc/"
+icc_interaction_dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc_interaction/"
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/combined/"
+outdir_comparison <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/comparison/"
+
+dir.create(outdir, recursive = TRUE)
+dir.create(outdir_comparison, recursive = TRUE)
+
+
+vars <- c("Line", "Passage", "Line:Passage", "Residual")
+selected_vars <- c("Line", "Passage", "Line:Passage", "Residual")
+var_colors <- c("#4734a9", "#78c0fe", "#97cf8a", "gray80")
+
+names(var_colors) <- vars
+
+var_colors <- var_colors[selected_vars]
+
+
+
+##### Get list of icc files #####
+icc_files <- list.files(icc_dir)
+# icc_files2 <- list.files(icc_dir2)
+
+
+
+##### Read in icc results #####
+icc_results_list <- lapply(icc_files, function(x){
+    readRDS(paste0(icc_dir,x))
+})
+names(icc_results_list) <- icc_files
+
+# icc_results_list2 <- lapply(icc_files2, function(x){
+#     readRDS(paste0(icc_dir2,x))
+# })
+# names(icc_results_list2) <- icc_files2
+
+
+
+##### Get list of icc interaction files #####
+icc_interaction_files <- list.files(icc_interaction_dir, pattern = "_icc.rds")
+
+
+
+##### Read in icc results #####
+icc_interaction_results_list <- lapply(icc_interaction_files, function(x){
+    readRDS(paste0(icc_interaction_dir,x))
+})
+names(icc_interaction_results_list) <- icc_interaction_files
+
+
+
+##### Merge icc results into a single data.table #####
+icc_dt <- do.call(rbind, icc_results_list)
+# icc_dt2 <- do.call(rbind, icc_results_list2)
+# colnames(icc_dt2) <- paste0("integrate_sct_", colnames(icc_dt2))
+
+
+# icc_dt_joined <- icc_dt[icc_dt2, on = c("grp" = "integrate_sct_grp", "gene" = "integrate_sct_gene")]
+# icc_dt_joined$diff <- icc_dt_joined$percent - icc_dt_joined$integrate_sct_percent
+# max(na.omit(icc_dt_joined$diff))
+
+# icc_dt_joined[diff == max(na.omit(icc_dt_joined$diff))]
+
+# icc_dt_joined[gene == "ENSG00000106153"]
+
+
+# diff_dist <- ggplot(icc_dt_joined, aes(diff)) +
+#     geom_histogram() +
+#     facet_wrap(vars(grp))
+#     # theme_classic()
+
+# ggsave(diff_dist, filename = paste0(outdir_comparison, "diff_histogram.png"))
+
+
+# scatter <- ggplot(icc_dt_joined, aes(percent,integrate_sct_percent)) +
+#     geom_point() +
+#     facet_wrap(vars(grp), scales = "free") +
+#     theme_classic()
+
+# ggsave(scatter, filename = paste0(outdir_comparison, "scatter_correlation.png"), height = 4.5)
+
+
+# icc_dt_joined_long <- melt(icc_dt_joined, id.vars = c("grp", "gene"),measure.vars = c("percent", "integrate_sct_percent"))
+
+
+# dist <- ggplot(icc_dt_joined_long, aes(value)) +
+#     geom_histogram() +
+#     facet_grid(grp ~ variable) +
+#     theme_classic()
+
+# ggsave(dist, filename = paste0(outdir_comparison, "histogram.png"))
+
+
+# ### want to use the integrate_sct_percent
+# ### compared the two methods and appears that fitting when have been separately sct assessed =>  Line effect well correlated but time is much less (and not well correlated between the two methods) so likely inducing Time effects from normalizing all together
+
+
+
+icc_dt$percent_round <- round(icc_dt$percent)
+
+icc_dt$grp <- gsub("Time", "Passage", icc_dt$grp)
+icc_dt$grp <- factor(icc_dt$grp, levels= rev(c("Line", "Passage", "Line:Passage", "Residual")))
+
+group_size  <- data.table(table(icc_dt$grp))
+colnames(group_size) <- c("grp", "size")
+group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size)
+
+icc_dt <- group_size[icc_dt, on = "grp"]
+icc_dt$grp_size <- factor(icc_dt$grp_size, levels = unique(group_size$grp_size))
+
+
+##### Merge icc_interaction results into a single data.table #####
+icc_interaction_dt <- do.call(rbind, icc_interaction_results_list)
+
+icc_interaction_dt$percent_round <- round(icc_interaction_dt$percent)
+
+icc_interaction_dt$grp <- gsub("Time", "Passage", icc_interaction_dt$grp)
+icc_interaction_dt$grp <- factor(icc_interaction_dt$grp, levels= rev(c("Line", "Passage", "Line:Passage", "Residual")))
+
+group_size  <- data.table(table(icc_interaction_dt$grp))
+colnames(group_size) <- c("grp", "size")
+group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size)
+
+icc_interaction_dt <- group_size[icc_interaction_dt, on = "grp"]
+icc_interaction_dt$grp_size <- factor(icc_interaction_dt$grp_size, levels = unique(group_size$grp_size))
+
+### *** Need to add individual effects without interaction in to interaction dt *** ###
+icc_interaction_plus_dt <- rbind(icc_interaction_dt, icc_dt[!(gene %in% icc_interaction_dt$gene)])
+
+
+group_size  <- data.table(table(icc_interaction_plus_dt$grp))
+colnames(group_size) <- c("grp", "size")
+group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size)
+
+icc_interaction_plus_dt <- group_size[icc_interaction_plus_dt, on = "grp"]
+icc_interaction_plus_dt$grp_size <- factor(icc_interaction_plus_dt$grp_size, levels = unique(group_size$grp_size))
+
+
+
+##### Check difference in percent explained with and without interactions #####
+icc_interaction_dt_joined <- icc_dt[icc_interaction_dt, on = c("grp", "gene")]
+
+icc_interaction_dt_joined$difference <- icc_interaction_dt_joined$percent - icc_interaction_dt_joined$i.percent
+
+
+pRaincloud_dif <- ggplot(icc_interaction_dt_joined, aes(x = difference, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(selected_vars)))) + 
+                geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) +
+                geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) +
+                coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+                theme_classic() +
+                theme(axis.title.y=element_blank()) +
+                xlab("Percent Variance Explained") +
+                scale_y_discrete(expand = c(0.03, 0)) +
+                scale_fill_manual(values = var_colors)
+
+ggsave(pRaincloud_dif, filename = paste0(outdir, "variance_explained_interaction_difference_raincloud.png"), height = 8, width = 7)
+ggsave(pRaincloud_dif, filename = paste0(outdir, "variance_explained_interaction_difference_raincloud.pdf"), height = 8, width = 7)
+
+
+
+##### Make a figure of stacked variance explained #####
+### Order based on line variance explained ###
+genes_list <- list()
+
+for (group in c("Line", "Passage", "Line:Passage", "Residual")){
+    genes_list[[group]] <- icc_dt[grp == group][rev(order(percent_round))]$gene
+}
+
+genes <- unique(unlist(genes_list))
+
+icc_dt$gene <- factor(icc_dt$gene, levels = genes)
+
+icc_dt$grp <- factor(icc_dt$grp, levels= rev(c("Line", "Passage", "Line:Passage", "Residual")))
+
+
+## First on line percent, then village percent ##
+bar_proportions <- ggplot(icc_dt, aes(x = gene, y = percent, fill = grp)) +
+    geom_bar(position="stack", stat="identity") +
+    theme_classic() +
+    theme(axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank()) +
+    scale_fill_manual(values = var_colors)
+
+
+ggsave(bar_proportions, filename = paste0(outdir, "variance_explained_bar.png"), width = 20)
+
+
+
+### Try boxplot ###
+boxplot <- ggplot(icc_dt, aes(x = factor(grp, levels = rev(levels(grp))), y = percent, fill = factor(grp, levels = rev(levels(grp))), color = factor(grp, levels = rev(levels(grp))))) +
+    geom_boxplot(alpha = 0.5, size = 0.5) +
+    theme_classic() +
+    xlab("Covariate") +
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
+        legend.position="none") +
+    scale_fill_manual(values = var_colors) +
+    scale_color_manual(values = var_colors)
+    
+
+ggsave(boxplot, filename = paste0(outdir, "variance_explained_box.png"), height = 3, width = 2)
+
+
+### Try ridgplots ###
+pRidges <- ggplot(icc_dt[grp != "Residual"], aes(x = percent, y = factor(grp, levels = rev(levels(grp))), fill = factor(grp, levels = rev(levels(grp))))) +
+    geom_density_ridges(stat = "binline", bins = 200, scale = 0.95, draw_baseline = FALSE) +
+    # geom_density_ridges() +
+    theme_classic()
+
+ggsave(pRidges, filename = paste0(outdir, "variance_explained_ridge.png"), height = 8, width = 10)
+
+### Try ridgplots ###
+pRidges_pop <- ggplot(icc_dt[grp != "Residual"][percent > 10], aes(x = percent, y = factor(grp, levels = rev(levels(grp))), fill = factor(grp, levels = rev(levels(grp))))) +
+    geom_density_ridges(stat = "binline", bins = 180, scale = 0.95, draw_baseline = FALSE) +
+    # geom_density_ridges() +
+    theme_classic()
+
+ggsave(pRidges_pop, filename = paste0(outdir, "variance_explained_ridge_pop.png"), height = 8, width = 10)
+
+
+# pRaincloud <- ggplot(icc_dt, aes(x = percent, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(vars)))) + 
+#                 geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) +
+#                 geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) +
+#                 coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+#                 theme_classic() +
+#                 theme(axis.title.y=element_blank()) +
+#                 xlab("Percent Variance Explained") +
+#                 scale_y_discrete(expand = c(0.03, 0)) +
+#                 scale_fill_manual(values = var_colors)
+
+pRaincloud <- ggplot(icc_dt, aes(x = percent, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(vars)))) + 
+                geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) +
+                geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) +
+                coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+                theme_classic() +
+                theme(axis.title.y=element_blank()) +
+                xlab("Percent Variance Explained") +
+                scale_y_discrete(expand = c(0.1, 0)) +
+                scale_fill_manual(values = var_colors, name = "Variable") +
+                scale_color_manual(values = var_colors, name = "Variable") +
+                geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5)
+
+ggsave(pRaincloud, filename = paste0(outdir, "variance_explained_raincloud.png"), height = 2, width = 5)
+ggsave(pRaincloud, filename = paste0(outdir, "variance_explained_raincloud.pdf"), height = 2, width = 5)
+
+
+
+icc_interaction_plus_dt$grp_size <- factor(icc_interaction_plus_dt$grp_size, levels = c("Line\nN = 750", "Passage\nN = 750", "Line:Passage\nN = 619", "Residual\nN = 750"))
+
+
+# pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = grp_size,  fill = factor(grp, levels = rev(vars)))) + 
+#                 geom_density_ridges(size = 0.1, stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity.., fill = factor(grp, levels = rev(vars))), alpha = 0.75) +
+#                 geom_boxplot(outlier.shape=20, size = 0.1,width = .15, outlier.size = 0.01, position = position_nudge(y=-0.12), alpha = 0.75) +
+#                 coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+#                 theme_classic() +
+#                 theme(axis.title.y=element_blank()) +
+#                 xlab("Percent Variance Explained") +
+#                 scale_y_discrete(expand = c(0.03, 0)) +
+#                 scale_fill_manual(values = var_colors) +
+#                 scale_color_manual(values = var_colors) +
+#                 geom_vline(xintercept = 1,color = "grey70", size = 0.4, lty="11")
+
+pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = factor(grp_size, levels = levels(grp_size)), fill = factor(grp, levels = rev(vars)))) + 
+                geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) +
+                geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) +
+                coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+                theme_classic() +
+                theme(axis.title.y=element_blank()) +
+                xlab("Percent Variance Explained") +
+                scale_y_discrete(expand = c(0.1, 0)) +
+                scale_fill_manual(values = var_colors, name = "Variable") +
+                scale_color_manual(values = var_colors, name = "Variable") +
+                geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5)
+
+ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.png"), height = 2, width = 4)
+ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.pdf"), height = 2, width = 4)
+
+
+
+icc_interaction_sig_list <- list()
+
+for (ensg in unique(icc_interaction_plus_dt$gene)){
+    icc_interaction_sig_list[[ensg]] <- icc_interaction_plus_dt[gene == ensg][P < 0.05/(nrow(icc_interaction_plus_dt[gene == ensg])-1)]
+    icc_interaction_sig_list[[ensg]] <- rbind(icc_interaction_sig_list[[ensg]], icc_interaction_plus_dt[gene == ensg & grp == "Residual"])
+}
+
+icc_interaction_sig_dt <- do.call(rbind, icc_interaction_sig_list) 
+
+
+
+group_size  <- data.table(table(icc_interaction_sig_dt$grp))
+colnames(group_size) <- c("grp", "size")
+group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size)
+
+icc_interaction_sig_dt <- group_size[icc_interaction_sig_dt, on = "grp"]
+icc_interaction_sig_dt$grp_size <- factor(icc_interaction_sig_dt$grp_size, levels = unique(group_size$grp_size))
+
+grp_size_order <- c("Line\nN = 750", "Passage\nN = 750", "Line:Passage\nN = 619", "Residual\nN = 750")
+
+
+
+# pRaincloud_interaction_sig <- ggplot(icc_interaction_sig_dt, aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + 
+#                 geom_density_ridges(stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) +
+#                 geom_boxplot(size = 0.5,width = .15, outlier.size = 0.15, position = position_nudge(y=-0.12), alpha = 0.75) +
+#                 coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+#                 theme_classic() +
+#                 theme(axis.title.y=element_blank()) +
+#                 xlab("Percent Variance Explained") +
+#                 scale_y_discrete(expand = c(0.03, 0)) +
+#                 scale_fill_manual(values = var_colors) +
+#                 geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5)
+
+pRaincloud_interaction_sig <- ggplot(icc_interaction_sig_dt, aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + 
+                geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) +
+                geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) +
+                coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+                theme_classic() +
+                theme(axis.title.y=element_blank()) +
+                xlab("Percent Variance Explained") +
+                scale_y_discrete(expand = c(0.07, 0)) +
+                scale_fill_manual(values = var_colors, name = "Variable") +
+                scale_color_manual(values = var_colors, name = "Variable") +
+                geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5)
+
+ggsave(pRaincloud_interaction_sig, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant.png"), height = 2, width = 4)
+ggsave(pRaincloud_interaction_sig, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant.pdf"), height = 2, width = 4)
+
+
+total <- icc_interaction_sig_dt[,.(count = .N), by = .(grp)]
+total_less1pct <- icc_interaction_sig_dt[percent <= 1][,.(count_less_1pct = .N), by = .(grp)]
+total_less5pct <- icc_interaction_sig_dt[percent <= 5][,.(count_less_5pct = .N), by = .(grp)]
+total_less10pct <- icc_interaction_sig_dt[percent <= 10][,.(count_less_10pct = .N), by = .(grp)]
+summary <- total[total_less1pct, on = "grp"]
+summary <- summary[total_less5pct, on = "grp"]
+summary <- summary[total_less10pct, on = "grp"]
+summary$count_greater_1pct <- summary$count - summary$count_less_1pct
+summary$count_greater_5pct <- summary$count - summary$count_less_5pct
+summary$count_greater_10pct <- summary$count - summary$count_less_10pct
+summary$percent_1pct <- (summary$count_less_1pct/summary$count)*100
+summary$percent_5pct <- (summary$count_less_5pct/summary$count)*100
+summary$percent_10pct <- (summary$count_less_10pct/summary$count)*100
+
+
+# pRaincloud_interaction_sig_1pct <- ggplot(icc_interaction_sig_dt[percent >= 1], aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + 
+#                 geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) +
+#                 geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) +
+#                 coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+#                 theme_classic() +
+#                 theme(axis.title.y=element_blank()) +
+#                 xlab("Percent Variance Explained") +
+#                 scale_y_discrete(expand = c(0.03, 0)) +
+#                 scale_fill_manual(values = var_colors) +
+#                 geom_vline(xintercept = 1, linetype = "dashed", color = "firebrick3")
+
+pRaincloud_interaction_sig_1pct <- ggplot(icc_interaction_sig_dt[percent >= 1], aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + 
+                geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) +
+                geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) +
+                coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+                theme_classic() +
+                theme(axis.title.y=element_blank()) +
+                xlab("Percent Variance Explained") +
+                scale_y_discrete(expand = c(0.07, 0)) +
+                scale_fill_manual(values = var_colors, name = "Variable") +
+                scale_color_manual(values = var_colors, name = "Variable") +
+                geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5)
+
+ggsave(pRaincloud_interaction_sig_1pct, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_1pct.png"), height = 2, width = 4)
+ggsave(pRaincloud_interaction_sig_1pct, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_1pct.pdf"), height = 2, width = 4)
+
+
+
+
+##### Pull just the significant variances genome-wide #####
+icc_interaction_plus_dt$fdr <- p.adjust(icc_interaction_plus_dt$P, method="fdr")
+icc_interaction_sig_gw_dt <- icc_interaction_plus_dt[fdr < 0.05 | is.na(fdr)]
+
+group_size_sig_gw  <- data.table(table(icc_interaction_sig_gw_dt$grp))
+colnames(group_size_sig_gw) <- c("grp", "size")
+group_size_sig_gw$grp_size <- paste0(group_size_sig_gw$grp, "\nN = ", formatC(group_size_sig_gw$size, format="d", big.mark=","))
+
+icc_interaction_sig_gw_dt <- group_size_sig_gw[icc_interaction_sig_gw_dt, on = "grp"]
+icc_interaction_sig_gw_dt$grp_size <- factor(icc_interaction_sig_gw_dt$grp_size, levels = unique(group_size_sig_gw$grp_size))
+
+
+group_size_sig_gw_order <- c("Line\nN = 750", "Passage\nN = 750", "Line:Passage\nN = 619", "Residual\nN = 750")
+
+
+pRaincloud_interaction_sig_gw <- ggplot(icc_interaction_sig_gw_dt, aes(x = percent, y = factor(grp_size, levels = group_size_sig_gw_order), fill = factor(grp, levels = rev(vars)))) + 
+                geom_density_ridges(stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) +
+                geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) +
+                coord_cartesian(xlim = c(1.2, NA), clip = "off") +
+                theme_classic() +
+                theme(axis.title.y=element_blank()) +
+                xlab("Percent Variance Explained") +
+                scale_y_discrete(expand = c(0.03, 0)) +
+                scale_fill_manual(values = var_colors) +
+                geom_vline(xintercept = 1, linetype = "dashed", color = "firebrick3") +
+                labs(fill="Covariate")
+
+ggsave(pRaincloud_interaction_sig_gw, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_genome_wide.png"), height = 8, width = 7)
+ggsave(pRaincloud_interaction_sig_gw, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_genome_wide.pdf"), height = 8, width = 7)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+icc_interaction_sig_dt[gene == "ENSG00000106153"]
+icc_dt[gene == "ENSG00000106153"]
+icc_interaction_plus_dt[gene == "ENSG00000106153"]
+icc_interaction_sig_gw_dt[gene == "ENSG00000106153"]
+icc_interaction_sig_gw_dt2[gene == "ENSG00000106153"]
+
+
+
+
+
+
+
+##### Add gene IDs for easy identification downstream #####
+GeneConversion1 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/DRENEA_1/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t")
+GeneConversion2 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/Village_B_1_week/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t")
+
+GeneConversion <- unique(rbind(GeneConversion1, GeneConversion2))
+GeneConversion <- GeneConversion[!duplicated(GeneConversion$X1),]
+GeneConversion$X3 <- NULL
+colnames(GeneConversion) <- c("gene", "Gene_ID")
+
+GeneConversion <- data.table(GeneConversion)
+
+
+### Add the gene IDs to the icc_dt ###
+
+icc_interaction_sig_gw_dt <- GeneConversion[icc_interaction_sig_gw_dt, on = "gene"]
+
+icc_interaction_sig_gw_dt[grp == "Cryopreserved"& percent_round > 1][rev(order(percent))]$Gene_ID
+head(icc_interaction_sig_gw_dt[grp == "Cryopreserved" & percent_round > 1][rev(order(percent))][,c("Gene_ID", "percent_round")], n = 50)
+icc_interaction_sig_gw_dt[grp == "Line"][rev(order(percent))]$Gene_ID
+head(icc_interaction_sig_gw_dt[grp == "Line" & percent_round > 1][rev(order(percent))][,c("Gene_ID", "percent_round")], n = 50)
+icc_interaction_sig_gw_dt[grp == "Village"][rev(order(percent))]$Gene_ID
+head(icc_interaction_sig_gw_dt[grp == "Village" & percent_round > 1][rev(order(percent))][,c("Gene_ID", "percent_round")], n = 50)
+
+fwrite(icc_interaction_sig_gw_dt, paste0(outdir, "sig_results.tsv.gz"), sep = "\t", compress = "gzip")
+
+## Highlight 
+##          X chromosome genes - wouldn't expect these to be Line-biased because expressed by both males and females
+##          Y chromosome genes - should be line-biased because expressed by only males and have some male(s) and some female(s)
+##          mt genes
+##          ribosomal genes
+##          look at gsea and kegg pathways for each
+
+## Read in gtf used as reference and pull just X, Y or MT chromosome genes from it, use ribosomal file for rb genes
+gtf <- fread("/directflow/GWCCGPipeline/projects/reference/refdata-cellranger-GRCh38-3.0.0/genes/genes.gtf", sep = "\t", autostart = 6, header = FALSE)
+
+gtf_genes <- gtf[!(grep("transcript_id", V9))]
+
+gtf_genes$V9 <- gsub("gene_id \"", "",gtf_genes$V9 ) %>%
+            gsub("\"; gene_version \"", ";", .) %>%
+            gsub("\"; gene_name \"", ";", .) %>%
+            gsub("\"; gene_source \"", ";", .) %>%
+            gsub("\"; gene_biotype \"", ";", .) %>%
+            gsub("\"", "", .)
+
+gtf_genes[, c("gene_id", "gene_version", "gene_name", "gene_source", "gene_biotype") := data.table(str_split_fixed(V9,";", 5))]
+
+icc_interaction_plus_dt <- GeneConversion[icc_interaction_plus_dt, on = "gene"]
+
+
+
+X_chromosome_genes <- gtf_genes[V1 == "X"]
+X_genelist <- X_chromosome_genes$gene_id[X_chromosome_genes$gene_id %in% genes]
+Y_chromosome_genes <- gtf_genes[V1 == "Y"]
+Y_genelist <- Y_chromosome_genes$gene_id[Y_chromosome_genes$gene_id %in% genes]
+MT_chromosome_genes <- gtf_genes[V1 == "MT"]
+MT_genelist <- MT_chromosome_genes$gene_id[MT_chromosome_genes$gene_id %in% genes]
+RbGeneList <- read.delim(file = "/directflow/SCCGGroupShare/projects/DrewNeavin/References/RibosomalGeneList_GeneID_ENSG.txt")
+Rb_genelist <- RbGeneList$ENSG[RbGeneList$ENSG %in% genes]
+
+
+### Make stacked bar plots of the variance explained by different factors for these gene groups
+
+
+
+
+## Figure of x chromosome genes ##
+icc_x <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% X_genelist][order(percent_round)]$gene), on = "gene"]
+icc_x$grp <- factor(icc_x$grp, levels = rev(selected_vars))
+icc_x$gene <- factor(icc_x$gene, levels = unique(icc_x$gene))
+
+bar_proportions_x <- ggplot(icc_x, aes(x = gene, y = percent, fill = grp)) +
+    geom_bar(position="stack", stat="identity", alpha = 0.75) +
+    theme_classic() +
+    theme(axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank(),
+        plot.title = element_text(hjust = 0.5)) +
+    scale_fill_manual(values = var_colors) +
+    ggtitle("Variance Explained of\nX Chromosome Genes")
+
+
+ggsave(bar_proportions_x, filename = paste0(outdir, "variance_explained_bar_x_genes.png"), width = 20)
+
+
+## Figure of y chromosome genes ##
+icc_y <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% Y_genelist][order(percent_round)]$gene), on = "gene"]
+icc_y$grp <- factor(icc_y$grp, levels = rev(selected_vars))
+icc_y$gene <- factor(icc_y$gene, levels = unique(icc_y$gene))
+icc_y$Gene_ID <- factor(icc_y$Gene_ID, levels = unique(icc_y$Gene_ID))
+
+
+bar_proportions_y <- ggplot(icc_y, aes(x = Gene_ID, y = percent, fill = grp)) +
+    geom_bar(position="stack", stat="identity", alpha = 0.75) +
+    theme_classic() +
+    theme(axis.title.x=element_blank(),
+        # axis.text.x=element_blank(),
+        # axis.ticks.x=element_blank(),
+        plot.title = element_text(hjust = 0.5),
+        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
+    scale_fill_manual(values = var_colors) +
+    scale_y_continuous(expand = c(0, 0)) +
+    ggtitle("Variance Explained of\nY Chromosome Genes") +
+    ylab("Percent")
+
+
+ggsave(bar_proportions_y, filename = paste0(outdir, "variance_explained_bar_y_genes.png"), width = 4.5, height = 4.5)
+ggsave(bar_proportions_y, filename = paste0(outdir, "variance_explained_bar_y_genes.pdf"), width = 4.5, height = 4.5)
+
+
+
+## Figure of mt chromosome genes ##
+icc_mt <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% MT_genelist][order(percent_round)]$gene), on = "gene"]
+icc_mt$grp <- factor(icc_mt$grp, levels = rev(selected_vars))
+icc_mt$gene <- factor(icc_mt$gene, levels = unique(icc_mt$gene))
+icc_mt$Gene_ID <- factor(icc_mt$Gene_ID, levels = unique(icc_mt$Gene_ID))
+
+
+bar_proportions_mt <- ggplot(icc_mt, aes(x = Gene_ID, y = percent, fill = grp)) +
+    geom_bar(position="stack", stat="identity", alpha = 0.75) +
+    theme_classic() +
+    theme(axis.title.x=element_blank(),
+        # axis.text.x=element_blank(),
+        # axis.ticks.x=element_blank(),
+        plot.title = element_text(hjust = 0.5),
+        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
+    scale_fill_manual(values = var_colors) +
+    scale_y_continuous(expand = c(0, 0)) +
+    ggtitle("Variance Explained of\nMitochondrial Genes") +
+    ylab("Percent")
+
+ggsave(bar_proportions_mt, filename = paste0(outdir, "variance_explained_bar_mt_genes.png"), width = 4.5, height = 4.5)
+ggsave(bar_proportions_mt, filename = paste0(outdir, "variance_explained_bar_mt_genes.pdf"), width = 4.5, height = 4.5)
+
+
+
+## Figure of mt chromosome genes ##
+icc_rb <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% Rb_genelist][order(percent_round)]$gene), on = "gene"]
+icc_rb$grp <- factor(icc_rb$grp, levels = rev(selected_vars))
+icc_rb$gene <- factor(icc_rb$gene, levels = unique(icc_rb$gene))
+
+
+bar_proportions_rb <- ggplot(icc_rb, aes(x = gene, y = percent, fill = grp)) +
+    geom_bar(position="stack", stat="identity", alpha = 0.75) +
+    theme_classic() +
+    theme(axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank(),
+        plot.title = element_text(hjust = 0.5)) +
+    scale_fill_manual(values = var_colors) +
+    scale_y_continuous(expand = c(0, 0)) +
+    ggtitle("Variance Explained\nof Ribosomal Genes")
+
+
+ggsave(bar_proportions_rb, filename = paste0(outdir, "variance_explained_bar_rb_genes.png"), width = 10, height = 4)
+ggsave(bar_proportions_rb, filename = paste0(outdir, "variance_explained_bar_rb_genes.pdf"), width = 10, height = 4)
+
+
+### Plot Pluripotency Genes ###
+pluri_genes <- fread(paste0(dir,"data/pluripotency_genes.tsv"), sep = "\t", col.names = "Gene_ID", header = FALSE)
+
+
+pluri_genes <- GeneConversion[pluri_genes, on = "Gene_ID"]
+
+
+icc_dt_pluri_genes <- icc_interaction_plus_dt[pluri_genes,on = c("gene")]
+icc_dt_pluri_genes$grp <- factor(icc_dt_pluri_genes$grp, levels = rev(selected_vars))
+icc_dt_pluri_genes <- icc_dt_pluri_genes[data.table(gene = icc_dt_pluri_genes[grp == "Residual"][order(percent)]$gene), on = "gene"]
+icc_dt_pluri_genes$Gene_ID <- factor(icc_dt_pluri_genes$Gene_ID, levels = unique(icc_dt_pluri_genes$Gene_ID))
+ 
+
+
+pPluri_Genes_Cont <- ggplot() +
+						geom_bar(data = icc_dt_pluri_genes, aes(Gene_ID, percent, fill = grp), position = "stack", stat = "identity", alpha = 0.75) +
+						theme_classic() +
+						# facet_wrap(Gene_ID ~ ., nrow = 3) +
+						scale_fill_manual(values = var_colors) +
+						theme(plot.title = element_text(hjust = 0.5),
+                            axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
+						ylab("Percent Gene Expression Variance Explained") +
+                        ggtitle("Variance Explained of\nPluripotency Genes") +
+						theme(axis.title.x=element_blank())
+
+ggsave(pPluri_Genes_Cont, filename = paste0(outdir, "Pluripotent_Gene_Variable_Contributions.png"), width = 6, height = 4.5)
+ggsave(pPluri_Genes_Cont, filename = paste0(outdir, "Pluripotent_Gene_Variable_Contributions.pdf"), width = 6, height = 4.5)
+
+
+
+##### check for variance explained for eQTL genes (from Kilpinen et al) that are #####
+eqtls <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/gene_snp_list.tsv", sep = "\t")
+
+
+eqtls_icc <- icc_interaction_plus_dt[unique(eqtls[,"gene"]), on = "gene"]
+eqtls_icc$grp <- factor(eqtls_icc$grp, levels = rev(vars))
+eqtls_icc <- eqtls_icc[data.table(gene = eqtls_icc[grp == "Residual"][order(percent)]$gene), on = "gene"]
+eqtls_icc$Gene_ID <- factor(eqtls_icc$Gene_ID, levels = unique(eqtls_icc$Gene_ID))
+ 
+
+group_size_eqtl  <- data.table(table(eqtls_icc$grp))
+colnames(group_size_eqtl) <- c("grp", "size")
+group_size_eqtl$grp_size <- paste0(group_size_eqtl$grp, "\nN = ", group_size_eqtl$size)
+
+eqtls_icc <- group_size_eqtl[eqtls_icc, on = "grp"]
+grp_size_order_eqtl <- c("Line\nN = 2371", "Village\nN = 1836", "Site\nN = 2461", "Replicate\nN = 869", "Line:Village\nN = 367", "Line:Site\nN = 911", "Village:Site\nN = 898","Replicate:Village\nN = 305", "Replicate:Line\nN = 25", "Replicate:Site\nN = 78", "Residual\nN = 2542")
+eqtls_icc$grp_size <- factor(eqtls_icc$grp_size, levels = grp_size_order_eqtl)
+
+
+
+eqtls_icc_1pct <- eqtls_icc
+
+
+for (ensg in unique(eqtls_icc$gene)){
+    if (!any(eqtls_icc[gene == ensg & grp != "Residual"]$percent > 1)){
+        eqtls_icc_1pct <- eqtls_icc_1pct[gene != ensg]
+    }
+}
+
+
+
+eqtls_icc_1pct_grouped_list <- list()
+
+
+for (ensg in unique(eqtls_icc_1pct$gene)){
+    group <- eqtls_icc_1pct[gene == ensg & grp != "Residual"][which.max(percent)]$grp
+    eqtls_icc_1pct_grouped_list[[group]][[ensg]] <- eqtls_icc_1pct[gene == ensg]
+}
+
+eqtls_icc_1pct_grouped <- lapply(eqtls_icc_1pct_grouped_list, function(x) do.call(rbind, x))
+eqtls_icc_1pct_grouped <- lapply(names(eqtls_icc_1pct_grouped), function(x){
+    eqtls_icc_1pct_grouped[[x]]$largest_contributor <- x
+    return(eqtls_icc_1pct_grouped[[x]])
+})
+
+eqtls_icc_1pct_grouped_dt <- do.call(rbind, eqtls_icc_1pct_grouped)
+
+eqtls_icc_1pct_grouped_dt$largest_contributor <- factor(eqtls_icc_1pct_grouped_dt$largest_contributor, levels = c("Line", "Village", "Site", "Line:Village", "Line:Site", "Village:Site", "Replicate:Village"))
+
+
+
+pPluri_Genes_largest_Cont_eqtl <- ggplot() +
+						geom_bar(data = eqtls_icc_1pct_grouped_dt, aes(Gene_ID, percent, fill = factor(grp, levels = rev(vars))), position = "stack", stat = "identity", alpha = 0.75) +
+						theme_classic() +
+						facet_grid(. ~ largest_contributor, scales = "free_x", space = "free_x") +
+						scale_fill_manual(values = var_colors) +
+						ylab("Percent Gene Expression Variance Explained") +
+						theme(axis.title.x=element_blank(),
+                            axis.text.x = element_blank(),
+                            panel.spacing.x=unit(0, "lines"),
+                            axis.ticks.x = element_blank()) +
+                        geom_hline(yintercept = 1, linetype = "dashed") 
+                        # scale_y_discrete(expand = c(0.03, 0))
+                        # scale_x_discrete(expand = c(0.03, 0))
+
+
+ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "eQTL_Genes_Variance_Contributions_1pct_largest_cont.png"), width = 10, height = 4)
+ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "eQTL_Genes_Variance_Contributions_1pct_largest_cont.pdf"), width = 10, height = 4)
+
+
+
+
+
+
+
+# ### Count number of each chromosome/gene category type 
+# ### numbers are the numbers of that category that where a significant percent of variance is explained by this variable
+# ### percent of that category that where a significant percent of variance is explained by this variable
+
+# x_number <- lapply(genes_list, function(x){
+#     length(which(x %in% X_chromosome_genes$gene_id))
+# })
+
+# x_percent <- lapply(genes_list, function(x){
+#     length(which(x %in% X_chromosome_genes$gene_id))/length(X_chromosome_genes$gene_id)
+# })
+
+# y_number <- lapply(genes_list, function(x){
+#     length(which(x %in% Y_chromosome_genes$gene_id))
+# })
+
+# y_percent <- lapply(genes_list, function(x){
+#     length(which(x %in% Y_chromosome_genes$gene_id))/length(Y_chromosome_genes$gene_id)
+# })
+
+# mt_number <- lapply(genes_list, function(x){
+#     length(which(x %in% MT_chromosome_genes$gene_id))
+# })
+
+# mt_percent <- lapply(genes_list, function(x){
+#     length(which(x %in% MT_chromosome_genes$gene_id))/length(MT_chromosome_genes$gene_id)
+# })
+
+# rb_number <- lapply(genes_list, function(x){
+#     length(which(x %in% RbGeneList$ENSG))
+# })
+
+# rb_percent <- lapply(genes_list, function(x){
+#     length(which(x %in% RbGeneList$ENSG))/length(RbGeneList$ENSG)
+# })
+
+
+
+# ### for > 1% var explained
+# ### Count number of each chromosome/gene category type 
+# x_number_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% X_chromosome_genes$gene_id))
+# })
+# names(x_number_1) <- vars
+
+# x_percent_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% X_chromosome_genes$gene_id))/length(X_chromosome_genes$gene_id)
+# })
+# names(x_percent_1) <- vars
+
+# y_number_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% Y_chromosome_genes$gene_id))
+# })
+# names(y_number_1) <- vars
+
+# y_percent_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% Y_chromosome_genes$gene_id))/length(Y_chromosome_genes$gene_id)
+# })
+# names(y_percent_1) <- vars
+
+# mt_number_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% MT_chromosome_genes$gene_id))
+# })
+# names(mt_number_1) <- vars
+
+# mt_percent_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% MT_chromosome_genes$gene_id))/length( MT_chromosome_genes$gene_id)
+# })
+# names(mt_percent_1) <- vars
+
+# rb_number_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% RbGeneList$ENSG))
+# })
+# names(rb_number_1) <- vars
+
+# rb_percent_1 <- lapply(vars, function(group){
+#     genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene
+#     length(which(genes %in% RbGeneList$ENSG))/length(RbGeneList$ENSG)
+# })
+# names(rb_percent_1) <- vars
+
+
+### Pathway analysis ###
+geneList <- lapply(genes_list, function(x){
+    tmp <- bitr(x, fromType = "ENSEMBL",
+        toType = c("ENTREZID"),
+        OrgDb = org.Hs.eg.db)$ENTREZID
+    tmp[!is.na(tmp)]
+})
+
+
+gg <- list()
+kk <- list()
+
+
+
+df = as.data.frame(org.Hs.egGO)
+go_gene_list = unique(sort(df$gene_id))
+
+dfk = as.data.frame(org.Hs.egPATH)
+kegg_gene_list = unique(sort(dfk$gene_id))
+
+
+
+for (group in c("Line", "Village", "Cryopreserved",  "Replicate","Line:Village", "Line:Cryopreserved", "Village:Cryopreserved", "Replicate:Village", "Replicate:Line", "Replicate:Cryopreserved", "Residual")){
+    kk[[group]] <- enrichKEGG(gene = geneList[[group]],
+                    universe = geneList[[group]],
+                    organism     = 'hsa',
+                    pvalueCutoff = 0.05,
+                    keyType = 'ncbi-geneid')
+
+    gg[[group]] <- groupGO(gene = geneList[[group]],
+                OrgDb = org.Hs.eg.db,
+                readable = TRUE)
+
+}
+
+
+hsGO <- godata('org.Hs.eg.db', ont="MF")
+
+
+sim_results <- list()
+
+vars <- c("Line", "Village", "Cryopreserved",  "Replicate","Village:Line", "Line:Cryopreserved", "Village:Cryopreserved", "Replicate:Village", "Replicate:Line", "Replicate:Cryopreserved")
+
+for (group1 in vars){
+    print(group1)
+    for (group2 in vars[(grep(paste0("^",group1, "$"), vars) + 1): length(vars)]){
+        print(group2)
+        sim_results[[group1]][[group2]] <- clusterSim(geneList[[group1]], geneList[[group2]], semData=hsGO, measure="Wang", combine="BMA")
+    }
+}
+
+
+
+
+
+# genes2rerun <- c(character())
+
+# for (g in unique(icc_dt$gene)){
+#     print(g)
+#     genes2rerun <- c(genes2rerun, as.character(unique(icc_dt[gene == g][P > 0.05/(nrow(icc_dt[gene == g])-1)]$gene)))
+# }
+
+
+# for (gene in genes2rerun){
+#     print(gene)
+#     # unlink(paste0(icc_dir,gene,"_icc.rds"))
+#     print(file.exists(paste0(icc_dir,gene,"_icc.rds")))
+#     unlink(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/",gene,"_fitted_models.rds"))
+#     unlink(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/residuals4qtl/",gene,"_residuals4qtl.rds"))
+# }
+
+
diff --git a/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R b/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R
new file mode 100644
index 0000000..c7ddd7b
--- /dev/null
+++ b/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R
@@ -0,0 +1,357 @@
+library(haven)
+library(ggplot2)
+library(glmmTMB)
+library(Seurat)
+library(tidyverse)
+library(specr)
+library(data.table)
+library(dsLib)
+library(pkgcond)
+library(texreg)
+
+
+inicio("Starting Analysis")
+
+
+##### Define functions #####
+icc_glmmtmb <- function(model, percent = TRUE) {
+    tmp <- VarCorr(model)
+    var <- do.call(rbind, lapply(names(tmp$cond), function(x) data.table("grp" = x, "vcov" = attr(tmp$cond[[x]], "stddev")^2)))
+    var <- rbind(var, data.table("grp" = "Residual", "vcov" = sigma(model)^2))
+    sum_var <- sum(var$vcov)
+    var <- var %>% dplyr::mutate(icc = vcov/sum_var)
+    if (isTRUE(percent)) {
+        var <- var %>% dplyr::mutate(percent = .data$icc * 100)
+    }
+    return(var)
+}
+
+
+
+##### Bring in variables #####
+### Bring in arguments
+args <- commandArgs(trailingOnly = TRUE)
+icc_action_outdir <- paste0(args[1])
+icc_outdir <- paste0(args[2])
+model_action_outdir <- paste0(args[3])
+model_outdir <- paste0(args[4])
+resid_outdir <- paste0(args[5])
+gene <- as.character(args[6])
+
+print(icc_outdir)
+print(icc_outdir)
+print(model_outdir)
+print(resid_outdir)
+print(gene)
+
+
+
+##### Read in seurat with genes #####
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds")
+
+
+
+### Make DF for modeling ###
+df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Line" = seurat@meta.data$Assignment, "Time" = as.factor(seurat@meta.data$Pool))
+colnames(df_hier_unscale)[1] <- "Expression"
+
+
+
+##### Leave one out method #####
+variables <- c("Line", "Time")
+
+model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+
+boolFalse<-F
+while(boolFalse==F & length(variables) > 0){
+  tryCatch({
+    print(variables)
+    model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+    boolFalse<-T
+  },error=function(e){
+    if (length(variables) > 1){
+        variables <- variables[1:(length(variables) -1)]
+    } else {
+        variables <- c()
+    }
+  })
+}
+
+
+if (!length(variables) == 0){
+
+
+    ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+    while (!model_glmmtmb$sdr$pdHess & length(variables) > 0 ){
+        print("Singular fit: removing last variable and rerunning with one less covariate.")
+        if (length(variables) > 1){
+            variables <- variables[1:(length(variables) -1)]
+            print(variables)
+            model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+            model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+        } else {
+            variables <- c()
+        }
+    }
+
+    print(variables)
+
+    if (length(variables) > 0){
+
+        model_loo <- list()
+
+        icc <- data.table(grp = variables, P = as.numeric(NA))
+
+        for (variable in variables){
+            print(variable)
+            if (length(variables) > 1){
+                model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+            } else {
+                model <- as.formula(paste0("Expression ~ 1"))
+            }
+            model_loo[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+            icc[grp == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2]
+        }
+
+
+        if (!(any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){
+            model_loo_updated <- model_loo
+
+            updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+            model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+            ### Calculate the variance explained by each of the included variables ###
+            icc <- icc_glmmtmb(model_loo_updated[["all"]])
+
+
+            ### Recalculate significance ###
+            icc$P <- as.numeric(NA)
+            icc$gene <- gene
+
+            for (variable in variables){
+                print(variable)
+                if (length(variables) > 1){
+                    model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                } else {
+                    model <- as.formula(paste0("Expression ~ 1"))
+                }
+                model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2]
+            }
+        }
+
+
+        while((any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){
+
+            print("Removing non-significant vartiables and retesting signficance")
+
+            ##### Identify variables to keep #####
+            variables <- icc[P < 0.05/length(variables)]$grp
+
+            if (length(variables) > 0){
+                
+                ##### Calculate full model #####
+                updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+
+                model_loo_updated <- list()
+                model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+                ### Calculate the variance explained by each of the included variables ###
+                icc <- icc_glmmtmb(model_loo_updated[["all"]])
+
+
+
+                ### Recalfulate significance ###
+                icc$P <- as.numeric(NA)
+                icc$gene <- gene
+
+                for (variable in variables){
+                    print(variable)
+                    if (length(variables) > 1){
+                        model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                    } else {
+                        model <- as.formula(paste0("Expression ~ 1"))
+                    }
+                    model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2]
+                }
+
+
+        
+            } else {
+                icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+                model_loo_updated <- list()
+            }
+        }
+
+        action_variables <- c()
+
+        if (length(variables) > 1){
+            ### Add in actions of the significant variables
+            if ("Line" %in% variables & "Time" %in% variables){
+                action_variables <- c(action_variables, "Line:Time")
+            }
+
+            model_all_action <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")"))
+
+
+            boolFalse<-F
+            while(boolFalse==F & length(action_variables) > 0){
+                tryCatch({
+                    print(c(variables, action_variables))
+                    model_glmmtmb_action <- suppress_warnings(glmmTMB(formula = noquote(model_all_action), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    boolFalse<-T
+                },error=function(e){
+                    if (length(action_variables) > 1){
+                        action_variables <- action_variables[1:(length(action_variables) -1)]
+                    } else {
+                        action_variables <- c()
+                    }   
+                })
+            }
+
+            ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+            while (!model_glmmtmb_action$sdr$pdHess & length(action_variables) > 0 ){
+                print("Singular fit: removing last variable and rerunning with one less covariate.")
+                if (length(action_variables) > 1){
+                    action_variables <- action_variables[1:(length(action_variables) -1)]
+                    print(c(action_variables, variables))
+                    model_all_action <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")"))
+                    model_glmmtmb_action <- suppress_warnings(glmmTMB(formula = noquote(model_all_action), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                } else {
+                    action_variables <- c()
+                }
+            }
+
+            if (length(action_variables) > 0){
+
+                model_loo_action <- list()
+
+                icc_action <- data.table(grp = action_variables, P = as.numeric(NA))
+
+                for (variable in c(action_variables)){
+                    print(variable)
+                    if (length(action_variables) > 1){
+                        model_action <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables)[!c(variables, action_variables) %in% variable], collapse = ") + (1|"), ")"))
+                    } else {
+                        model_action <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                    }
+                    model_loo_action[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    icc_action[grp == variable]$P <- anova(model_loo_action[[variable]], model_glmmtmb_action)$`Pr(>Chisq)`[2]
+                }
+
+
+                if (!(any(icc_action[grp != "Residual"]$P > 0.05/length(c(variables, action_variables))) | any(is.na(icc_action[grp != "Residual"]$P)))){
+                    model_loo_action_updated <- model_loo_action
+
+                    updated_model_action <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")"))
+
+                    model_loo_action_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+                    ### Calculate the variance explained by each of the included variables ###
+                    icc_action <- icc_glmmtmb(model_loo_action_updated[["all"]])
+
+
+                    ### Recalculate significance ###
+                    icc_action$P <- as.numeric(NA)
+                    icc_action$gene <- gene
+
+                    for (variable in c(variables, action_variables)){
+                        print(variable)
+                        if (length(c(action_variables)) > 1){
+                            model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables)[!c(variables, action_variables) %in% variable], collapse = ") + (1|"), ")"))
+                        } else {
+                            model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                        }
+                        model_loo_action_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                        icc_action[grp == variable]$P <- anova(model_loo_action_updated[[variable]], model_loo_action_updated[["all"]])$`Pr(>Chisq)`[2]
+                    }
+                }
+
+
+                while((any(icc_action[!(grp %in% c("Residual", variables))]$P > 0.05/length(c(variables, action_variables))) | any(is.na(icc_action[!(grp %in%c("Residual", variables))]$P)))){
+
+                    print("Removing non-significant vartiables and retesting signficance")
+
+                    ##### Identify variables to keep #####
+                    action_variables <- icc_action[!(grp %in% c("Residual", variables)) & P < 0.05/length(c(variables, action_variables))]$grp
+
+                    if (length(action_variables) > 0){
+                        
+                        ##### Calculate full model #####
+                        updated_model_action <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")"))
+
+
+                        model_loo_action_updated <- list()
+                        model_loo_action_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+                        ### Calculate the variance explained by each of the included variables ###
+                        icc_action <- icc_glmmtmb(model_loo_action_updated[["all"]])
+
+
+
+                        ### Recalfulate significance ###
+                        icc_action$P <- as.numeric(NA)
+                        icc_action$gene <- gene
+
+                        for (variable in c(variables, action_variables)){
+                            print(variable)
+                            model_action <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables)[!(c(variables, action_variables) %in% variable)], collapse = ") + (1|"), ")"))
+                            model_loo_action_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                            icc_action[grp == variable]$P <- anova(model_loo_action_updated[[variable]], model_loo_action_updated[["all"]])$`Pr(>Chisq)`[2]
+                        }
+                    } else {
+                        icc_action <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+                        model_loo_action_updated <- list()
+                    }
+                }
+            } else {
+                icc_action <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+                model_loo_action_updated <- list()
+            }
+            if (nrow(icc_action) > nrow(icc)){
+                saveRDS(icc_action, paste0(icc_action_outdir, gene, "_icc.rds"), compress = TRUE)
+                saveRDS(model_loo_action_updated, paste0(model_action_outdir, gene, "_fitted_models.rds"), compress = TRUE)
+            }
+
+
+            ### If line is significant, then get residuals for downstream qtl checks ###
+            if ("Line" %in% variables){
+                print("Making residuals for qtl detection")
+                if (length(variables) > 1){
+                        if (length(action_variables) > 0){
+                            model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables)[-grep("Line", c(variables, action_variables))], collapse = ") + (1|"), ")"))
+                        } else {
+                            model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Line"], collapse = ") + (1|"), ")"))
+                        }
+                } else {
+                    model_no_line <- as.formula(paste0("Expression ~ 1"))
+                }
+                fit_no_line <- glmmTMB(formula = noquote(model_no_line), data = df_hier_unscale, REML = TRUE)
+                residuals <- resid(fit_no_line)
+                saveRDS(residuals, paste0(resid_outdir, gene, "_residuals4qtl.rds"), compress = TRUE)
+            }
+        
+        } else {
+            icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+            model_loo_updated <- list()
+        }
+    } else {
+        icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+        model_loo_updated <- list()
+    }
+}
+
+
+saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE)
+saveRDS(model_loo_updated, paste0(model_outdir, gene, "_fitted_models.rds"), compress = TRUE)
+
+
+fin()
\ No newline at end of file
diff --git a/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R b/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R
new file mode 100644
index 0000000..6b23a9a
--- /dev/null
+++ b/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R
@@ -0,0 +1,358 @@
+library(haven)
+library(ggplot2)
+library(glmmTMB)
+library(Seurat)
+library(tidyverse)
+library(specr)
+library(data.table)
+library(dsLib)
+library(pkgcond)
+library(texreg)
+
+
+inicio("Starting Analysis")
+
+
+##### Define functions #####
+icc_glmmtmb <- function(model, percent = TRUE) {
+    tmp <- VarCorr(model)
+    var <- do.call(rbind, lapply(names(tmp$cond), function(x) data.table("grp" = x, "vcov" = attr(tmp$cond[[x]], "stddev")^2)))
+    var <- rbind(var, data.table("grp" = "Residual", "vcov" = sigma(model)^2))
+    sum_var <- sum(var$vcov)
+    var <- var %>% dplyr::mutate(icc = vcov/sum_var)
+    if (isTRUE(percent)) {
+        var <- var %>% dplyr::mutate(percent = .data$icc * 100)
+    }
+    return(var)
+}
+
+
+
+##### Bring in variables #####
+### Bring in arguments
+args <- commandArgs(trailingOnly = TRUE)
+icc_interaction_outdir <- paste0(args[1])
+icc_outdir <- paste0(args[2])
+model_interaction_outdir <- paste0(args[3])
+model_outdir <- paste0(args[4])
+resid_outdir <- paste0(args[5])
+gene <- as.character(args[6])
+
+print(icc_outdir)
+print(icc_outdir)
+print(model_outdir)
+print(resid_outdir)
+print(gene)
+
+
+
+##### Read in seurat with genes #####
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds")
+
+
+
+### Make DF for modeling ###
+df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Line" = seurat@meta.data$Assignment, "Time" = as.factor(seurat@meta.data$Pool), "Ncov" = seurat$Ncov)
+colnames(df_hier_unscale)[1] <- "Expression"
+
+
+
+##### Leave one out method #####
+variables <- c("Line", "Time", "Ncov")
+
+model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+
+boolFalse<-F
+while(boolFalse==F & length(variables) > 0){
+  tryCatch({
+    print(variables)
+    model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+    boolFalse<-T
+  },error=function(e){
+    if (length(variables) > 1){
+        variables <- variables[1:(length(variables) -1)]
+    } else {
+        variables <- c()
+    }
+  })
+}
+
+
+if (!length(variables) == 0){
+
+
+    ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+    while (!model_glmmtmb$sdr$pdHess & length(variables) > 0 ){
+        print("Singular fit: removing last variable and rerunning with one less covariate.")
+        if (length(variables) > 1){
+            variables <- variables[1:(length(variables) -1)]
+            print(variables)
+            model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")"))
+            model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+        } else {
+            variables <- c()
+        }
+    }
+
+    print(variables)
+
+    if (length(variables) > 0){
+
+        model_loo <- list()
+
+        icc <- data.table(grp = variables, P = as.numeric(NA))
+
+        for (variable in variables){
+            print(variable)
+            if (length(variables) > 1){
+                model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+            } else {
+                model <- as.formula(paste0("Expression ~ 1"))
+            }
+            model_loo[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+            icc[grp == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2]
+        }
+
+
+        if (!(any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){
+            model_loo_updated <- model_loo
+
+            updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+            model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+            ### Calculate the variance explained by each of the included variables ###
+            icc <- icc_glmmtmb(model_loo_updated[["all"]])
+
+
+            ### Recalculate significance ###
+            icc$P <- as.numeric(NA)
+            icc$gene <- gene
+
+            for (variable in variables){
+                print(variable)
+                if (length(variables) > 1){
+                    model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                } else {
+                    model <- as.formula(paste0("Expression ~ 1"))
+                }
+                model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2]
+            }
+        }
+
+
+        while((any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){
+
+            print("Removing non-significant vartiables and retesting signficance")
+
+            ##### Identify variables to keep #####
+            variables <- icc[P < 0.05/length(variables)]$grp
+
+            if (length(variables) > 0){
+                
+                ##### Calculate full model #####
+                updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")"))
+
+
+                model_loo_updated <- list()
+                model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+                ### Calculate the variance explained by each of the included variables ###
+                icc <- icc_glmmtmb(model_loo_updated[["all"]])
+
+
+
+                ### Recalfulate significance ###
+                icc$P <- as.numeric(NA)
+                icc$gene <- gene
+
+                for (variable in variables){
+                    print(variable)
+                    if (length(variables) > 1){
+                        model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                    } else {
+                        model <- as.formula(paste0("Expression ~ 1"))
+                    }
+                    model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2]
+                }
+
+
+        
+            } else {
+                icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+                model_loo_updated <- list()
+            }
+        }
+
+        interaction_variables <- c()
+
+        if (length(variables) > 1){
+            ### Add in interactions of the significant variables
+            if ("Line" %in% variables & "Time" %in% variables){
+                interaction_variables <- c(interaction_variables, "Line:Time")
+            }
+
+            model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+
+
+            boolFalse<-F
+            while(boolFalse==F & length(interaction_variables) > 0){
+                tryCatch({
+                    print(c(variables, interaction_variables))
+                    model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    boolFalse<-T
+                },error=function(e){
+                    if (length(interaction_variables) > 1){
+                        interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)]
+                    } else {
+                        interaction_variables <- c()
+                    }
+                })
+            }
+
+        ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance
+        while (!model_glmmtmb_interaction$sdr$pdHess & length(interaction_variables) > 0 ){
+            print("Singular fit: removing last variable and rerunning with one less covariate.")
+            if (length(interaction_variables) > 1){
+                interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)]
+                print(c(interaction_variables, variables))
+                model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+                model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+            } else {
+                interaction_variables <- c()
+            }
+        }
+
+        if (length(interaction_variables) > 0){
+
+            model_loo_interaction <- list()
+
+            icc_interaction <- data.table(grp = interaction_variables, P = as.numeric(NA))
+
+            for (variable in c(interaction_variables)){
+                print(variable)
+                if (length(interaction_variables) > 1){
+                    model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")"))
+                } else {
+                    model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                }
+                model_loo_interaction[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                icc_interaction[grp == variable]$P <- anova(model_loo_interaction[[variable]], model_glmmtmb_interaction)$`Pr(>Chisq)`[2]
+            }
+
+
+            if (!(any(icc_interaction[grp != "Residual"]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[grp != "Residual"]$P)))){
+                model_loo_interaction_updated <- model_loo_interaction
+
+                updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+
+                model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+                ### Calculate the variance explained by each of the included variables ###
+                icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]])
+
+
+                ### Recalculate significance ###
+                icc_interaction$P <- as.numeric(NA)
+                icc_interaction$gene <- gene
+
+                for (variable in c(variables, interaction_variables)){
+                    print(variable)
+                    if (length(c(interaction_variables)) > 1){
+                        model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")"))
+                    } else {
+                        model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")"))
+                    }
+                    model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                    icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2]
+                }
+            }
+
+
+            while((any(icc_interaction[!(grp %in% c("Residual", variables))]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[!(grp %in%c("Residual", variables))]$P)))){
+
+                print("Removing non-significant vartiables and retesting signficance")
+
+                ##### Identify variables to keep #####
+                interaction_variables <- icc_interaction[!(grp %in% c("Residual", variables)) & P < 0.05/length(c(variables, interaction_variables))]$grp
+
+                if (length(interaction_variables) > 0){
+                    
+                    ##### Calculate full model #####
+                    updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")"))
+
+
+                    model_loo_interaction_updated <- list()
+                    model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+
+
+
+                    ### Calculate the variance explained by each of the included variables ###
+                    icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]])
+
+
+
+                    ### Recalfulate significance ###
+                    icc_interaction$P <- as.numeric(NA)
+                    icc_interaction$gene <- gene
+
+                    for (variable in c(variables, interaction_variables)){
+                        print(variable)
+                        model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!(c(variables, interaction_variables) %in% variable)], collapse = ") + (1|"), ")"))
+                        model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse")
+                        icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2]
+                    }
+                } else {
+                    icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+                    model_loo_interaction_updated <- list()
+                }
+
+                if (nrow(icc_interaction) > nrow(icc)){
+                    saveRDS(icc_interaction, paste0(icc_interaction_outdir, gene, "_icc.rds"), compress = TRUE)
+                    saveRDS(model_loo_interaction_updated, paste0(model_interaction_outdir, gene, "_fitted_models.rds"), compress = TRUE)
+                }
+            }
+        } else {
+            icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+            model_loo_interaction_updated <- list()
+        }
+
+
+        ### If line is significant, then get residuals for downstream qtl checks ###
+        if ("Line" %in% variables){
+            print("Making residuals for qtl detection")
+            if (length(variables) > 1){
+                    if (length(interaction_variables) > 0){
+                        model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[-grep("Line", c(variables, interaction_variables))], collapse = ") + (1|"), ")"))
+                    } else {
+                        model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Line"], collapse = ") + (1|"), ")"))
+                    }
+            } else {
+                model_no_line <- as.formula(paste0("Expression ~ 1"))
+            }
+            fit_no_line <- glmmTMB(formula = noquote(model_no_line), data = df_hier_unscale, REML = TRUE)
+            residuals <- resid(fit_no_line)
+            saveRDS(residuals, paste0(resid_outdir, gene, "_residuals4qtl.rds"), compress = TRUE)
+        }
+    
+        } else {
+            icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+            model_loo_updated <- list()
+        }
+    } else {
+        icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character())
+        model_loo_updated <- list()
+    }
+}
+
+
+saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE)
+saveRDS(model_loo_updated, paste0(model_outdir, gene, "_fitted_models.rds"), compress = TRUE)
+
+
+fin()
\ No newline at end of file
diff --git a/multi-passage/Variance/variance_partition_multipassage_snake.sh b/multi-passage/Variance/variance_partition_multipassage_snake.sh
new file mode 100644
index 0000000..8e72118
--- /dev/null
+++ b/multi-passage/Variance/variance_partition_multipassage_snake.sh
@@ -0,0 +1,136 @@
+
+cd /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/
+
+SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage.smk"
+LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/logs/"
+
+mkdir -p $LOG
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --reason > jobs2run.txt
+
+
+
+nohup \
+  snakemake \
+    --snakefile $SNAKEFILE \
+    --jobs 100 \
+    --use-singularity \
+    --restart-times 1 \
+    --keep-going \
+    --cluster \
+        "qsub -S /bin/bash \
+        -q short.q \
+        -r yes \
+        -pe smp {threads} \
+        -l tmp_requested={resources.disk_per_thread_gb}G \
+        -l mem_requested={resources.mem_per_thread_gb}G \
+        -e $LOG \
+        -o $LOG \
+        -j y \
+        -V" \
+  > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log &
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --unlock
+
+
+snakemake \
+  --snakefile $SNAKEFILE \
+  --dryrun \
+  --cores 1 \
+  --quiet \
+  --cleanup-metadata \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \
+  /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds
+
+
+
+rm ENSG00000047617_icc.rds
+rm ENSG00000089250_icc.rds
+rm ENSG00000124602_icc.rds
+rm ENSG00000130052_icc.rds
+rm ENSG00000136099_icc.rds
+rm ENSG00000143412_icc.rds
+rm ENSG00000147124_icc.rds
+rm ENSG00000158104_icc.rds
+rm ENSG00000160188_icc.rds
+rm ENSG00000163393_icc.rds
+rm ENSG00000163947_icc.rds
+rm ENSG00000167083_icc.rds
+rm ENSG00000170113_icc.rds
+rm ENSG00000170961_icc.rds
+rm ENSG00000172748_icc.rds
+rm ENSG00000184916_icc.rds
+rm ENSG00000185386_icc.rds
+rm ENSG00000186141_icc.rds
+rm ENSG00000188536_icc.rds
+rm ENSG00000189164_icc.rds
+rm ENSG00000230606_icc.rds
+rm ENSG00000242687_icc.rds
+rm ENSG00000243444_icc.rds
+rm ENSG00000253438_icc.rds
+rm ENSG00000253731_icc.rds
+rm ENSG00000258813_icc.rds
+rm ENSG00000258944_icc.rds
+rm ENSG00000260528_icc.rds
+rm ENSG00000272030_icc.rds
+rm ENSG00000272973_icc.rds
+rm ENSG00000274367_icc.rds
+rm ENSG00000275580_icc.rds
+
+
+
+
+rm ENSG00000047617_fitted_models.rds
+rm ENSG00000089250_fitted_models.rds
+rm ENSG00000124602_fitted_models.rds
+rm ENSG00000130052_fitted_models.rds
+rm ENSG00000136099_fitted_models.rds
+rm ENSG00000143412_fitted_models.rds
+rm ENSG00000147124_fitted_models.rds
+rm ENSG00000158104_fitted_models.rds
+rm ENSG00000160188_fitted_models.rds
+rm ENSG00000163393_fitted_models.rds
+rm ENSG00000163947_fitted_models.rds
+rm ENSG00000167083_fitted_models.rds
+rm ENSG00000170113_fitted_models.rds
+rm ENSG00000170961_fitted_models.rds
+rm ENSG00000172748_fitted_models.rds
+rm ENSG00000184916_fitted_models.rds
+rm ENSG00000185386_fitted_models.rds
+rm ENSG00000186141_fitted_models.rds
+rm ENSG00000188536_fitted_models.rds
+rm ENSG00000189164_fitted_models.rds
+rm ENSG00000230606_fitted_models.rds
+rm ENSG00000242687_fitted_models.rds
+rm ENSG00000243444_fitted_models.rds
+rm ENSG00000253438_fitted_models.rds
+rm ENSG00000253731_fitted_models.rds
+rm ENSG00000258813_fitted_models.rds
+rm ENSG00000258944_fitted_models.rds
+rm ENSG00000260528_fitted_models.rds
+rm ENSG00000272030_fitted_models.rds
+rm ENSG00000272973_fitted_models.rds
+rm ENSG00000274367_fitted_models.rds
+rm ENSG00000275580_fitted_models.rds
diff --git a/multi-passage/preQC.R b/multi-passage/preQC.R
index 323d606..bd1859d 100644
--- a/multi-passage/preQC.R
+++ b/multi-passage/preQC.R
@@ -4,11 +4,13 @@ library(dplyr)
 library(tidyverse)
 library(colorspace)
 library(Nebulosa)
+library(RColorBrewer)
+library(scran)
 
 
 ##### Set up directories #####
 dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/"
-datadir <- paste0(dir, "data/multi-passage/GIMR_GWCCG_211213_JOSPOW_10x_3p_results/GIMR_GWCCG_211213_JOSPOW_10x_3p/analyses/")
+datadir <- paste0(dir, "data/multi-passage/resequencing/220405/")
 outdir <- paste0(dir, "output/multi-passage/preQC/")
 
 
@@ -20,7 +22,7 @@ pools <- dir(datadir)
 ## Read in expression data
 counts_list <- lapply(pools, function(x){
     print(x)
-    Read10X(paste0(datadir,x, "/GE/",x,"/outs/per_sample_outs/",x,"/count/sample_feature_bc_matrix"), gene.column = 1)
+    Read10X(paste0(datadir,x, "/GE/summary/sample_feature_bc_matrix/"), gene.column = 1)
 })
 names(counts_list) <- pools
 
@@ -34,8 +36,10 @@ names(counts_list) <- pools
 
 
 ## Make seurat object ##
-seurat_list <- lapply(counts_list, function(x){
-    CreateSeuratObject(counts = x)
+seurat_list <- lapply(names(counts_list), function(x){
+    temp <- CreateSeuratObject(counts = counts_list[[x]])
+    temp@meta.data$Pool <- x
+    return(temp)
 })
 names(seurat_list) <- pools
 
@@ -65,6 +69,44 @@ RbGeneList <- RbGeneList[which(RbGeneList$ENSG %in% rownames(seurat)),]
 seurat <- PercentageFeatureSet(seurat, features = RbGeneList$ENSG, col.name = "percent.rb")
 
 
+## Add Demultiplexing Results ##
+demultiplexing_list <- lapply(pools, function(pool){
+    temp <- data.frame(fread(paste0(dir, "output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/atleasthalf_singlet/", pool, "/atleasthalf_singlet_w_combined_assignments.tsv")))
+    rownames(temp) <- gsub("-1", "", paste0(pool, "_", temp$Barcode))
+    return(temp)
+})
+
+demultiplexing_dt <- do.call(rbind,demultiplexing_list)
+
+
+seurat <- AddMetaData(seurat, demultiplexing_dt)
+
+
+print("Starting cell cycle step")
+hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
+
+countsENSG <- seurat[["RNA"]]@counts # Create a counts matrix that has the ENSG gene clasifiers so can determine cell cycle phase
+print("The size of the counts object in genes x cells is:")
+print(dim(countsENSG))
+
+### Run the cell cycle identification ###
+print("Starting cell cycle determination")
+assigned <- cyclone(countsENSG, pairs=hs.pairs)  #Note, this takes hours
+table(assigned$phases)
+write.table(assigned, file = paste0(outdir,"CellCycleProportions.txt"), quote = F, sep = "\t") #Save so that can read in and don't have to wait to recompute again
+
+
+assigned <- read.table(paste0(outdir,"CellCycleProportions.txt"), sep = "\t")
+assigned <- as.data.frame(assigned)
+rownames(assigned) <- colnames(seurat)
+write.table(assigned, file = paste0(outdir,"CellCycleProportions.txt"), quote = F, sep = "\t") #Save so that can read in and don't have to wait to recompute again
+assigned <- read.table(paste0(outdir,"CellCycleProportions.txt"), sep = "\t")
+
+seurat <- AddMetaData(seurat, assigned)
+saveRDS(seurat, paste0(outdir,"seurat_all_cells_cell_cycle.rds"))
+
+
+
 ### Make pre-QC figures ###
 seurat <- NormalizeData(seurat, verbose = TRUE)
 seurat <- FindVariableFeatures(seurat, selection.method = "mean.var.plot")
@@ -75,45 +117,333 @@ seurat <- FindClusters(seurat, resolution = 0.5)
 seurat <- RunUMAP(seurat, dims = 1:10)
 
 
-seurat$Pool <- gsub("_[ATCG]+", "", colnames(seurat))
-
 
 ### QC Figures pre filtering ###
 plot_mt_pct <- VlnPlot(seurat, features = c( "percent.mt"), group.by = "Pool", pt.size = 0) +
-                    scale_fill_discrete_sequential(palette = "SunsetDark")
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(plot_mt_pct, filename = paste0(outdir,"Mt_pct_vln.png"))
 
 plot_rb_pct <- VlnPlot(seurat, features = c( "percent.rb"), group.by = "Pool", pt.size = 0) +
-                    scale_fill_discrete_sequential(palette = "SunsetDark")
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(plot_rb_pct, filename = paste0(outdir,"Rb_pct_vln.png"))
 
 plot_n_count <- VlnPlot(seurat, features = c( "nCount_RNA"), group.by = "Pool", pt.size = 0) +
-                    scale_fill_discrete_sequential(palette = "SunsetDark")
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(plot_n_count, filename = paste0(outdir,"N_count_vln.png"))
 
 plot_nFeature_RNA <- VlnPlot(seurat, features = c( "nFeature_RNA"), group.by = "Pool", pt.size = 0) +
-                    scale_fill_discrete_sequential(palette = "SunsetDark")
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(plot_nFeature_RNA, filename = paste0(outdir,"nFeature_RNA_vln.png"))
 
 lib_mt <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Pool") +
-                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25)
+                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(lib_mt, filename = paste0(outdir,"lib_mt.png"))
 
 lib_genes <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Pool") +
-                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25)
+                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(lib_genes, filename = paste0(outdir,"lib_genes.png"))
 
-nebulosa_mt_umap <- plot_density(seurat, "percent.mt", pal = "plasma")
+nebulosa_mt_umap <- plot_density(seurat, "percent.mt", pal = "plasma") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(nebulosa_mt_umap, filename = paste0(outdir,"mt_percent_umap.png"))
 
-nebulosa_rb_umap <- plot_density(seurat, "percent.rb", pal = "plasma")
+nebulosa_rb_umap <- plot_density(seurat, "percent.rb", pal = "plasma") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(nebulosa_rb_umap, filename = paste0(outdir,"rb_percent_umap.png"))
 
 umap_Pool <- DimPlot(seurat, group.by = "Pool") +
-                    scale_color_discrete_sequential(palette = "SunsetDark")
+                    scale_color_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(umap_Pool, filename = paste0(outdir,"pool_umap.png"))
 
 mt_umap <- FeaturePlot(seurat, features = "percent.mt") +
-                    scale_color_continuous_sequential(palette = "RedPurple")
+                    scale_color_continuous_sequential(palette = "RedPurple") +
+                    theme(plot.background = element_rect(fill = "white"))
 ggsave(mt_umap, filename = paste0(outdir,"mt_percent_umap_seurat.png"))
 
+colourCount = length(unique(seurat@meta.data$AtLeastHalfSinglet_Individual_Assignment))
+getPalette = colorRampPalette(brewer.pal(9, "Set1"))
+
+umap_individual <- DimPlot(seurat, group.by = "AtLeastHalfSinglet_Individual_Assignment") +
+                    scale_color_manual(values = getPalette(colourCount)) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_individual, filename = paste0(outdir,"pool_individual.png"))
+
+umap_droplet_type <- DimPlot(seurat, group.by = "AtLeastHalfSinglet_DropletType") +
+                    scale_color_discrete_sequential(palette = "Red-Blue", rev = FALSE) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_droplet_type, filename = paste0(outdir,"pool_droplet_type.png"))
+
+umap_phases <- DimPlot(seurat, group.by = "phases") +
+                    scale_color_discrete_sequential(palette = "Sunset", rev = FALSE) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_phases, filename = paste0(outdir,"cell_cycle_type.png"))
+
+
+
+##### Subset seurat object by cells that have no info for cell assignments (removed by dropletQC) #####
+seurat_sub <- subset(seurat, subset = AtLeastHalfSinglet_Individual_Assignment != "doublet")
+
+
+
+### Make pre-QC figures ###
+seurat_sub <- NormalizeData(seurat_sub, verbose = TRUE)
+seurat_sub <- FindVariableFeatures(seurat_sub, selection.method = "mean.var.plot")
+seurat_sub <- ScaleData(seurat_sub, features = VariableFeatures(seurat_sub))
+seurat_sub <- RunPCA(seurat_sub, features = VariableFeatures(object = seurat_sub))
+seurat_sub <- FindNeighbors(seurat_sub, dims = 1:10)
+seurat_sub <- FindClusters(seurat_sub, resolution = 0.5)
+seurat_sub <- RunUMAP(seurat_sub, dims = 1:10)
+
+
+
+### QC Figures pre filtering ###
+plot_mt_pct <- VlnPlot(seurat_sub, features = c( "percent.mt"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_mt_pct, filename = paste0(outdir,"Mt_pct_vln_singlets.png"))
+
+plot_rb_pct <- VlnPlot(seurat_sub, features = c( "percent.rb"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_rb_pct, filename = paste0(outdir,"Rb_pct_vln_singlets.png"))
+
+plot_n_count <- VlnPlot(seurat_sub, features = c( "nCount_RNA"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_n_count, filename = paste0(outdir,"N_count_vln_singlets.png"))
+
+plot_nFeature_RNA <- VlnPlot(seurat_sub, features = c( "nFeature_RNA"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_nFeature_RNA, filename = paste0(outdir,"nFeature_RNA_vln_singlets.png"))
+
+lib_mt <- FeatureScatter(seurat_sub, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Pool") +
+                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(lib_mt, filename = paste0(outdir,"lib_mt_singlets.png"))
+
+lib_genes <- FeatureScatter(seurat_sub, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Pool") +
+                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(lib_genes, filename = paste0(outdir,"lib_genes_singlets.png"))
+
+nebulosa_mt_umap <- plot_density(seurat_sub, "percent.mt", pal = "plasma") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(nebulosa_mt_umap, filename = paste0(outdir,"mt_percent_umap_singlets.png"))
+
+nebulosa_rb_umap <- plot_density(seurat_sub, "percent.rb", pal = "plasma") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(nebulosa_rb_umap, filename = paste0(outdir,"rb_percent_umap_singlets.png"))
+
+umap_Pool <- DimPlot(seurat_sub, group.by = "Pool") +
+                    scale_color_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_Pool, filename = paste0(outdir,"pool_umap_singlets.png"))
+
+mt_umap <- FeaturePlot(seurat_sub, features = "percent.mt") +
+                    scale_color_continuous_sequential(palette = "RedPurple") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(mt_umap, filename = paste0(outdir,"mt_percent_umap_seurat_singlets.png"))
+
+colourCount = length(unique(seurat_sub@meta.data$AtLeastHalfSinglet_Individual_Assignment))
+getPalette = colorRampPalette(brewer.pal(9, "Set1"))
+
+umap_individual <- DimPlot(seurat_sub, group.by = "AtLeastHalfSinglet_Individual_Assignment") +
+                    scale_color_manual(values = getPalette(colourCount)) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_individual, filename = paste0(outdir,"pool_individual_singlets.png"))
+
+umap_droplet_type <- DimPlot(seurat_sub, group.by = "AtLeastHalfSinglet_DropletType") +
+                    scale_color_discrete_sequential(palette = "Red-Blue", rev = FALSE) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_droplet_type, filename = paste0(outdir,"pool_droplet_type_singlets.png"))
+
+umap_phases <- DimPlot(seurat_sub, group.by = "phases") +
+                    scale_color_discrete_sequential(palette = "Sunset", rev = FALSE) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_phases, filename = paste0(outdir,"cell_cycle_type_singlets.png"))
+
+
+
+
+
+##### Remove high mitochondrial % cells #####
+seurat_sub_mt <- subset(seurat_sub, subset = percent.mt < 25)
+
+
+### Integrate different time-points
+seurat_sub_mt_list <- SplitObject(seurat_sub_mt, split.by = "Pool")
+seurat_sub_mt_list <- lapply(X = seurat_sub_mt_list, function(x) SCTransform(x, vars.to.regress = c("scores.G1", "scores.S", "scores.G2M", "percent.mt", "percent.rb"), return.only.var.genes = FALSE))
+features <- SelectIntegrationFeatures(object.list = seurat_sub_mt_list, nfeatures = 3000)
+seurat_sub_mt_list <- PrepSCTIntegration(object.list = seurat_sub_mt_list, anchor.features = features)
+
+saveRDS(seurat_sub_mt_list, paste0(outdir,"seurat_sub_mt_list.rds"))
+seurat_sub_mt_list <- readRDS(paste0(outdir,"seurat_sub_mt_list.rds"))
+
+seurat_sub_mt_anchors <- FindIntegrationAnchors(object.list = seurat_sub_mt_list, normalization.method = "SCT",
+    anchor.features = features)
+combined_sct <- IntegrateData(anchorset = seurat_sub_mt_anchors, normalization.method = "SCT")
+
+combined_sct <- RunPCA(combined_sct, verbose = FALSE)
+combined_sct <- RunUMAP(combined_sct, reduction = "pca", dims = 1:30)
+
+
+### QC Figures  ###
+plot_mt_pct <- VlnPlot(combined_sct, features = c( "percent.mt"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_mt_pct, filename = paste0(outdir,"Mt_pct_vln_singlets_filtered.png"))
+
+plot_rb_pct <- VlnPlot(combined_sct, features = c( "percent.rb"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_rb_pct, filename = paste0(outdir,"Rb_pct_vln_singlets_filtered.png"))
+
+plot_n_count <- VlnPlot(combined_sct, features = c( "nCount_RNA"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_n_count, filename = paste0(outdir,"N_count_vln_singlets_filtered.png"))
+
+plot_nFeature_RNA <- VlnPlot(combined_sct, features = c( "nFeature_RNA"), group.by = "Pool", pt.size = 0) +
+                    scale_fill_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(plot_nFeature_RNA, filename = paste0(outdir,"nFeature_RNA_vln_singlets_filtered.png"))
+
+lib_mt <- FeatureScatter(combined_sct, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Pool") +
+                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(lib_mt, filename = paste0(outdir,"lib_mt_singlets_filtered.png"))
+
+lib_genes <- FeatureScatter(combined_sct, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Pool") +
+                    scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(lib_genes, filename = paste0(outdir,"lib_genes_singlets_filtered.png"))
+
+nebulosa_mt_umap <- plot_density(combined_sct, "percent.mt", pal = "plasma") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(nebulosa_mt_umap, filename = paste0(outdir,"mt_percent_umap_singlets_filtered.png"))
+
+nebulosa_rb_umap <- plot_density(combined_sct, "percent.rb", pal = "plasma") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(nebulosa_rb_umap, filename = paste0(outdir,"rb_percent_umap_singlets_filtered.png"))
+
+umap_Pool <- DimPlot(combined_sct, group.by = "Pool") +
+                    scale_color_discrete_sequential(palette = "SunsetDark") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_Pool, filename = paste0(outdir,"pool_umap_singlets_filtered.png"))
+
+mt_umap <- FeaturePlot(combined_sct, features = "percent.mt") +
+                    scale_color_continuous_sequential(palette = "RedPurple") +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(mt_umap, filename = paste0(outdir,"mt_percent_umap_seurat_singlets_filtered.png"))
+
+colourCount = length(unique(combined_sct@meta.data$AtLeastHalfSinglet_Individual_Assignment))
+getPalette = colorRampPalette(brewer.pal(9, "Set1"))
+
+umap_individual <- DimPlot(combined_sct, group.by = "AtLeastHalfSinglet_Individual_Assignment") +
+                    scale_color_manual(values = getPalette(colourCount)) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_individual, filename = paste0(outdir,"pool_individual_singlets_filtered.png"))
+
+umap_droplet_type <- DimPlot(combined_sct, group.by = "AtLeastHalfSinglet_DropletType") +
+                    scale_color_discrete_sequential(palette = "Red-Blue", rev = FALSE) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_droplet_type, filename = paste0(outdir,"pool_droplet_type_singlets_filtered.png"))
+
+umap_phases <- DimPlot(combined_sct, group.by = "phases") +
+                    scale_color_discrete_sequential(palette = "Sunset", rev = FALSE) +
+                    theme(plot.background = element_rect(fill = "white"))
+ggsave(umap_phases, filename = paste0(outdir,"cell_cycle_type_singlets_filtered.png"))
+
+
+### Update individual IDs to match what used for Nona's plots ###
+combined_sct$Assignment <- gsub("^0_", "", combined_sct$AtLeastHalfSinglet_Individual_Assignment) %>%
+                                gsub("D-", "", .) %>%
+                                    gsub("\\.\\d\\.", "", .) %>%
+                                        gsub("N-", "", .) %>%
+                                            gsub("-P36", "", .)  %>%
+                                                gsub("-", "", .)
+
+seurat_sub$Assignment <- gsub("^0_", "", seurat_sub$AtLeastHalfSinglet_Individual_Assignment) %>%
+                                gsub("D-", "", .) %>%
+                                    gsub("\\.\\d\\.", "", .) %>%
+                                        gsub("N-", "", .) %>%
+                                            gsub("-P36", "", .)  %>%
+                                                gsub("-", "", .)
+
+
+##### Make proportion plots (area plot) #####
+village_summary <- data.table(prop.table(table(seurat_sub@meta.data[,c("Assignment", "Pool")]), margin = 2))
+village_summary$Assignment <- factor(village_summary$Assignment, levels = rev(village_summary[Pool == "Village_P8"]$Assignment[order(village_summary[Pool == "Village_P8"]$N)]))
+
+colors_original <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Nona_multiome/line_colors")
+colors_original <- colors_original[levels(village_summary$Assignment)]
+
+
+p_stacked_area <- ggplot(village_summary, aes(x = as.numeric(as.character(gsub("Village_P","", Pool))), y = N, fill = factor(Assignment), group = Assignment)) +
+    geom_area(alpha=0.6 , size=0.5, colour="black") +
+    theme_classic() +
+    scale_fill_manual(values = colors_original) +
+    xlab("Passage") +
+    ylab("Proportion of Cells")
+ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.png"), width = 7, height = 4)
+ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.pdf"), width = 7, height = 4)
+
+
+village_summary_singlets <- data.table(prop.table(table(combined_sct@meta.data[,c("Assignment", "Pool")]), margin = 2))
+village_summary_singlets$Assignment <- factor(village_summary_singlets$Assignment, levels = rev(village_summary_singlets[Pool == "Village_P8"]$Assignment[order(village_summary_singlets[Pool == "Village_P8"]$N)]))
+
+colors <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Nona_multiome/line_colors")
+colors <- colors[levels(village_summary_singlets$Assignment)]
+
+
+p_stacked_area <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(gsub("Village_P","", Pool))), y = N, fill = factor(Assignment), group = Assignment)) +
+    geom_area(alpha=0.6 , size=0.5, colour="black") +
+    theme_classic() +
+    scale_fill_manual(values = colors) +
+    xlab("Passage") +
+    ylab("Proportion of Cells")
+ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area_filtered.png"), width = 7, height = 4)
+ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area_filtered.pdf"), width = 7, height = 4)
+
+
+
+
+
+
+
+saveRDS(combined_sct, paste0(outdir,"time-integrated_filtered_seurat.rds"))
+combined_sct <- readRDS(paste0(outdir,"time-integrated_filtered_seurat.rds"))
+
+
+### Add number of cells at each time into metadata and 1/n for models
+freq_dt <- as.data.frame(table(combined_sct$Assignment, combined_sct$Pool))
+freq_dt$Ncov <- ifelse(freq_dt$Freq == 0, NA, 1/freq_dt$Freq)
+
+meta <- left_join(combined_sct@meta.data, freq_dt, by = c("Assignment" = "Var1", "Pool" = "Var2"))
+rownames(meta) <- rownames(combined_sct@meta.data)
+
+combined_sct_meta <- AddMetaData(combined_sct, meta)
+
+
+### SCT the dataset
+feats <- rownames(combined_sct_meta[["SCT"]]@counts)[which((rowSums(combined_sct_meta[["SCT"]]@counts > 0)/ncol(combined_sct_meta[["SCT"]]@counts)) >= 0.01)]
+combined_sct_filt <- subset(combined_sct_meta, features = feats)
+
+saveRDS(combined_sct_filt, paste0(outdir, "time-integrated_filtered_seurat_1pct_expressing.rds"))
+
+
+DefaultAssay(combined_sct_filt) <- "RNA"
+
+combined_sct_filt <- SCTransform(combined_sct_filt, verbose = TRUE, vars.to.regress = c("scores.G1", "scores.S", "scores.G2M", "percent.mt", "percent.rb"), return.only.var.genes = FALSE)
+
+saveRDS(combined_sct_filt, paste0(outdir, "seurat_joint_SCT_1pct_expressing.rds"))
+
+fwrite(data.table(Gene = rownames(combined_sct_filt)), paste0(outdir, "genes_1pct_expressing.tsv"), sep = "\t")
diff --git a/multi-passage/test_vireo_ambient/vireo.sh b/multi-passage/test_vireo_ambient/vireo.sh
new file mode 100644
index 0000000..4743d99
--- /dev/null
+++ b/multi-passage/test_vireo_ambient/vireo.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+
+
+
+singularity exec --bind /directflow $SIF vireo -c $VIREO_INDIR -d $VIREO_INDIR/donor_subset.vcf -o $VIREO_OUTDIR -t $FIELD --callAmbientRNAs
\ No newline at end of file
diff --git a/multi-passage/test_vireo_ambient/vireo_submit.sh b/multi-passage/test_vireo_ambient/vireo_submit.sh
new file mode 100644
index 0000000..8861288
--- /dev/null
+++ b/multi-passage/test_vireo_ambient/vireo_submit.sh
@@ -0,0 +1,41 @@
+
+
+DIR=/directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_dir/vireo_new_barcode
+SIF=/directflow/SCCGGroupShare/projects/himaro/demuxafy/image/Demuxafy.sif
+PIPELINE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/test_vireo_ambient/vireo.sh"
+
+THREADS=4
+N=18
+
+
+for pool in `ls /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/multi-passage/resequencing/220405/`
+do
+
+    FIELD="GP"
+
+
+    VIREO_INDIR=/directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/vireo_new_barcode/$pool
+    VIREO_OUTDIR=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/vireo_test/$pool
+    LOG=$VIREO_OUTDIR/logs
+    mkdir -p $LOG
+
+
+
+    qsub -S /bin/bash \
+        -cwd \
+        -pe smp 4 \
+        -N vireo \
+        -q short.q \
+        -l mem_requested=8G \
+        -l tmp_requested=8G \
+        -e $LOG \
+        -o $LOG \
+        -r yes \
+        -j y \
+        -M d.neavin@garvan.org.au \
+        -v DIR=$DIR,SIF=$SIF,VIREO_INDIR=$VIREO_INDIR,VIREO_OUTDIR=$VIREO_OUTDIR,FIELD=$FIELD \
+        -V \
+        -C '' $PIPELINE
+
+
+done
\ No newline at end of file
diff --git a/multi-passage/transfer_data.sh b/multi-passage/transfer_data.sh
new file mode 100644
index 0000000..36f08a6
--- /dev/null
+++ b/multi-passage/transfer_data.sh
@@ -0,0 +1,45 @@
+
+
+INDIR=/directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/combined_results/AtLeastHalfSinglet
+OUTDIR=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed
+
+mkdir -p $OUTDIR
+
+# for VIL in `ls $INDIR`
+# do
+#     cp -R $VIL $OUTDIR
+# done
+
+### Asked Himanshi to redo the overlap with demuxlet included since it allowed us to identify additionao 
+mkdir -p $OUTDIR/with_demuxlet
+
+for VIL in `ls $INDIR`
+do
+    cp -R $INDIR/$VIL $OUTDIR/with_demuxlet
+done
+
+
+
+conda activate baseR402
+
+for village in `ls /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/demuxlet_updated_barcodes/`
+do
+    mkdir -p /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/atleasthalf_singlet/$village
+    mkdir -p /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/majority_singlet/$village
+
+    Rscript /directflow/SCCGGroupShare/projects/DrewNeavin/Demultiplex_Benchmark/Demultiplexing_Doublet_Detecting_Docs/scripts/Combine_Results.R \
+        -o /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/majority_singlet/$village/majority_singlet_new.tsv \
+        -d /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/demuxlet_updated_barcodes/$village \
+        -f /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/freemuxlet_new_barcode/$village \
+        -u /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/souporcell_new_barcode/$village \
+        -v /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/vireo_new_barcode/$village \
+        --method "MajoritySinglet"
+
+        Rscript /directflow/SCCGGroupShare/projects/DrewNeavin/Demultiplex_Benchmark/Demultiplexing_Doublet_Detecting_Docs/scripts/Combine_Results.R \
+        -o /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/atleasthalf_singlet/$village/atleasthalf_singlet_new.tsv \
+        -d /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/demuxlet_updated_barcodes/$village \
+        -f /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/freemuxlet_new_barcode/$village \
+        -u /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/souporcell_new_barcode/$village \
+        -v /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/vireo_new_barcode/$village \
+        --method "AtLeastHalfSinglet"
+done
\ No newline at end of file
diff --git a/scCODA/sccod.py b/scCODA/sccod.py
new file mode 100644
index 0000000..6ddc350
--- /dev/null
+++ b/scCODA/sccod.py
@@ -0,0 +1,354 @@
+# Setup
+import importlib
+import warnings
+warnings.filterwarnings("ignore")
+
+import pandas as pd
+import pickle as pkl
+import matplotlib.pyplot as plt
+
+from sccoda.util import comp_ana as mod
+from sccoda.util import cell_composition_data as dat
+from sccoda.util import data_visualization as viz
+
+import sccoda.datasets as scd
+
+
+
+### Set up directories ###
+dir = '/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Expression_Boxplots/pluri_degs/'
+outdir = '/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/scCODA/'
+
+
+# Load data
+cell_counts = pd.read_csv(outdir + 'cell_line_numbers.tsv', sep = '\t')
+
+print(cell_counts)
+
+
+# Convert data to anndata object
+data_all = dat.from_pandas(cell_counts, covariate_columns=["Group"])
+
+print(data_all)
+
+
+# Extract condition from mouse name and add it as an extra column to the covariates
+data_all.obs["Location"] = data_all.obs["Group"].str.replace(r"_.+", "", regex=True)
+data_all.obs["Village"] = data_all.obs["Group"].str.replace(r"Brisbane_", "", regex=True).str.replace(r"Melbourne_", "", regex=True).str.replace(r"Sydney_", "", regex=True).str.replace(r"_Replicate[0-9]_.+", "", regex=True)
+data_all.obs["Replicate"] = data_all.obs["Group"].str.replace(r".+_Replicate", "Replicate", regex=True).str.replace("_.+", "", regex=True)
+data_all.obs["Cryopreservation"] = data_all.obs["Group"].str.replace(r".+_", "", regex=True)
+
+
+# Uni-culture vs village
+## Brisbane
+data_brisbane = data_all[data_all.obs["Location"].isin(["Brisbane"])]
+viz.boxplots(data_brisbane, feature_name="Village")
+plt.savefig(outdir + 'brisbane_numbers.png')
+
+model_brisbane = mod.CompositionalAnalysis(data_brisbane, formula="Village", reference_cell_type="automatic")
+brisbane_sim_results = model_brisbane.sample_hmc()
+
+brisbane_sim_results.summary()
+print(brisbane_sim_results.credible_effects())
+
+
+# Run scCODA with each cell type as the reference
+cell_types_brisbane = data_brisbane.var.index
+results_cycle_brisbane = pd.DataFrame(index=cell_types_brisbane, columns=["times_credible"]).fillna(0)
+
+models_brisbane = []
+results_brisbane = []
+
+for ct in cell_types_brisbane:
+    print(f"Reference: {ct}")
+    # Run inference
+    model_temp = mod.CompositionalAnalysis(data_brisbane, formula="Village", reference_cell_type=ct)
+    models_brisbane.append(model_temp)
+    results_temp = model_temp.sample_hmc(num_results=20000)
+    results_brisbane.append(results_temp)
+    # Select credible effects
+    cred_eff = results_temp.credible_effects()
+    cred_eff.index = cred_eff.index.droplevel(level=0)
+    # add up credible effects
+    results_cycle_brisbane["times_credible"] += cred_eff.astype("int")
+
+# Calculate percentages
+results_cycle_brisbane["pct_credible"] = results_cycle_brisbane["times_credible"]/len(cell_types_brisbane)
+results_cycle_brisbane["is_credible"] = results_cycle_brisbane["pct_credible"] > 0.5
+print(results_cycle_brisbane)
+
+# Get average difference
+### couldn't work out how to pull from summary in python, will do manually - angry emoji
+results_brisbane[0].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           0.000000      1735.548669          0.855849
+#                    MBE1006          -0.665075      1502.677050         -0.103652
+#                    TOB0421          -1.146676       899.107614         -0.798455
+
+results_brisbane[1].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           0.672738      1738.916835          0.863100
+#                    MBE1006           0.000000      1501.590528         -0.107455
+#                    TOB0421          -0.481418       896.825971         -0.801995
+
+results_brisbane[2].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           1.155125      1739.197970          0.864988
+#                    MBE1006           0.480325      1500.238853         -0.108542
+#                    TOB0421           0.000000       897.896510         -0.801505
+
+
+
+brisbane_logfc_results_dt = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'],
+        'FSA0006_ref': [0.855849, -0.103652, -0.798455],
+        'MBE1006_ref': [0.863100, -0.107455, -0.801995],
+        'TOB0421_ref': [0.864988, -0.108542, -0.801505]})
+
+brisbane_logfc_results_dt['mean'] = brisbane_logfc_results_dt.mean(axis=1)
+
+brisbane_logfc_results_dt['is_credible'] = [True, True, True]
+
+brisbane_logfc_results_dt.to_csv(outdir + "brisbane_logfc_results.tsv", sep = "\t")
+
+
+
+
+## Melbourne
+data_melbourne = data_all[data_all.obs["Location"].isin(["Melbourne"])]
+viz.boxplots(data_melbourne, feature_name="Village")
+plt.savefig(outdir + 'melbourne_numbers.png')
+
+model_melbourne = mod.CompositionalAnalysis(data_melbourne, formula="Village", reference_cell_type="automatic")
+melbourne_sim_results = model_melbourne.sample_hmc()
+
+melbourne_sim_results.summary()
+print(melbourne_sim_results.credible_effects())
+
+
+# Run scCODA with each cell type as the reference
+cell_types_melbourne = data_melbourne.var.index
+results_cycle_melbourne = pd.DataFrame(index=cell_types_melbourne, columns=["times_credible"]).fillna(0)
+
+models_melbourne = []
+results_melbourne = []
+
+for ct in cell_types_melbourne:
+    print(f"Reference: {ct}")
+    # Run inference
+    model_temp = mod.CompositionalAnalysis(data_melbourne, formula="Village", reference_cell_type=ct)
+    models_melbourne.append(model_temp)
+    results_temp = model_temp.sample_hmc(num_results=20000)
+    results_melbourne.append(results_temp)
+    # Select credible effects
+    cred_eff = results_temp.credible_effects()
+    cred_eff.index = cred_eff.index.droplevel(level=0)
+    # add up credible effects
+    results_cycle_melbourne["times_credible"] += cred_eff.astype("int")
+
+# Calculate percentages
+results_cycle_melbourne["pct_credible"] = results_cycle_melbourne["times_credible"]/len(cell_types_melbourne)
+results_cycle_melbourne["is_credible"] = results_cycle_melbourne["pct_credible"] > 0.5
+print(results_cycle_melbourne)
+
+
+# Get average difference
+### couldn't work out how to pull from summary in python, will do manually - angry emoji
+results_melbourne[0].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           0.000000      3471.687790          1.147833
+#                    MBE1006          -1.930117       648.252082         -1.636737
+#                    TOB0421          -1.203827      1065.893461         -0.588923
+
+results_melbourne[1].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           1.932333      3478.980493          1.152990
+#                    MBE1006           0.000000       650.122718         -1.634778
+#                    TOB0421           0.716773      1056.730123         -0.600693
+
+results_melbourne[2].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           1.139058      3411.776681          1.136354
+#                    MBE1006          -0.780606       662.039516         -1.633135
+#                    TOB0421           0.000000      1112.017136         -0.506959
+
+melbourne_logfc_results_dt = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'],
+        'FSA0006_ref': [1.147833, -1.636737, -0.588923],
+        'MBE1006_ref': [1.152990, -1.634778, -0.600693],
+        'TOB0421_ref': [1.136354, -1.633135, -0.506959]})
+
+melbourne_logfc_results_dt['mean'] = melbourne_logfc_results_dt.mean(axis=1)
+
+melbourne_logfc_results_dt['is_credible'] = [True, True, True]
+
+melbourne_logfc_results_dt.to_csv(outdir + "melbourne_logfc_results.tsv", sep = "\t")
+
+
+
+
+
+
+## Sydney - Fresh
+data_sydney = data_all[data_all.obs["Location"].isin(["Sydney"]) & data_all.obs["Cryopreservation"].isin(["Fresh"])]
+viz.boxplots(data_sydney, feature_name="Village")
+plt.savefig(outdir + 'sydney_numbers.png')
+
+model_sydney = mod.CompositionalAnalysis(data_sydney, formula="Village", reference_cell_type="TOB0421")
+
+sydney_sim_results = model_sydney.sample_hmc()
+sydney_sim_results.summary()
+print(sydney_sim_results.credible_effects())
+
+
+# Run scCODA with each cell type as the reference
+cell_types_sydney = data_sydney.var.index
+results_cycle_sydney = pd.DataFrame(index=cell_types_sydney, columns=["times_credible"]).fillna(0)
+
+models_sydney = []
+results_sydney = []
+
+for ct in cell_types_sydney:
+    print(f"Reference: {ct}")
+    # Run inference
+    model_temp = mod.CompositionalAnalysis(data_sydney, formula="Village", reference_cell_type=ct)
+    models_sydney.append(model_temp)
+    results_temp = model_temp.sample_hmc(num_results=20000)
+    results_sydney.append(results_temp)
+    # Select credible effects
+    cred_eff = results_temp.credible_effects()
+    cred_eff.index = cred_eff.index.droplevel(level=0)
+    # add up credible effects
+    results_cycle_sydney["times_credible"] += cred_eff.astype("int")
+
+# Calculate percentages
+results_cycle_sydney["pct_credible"] = results_cycle_sydney["times_credible"]/len(cell_types_sydney)
+results_cycle_sydney["is_credible"] = results_cycle_sydney["pct_credible"] > 0.5
+print(results_cycle_sydney)
+
+# Get average difference
+### couldn't work out how to pull from summary in python, will do manually - angry emoji
+results_sydney[0].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006                0.0      1130.524812               0.0
+#                    MBE1006                0.0      1255.685903               0.0
+#                    TOB0421                0.0       819.289285               0.0
+
+results_sydney[1].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006                0.0      1132.029342               0.0
+#                    MBE1006                0.0      1244.846090               0.0
+#                    TOB0421                0.0       828.624568               0.0
+
+results_sydney[2].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006                0.0      1125.134806               0.0
+#                    MBE1006                0.0      1255.963310               0.0
+#                    TOB0421                0.0       824.401884               0.0
+
+
+sydney_logfc_results_dt = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'],
+        'FSA0006_ref': [0, 0, 0],
+        'MBE1006_ref': [0, 0, 0],
+        'TOB0421_ref': [0, 0, 0]})
+
+sydney_logfc_results_dt['mean'] = sydney_logfc_results_dt.mean(axis=1)
+
+sydney_logfc_results_dt['is_credible'] = [False, False, False]
+
+sydney_logfc_results_dt.to_csv(outdir + "sydney_logfc_results.tsv", sep = "\t")
+
+
+
+
+
+## Sydney - Cryopreserved
+data_sydney_cryo = data_all[data_all.obs["Location"].isin(["Sydney"]) & data_all.obs["Cryopreservation"].isin(["Cryopreserved"])]
+viz.boxplots(data_sydney_cryo, feature_name="Village")
+plt.savefig(outdir + 'sydney_numbers.png')
+
+model_sydney_cryo = mod.CompositionalAnalysis(data_sydney_cryo, formula="Village", reference_cell_type="automatic")
+
+sydney_sim_results_cryo = model_sydney_cryo.sample_hmc()
+sydney_sim_results_cryo.summary()
+print(sydney_sim_results_cryo.credible_effects())
+
+
+# Run scCODA with each cell type as the reference
+cell_types_sydney_cryo = data_sydney_cryo.var.index
+results_cycle_sydney_cryo = pd.DataFrame(index=cell_types_sydney_cryo, columns=["times_credible"]).fillna(0)
+
+models_sydney_cryo = []
+results_sydney_cryo = []
+
+for ct in cell_types_sydney_cryo:
+    print(f"Reference: {ct}")
+    # Run inference
+    model_temp = mod.CompositionalAnalysis(data_sydney_cryo, formula="Village", reference_cell_type=ct)
+    models_sydney_cryo.append(model_temp)
+    results_temp = model_temp.sample_hmc(num_results=20000)
+    results_sydney_cryo.append(results_temp)
+    # Select credible effects
+    cred_eff = results_temp.credible_effects()
+    cred_eff.index = cred_eff.index.droplevel(level=0)
+    # add up credible effects
+    results_cycle_sydney_cryo["times_credible"] += cred_eff.astype("int")
+
+# Calculate percentages
+results_cycle_sydney_cryo["pct_credible"] = results_cycle_sydney_cryo["times_credible"]/len(results_cycle_sydney_cryo)
+results_cycle_sydney_cryo["is_credible"] = results_cycle_sydney_cryo["pct_credible"] > 0.5
+print(results_cycle_sydney_cryo)
+
+# Get average difference
+### couldn't work out how to pull from summary in python, will do manually - angry emoji
+results_sydney_cryo[0].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006            0.00000      1620.195457          1.368064
+#                    MBE1006           -2.05755       294.934049         -1.600354
+#                    TOB0421           -1.82538       280.037161         -1.265402
+
+results_sydney_cryo[1].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           1.967652      1628.040102          1.372877
+#                    MBE1006           0.000000       308.733991         -1.465845
+#                    TOB0421           0.000000       258.392574         -1.465845
+
+results_sydney_cryo[2].summary()
+# Effects:
+#                               Final Parameter  Expected Sample  log2-fold change
+# Covariate          Cell Type                                                    
+# Village[T.Village] FSA0006           1.925718      1608.624854          1.359406
+#                    MBE1006           0.000000       320.030556         -1.418819
+#                    TOB0421           0.000000       266.511256         -1.418819
+
+sydney_logfc_results_dt_cryo = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'],
+        'FSA0006_ref': [1.368064, -1.600354, -1.265402],
+        'MBE1006_ref': [1.372877, -1.465845, -1.465845],
+        'TOB0421_ref': [1.359406, -1.418819, -1.418819]})
+
+sydney_logfc_results_dt_cryo['mean'] = sydney_logfc_results_dt_cryo.mean(axis=1)
+
+sydney_logfc_results_dt_cryo['is_credible'] = [True, False, False]
+
+sydney_logfc_results_dt_cryo.to_csv(outdir + "sydney_cryo_logfc_results.tsv", sep = "\t")
+
diff --git a/scCODA/sccoda_plots.R b/scCODA/sccoda_plots.R
new file mode 100644
index 0000000..8f782ae
--- /dev/null
+++ b/scCODA/sccoda_plots.R
@@ -0,0 +1,105 @@
+library(data.table)
+library(tidyverse)
+
+
+
+dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/scCODA/"
+
+line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C")
+groups <- c("brisbane", "melbourne", "sydney", "sydney_cryo")
+
+##### Read in Results #####
+results <- lapply(groups, function(group){
+    tmp <- fread(paste0(dir, group, "_logfc_results.tsv"))
+    tmp$group <- group
+    return(tmp)
+})
+
+results_dt <- do.call(rbind, results)
+results_dt$V1 <- NULL
+results_dt$group <- gsub("brisbane", "Brisbane", results_dt$group) %>%
+                        gsub("melbourne", "Melbourne", .) %>%
+                        gsub("^sydney$", "Sydney", .) %>%
+                        gsub("sydney_cryo", "Sydney\nCryopreserved", .)
+
+
+results_long <- melt(results_dt, id.vars = c("Cell Type", "is_credible", "group"),
+                measure.vars = c("FSA0006_ref", "MBE1006_ref", "TOB0421_ref"))
+
+
+
+plot <- ggplot(results_long, aes(`Cell Type`, value, color = `Cell Type`)) +
+            geom_hline(yintercept = 0, linetype = "dashed") +
+            geom_jitter(width = 0.2, alpha = 0.7) +
+            theme_classic() +
+            scale_color_manual(values = line_colors) +
+            facet_wrap(vars(group), nrow = 1) +
+            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
+                axis.title.x = element_blank()) +
+            ylab("log2(Fold Change)") +
+            geom_text(
+                data = unique(results_long[,c("Cell Type", "is_credible", "group")]),
+                aes(y = 1.6,label = gsub(TRUE, "\\^", is_credible) %>% gsub(FALSE, " ", .)), 
+                color = "black",
+                # position = position_dodge(width = 1), 
+                # vjust = -0.5, 
+                size = 3, 
+                # stat = "unique", 
+                # parse = TRUE
+            )
+
+ggsave(plot, filename = paste0(dir, "fold_change_plot.png"), height = 3, width = 6)
+ggsave(plot, filename = paste0(dir, "fold_change_plot.pdf"), height = 3, width = 6)
+
+
+
+
+plot_fresh <- ggplot(results_long[group != "Sydney\nCryopreserved"], aes(`Cell Type`, value, color = `Cell Type`)) +
+            geom_hline(yintercept = 0, linetype = "dashed") +
+            geom_jitter(width = 0.2, alpha = 0.7) +
+            theme_classic() +
+            scale_color_manual(values = line_colors) +
+            facet_wrap(vars(group), nrow = 1) +
+            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
+                axis.title.x = element_blank()) +
+            ylab("log2(Fold Change)") +
+            geom_text(
+                data = unique(results_long[group != "Sydney\nCryopreserved",c("Cell Type", "is_credible", "group")]),
+                aes(y = 1.6,label = gsub(TRUE, "\\^", is_credible) %>% gsub(FALSE, " ", .)), 
+                color = "black",
+                # position = position_dodge(width = 1), 
+                # vjust = -0.5, 
+                size = 3, 
+                # stat = "unique", 
+                # parse = TRUE
+            )
+
+ggsave(plot_fresh, filename = paste0(dir, "fold_change_plot_fresh.png"), height = 3, width = 4.5)
+ggsave(plot_fresh, filename = paste0(dir, "fold_change_plot_fresh.pdf"), height = 3, width = 4.5)
+
+
+
+
+plot_cryo <- ggplot(results_long[grep("Sydney", group)], aes(`Cell Type`, value, color = `Cell Type`)) +
+            geom_hline(yintercept = 0, linetype = "dashed") +
+            geom_jitter(width = 0.2, alpha = 0.7) +
+            theme_classic() +
+            scale_color_manual(values = line_colors) +
+            facet_wrap(vars(group), nrow = 1) +
+            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
+                axis.title.x = element_blank()) +
+            ylab("log2(Fold Change)") +
+            geom_text(
+                data = unique(results_long[grep("Sydney", group),c("Cell Type", "is_credible", "group")]),
+                aes(y = 1.6,label = gsub(TRUE, "\\^", is_credible) %>% gsub(FALSE, " ", .)), 
+                color = "black",
+                # position = position_dodge(width = 1), 
+                # vjust = -0.5, 
+                size = 3, 
+                # stat = "unique", 
+                # parse = TRUE
+            )
+
+ggsave(plot_cryo, filename = paste0(dir, "fold_change_plot_cryo.png"), height = 3, width = 3.75)
+ggsave(plot_cryo, filename = paste0(dir, "fold_change_plot_cryo.pdf"), height = 3, width = 3.75)
+
diff --git a/scCODA/sccoda_preparation.R b/scCODA/sccoda_preparation.R
new file mode 100644
index 0000000..83347f5
--- /dev/null
+++ b/scCODA/sccoda_preparation.R
@@ -0,0 +1,40 @@
+library(data.table)
+library(tidyverse)
+library(Seurat)
+
+
+
+
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/scCODA/"
+dir.create(outdir, recursive = TRUE)
+
+
+
+seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/All_data_integrated_remove_bad/seurat_integrated_all_times_clustered.rds")
+
+
+head(seurat@meta.data)
+
+seurat@meta.data$Location <- gsub("_.+", "", seurat@meta.data$Location)
+seurat@meta.data$Cryopreserved <- ifelse(grepl("Thawed", seurat@meta.data$Location_Time), "Cryopreserved", "Fresh")
+seurat@meta.data$Location_Cryopreserved_Line <- paste0(seurat@meta.data$Location, "_", seurat@meta.data$Cryopreserved, "_", seurat@meta.data$Final_Assignment)
+seurat@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat@meta.data$Time) %>% 
+        gsub("Thawed Village Day 0", "Uni-Culture", .) %>%
+        gsub("Thawed Village Day 7", "Village", .) %>%
+        gsub("Village Day 4", "Village", .)
+seurat@meta.data$Replicate <- gsub("Brisbane", "Replicate", seurat@meta.data$MULTI_ID) %>%
+        gsub("Melbourne", "Replicate", .) %>% 
+        gsub("Sydney", "Replicate", .)
+
+
+seurat@meta.data$Group <- paste0(seurat@meta.data$Location, "_", seurat@meta.data$Village, "_",seurat@meta.data$Replicate, "_", seurat@meta.data$Cryopreserved)
+
+
+cell_prop_long <- data.table(table(seurat@meta.data$Group, seurat@meta.data$Final_Assignment))
+
+
+cell_prop_dt <- dcast(cell_prop_long, V1 ~ V2, value.var = "N")
+colnames(cell_prop_dt)[1] <- c("Group")
+
+
+fwrite(cell_prop_dt, paste0(outdir, "cell_line_numbers.tsv"), sep = "\t")