From b6b0ec7e8930ed2841c33852663bf87f0f2b071b Mon Sep 17 00:00:00 2001 From: drneavin Date: Wed, 31 Aug 2022 20:34:07 +1000 Subject: [PATCH] updating scripts --- Clustering/All_Integrated_QC.R | 9 +- Expression_Boxplots/Expression_Boxplots.R | 66 +- .../pluripotency_deg_boxplots.R | 128 +++ .../Expression_Correlation.R | 260 +++++- Nona_multiome/cell_line_proportions.R | 8 +- RNAvelocity/latent_on_seurat2.R | 34 +- Variance/Distribution_tests.R | 7 + .../post_review/prepare_pseudotime.R | 6 +- .../post_review/pseudotime_effect.R | 231 ++++- .../post_review/pseudotime_effect.smk | 41 + .../post_review/pseudotime_effect_combine.R | 412 +++++++++ .../post_review/pseudotime_effect_snake.sh | 135 +++ .../variance_partition_post_review_combine.R | 125 ++- Variance/test_interaction_nb_model.R | 89 +- eQTL_check/eQTL_village_interaction.sh | 109 +++ eQTL_check/eQTL_village_interaction.smk | 59 ++ eQTL_check/eQTL_village_interaction_combine.R | 357 +++++++ .../eQTL_village_interaction_deboever.R | 177 ++++ .../eQTL_village_interaction_kilpinen.R | 177 ++++ eQTL_check/filter_snps_kilpinen.R | 83 ++ eQTL_check/multi-passage/filter_snps.R | 88 ++ eQTL_check/multi-passage/genes2test.sh | 30 + eQTL_check/multi-passage/metadata_prep.R | 18 + eQTL_check/multi-passage/pull_eQTL_SNPs.sh | 85 ++ eQTL_check/multi-passage/test_eQTL.R | 204 ++++ eQTL_check/multi-passage/test_eQTL.sh | 109 +++ eQTL_check/multi-passage/test_eQTL.smk | 53 ++ .../test_eQTL_combine_deboever.R | 114 +++ eQTL_check/pull_eQTL_SNPs.sh | 15 +- eQTL_check/test_eQTL.smk | 25 +- eQTL_check/test_eQTL_combine.R | 177 +++- eQTL_check/test_eQTL_combine_deboever.R | 96 ++ eQTL_check/test_eQTL_combine_kilpinen.R | 109 +++ eQTL_check/test_eQTL_kilpinen.R | 206 +++++ .../variance_partition_multipassage.R | 358 +++++++ .../variance_partition_multipassage.smk | 99 ++ .../variance_partition_multipassage_combine.R | 874 ++++++++++++++++++ ...nce_partition_multipassage_integratedSCT.R | 357 +++++++ ...artition_multipassage_integratedSCT_Ncov.R | 358 +++++++ .../variance_partition_multipassage_snake.sh | 136 +++ multi-passage/preQC.R | 362 +++++++- multi-passage/test_vireo_ambient/vireo.sh | 6 + .../test_vireo_ambient/vireo_submit.sh | 41 + multi-passage/transfer_data.sh | 45 + scCODA/sccod.py | 354 +++++++ scCODA/sccoda_plots.R | 105 +++ scCODA/sccoda_preparation.R | 40 + 47 files changed, 6874 insertions(+), 103 deletions(-) create mode 100644 Expression_Boxplots/pluripotency_deg_boxplots.R create mode 100644 Variance/RNAvelocity/post_review/pseudotime_effect.smk create mode 100644 Variance/RNAvelocity/post_review/pseudotime_effect_combine.R create mode 100644 Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh create mode 100644 eQTL_check/eQTL_village_interaction.sh create mode 100644 eQTL_check/eQTL_village_interaction.smk create mode 100644 eQTL_check/eQTL_village_interaction_combine.R create mode 100644 eQTL_check/eQTL_village_interaction_deboever.R create mode 100644 eQTL_check/eQTL_village_interaction_kilpinen.R create mode 100644 eQTL_check/filter_snps_kilpinen.R create mode 100644 eQTL_check/multi-passage/filter_snps.R create mode 100644 eQTL_check/multi-passage/genes2test.sh create mode 100644 eQTL_check/multi-passage/metadata_prep.R create mode 100644 eQTL_check/multi-passage/pull_eQTL_SNPs.sh create mode 100644 eQTL_check/multi-passage/test_eQTL.R create mode 100644 eQTL_check/multi-passage/test_eQTL.sh create mode 100644 eQTL_check/multi-passage/test_eQTL.smk create mode 100644 eQTL_check/multi-passage/test_eQTL_combine_deboever.R create mode 100644 eQTL_check/test_eQTL_combine_deboever.R create mode 100644 eQTL_check/test_eQTL_combine_kilpinen.R create mode 100644 eQTL_check/test_eQTL_kilpinen.R create mode 100644 multi-passage/Variance/variance_partition_multipassage.R create mode 100644 multi-passage/Variance/variance_partition_multipassage.smk create mode 100644 multi-passage/Variance/variance_partition_multipassage_combine.R create mode 100644 multi-passage/Variance/variance_partition_multipassage_integratedSCT.R create mode 100644 multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R create mode 100644 multi-passage/Variance/variance_partition_multipassage_snake.sh create mode 100644 multi-passage/test_vireo_ambient/vireo.sh create mode 100644 multi-passage/test_vireo_ambient/vireo_submit.sh create mode 100644 multi-passage/transfer_data.sh create mode 100644 scCODA/sccod.py create mode 100644 scCODA/sccoda_plots.R create mode 100644 scCODA/sccoda_preparation.R diff --git a/Clustering/All_Integrated_QC.R b/Clustering/All_Integrated_QC.R index a9dc280..5ae1641 100644 --- a/Clustering/All_Integrated_QC.R +++ b/Clustering/All_Integrated_QC.R @@ -38,14 +38,15 @@ save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){ seurat <- readRDS(paste0(datadir,"seurat_integrated_all_times_clustered.rds")) seurat@meta.data$Location <- gsub("_.+", "", seurat@meta.data$Location_Time) seurat@meta.data$phases <- factor(seurat@meta.data$phases, levels = c("G1", "S", "G2M")) - +seurat@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat@meta.data$Time) %>% gsub("Village Day 4", "Village", .) %>% gsub("Thawed Village Day 0", "Uni-Culture", .) %>% gsub("Thawed Village Day 7", "Village", .) ##### Set up Colors ##### cell_line_colors <- c("FSA0006" = "#F79E29", "MBE1006" = "#9B2C99", "TOB0421"= "#35369C") replicate_colors <- c("1" = "#ACD39E", "2" = "#5AAE61", "3" = "#1B7837") time_colors <- c("Baseline" = "#b9cee4", "Village Day 4" = "#8a92bb", "Thawed Village Day 0" = "#7d57a0", "Thawed Village Day 7" = "#853786") +village_colors <- c("Uni-Culture" = "#613246", "Village" = "#A286AA") cycle_colors <- c("G1" = "#4393C3", "S" = "#92C5DE", "G2M" = "#D1E5F0") -site_colors <- c("Brisbane" = "#5D59AB", "Sydney" = "#A7C9A9", "Melbourne" = "#179085") +site_colors <- c("Brisbane" = "#536CB4", "Sydney" = "#62BD67", "Melbourne" = "#D24F72") cluster_colors <- c("0" = "#C9D8EA", "1" = "#928CC5", "2" = "#A7C9A9", "3" = "#179085", "4" = "#F79F9F", "5" = "#C35B76", "6" = "#F4C893", "7" = "#F6AA4B") @@ -54,6 +55,10 @@ cluster_colors <- c("0" = "#C9D8EA", "1" = "#928CC5", "2" = "#A7C9A9", "3" = "#1 UMAP_cell_line <- DimPlot(seurat, reduction = "umap", group.by = c("integrated_snn_res.0.28"), cols = cluster_colors) + labs(color="Cluster") + ggtitle(NULL) save_figs(UMAP_cell_line, basename = paste0(outdir,"Cluster_umap"),width = 15, height = 15) +## With Village ## +UMAP_village <- DimPlot(seurat, reduction = "umap", group.by = c("Village"), cols = village_colors) + labs(color="Village") + ggtitle(NULL) +save_figs(UMAP_village, basename = paste0(outdir,"Village_umap"),width = 15, height = 15) + ## By Cell Line ## UMAP_cell_line <- DimPlot(seurat, reduction = "umap", group.by = c("Final_Assignment"), cols = cell_line_colors) + labs(color="Cell Line") + ggtitle(NULL) save_figs(UMAP_cell_line, basename = paste0(outdir,"Cell_Line_umap"),width = 15, height = 15) diff --git a/Expression_Boxplots/Expression_Boxplots.R b/Expression_Boxplots/Expression_Boxplots.R index fe312ed..d6cc51b 100644 --- a/Expression_Boxplots/Expression_Boxplots.R +++ b/Expression_Boxplots/Expression_Boxplots.R @@ -40,7 +40,7 @@ save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){ ##### Set up colors ##### variable_colors <- c(Village = "#A2B0D0", Replicate = "#64A66B", Line = "#68319B") line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C") -village_colors <- c(Baseline = "#b9cee4", Village = "#8a92bb" ) +village_colors <- c("Uni-Culture" = "#613246", "Village" = "#A286AA") site_updates <- c("Brisbane" = "Site 1" ,"Melbourne" = "Site 2", "Sydney" = "Site 3") @@ -57,7 +57,7 @@ names(seurat_list) <- gsub("_seurat.rds", "", files) seurat_list <- lapply(seurat_list, function(x){ x$Location <- ifelse(x$Time %in% c("Thawed Village Day 0", "Thawed Village Day 7"), "Sydney_Cryopreserved", x$Location) - x$Time <- ifelse(x$Time %in% c("Village Day 4", "Thawed Village Day 7"), "Village", "Baseline") + x$Time <- ifelse(x$Time %in% c("Village Day 4", "Thawed Village Day 7"), "Village", "Uni-Culture") for (location in names(site_updates)){ x$Location <- gsub(location, site_updates[location], x$Location) } @@ -455,29 +455,73 @@ for (gene in pluri_genes$Gene){ df_list[[gene]] <- expression_df(seurat_list, pluri_genes[which(pluri_genes$Gene == gene),"ENSG"], c("Location", "Time", "Final_Assignment")) df_list[[gene]]$Gene <- gene } -df <- do.call(rbind, df_list) +df <- data.table(do.call(rbind, df_list)) dir.create(paste0(outdir,"pluri_genes/")) -p_counts <- ggplot(df[which(df$Location != "Site 3_Cryopreserved"),], aes(x = Final_Assignment, y = Counts, color = Time)) + - geom_boxplot(outlier.size = 0.5) + + + +##### Read in significance ##### +pluri_deg <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Expression_Boxplots/pluri_degs/LR_DEGs_4pluri_genes.rds") + + +pluri_deg <- lapply(names(pluri_deg), function(x){ + strings <- unlist(str_split(x, "_")) + pluri_deg[[x]]$Location <- gsub("Brisbane", "Site 1", strings[1]) %>% gsub("Melbourne", "Site 2", .) %>% gsub("Sydney", "Site 3", .) + pluri_deg[[x]]$Cryopreservation <- strings[2] + pluri_deg[[x]]$Final_Assignment <- strings[3] + return(pluri_deg[[x]]) +}) + +pluri_deg_dt <- do.call(rbind,pluri_deg) +colnames(pluri_deg_dt) <- gsub("GeneID", "Gene", colnames(pluri_deg_dt)) +pluri_deg_dt$Symbol <- "*" +pluri_deg_dt$counts_position <- ifelse(pluri_deg_dt$Gene == "MYC", 26, + ifelse(pluri_deg_dt$Gene == "NANOG", 22, + ifelse(pluri_deg_dt$Gene == "POU5F1", 115, 60))) + +df$Cryopreservation <- ifelse(df$Location != "3_Cryopreserved", "Fresh", "Cryopreserved") +df$Location <- gsub("_Cryopreserved", "", df$Location) + + +p_counts <- ggplot(df[Cryopreservation != "Cryopreserved"], aes(x = Final_Assignment, y = Counts, color = Time)) + + geom_boxplot(aes(fill = Time), outlier.size = 0.5) + theme_classic() + - facet_grid(Gene ~ Location, scales = "free_y") + scale_color_manual(values = village_colors) + + scale_fill_manual(values = alpha(village_colors, 0.3)) + theme(legend.position = "none", - axis.title.x = element_blank()) -save_figs(p_counts, paste0(outdir,"pluri_genes/pluri_counts"), width = 16, height = 14) + axis.title.x = element_blank()) + + geom_text( + data = pluri_deg_dt[Cryopreservation != "Cryopreserved"], + aes(x = Final_Assignment, y = counts_position,label = Symbol), + color = "black", + size = 4) + + facet_grid(Gene ~ Location, scales = "free_y") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +save_figs(p_counts, paste0(outdir,"pluri_genes/pluri_counts"), width = 16, height = 16) -p_scaled <- ggplot(df[which(df$Location != "Site 3_Cryopreserved"),], aes(x = Final_Assignment, y = Normalized, color = Time)) + - geom_boxplot(outlier.size = 0.5) + + +pluri_deg_dt$counts_position <- ifelse(pluri_deg_dt$Gene == "MYC", 15.5, + ifelse(pluri_deg_dt$Gene == "NANOG", 10, + ifelse(pluri_deg_dt$Gene == "POU5F1", 9, 12))) + +p_scaled <- ggplot(df[Location != "Cryopreserved"], aes(x = Final_Assignment, y = Normalized, color = Time)) + + geom_boxplot(aes(fill = Time),outlier.size = 0.15, lwd=0.3) + theme_classic() + facet_grid(Gene ~ Location, scales = "free_y") + scale_color_manual(values = village_colors) + + scale_fill_manual(values = alpha(village_colors, 0.4)) + theme(axis.title.x = element_blank()) + ylab("Normalized Expression") + - theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + geom_text( + data = pluri_deg_dt[Cryopreservation != "Cryopreserved"], + aes(x = Final_Assignment, y = counts_position,label = Symbol), + color = "black", + size = 3) save_figs(p_scaled, paste0(outdir,"pluri_genes/pluri_normalized"), width = 15, height = 10) diff --git a/Expression_Boxplots/pluripotency_deg_boxplots.R b/Expression_Boxplots/pluripotency_deg_boxplots.R new file mode 100644 index 0000000..b1f2f89 --- /dev/null +++ b/Expression_Boxplots/pluripotency_deg_boxplots.R @@ -0,0 +1,128 @@ +library(data.table) +library(tidyverse) +library(Seurat) + + + + +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Expression_Boxplots/pluri_degs/" +dir.create(outdir, recursive = TRUE) + + +pluri_genes <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/pluripotency_genes.tsv") + +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/All_data_integrated_remove_bad/seurat_integrated_all_times_clustered.rds") +seurat@meta.data$Location <- gsub("_.+", "", seurat@meta.data$Location) +seurat@meta.data$Cryopreserved <- ifelse(grepl("Thawed", seurat@meta.data$Location_Time), "Cryopreserved", "Fresh") +seurat@meta.data$Location_Cryopreserved_Line <- paste0(seurat@meta.data$Location, "_", seurat@meta.data$Cryopreserved, "_", seurat@meta.data$Final_Assignment) +seurat@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat@meta.data$Time) %>% + gsub("Thawed Village Day 0", "Uni-Culture", .) %>% + gsub("Thawed Village Day 7", "Village", .) %>% + gsub("Village Day 4", "Village", .) + + +##### Combine replicates from same groups together ##### +seurat_list <- lapply(unique(seurat@meta.data$Location_Cryopreserved_Line), function(x){ + subset(seurat, subset = Location_Cryopreserved_Line == x) +}) +names(seurat_list) <- unique(seurat@meta.data$Location_Cryopreserved_Line) + + + +seurat_list <- lapply(seurat_list, function(x){ + Idents(x) <- "Village" + return(x) +}) + + + +seurat_list <- lapply(seurat_list, function(x){ + SCTransform(x, vars.to.regress = c("scores.G1", "scores.S", "scores.G2M", "percent.mt", "percent.rb"), return.only.var.genes = FALSE) +}) + +seurat_list <- lapply(seurat_list, function(x){ + PrepSCTFindMarkers(x) +}) + +saveRDS(seurat_list, paste0(outdir, "sct_location_cryo_line.rds")) +seurat_list <- readRDS(paste0(outdir, "sct_location_cryo_line.rds")) + + + +##### Combine Village and Baselines together ##### +### Try with logistic MAST ### +MAST_DEGs <- lapply(seurat_list, function(x){ + FindMarkers(x, ident.1 = "Uni-Culture", ident.2 = "Village", latent.vars = "MULTI_classification", test.use = "MAST", logfc.threshold = 0, assay = "RNA") +}) + +saveRDS(MAST_DEGs, paste0(outdir, "MAST_DEGs.rds")) +MAST_DEGs <- readRDS(paste0(outdir, "MAST_DEGs.rds")) + + +### Update multiple testing for all sites ### +MAST_DEGs_nrow_list <- lapply(MAST_DEGs, function(x) data.table(nrow(x))) +MAST_DEGs_nrow <- sum(do.call(rbind, MAST_DEGs_nrow_list)$V1) + +MAST_DEGs <- lapply(MAST_DEGs, function(x){ + x$p_val_adj_updated <- p.adjust(x$p_val, method = "bonferroni", n = MAST_DEGs_nrow) + return(x) +}) + + +MAST_DEGs_pluri <- lapply(MAST_DEGs, function(x){ + x$ENSG <- rownames(x) + x <- data.table(x) + x <- x[pluri_genes, on = c("ENSG")] + x <- x[!is.na(p_val) & p_val_adj_updated < 0.05] + return(x) +}) + +MAST_DEGs_pluri <- lapply(MAST_DEGs, function(x){ + x$ENSG <- rownames(x) + x <- data.table(x) + x <- x[pluri_genes, on = c("ENSG")] + x <- x[!is.na(p_val) & p_val_adj < 0.05] + return(x) +}) + +MAST_DEGs_pluri_4 <- lapply(MAST_DEGs_pluri, function(x){ + x[GeneID %in% c("MYC", "NANOG", "POU5F1", "SOX2")] +}) + + +### Try with logistic regression ### +LR_DEGs <- lapply(seurat_list, function(x){ + FindMarkers(x, ident.1 = "Uni-Culture", ident.2 = "Village", latent.vars = "MULTI_classification", test.use = "LR", logfc.threshold = 0) +}) + +saveRDS(LR_DEGs, paste0(outdir, "LR_DEGs.rds")) +##### Will use LR instead of MAST - MAST finds more DEG and lower p-values which I'm not sure is legitimate ##### + + +### Update multiple testing for all sites ### +LR_DEGs_nrow_list <- lapply(LR_DEGs, function(x) data.table(nrow(x))) +LR_DEGs_nrow <- sum(do.call(rbind, LR_DEGs_nrow_list)$V1) + +LR_DEGs <- lapply(LR_DEGs, function(x){ + x$p_val_adj_updated <- p.adjust(x$p_val, method = "bonferroni", n = LR_DEGs_nrow) + return(x) +}) + + +LR_DEGs_pluri <- lapply(LR_DEGs, function(x){ + x$ENSG <- rownames(x) + x <- data.table(x) + x <- x[pluri_genes, on = c("ENSG")] + x <- x[!is.na(p_val) & p_val_adj_updated < 0.05] + return(x) +}) + + +LR_DEGs_pluri_4 <- lapply(LR_DEGs_pluri, function(x){ + x[GeneID %in% c("MYC", "NANOG", "POU5F1", "SOX2")] +}) + + + +saveRDS(LR_DEGs_pluri_4, paste0(outdir, "LR_DEGs_4pluri_genes.rds")) + diff --git a/Expression_Correlation/Expression_Correlation.R b/Expression_Correlation/Expression_Correlation.R index 855d768..eb421e0 100644 --- a/Expression_Correlation/Expression_Correlation.R +++ b/Expression_Correlation/Expression_Correlation.R @@ -227,8 +227,266 @@ summary_SCT_list <- lapply(names(SCT_combined_list_list), function(x){ -summary_SCT <- do.call(rbind, summary_SCT_list) +summary_SCT <- data.table(do.call(rbind, summary_SCT_list)) summary_SCT$Replicate <- gsub("Brisbane", "Replicate", summary_SCT$Replicate) %>% gsub("Melbourne", "Replicate", .) %>% gsub("Sydney", "Replicate", .) + + + +##### Get unique id for each ##### +summary_SCT$ID <- paste(summary_SCT$Location, summary_SCT$Time, summary_SCT$Line, summary_SCT$Replicate, sep = "-") + + +correlations <- lapply(unique(summary_SCT$ID), function(group1){ + print(group1) + tmp <- lapply(unique(summary_SCT$ID), function(group2){ + print(group2) + genes <- summary_SCT[ID == group1][summary_SCT[ID == group1]$Gene %in% summary_SCT[ID == group2]$Gene]$Gene + print(all(summary_SCT[ID == group1][Gene %in% genes]$Gene == summary_SCT[ID == group2][Gene %in% genes]$Gene)) + return(data.table(Group1 = group1, Group2 = group2, Spearman = cor(summary_SCT[ID == group1][Gene %in% genes]$Mean, summary_SCT[ID == group2][Gene %in% genes]$Mean, method = "spearman"))) + }) + return(do.call(rbind, tmp)) +}) + +saveRDS(correlations, paste0(outdir, "correlations.rds")) + + +correlations_pearson <- lapply(unique(summary_SCT$ID), function(group1){ + print(group1) + tmp <- lapply(unique(summary_SCT$ID), function(group2){ + print(group2) + genes <- summary_SCT[ID == group1][summary_SCT[ID == group1]$Gene %in% summary_SCT[ID == group2]$Gene]$Gene + print(all(summary_SCT[ID == group1][Gene %in% genes]$Gene == summary_SCT[ID == group2][Gene %in% genes]$Gene)) + return(data.table(Group1 = group1, Group2 = group2, Pearson = cor(summary_SCT[ID == group1][Gene %in% genes]$Mean, summary_SCT[ID == group2][Gene %in% genes]$Mean))) + }) + return(do.call(rbind, tmp)) +}) + +correlations_pearson_dt <- do.call(rbind, correlations_pearson) + +tmp <- summary_SCT[ID == "Brisbane-Baseline-FSA0006-Replicate1"][summary_SCT[ID == "Brisbane-Baseline-TOB0421-Replicate1"], on = "Gene"] + +scatter_test <- ggplot(tmp, aes(Mean,i.Mean)) + + geom_point() + + theme_classic() + +ggsave(scatter_test, filename = paste0(outdir, "test_correlation.png")) + + +correlations_dt <- do.call(rbind, correlations) +correlations_dt[, c("Location1", "Village1", "Line1", "Replicate1") := tstrsplit(Group1, "-", fixed=TRUE)] +correlations_dt[, c("Location2", "Village2", "Line2", "Replicate2") := tstrsplit(Group2, "-", fixed=TRUE)] + + + +##### Try with data slot instead ##### +summary_SCT_data_list <- lapply(names(SCT_combined_list_list), function(x){ + temp3 <- lapply(names(SCT_combined_list_list[[x]]), function(y){ + temp2 <- lapply(names(SCT_combined_list_list[[x]][[y]]), function(z){ + temp <- lapply(names(SCT_combined_list_list[[x]][[y]][[z]]), function(rep){ + data.frame(Gene = rownames(SCT_combined_list_list[[x]][[y]][[z]][[rep]]), Mean = rowMeans(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@data), N = ncol(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@data), Replicate = rep, Line = z, Location = y, Time = x) + }) + do.call(rbind, temp) + }) + do.call(rbind, temp2) + }) + do.call(rbind, temp3) +}) + + + +summary_SCT_data <- data.table(do.call(rbind, summary_SCT_data_list)) +summary_SCT_data$Replicate <- gsub("Brisbane", "Replicate", summary_SCT_data$Replicate) %>% gsub("Melbourne", "Replicate", .) %>% gsub("Sydney", "Replicate", .) + + + +##### Get unique id for each ##### +summary_SCT_data$ID <- paste(summary_SCT_data$Location, summary_SCT_data$Time, summary_SCT_data$Line, summary_SCT_data$Replicate, sep = "-") + + +correlations_data <- lapply(unique(summary_SCT_data$ID), function(group1){ + print(group1) + tmp <- lapply(unique(summary_SCT_data$ID)[(which(unique(summary_SCT_data$ID) == group1) + 1):length(unique(summary_SCT_data$ID))], function(group2){ + print(group2) + genes <- summary_SCT_data[ID == group1][summary_SCT_data[ID == group1]$Gene %in% summary_SCT_data[ID == group2]$Gene]$Gene + print(all(summary_SCT_data[ID == group1][Gene %in% genes]$Gene == summary_SCT_data[ID == group2][Gene %in% genes]$Gene)) + return(data.table(Group1 = group1, Group2 = group2, Spearman = cor(summary_SCT_data[ID == group1][Gene %in% genes]$Mean, summary_SCT_data[ID == group2][Gene %in% genes]$Mean, method = "spearman"))) + }) + return(do.call(rbind, tmp)) +}) + + + +correlations_data_dt <- do.call(rbind, correlations_data) +correlations_data_dt[, c("Location1", "Village1", "Line1", "Replicate1") := tstrsplit(Group1, "-", fixed=TRUE)] +correlations_data_dt[, c("Location2", "Village2", "Line2", "Replicate2") := tstrsplit(Group2, "-", fixed=TRUE)] + +correlations_data_fresh_dt <- correlations_data_dt[Location1 != "Sydney_Cryopreserved" | Location2 != "Sydney_Cryopreserved"] + + +##### ##### +correlations_data_fresh_village_dt <- correlations_data_fresh_dt[Replicate1 == Replicate2 & Line1 == Line2 & Location1 == Location2 & Village1 != Village2 & Group1 != Group2] +correlations_data_fresh_village_dt$Group <- "Village" + +correlations_data_fresh_replicate_dt <- correlations_data_fresh_dt[Village1 == Village2 & Line1 == Line2 & Location1 == Location2 & Replicate1 != Replicate2 & Group1 != Group2] +correlations_data_fresh_replicate_dt$Group <- "Replicate" + +correlations_data_fresh_location_dt <- correlations_data_fresh_dt[Replicate1 == Replicate2 & Line1 == Line2 & Village1 == Village2 & Location1 != Location2 & Group1 != Group2] +correlations_data_fresh_location_dt$Group <- "Location" + +correlations_data_fresh_line_dt <- correlations_data_fresh_dt[Village1 == Village2 & Location1 == Location2 & Line1 != Line2 & Group1 != Group2] +correlations_data_fresh_line_dt$Group <- "Line" + +correlations_data_fresh_combined_dt <- rbind(correlations_data_fresh_village_dt, correlations_data_fresh_replicate_dt, correlations_data_fresh_location_dt, correlations_data_fresh_line_dt) + +correlations_data_fresh_combined_dt$Group <- factor(correlations_data_fresh_combined_dt$Group, levels = c("Replicate", "Village", "Line", "Location")) + + +res <- list() + +for (group1 in unique(correlations_data_fresh_combined_dt$Group)){ + for (group2 in unique(correlations_data_fresh_combined_dt$Group)[(which(unique(correlations_data_fresh_combined_dt$Group) == group1)+1):length(unique(correlations_data_fresh_combined_dt$Group))]){ + res[[group1]][[group2]] <- wilcox.test(correlations_data_fresh_combined_dt[Group == group1]$Spearman, correlations_data_fresh_combined_dt[Group == group2]$Spearman,exact = FALSE) + } +} + + + + +correlations_data_fresh_combined_dt_med <- correlations_data_fresh_combined_dt %>% + group_by(Group) %>% + mutate(median = median(as.numeric(Spearman))) + + +correlation_fresh_dist <- ggplot(correlations_data_fresh_combined_dt, aes(Spearman)) + + geom_histogram(bins = 50) + + facet_wrap(vars(factor(Group, levels = c("Replicate", "Village", "Line", "Location"))), ncol = 1, scales = "free_y") + + theme_classic() + + geom_vline(data = correlations_data_fresh_combined_dt_med, aes(xintercept = median), linetype = "dashed") + +ggsave(correlation_fresh_dist, filename = paste0(outdir, "correlation_distributions_fresh.png"), width = 3) +ggsave(correlation_fresh_dist, filename = paste0(outdir, "correlation_distributions_fresh.pdf"), width = 3) + + +##### For cryopreserdd ##### + +correlations_data_combined_cryo_dt <- correlations_data_dt[grepl("Sydney", Location1) & grepl("Sydney", Location2)] +correlations_data_combined_cryo_dt$Cryopreserved1 <- gsub("Sydney_", "",correlations_data_combined_cryo_dt$Location1) +correlations_data_combined_cryo_dt$Cryopreserved2 <- gsub("Sydney_", "",correlations_data_combined_cryo_dt$Location2) + + +##### ##### +correlations_data_cryo_village_dt <- correlations_data_combined_cryo_dt[Replicate1 == Replicate2 & Line1 == Line2 & Cryopreserved1 == Cryopreserved2 & Village1 != Village2 & Group1 != Group2] +correlations_data_cryo_village_dt$Group <- "Village" + +correlations_data_cryo_replicate_dt <- correlations_data_combined_cryo_dt[Village1 == Village2 & Line1 == Line2 & Cryopreserved1 == Cryopreserved2 & Replicate1 != Replicate2 & Group1 != Group2] +correlations_data_cryo_replicate_dt$Group <- "Replicate" + +correlations_data_cryo_cryo_dt <- correlations_data_combined_cryo_dt[Replicate1 == Replicate2 & Line1 == Line2 & Village1 == Village2 & Cryopreserved1 != Cryopreserved2 & Group1 != Group2] +correlations_data_cryo_cryo_dt$Group <- "Cryopreserved" + +correlations_data_cryo_line_dt <- correlations_data_combined_cryo_dt[Village1 == Village2 & Cryopreserved1 == Cryopreserved2 & Line1 != Line2 & Group1 != Group2] +correlations_data_cryo_line_dt$Group <- "Line" + +correlations_data_cryo_combined_dt <- rbind(correlations_data_cryo_village_dt, correlations_data_cryo_replicate_dt, correlations_data_cryo_cryo_dt, correlations_data_cryo_line_dt) + +correlations_data_cryo_combined_dt$Group <- factor(correlations_data_cryo_combined_dt$Group, levels = c("Replicate", "Cryopreserved", "Village", "Line")) + +correlations_data_cryo_combined_dt_med <- correlations_data_cryo_combined_dt %>% + group_by(Group) %>% + mutate(median = median(as.numeric(Spearman))) + + + +correlation_cryo_dist <- ggplot(correlations_data_cryo_combined_dt, aes(Spearman)) + + geom_histogram(bins = 50) + + facet_wrap(vars(factor(Group, levels = c("Replicate", "Cryopreserved", "Village", "Line"))), ncol = 1, scales = "free_y") + + theme_classic() + + geom_vline(data = correlations_data_cryo_combined_dt_med, aes(xintercept = median), linetype = "dashed") + +ggsave(correlation_cryo_dist, filename = paste0(outdir, "correlation_distributions_cryo.png"), width = 3) +ggsave(correlation_cryo_dist, filename = paste0(outdir, "correlation_distributions_cryo.pdf"), width = 3) + + + + + +##### Try with counts slot instead ##### +summary_SCT_counts_list <- lapply(names(SCT_combined_list_list), function(x){ + temp3 <- lapply(names(SCT_combined_list_list[[x]]), function(y){ + temp2 <- lapply(names(SCT_combined_list_list[[x]][[y]]), function(z){ + temp <- lapply(names(SCT_combined_list_list[[x]][[y]][[z]]), function(rep){ + data.frame(Gene = rownames(SCT_combined_list_list[[x]][[y]][[z]][[rep]]), Mean = rowMeans(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@counts), N = ncol(SCT_combined_list_list[[x]][[y]][[z]][[rep]][["SCT"]]@counts), Replicate = rep, Line = z, Location = y, Time = x) + }) + do.call(rbind, temp) + }) + do.call(rbind, temp2) + }) + do.call(rbind, temp3) +}) + + + +summary_SCT_counts <- data.table(do.call(rbind, summary_SCT_counts_list)) +summary_SCT_counts$Replicate <- gsub("Brisbane", "Replicate", summary_SCT_counts$Replicate) %>% gsub("Melbourne", "Replicate", .) %>% gsub("Sydney", "Replicate", .) + + + +##### Get unique id for each ##### +summary_SCT_counts$ID <- paste(summary_SCT_counts$Location, summary_SCT_counts$Time, summary_SCT_counts$Line, summary_SCT_counts$Replicate, sep = "-") + + +correlations_counts <- lapply(unique(summary_SCT_counts$ID), function(group1){ + print(group1) + tmp <- lapply(unique(summary_SCT_counts$ID)[(which(unique(summary_SCT_counts$ID) == group1) + 1):length(unique(summary_SCT_counts$ID))], function(group2){ + print(group2) + genes <- summary_SCT_counts[ID == group1][summary_SCT_counts[ID == group1]$Gene %in% summary_SCT_counts[ID == group2]$Gene]$Gene + print(all(summary_SCT_counts[ID == group1][Gene %in% genes]$Gene == summary_SCT_counts[ID == group2][Gene %in% genes]$Gene)) + return(data.table(Group1 = group1, Group2 = group2, Spearman = cor(summary_SCT_counts[ID == group1][Gene %in% genes]$Mean, summary_SCT_counts[ID == group2][Gene %in% genes]$Mean, method = "spearman"))) + }) + return(do.call(rbind, tmp)) +}) + + + +correlations_counts_dt <- do.call(rbind, correlations_counts) +correlations_counts_dt[, c("Location1", "Village1", "Line1", "Replicate1") := tstrsplit(Group1, "-", fixed=TRUE)] +correlations_counts_dt[, c("Location2", "Village2", "Line2", "Replicate2") := tstrsplit(Group2, "-", fixed=TRUE)] + +correlations_counts_village_dt <- correlations_counts_dt[Replicate1 == Replicate2 & Line1 == Line2 & Location1 == Location2 & Village1 != Village2 & Group1 != Group2] +correlations_counts_village_dt$Group <- "Village" + +correlations_counts_replicate_dt <- correlations_counts_dt[Village1 == Village2 & Line1 == Line2 & Location1 == Location2 & Replicate1 != Replicate2 & Group1 != Group2] +correlations_counts_replicate_dt$Group <- "Replicate" + +correlations_counts_location_dt <- correlations_counts_dt[Replicate1 == Replicate2 & Line1 == Line2 & Village1 == Village2 & Location1 != Location2 & Group1 != Group2] +correlations_counts_location_dt$Group <- "Location" + +correlations_counts_line_dt <- correlations_counts_dt[Replicate1 == Replicate2 & Village1 == Village2 & Location1 == Location2 & Line1 != Line2 & Group1 != Group2] +correlations_counts_line_dt$Group <- "Line" + +correlations_counts_combined_dt <- rbind(correlations_counts_village_dt, correlations_counts_replicate_dt, correlations_counts_location_dt, correlations_counts_line_dt) + +correlations_counts_combined_dt_med <- correlations_counts_combined_dt %>% + group_by(Group) %>% + mutate(median = median(as.numeric(Spearman))) + + +correlations_counts_combined_dt$Group <- factor(correlations_counts_combined_dt$Group, levels = c("Replicate", "Village", "Line", "Location")) + + +correlation_dist <- ggplot(correlations_counts_combined_dt, aes(Spearman)) + + geom_histogram(bins = 50) + + facet_wrap(vars(factor(Group, levels = c("Replicate", "Village", "Line", "Location"))), ncol = 1, scales = "free_y") + + theme_classic() + + geom_vline(data = correlations_counts_combined_dt_med, aes(xintercept = median), linetype = "dashed") + +ggsave(correlation_dist, filename = paste0(outdir, "correlation_distributions_counts.png"), width = 3) + + + + + + summary_SCT_wide <- pivot_wider(summary_SCT, names_from = Replicate, values_from = c(Mean, N)) summary_SCT_wide$SD <- rowSds(as.matrix(summary_SCT_wide[,c("Mean_Replicate1", "Mean_Replicate2", "Mean_Replicate3")])) summary_SCT_wide$Mean <- (summary_SCT_wide$Mean_Replicate1 * summary_SCT_wide$N_Replicate1 + summary_SCT_wide$Mean_Replicate2 * summary_SCT_wide$N_Replicate2 + summary_SCT_wide$Mean_Replicate3 * summary_SCT_wide$N_Replicate3)/(summary_SCT_wide$N_Replicate1 + summary_SCT_wide$N_Replicate2 + summary_SCT_wide$N_Replicate3) diff --git a/Nona_multiome/cell_line_proportions.R b/Nona_multiome/cell_line_proportions.R index fbbb345..c8f47a4 100644 --- a/Nona_multiome/cell_line_proportions.R +++ b/Nona_multiome/cell_line_proportions.R @@ -32,7 +32,7 @@ village_id_list <- lapply(villages, function(x){ # tmp <- fread(paste0(datadir,x,"/CombinedResults/Final_Assignments_demultiplexing_doublets_new_edit.txt"), sep = "\t") tmp <- readRDS(paste0(non_dir,x)) dt <- data.table(tmp@meta.data) - dt$Pool_ID <- gsub("_DemuxALL.rds", "",x)) + dt$Pool_ID <- gsub("_DemuxALL.rds", "",x) dt$Day <- as.numeric(as.character(gsub("Village_Day", "", dt$Pool_ID))) return(dt) }) @@ -66,6 +66,11 @@ village_summary_singlets <- data.table(prop.table(table(village_id[Assignment != village_summary_singlets$Assignment <- factor(village_summary_singlets$Assignment, levels = rev(village_summary_singlets[Day_updated == 15]$Assignment[order(village_summary_singlets[Day_updated == 15]$N)])) +colors <- c("#f44336", "#e81f63", "#9c27b0", "#673ab7", "#3f51b5", "#2096f3","#2096f3", "#009688", "#4caf50", "#8bc34a", "#cddc39", "#ffeb3b", "#ffc108", "#ff9801", "#ff5723" ,"#795548", "#9e9e9e", "#607d8b") +names(colors) <- levels(village_summary_singlets$Assignment) + +saveRDS(colors, paste0(outdir,"line_colors")) + ##### Make proportion plots (area plot) ##### p_stacked_area <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day_updated)), y = N, fill = factor(Assignment), group = Assignment)) + @@ -80,6 +85,7 @@ ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.pdf"), width = 7, + ##### Make line plot of propotion over time ##### p_line <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day_updated)), y = N, color = Assignment)) + geom_point() + diff --git a/RNAvelocity/latent_on_seurat2.R b/RNAvelocity/latent_on_seurat2.R index 3e27439..ab321a0 100644 --- a/RNAvelocity/latent_on_seurat2.R +++ b/RNAvelocity/latent_on_seurat2.R @@ -1,10 +1,11 @@ library(data.table) library(Seurat) library(ggplot2) -library(Nebulosa) -library(schex) -library(R.utils) -library(Hmisc) +library(tidyverse) +# library(Nebulosa) +# library(schex) +# library(R.utils) +# library(Hmisc) ##### Setting up Directories @@ -16,6 +17,7 @@ dir.create(outdir, recursive = TRUE) line_colors = c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C") +village_colors <- c("Uni-Culture" = "#613246", "Village" = "#A286AA") ##### Read in data ##### @@ -397,15 +399,29 @@ summary(lm(POU5F1 ~ equal_groups, data = latent_split)) ##### Make histograms of each location, colored by time, faceted by location ##### -seurat_noNA@meta.data$Time <- gsub("Village Day 4","Village", seurat_noNA@meta.data$Time) %>% gsub("Thawed Village Day 7", "Village", .) %>% gsub("Thawed Village Day 0","Baseline", .) - seurat_noNA@meta.data$Location <- ifelse(grepl("Thawed", seurat_noNA@meta.data$Location), "Sydney-Cryopreserved", gsub("_.+", "", seurat_noNA@meta.data$Location)) +seurat_noNA@meta.data$Location <- gsub("Brisbane", "Site 1", seurat_noNA@meta.data$Location) %>% gsub("Melbourne", "Site 2", .) %>% gsub("Sydney", "Site 3", .) +seurat_noNA@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat_noNA@meta.data$Time) %>% gsub("Village Day 4", "Village", .) %>% gsub("Thawed Village Day 0", "Uni-Culture", .) %>% gsub("Thawed Village Day 7", "Village", .) + seurat_noNA@meta.data$Location_Individual <- paste0(seurat_noNA@meta.data$Location, "-", seurat_noNA@meta.data$Final_Assignment) -pLocation_Time_latent <- ggplot(seurat_noNA@meta.data, aes(latent_time, color = Time)) + - geom_density() + + +pLocation_Time_latent <- ggplot(seurat_noNA@meta.data, aes(latent_time, fill = Village)) + + geom_density(alpha = 0.75) + theme_classic() + - facet_grid(Location ~ Final_Assignment) + facet_grid(Location ~ Final_Assignment) + + scale_fill_manual(values = village_colors) ggsave(pLocation_Time_latent, filename = paste0(outdir,"faceted_histogram_latent.png")) +ggsave(pLocation_Time_latent, filename = paste0(outdir,"faceted_histogram_latent.pdf")) + + + +pLocation_Time_latent_line <- ggplot(seurat_noNA@meta.data, aes(latent_time, fill = Village)) + + geom_density(alpha = 0.75) + + theme_classic() + + facet_grid(vars(Final_Assignment)) + + scale_fill_manual(values = village_colors) +ggsave(pLocation_Time_latent_line, filename = paste0(outdir,"faceted_histogram_latent_line.png"), width = 4, height = 3) +ggsave(pLocation_Time_latent_line, filename = paste0(outdir,"faceted_histogram_latent_line.pdf"), width = 4, height = 3) diff --git a/Variance/Distribution_tests.R b/Variance/Distribution_tests.R index f50065b..4e090b9 100644 --- a/Variance/Distribution_tests.R +++ b/Variance/Distribution_tests.R @@ -5,6 +5,7 @@ library(gamlss) library(fitdistrplus) library(VGAM) library(emdbook) +library(data.table) save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){ @@ -214,6 +215,12 @@ seurat <- readRDS(paste0(datadir,"seurat_integrated_all_times_clustered.rds")) ## Keep only genes expressed in at least 1% of cells seurat_sub <- subset(seurat, features = rownames(seurat)[which(rowSums(seurat[["SCT"]]@counts > 0)/ncol(seurat[["SCT"]]@counts) >= 0.01)]) saveRDS(seurat_sub, paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing.rds")) +seurat_sub <- readRDS(paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing.rds")) + + +### Make a list of the genes to be used by snakemake in variance_partition_post_reviedw.smk ### +genes_dt <- data.table(Gene = rownames(seurat_sub)) +fwrite(genes_dt, paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing_genelist.tsv"), sep = "\t") aic_df <- as.data.frame(matrix(nrow = 0, ncol = 3)) diff --git a/Variance/RNAvelocity/post_review/prepare_pseudotime.R b/Variance/RNAvelocity/post_review/prepare_pseudotime.R index c440c7d..cdcd507 100644 --- a/Variance/RNAvelocity/post_review/prepare_pseudotime.R +++ b/Variance/RNAvelocity/post_review/prepare_pseudotime.R @@ -41,4 +41,8 @@ seurat_noNA@meta.data$Cryopreserved <-ifelse(seurat_noNA@meta.data$Location == " seurat_noNA@meta.data$Location <- gsub("_Cryopreserved", "", seurat_noNA@meta.data$Location) -saveRDS(seurat_noNA, paste0(outdir, "seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds")) \ No newline at end of file +saveRDS(seurat_noNA, paste0(outdir, "seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds")) +seurat_noNA <- readRDS(paste0(outdir, "seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds")) + + +fwrite(data.table(Gene = rownames(seurat_noNA)), paste0(outdir,"seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.tsv")) \ No newline at end of file diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect.R b/Variance/RNAvelocity/post_review/pseudotime_effect.R index dfb1a32..363d1c6 100644 --- a/Variance/RNAvelocity/post_review/pseudotime_effect.R +++ b/Variance/RNAvelocity/post_review/pseudotime_effect.R @@ -34,17 +34,18 @@ icc_glmmtmb <- function(model, percent = TRUE) { args <- commandArgs(trailingOnly = TRUE) icc_interaction_outdir <- paste0(args[1]) icc_outdir <- paste0(args[2]) -model_interaction_outdir <- paste0(args[3]) -model_outdir <- paste0(args[4]) -resid_outdir <- paste0(args[5]) -gene <- as.character(args[6]) +plot_outdir <- paste0(args[3]) +gene <- as.character(args[4]) +effects_outdir <- paste0(args[5]) + +print(icc_interaction_outdir) print(icc_outdir) -print(icc_outdir) -print(model_outdir) -print(resid_outdir) +print(plot_outdir) print(gene) +line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C") + ##### Read in data ##### @@ -57,47 +58,217 @@ colnames(icc_summary) <- gsub("gene", "ensg", colnames(icc_summary)) ### Make DF for modeling ### -df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)), "Cryopreserved" = seurat$Cryopreserved, "Site" = seurat$Location, "Pseudotime" = round(seurat$latent_time, 2)) +df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,],"Normalized Counts" = seurat[["SCT"]]@counts[gene,], "Log Expression" = seurat[["SCT"]]@data[gene,], "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)), "Cryopreserved" = seurat$Cryopreserved, "Site" = seurat$Location, "Pseudotime" = round(seurat$latent_time, 2)) colnames(df_hier_unscale)[1] <- "Expression" ##### Get list of variables to fit before testing pseudotime effect ##### -variables <- icc_summary[ensg == gene & grp != "Residual"]$grp +variables <- c(icc_summary[ensg == gene & grp != "Residual"]$grp, "Pseudotime") +variables <- c(variables[!grepl(":",variables)], variables[grepl(":", variables)]) +model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) +if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Pseudotime"], collapse = ") + (1|"), ")")) +} else { + model <- as.formula(paste0("Expression ~ 1")) +} -##### Test pseudotime impact ##### -### Fit the known variables to get residuals ### -model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) -model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + +boolFalse<-F +while(boolFalse==F & length(variables) > 0){ + tryCatch({ + print(variables) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + model_glmmtmb_loo <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + } else { + variables <- c() + } + }) +} -### Test pseudotime on residuals ### -df_hier_unscale$Residuals <- resid(model_glmmtmb) +if ("Pseudotime" %in% variables){ + + ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance + while ((!model_glmmtmb$sdr$pdHess | !model_glmmtmb_loo$sdr$pdHess) & length(variables) > 0){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + print(variables) + model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Pseudotime"], collapse = ") + (1|"), ")")) + model_glmmtmb_loo <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + variables <- c() + } + } -model_pseudotime <- suppress_warnings(glmmTMB(Residuals ~ Pseudotime, data = df_hier_unscale, REML = TRUE), "giveCsparse") -model_pseudotime2 <- suppress_warnings(glmmTMB(Residuals ~ 1, data = df_hier_unscale, REML = TRUE), "giveCsparse") + print(variables) + + if ("Pseudotime" %in% variables){ -### Test with Anova ### -P_value <- anova(model_pseudotime2, model_pseudotime)$`Pr(>Chisq)`[2] -P_value <- anova(model_glmmtmb, model_pseudotime2)$`Pr(>Chisq)`[2] + icc <- data.table(grp = c("Pseudotime", "Residual"), P = as.numeric(NA)) + icc[grp == "Pseudotime"]$P <- anova(model_glmmtmb_loo, model_glmmtmb)$`Pr(>Chisq)`[2] -test_plot <- ggplot(df_hier_unscale, aes(Pseudotime, Residuals, color = Site)) + - geom_point() + - facet_grid(~Line) + - geom_smooth(method = "lm", se = FALSE) -ggsave(test_plot, filename = "/directflow/SCCGGroupShare/projects/DrewNeavin/test.png") + if (icc[grp == "Pseudotime"]$P < 0.05/length(variables)){ + + ##### Calculate full model ##### + updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")")) + + + model_loo_updated <- list() + model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc <- icc_glmmtmb(model_loo_updated[["all"]]) + + ### Recalfulate significance ### + icc$P <- as.numeric(NA) + icc$gene <- gene + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2] + } + } + + interaction_variables <- c() + + if ("Line" %in% variables & "Pseudotime" %in% variables){ + interaction_variables <- c(interaction_variables, "Line:Pseudotime") + model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) -if (P_value < 0.05/(length(variables) + 1)){ - ### Test for amount of variance explained ### - model_pseudotime2 <- suppress_warnings(glmmTMB(Residuals ~ 1 + Pseudotime, data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-F + while(boolFalse==F & length(interaction_variables) > 0){ + tryCatch({ + print(c(variables, interaction_variables)) + model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(interaction_variables) > 1){ + interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)] + } else { + interaction_variables <- c() + } + }) + } + + if ("Line:Pseudotime" %in% interaction_variables){ + ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance + while (!model_glmmtmb_interaction$sdr$pdHess & length(interaction_variables) > 0 ){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(interaction_variables) > 1){ + interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)] + print(c(interaction_variables, variables)) + model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + interaction_variables <- c() + } + } + + if ("Line:Pseudotime" %in% interaction_variables){ + + model_loo_interaction <- list() + + icc_interaction <- data.table(grp = interaction_variables, P = as.numeric(NA)) + + for (variable in c(interaction_variables)){ + print(variable) + if (length(interaction_variables) > 1){ + model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")")) + } else { + model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } + model_loo_interaction[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction[[variable]], model_glmmtmb_interaction)$`Pr(>Chisq)`[2] + } + + if (icc_interaction[grp == "Line:Pseudotime"]$P < 0.05/length(c(variables, interaction_variables)) & !is.na(icc_interaction[grp == "Line:Pseudotime"]$P)){ + print("interaction significant:") + print(icc_interaction) + + model_loo_interaction_updated <- model_loo_interaction + + updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + + model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]]) + + + ### Recalculate significance ### + icc_interaction$P <- as.numeric(NA) + icc_interaction$gene <- gene + + for (variable in c(variables, interaction_variables)){ + print(variable) + # if (length(c(interaction_variables)) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")")) + # } else { + # model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + # } + model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2] + } + + + plot <- ggplot(df_hier_unscale, aes(Pseudotime, Expression, color = Line)) + + geom_point(alpha = 0.3) + + # geom_smooth(method = "lm", se = TRUE) + + geom_smooth(se = TRUE) + + theme_classic() + + scale_color_manual(values = line_colors) + + ggsave(plot, filename = paste0(plot_outdir, gene,".png"), width = 5, height = 4) + + plot_facet <- ggplot(df_hier_unscale, aes(Pseudotime, Expression, color = Line)) + + geom_point(alpha = 0.3) + + facet_wrap(vars(paste0(Site, " ", Cryopreserved)), ncol = 1, scales = "free_y") + + geom_smooth(se = TRUE) + + # geom_smooth(method = "lm", se = TRUE) + + theme_classic() + + scale_color_manual(values = line_colors) + + ggsave(plot_facet, filename = paste0(plot_outdir, gene,"_site_facet.png"), width = 4, height = 7) + + + saveRDS(icc_interaction, paste0(icc_interaction_outdir, gene, "_icc.rds"), compress = TRUE) + + + model_interaction_effects_formula <- as.formula(paste0("Expression ~ ", paste0(c(variables, interaction_variables), collapse = " + "))) + model_interaction_effects <- suppress_warnings(glmmTMB(formula = noquote(model_interaction_effects_formula), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + dt <- data.table(grp = rownames(coef(summary(model_interaction_effects))$cond), Effect = coef(summary(model_interaction_effects))$cond[,"Estimate"], P = coef(summary(model_interaction_effects))$cond[,"Pr(>|z|)"]) + + saveRDS(dt, paste0(effects_outdir, gene, "_effects.rds"), compress = TRUE) + } + } + } + } + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + } +} +saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE) - ### Test for interactions -} diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect.smk b/Variance/RNAvelocity/post_review/pseudotime_effect.smk new file mode 100644 index 0000000..939eba1 --- /dev/null +++ b/Variance/RNAvelocity/post_review/pseudotime_effect.smk @@ -0,0 +1,41 @@ +import pandas as pd + + +genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/data/seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.tsv" +genes = pd.read_csv(genes_file, sep = "\t") +# genes = genes.iloc[1:1000] + + +rule all: + input: + expand( "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc/{gene}_icc.rds", gene = genes.Gene) + + +rule partition_variance: + input: + seurat = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/data/seurat_integrated_all_times_clustered_1pct_expressing_pseudotime.rds" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc/{gene}_icc.rds" + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 64, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 64 + threads: 1 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/Variance/RNAvelocity/post_review/pseudotime_effect.R", + out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc/", + out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc_interaction/", + out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/icc_interaction/", + out_plot = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/plots/", + out_effects = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/results/gene_separated/effect_betas/" + log: + shell: + """ + mkdir -p {params.out_icc_interaction} + mkdir -p {params.out_model_interaction} + mkdir -p {params.out_plot} + mkdir -p {params.out_effects} + + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_plot} {wildcards.gene} {params.out_effects} + """ + + diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect_combine.R b/Variance/RNAvelocity/post_review/pseudotime_effect_combine.R new file mode 100644 index 0000000..b5733f8 --- /dev/null +++ b/Variance/RNAvelocity/post_review/pseudotime_effect_combine.R @@ -0,0 +1,412 @@ +library(tidyverse) +library(ggplot2) +library(ggrepel) +library(data.table) +library(Seurat) +library(viridis) +library(colorspace) +library(RColorBrewer) +library(ggsignif) + + +dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/" +datadir <- paste0(dir,"output/Variance/RNAvelocity/post_review/results/gene_separated/") +icc_dir <- paste0(datadir,"icc/") +icc_interaction_dir <- paste0(datadir,"icc_interaction/") +effect_interaction_dir <- paste0(datadir,"effect_betas/") +outdir <- paste0(dir,"output/Variance/RNAvelocity/post_review/results/combined/") +dir.create(outdir) + + + + +save_figs <- function(plot, basename, width = 17, height = 17, units = "cm"){ + ggsave(plot, filename = paste0(basename,".png"), height = height, width = width, units = units) + ggsave(plot, filename = paste0(basename,".pdf"), height = height, width = width, units = units) + ggsave(plot, filename = paste0(basename,".eps"), height = height, width = width, units = units) +} + + + +##### Set up colors ##### +variable_colors <- c(Village = "#a186aa", Replicate = "#f2c1ce", Line = "#4734a9") +line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C") + + +site_updates <- c("Brisbane" = "Site 1", "Melbourne" = "Site 2" ,"Sydney" = "Site 3") + + +##### Add gene IDs for easy identification downstream ##### +GeneConversion1 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/DRENEA_1/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t") +GeneConversion2 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/Village_B_1_week/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t") + +GeneConversion <- unique(rbind(GeneConversion1, GeneConversion2)) +GeneConversion <- GeneConversion[!duplicated(GeneConversion$X1),] +GeneConversion$X3 <- NULL +colnames(GeneConversion) <- c("ENSG_ID", "Gene_ID") + + + + +if (!file.exists(paste0(outdir,"variance_partitionin_df.tsv"))){ + ##### Get list of files ##### + icc_files <- list.files(icc_dir) + icc_interaction_files <- list.files(icc_interaction_dir) + effect_interaction_files <- list.files(effect_interaction_dir) + + + ##### Read in the files ##### + icc_list <- lapply(icc_files, function(icc){ + readRDS(paste0(icc_dir,icc)) + }) + names(icc_list) <- gsub("_icc.rds", "", icc_files) + + + icc_interaction_list <- lapply(icc_interaction_files, function(icc){ + readRDS(paste0(icc_interaction_dir,icc)) + }) + names(icc_interaction_list) <- gsub("_icc.rds", "", icc_interaction_files) + + + effect_interaction_list <- lapply(effect_interaction_files, function(icc){ + readRDS(paste0(effect_interaction_dir,icc)) + }) + names(effect_interaction_list) <- gsub("_effects.rds", "", effect_interaction_files) + + + ##### Make dataframe of effect sizes from models ##### + effect_interaction_dt_list <- lapply(effect_interaction_list, function(x){ + tmp <- data.table(grp=character(), gene=character(), effect=numeric()) + + }) + + + ##### Combine fits Results ##### + fits_df <- do.call(rbind, fits) + + fits_df$Location <- gsub("_ENSG.+", "",rownames(fits_df)) + + write_delim(fits_df, paste0(outdir,"variance_partitionin_df.tsv"), delim = "\t") + +} else { + fits_df <- read_delim(paste0(outdir,"variance_partitionin_df.tsv"), delim = "\t") +} + + +### Read in the seurat data objects +seurat_list <- list() +for (location in unique(fits_df$Location)){ + seurat_list[[location]] <- readRDS(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/nb_partitioning/data/",location,"_SCT_seurat_1pct.rds")) +} + +### Make dataframe of location x number for faulty models ### +faulty_df <- data.table(Location = gsub("_nb\\.o.+", "", faulty$V1), Number = gsub(".+nb\\.o\\d+\\.", "", faulty$V1)) + +## Add gene id to the faulty dataframe +faulty_df$ENSG <- NA +for (row in 1:nrow(faulty_df)){ + print(row) + faulty_df$ENSG[row] <- rownames(seurat_list[[faulty_df$Location[row]]])[as.numeric(faulty_df$Number[row])] +} + + +### Remove the faulty genes ### +fits_df <- setDT(fits_df)[!faulty_df, on = c("Location", "Gene" = "ENSG")] + + + +##### Make long dataframe for plotting ##### +fits_df_long <- pivot_longer(fits_df, cols = c("Line", "Village", "Replicate"), names_to = "Covariate", values_to = "Variance_Explained") +fits_df_long$Variance_Explained <- round(fits_df_long$Variance_Explained,6) + +fits_df_long <- left_join(fits_df_long, GeneConversion, by = c("Gene" = "ENSG_ID")) + + + + +##### Make some figures!!! ##### +### FRESH ### +pTotal_Cont <- ggplot(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved"),], aes(Variance_Explained*100, color = Covariate)) + + geom_density() + + facet_grid(vars(Location)) + + theme_classic() + + # scale_y_continuous(trans = "log10") + + scale_x_continuous(trans = "log10") + + scale_color_manual(values = variable_colors) + + geom_vline(xintercept = 1, linetype="dashed") + +save_figs(pTotal_Cont, paste0(outdir, "Total_Contribution_Histogram_cov")) + + +table(subset(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved"),], (Variance_Explained > 0.90))$Gene_ID) +subset(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved"),], (Variance_Explained > 0.90)) +fits_df_long[grepl("XIST", fits_df_long$Gene_ID),] +fits_df_long[grepl("MT-ATP6", fits_df_long$Gene_ID),] + +table(subset(fits_df_long[which(fits_df_long$Location != "Sydney_Cryopreserved" & fits_df_long$Covariate == "Village"),], (Variance_Explained > 0.46))$Gene_ID) + +for (location in names(site_updates)){ + fits_df_long$Location <- gsub(location, site_updates[location], fits_df_long$Location) +} + +fits_df_long$Location <- gsub("Cryo", "\nCryo", fits_df_long$Location) + +fits_df_long$Quartile <- gsub(".+_Q", "Q", fits_df_long$Location) +fits_df_long$Location <- gsub("_Q\\d", "", fits_df_long$Location) +fits_df_long$Location <- gsub("_", "", fits_df_long$Location) + +pVar_Explained_box <- ggplot(fits_df_long, aes(x = Covariate, y = Variance_Explained*100, color = Covariate)) + + geom_boxplot(outlier.size = 0.25) + + facet_grid(Location ~ Quartile) + + scale_color_manual(values = variable_colors) + + theme_classic() + + ylab("Percent Gene Expression Variance Explained") + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +save_figs(pVar_Explained_box, paste0(outdir, "Total_Contribution_Boxplot_cov"), width = 20, height = 12) + + + +fits_df_long[which(fits_df_long$Variance_Explained > 0.5 & fits_df_long$Covariate == "Village"),] +table(fits_df_long[which(fits_df_long$Variance_Explained > 0.4 & fits_df_long$Covariate == "Village"),]$Gene) +table(fits_df_long[which(fits_df_long$Variance_Explained > 0.5 & fits_df_long$Covariate == "Line"),]$Gene) +head(fits_df_long[which(fits_df_long$Covariate == "Village"),][rev(order(na.omit(fits_df_long[which(fits_df_long$Covariate == "Village"),"Variance_Explained"]))),]) + + + + +### Variance explained correlation between sites ### +fits_df_long$Location_Quartile <- paste0(fits_df_long$Location, "-", fits_df_long$Quartile) +fits_df_long <- data.table(fits_df_long) + +fwrite(fits_df_long, sep = "\t", paste0(outdir, "var_explained_long.tsv")) + +### Subset +fits_df_long_line <- fits_df_long[Covariate == "Line"] +fits_df_long_rep <- fits_df_long[Covariate == "Replicate"] +fits_df_long_village <- fits_df_long[Covariate == "Village"] + + +### identify the combinatinons to use +pairs_df <- as.data.table(t(combn(sort(unique(fits_df_long$Location_Quartile)), 2, simplify = TRUE))) +colnames(pairs_df) <- c("Location_Quartile.x", "Location_Quartile.y") + + +### Self Join ### +fits_df_long_line_joined <- merge(pairs_df, fits_df_long_line, by.x = "Location_Quartile.x", by.y = "Location_Quartile", allow.cartesian=TRUE) +fits_df_long_line_joined <- merge(fits_df_long_line_joined, fits_df_long_line, by.x = c("Location_Quartile.y", "Gene", "Covariate", "Gene_ID"), by.y = c("Location_Quartile", "Gene", "Covariate", "Gene_ID"), allow.cartesian=TRUE) +fits_df_long_line_joined <- na.omit(fits_df_long_line_joined) + +fits_df_long_rep_joined <- merge(pairs_df, fits_df_long_rep, by.x = "Location_Quartile.x", by.y = "Location_Quartile", allow.cartesian=TRUE) +fits_df_long_rep_joined <- merge(fits_df_long_rep_joined, fits_df_long_rep, by.x = c("Location_Quartile.y", "Gene", "Covariate", "Gene_ID"), by.y = c("Location_Quartile", "Gene", "Covariate", "Gene_ID"), allow.cartesian=TRUE) +fits_df_long_rep_joined <- na.omit(fits_df_long_rep_joined) + +fits_df_long_village_joined <- merge(pairs_df, fits_df_long_village, by.x = "Location_Quartile.x", by.y = "Location_Quartile", allow.cartesian=TRUE) +fits_df_long_village_joined <- merge(fits_df_long_village_joined, fits_df_long_village, by.x = c("Location_Quartile.y", "Gene", "Covariate", "Gene_ID"), by.y = c("Location_Quartile", "Gene", "Covariate", "Gene_ID"), allow.cartesian=TRUE) +fits_df_long_village_joined <- na.omit(fits_df_long_village_joined) + + +### Remove the ones that are merges between the same ### +fits_df_long_line_joined <- fits_df_long_line_joined[which(fits_df_long_line_joined$Location_Quartile.x != fits_df_long_line_joined$Location_Quartile.y),] +fits_df_long_rep_joined <- fits_df_long_rep_joined[which(fits_df_long_rep_joined$Location_Quartile.x != fits_df_long_rep_joined$Location_Quartile.y),] +fits_df_long_village_joined <- fits_df_long_village_joined[which(fits_df_long_village_joined$Location_Quartile.x != fits_df_long_village_joined$Location_Quartile.y),] + + +fits_df_long_joined_list <- list(fits_df_long_line_joined, fits_df_long_rep_joined, fits_df_long_village_joined) +names(fits_df_long_joined_list) <- c("Line", "Replicate", "Village") + + +# ################ +# lapply(fits_df_long_joined_list, function(x){ +# x$Location <- gsub("Site 3", "Site 4", x$Location) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 3", .) +# x$Location_Quartile.x <- gsub("Site 3", "Site 4", x$Location_Quartile.x) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 3", .) +# x$Location_Quartile.y <- gsub("Site 3", "Site 4", x$Location_Quartile.y) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 3", .) +# return(x) +# }) + +# ################ + + + +saveRDS(fits_df_long_joined_list, paste0(outdir, "variance_explained_long_list.rds")) +fits_df_long_joined_list <- readRDS(paste0(outdir, "variance_explained_long_list.rds")) + +Rsquared_list <- lapply(names(fits_df_long_joined_list), function(x){ + tmp <- pairs_df + tmp$Covariate <- x + tmp$Rsquared <- NA + return(data.table(tmp)) +}) +names(Rsquared_list) <- names(fits_df_long_joined_list) + + + +Rsquared_list <- lapply(names(Rsquared_list), function(x){ + print(x) + for (row in 1:nrow(Rsquared_list[[x]])){ + print(row) + Rsquared_list[[x]]$Rsquared[row] <- round(summary(lm(Variance_Explained.y ~ Variance_Explained.x, fits_df_long_joined_list[[x]][Location_Quartile.x == Rsquared_list[[x]]$Location_Quartile.x[row] & Location_Quartile.y == Rsquared_list[[x]]$Location_Quartile.y[row],]))$r.squared,2) + # Rsquared_list[[x]]$pearson[row] <- round(cor(fits_df_long_joined_list[[x]][which(fits_df_long_joined_list[[x]]$Location_Quartile.x == Rsquared_list[[x]]$Location_Quartile.x[row] & + # fits_df_long_joined_list[[x]]$Location_Quartile.y == Rsquared_list[[x]]$Location_Quartile.y[row]),]$Variance_Explained.y, + # fits_df_long_joined_list[[x]][which(fits_df_long_joined_list[[x]]$Location_Quartile.x == Rsquared_list[[x]]$Location_Quartile.x[row] & + # fits_df_long_joined_list[[x]]$Location_Quartile.y == Rsquared_list[[x]]$Location_Quartile.y[row]),]$Variance_Explained.x, use="complete.obs"), 2) + } + + Rsquared_list[[x]] <- na.omit(Rsquared_list[[x]]) + return(Rsquared_list[[x]]) +}) +names(Rsquared_list) <- names(fits_df_long_joined_list) + +saveRDS(Rsquared_list, paste0(outdir, "R_squared_list.rds")) + + + + +fits_df_long_joined_list <- readRDS(paste0(outdir, "variance_explained_long_list.rds")) +Rsquared_list <- readRDS(paste0(outdir, "R_squared_list.rds")) + + +Rsquared_list <- lapply(Rsquared_list, function(x){ + x$Location_Quartile.x <- gsub("Site 3", "Site 4", x$Location_Quartile.x) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 2", .) %>% gsub("Cryo", "\nCryo", .) + x$Location_Quartile.y <- gsub("Site 3", "Site 4", x$Location_Quartile.y) %>% gsub("Site 2", "Site 3", .) %>% gsub("Site 4", "Site 2", .) %>% gsub("Cryo", "\nCryo", .) + return(x) +}) + + +pCorr_tile <- ggplot(Rsquared_list[["Line"]], aes(Location_Quartile.x, Location_Quartile.y, fill = Rsquared)) + + geom_tile() + + theme_classic() + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + scale_fill_continuous_sequential(palette = "Blues") +ggsave(pCorr_tile, filename = paste0(outdir,"Rsquared_heatmap.png"), height = 5.85) + + + +##### Make lists that are only same site ##### +Rsquared_list_subset <- lapply(Rsquared_list, function(x){ + tmp <- list() + for (location in unique(gsub("-.+", "", unique(c(x$Location_Quartile.x, x$Location_Quartile.y))))){ + tmp[[location]] <- x[grepl(paste0(location, "-"), x$Location_Quartile.x) & grepl(paste0(location, "-"), Location_Quartile.y)] + tmp[[location]]$Location <- location + tmp[[location]]$Quintile.x <- gsub(".+-", "", tmp[[location]]$Location_Quartile.x) + tmp[[location]]$Quintile.y <- gsub(".+-", "", tmp[[location]]$Location_Quartile.y) + } + tmp2 <- do.call(rbind, tmp) + return(tmp2) +}) + + +pCorr_tile <- ggplot(Rsquared_list_subset[["Line"]], aes(Quintile.x, Quintile.y, fill = Rsquared)) + + geom_tile() + + theme_classic() + + facet_wrap(vars(Location), nrow = 2) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + scale_fill_continuous_sequential(palette = "Purples") + + xlab("Quintile") + + ylab("Quintile") + + labs(fill = "Line\nR^2") +ggsave(pCorr_tile, filename = paste0(outdir,"Rsquared_heatmap_location_facet.png"), height = 3.25, width = 3.5) +ggsave(pCorr_tile, filename = paste0(outdir,"Rsquared_heatmap_location_facet.pdf"), height = 3.25, width = 3.5) + + +pCorr_Village<- ggplot(Rsquared_list_subset[["Village"]], aes(Quintile.x, Quintile.y, fill = Rsquared)) + + geom_tile() + + theme_classic() + + facet_wrap(vars(Location), nrow = 2) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + scale_fill_continuous_sequential(palette = "Blues") + + xlab("Quintile") + + ylab("Quintile") + + labs(fill = "Village\nR^2") +ggsave(pCorr_Village, filename = paste0(outdir,"Rsquared_heatmap_village_facet.png"), height = 3.25, width = 3.5) +ggsave(pCorr_Village, filename = paste0(outdir,"Rsquared_heatmap_village_facet.pdf"), height = 3.25, width = 3.5) + + + + + +##### Make lists that are only same quintile ##### +fits_df_long <- fread(sep = "\t", paste0(outdir, "var_explained_long.tsv")) + + +Rsquared_list_subset_quint <- lapply(Rsquared_list, function(x){ + tmp <- list() + for (quintile in unique(fits_df_long$Quartile)){ + tmp[[quintile]] <- x[grepl(paste0("-", quintile), x$Location_Quartile.x) & grepl(paste0("-", quintile), x$Location_Quartile.y)] + tmp[[quintile]]$Quintile <- quintile + tmp[[quintile]]$Location.x <- gsub("-.+", "", tmp[[quintile]]$Location_Quartile.x) + tmp[[quintile]]$Location.y <- gsub("-.+", "", tmp[[quintile]]$Location_Quartile.y) + } + tmp2 <- do.call(rbind, tmp) + return(tmp2) +}) + +bluecols <- colorRampPalette(brewer.pal(9, "Blues")) +purplecols <- colorRampPalette(brewer.pal(9, "Purples")) + +pCorr_Line_quant <- ggplot(Rsquared_list_subset_quint[["Line"]], aes(Location.x, Location.y, fill = Rsquared)) + + geom_tile() + + theme_classic() + + facet_wrap(vars(Quintile), nrow = 1) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + scale_fill_gradientn(colours = purplecols(100), limits=c(0, 1)) + # scale_fill_continuous_sequential(palette = "Purples") +ggsave(pCorr_Line_quant, filename = paste0(outdir,"Rsquared_heatmap_line_facet_quint.png"), height = 2.5, width = 6.5) + +pCorr_Village_quant <- ggplot(Rsquared_list_subset_quint[["Village"]], aes(Location.x, Location.y, fill = Rsquared)) + + geom_tile() + + theme_classic() + + facet_wrap(vars(Quintile), nrow = 1) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + scale_fill_gradientn(colours = bluecols(100), limits=c(0, 1)) + # scale_fill_continuous_sequential(palette = "Blues", begin = 0, end = 1) +ggsave(pCorr_Village_quant, filename = paste0(outdir,"Rsquared_heatmap_village_facet_quint.png"), height = 2.5, width = 6.5) + + + +pCorr_Location_Point_Line <- ggplot(fits_df_long_joined_list[["Line"]], aes(Variance_Explained.x*100, Variance_Explained.y*100, color = Covariate)) + + geom_point(size = 0.5, alpha = 0.5) + + facet_grid(rows = vars(Location.y), cols = vars(Location.x)) + + scale_color_manual(values = variable_colors[["Line"]]) + + theme_classic() + + ylim(0,110)+ + # geom_text(size = 2.5, x = 0, y = 110, aes(label = paste0("R^2 = ", Rsquared)), data = Rsquared_list[["Line"]], hjust = 0) + + xlab("Percent Gene Expression Variance Explained") + + ylab("Percent Gene Expression Variance Explained") +save_figs(pCorr_Location_Point_Line, paste0(outdir, "Correlation_Point_Line"), width = 50, height = 5*7.25) + + +pCorr_Location_Point_Village <- ggplot(fits_df_joined[which(fits_df_joined$Covariate == "Village" & fits_df_joined$Location.x != "Site 2_Cryopreserved" & fits_df_joined$Location.y != "Site 2_Cryopreserved"),], aes(Variance_Explained.x*100, Variance_Explained.y*100, color = Covariate)) + + geom_point(size = 0.5, alpha = 0.5) + + facet_grid(rows = vars(Location.y), cols = vars(Location.x)) + + scale_color_manual(values = variable_colors[["Village"]]) + + theme_classic() + + ylim(0,70)+ + geom_text(size = 2.5, x = 0, y = 70, aes(label = paste0("R^2 = ", Rsquared)), data = Rsquared[which(Rsquared$Covariate == "Village" & Rsquared$Location.x != "Site 2_Cryopreserved" & Rsquared$Location.y != "Site 2_Cryopreserved"),], hjust = 0) + + xlab("Percent Gene Expression\nVariance Explained") + + ylab("Percent Gene Expression\nVariance Explained") + + geom_text_repel(size = 2.5, data = subset(fits_df_joined[which(fits_df_joined$Location.x != "Site 2_Cryopreserved" & fits_df_joined$Location.y != "Site 2_Cryopreserved"),], + (Gene_ID %in% c("MT-ATP6", "MT-ND1") & Covariate == "Village")), + aes(label = Gene_ID), box.padding =1) +save_figs(pCorr_Location_Point_Village, paste0(outdir, "Correlation_Point_Village"), width = 10, height = 7.25) + + + + +##### Make a point figure instead of heatmap for Correlation between sites for each quintile +Rsquared_list_subset <- do.call(rbind, Rsquared_list_subset_quint) + +pR2_point <- ggplot(Rsquared_list_subset[Covariate != "Replicate"], aes(Covariate, Rsquared, color = Quintile)) + + geom_jitter(width = 0.1, size = 0.75) + + theme_classic() + + ylab("R^2") + + scale_color_viridis(discrete = TRUE) + + geom_signif(comparisons = list(c("Line", "Village")), + map_signif_level=TRUE, y = 1, + test = "t.test", test.args=list(alternative = "two.sided", var.equal = FALSE, paired=TRUE)) + +ggsave(pR2_point, filename = paste0(outdir, "R2_scatter.png"), width = 2.5, height = 3) +ggsave(pR2_point, filename = paste0(outdir, "R2_scatter.pdf"), width = 2.5, height = 3) + + +t.test(Rsquared_list_subset[Covariate == "Village"]$Rsquared, Rsquared_list_subset[Covariate == "Line"]$Rsquared, alternative = "two.sided", var.equal = FALSE, paired=TRUE) + diff --git a/Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh b/Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh new file mode 100644 index 0000000..54f6b76 --- /dev/null +++ b/Variance/RNAvelocity/post_review/pseudotime_effect_snake.sh @@ -0,0 +1,135 @@ + +SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/Variance/RNAvelocity/post_review/pseudotime_effect.smk" +LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Variance/RNAvelocity/post_review/logs/" + +mkdir -p $LOG + +cd /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/Variance/RNAvelocity/post_review + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --rerun-incomplete \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --reason > jobs2run.txt + + + +nohup \ + snakemake \ + --snakefile $SNAKEFILE \ + --jobs 200 \ + --use-singularity \ + --restart-times 1 \ + --rerun-incomplete \ + --keep-going \ + --cluster \ + "qsub -S /bin/bash \ + -q short.q \ + -r yes \ + -pe smp {threads} \ + -l tmp_requested={resources.disk_per_thread_gb}G \ + -l mem_requested={resources.mem_per_thread_gb}G \ + -e $LOG \ + -o $LOG \ + -j y \ + -V" \ + > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log & + + +### Without -pe smp +nohup \ + snakemake \ + --snakefile $SNAKEFILE \ + --jobs 200 \ + --use-singularity \ + --restart-times 1 \ + --rerun-incomplete \ + --keep-going \ + --cluster \ + "qsub -S /bin/bash \ + -q short.q \ + -r yes \ + -l tmp_requested={resources.disk_per_thread_gb}G \ + -l mem_requested={resources.mem_per_thread_gb}G \ + -e $LOG \ + -o $LOG \ + -j y \ + -V" \ + > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log & + +34342 + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --cleanup-metadata \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds + + + + + + +ENSG00000140481 +ENSG00000158552 +ENSG00000167004 +ENSG00000184281 +ENSG00000185437 +ENSG00000185875 +ENSG00000196109 +ENSG00000197536 +ENSG00000215910 +ENSG00000215883 +ENSG00000258297 +ENSG00000259488 +ENSG00000274877 + + +ENSG00000158555 +ENSG00000167005 +ENSG00000184292 +ENSG00000184281 +ENSG00000185437 +ENSG00000185875 +ENSG00000196109 +ENSG00000197536 +ENSG00000215866 +ENSG00000258289 +ENSG00000259485 +ENSG00000274828 + + +rm ENSG00000167011_fitted_models.rds +rm ENSG00000184304_fitted_models.rds +rm ENSG00000185477_fitted_models.rds +rm ENSG00000185900_fitted_models.rds +rm ENSG00000196132_fitted_models.rds +rm ENSG00000197566_fitted_models.rds +rm ENSG00000217930_fitted_models.rds +rm ENSG00000258593_fitted_models.rds +rm ENSG00000259715_fitted_models.rds +rm ENSG00000275074_fitted_models.rds diff --git a/Variance/post_review2/variance_partition_post_review_combine.R b/Variance/post_review2/variance_partition_post_review_combine.R index 5f9c3ce..ae6d13f 100644 --- a/Variance/post_review2/variance_partition_post_review_combine.R +++ b/Variance/post_review2/variance_partition_post_review_combine.R @@ -27,7 +27,7 @@ dir.create(outdir, recursive = TRUE) vars <- c("Line", "Village", "Cryopreserved", "Site", "Replicate","Line:Village", "Line:Cryopreserved", "Line:Site", "Village:Cryopreserved","Village:Site", "Replicate:Village", "Replicate:Line","Replicate:Cryopreserved", "Replicate:Site", "Residual") # var_colors <- c(colorRampPalette(brewer.pal(11, "Spectral"))(14), "grey90") -var_colors <- c("#4734a9", rev(c("#115cc7", "#a4c3c8", "#499090", "#405940", "#685a54", "#f7d312", "#f8bf33", "#e4910e", "#f65d19", "#931519", "#f2c1ce", "#e17aab", "#a186aa")), "gray90") +var_colors <- c("#4734a9", rev(c("#115cc7", "#a4c3c8", "#499090", "#405940", "#685a54", "#f7d312", "#f8bf33", "#e4910e", "#f65d19", "#931519", "#f2c1ce", "#e17aab", "#a186aa")), "gray80") # var_colors <- c("#a2104d", "#fde64b", "#3e90c1", "#358856", "#f5764e", "#685ba6", "#728d01", "#abd302", "#8b6248", "#6ec4aa", "gray90") names(var_colors) <- vars @@ -95,6 +95,9 @@ icc_interaction_dt$grp_size <- factor(icc_interaction_dt$grp_size, levels = uniq ### *** Need to add individual effects without interaction in to interaction dt *** ### icc_interaction_plus_dt <- rbind(icc_interaction_dt, icc_dt[!(gene %in% icc_interaction_dt$gene)]) +fwrite(icc_interaction_plus_dt, paste0(outdir, "effect_results.tsv"), sep = "\t") +icc_interaction_plus_dt <- fread(paste0(outdir, "effect_results.tsv"), sep = "\t") + group_size <- data.table(table(icc_interaction_plus_dt$grp)) colnames(group_size) <- c("grp", "size") @@ -209,20 +212,24 @@ icc_interaction_plus_dt$grp_size <- factor(icc_interaction_plus_dt$grp_size, lev unique(grep("Replicate:Site\nN ", icc_interaction_plus_dt$grp_size, value = TRUE)), unique(grep("Residual\nN = ", icc_interaction_plus_dt$grp_size, value = TRUE)))) -pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(vars)))) + - geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) + - geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) + +pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = factor(grp_size, levels = levels(grp_size)), fill = factor(grp, levels = rev(vars)))) + + # geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) + + # geom_point(size =1,position = position_nudge(y=-0.1), shape = "|", aes(color = factor(grp, levels = rev(vars))), alpha = 0.75) + + geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) + + geom_point(size =1, position = position_nudge(y=-0.11), shape = "|", aes(color = factor(grp, levels = rev(vars)))) + + # geom_boxplot(size = 0.1,width = .15, outlier.size = 0.1, position = position_nudge(y=-0.12), alpha = 0.75) + coord_cartesian(xlim = c(1.2, NA), clip = "off") + theme_classic() + theme(axis.title.y=element_blank()) + xlab("Percent Variance Explained") + scale_y_discrete(expand = c(0.03, 0)) + - scale_fill_manual(values = var_colors) + - geom_vline(xintercept = 1, linetype = "dashed", color = "red") + scale_fill_manual(values = var_colors, name = "Variable") + + scale_color_manual(values = var_colors, name = "Variable") + + geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5) -ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.png"), height = 8, width = 7) -ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.pdf"), height = 8, width = 7) +ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.png"), height = 4, width = 7) +ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.pdf"), height = 4, width = 7) @@ -596,6 +603,108 @@ ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "eQTL_Genes_Var +eqtls_icc_1pct_grouped_line_dt <- eqtls_icc_1pct_grouped_dt[gene %in% unique(eqtls_icc_1pct_grouped_dt[grp == "Line"]$gene)] + + +table(eqtls_icc_1pct_grouped_line_dt[gene %in% unique(eqtls_icc_1pct_grouped_dt[grp == "Line:Village"]$gene)]$grp) + + +length(unique(eqtls$gene)) ## number that has snp-gene pairs in deboever and our data 4112 +length(unique(eqtls_icc$gene)) ## 2337 +length(unique(eqtls_icc_1pct_grouped_dt$gene)) ### 944 +table(eqtls_icc_1pct_grouped_dt$grp) ## 909 sig in line +909/944 ### 0.9629237 + + +##### do for deboever as well ##### + +##### check for variance explained for eQTL genes (from Kilpinen et al) that are ##### +deboever_eqtls <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv", sep = "\t") + + +deboever_eqtls_icc <- icc_interaction_plus_dt[unique(deboever_eqtls[,"gene_id"]), on = c("gene" = "gene_id")] +deboever_eqtls_icc$grp <- factor(deboever_eqtls_icc$grp, levels = rev(vars)) +deboever_eqtls_icc <- deboever_eqtls_icc[data.table(gene = deboever_eqtls_icc[grp == "Residual"][order(percent)]$gene), on = "gene"] +deboever_eqtls_icc$Gene_ID <- factor(deboever_eqtls_icc$Gene_ID, levels = unique(deboever_eqtls_icc$Gene_ID)) + + +group_size_eqtl <- data.table(table(deboever_eqtls_icc$grp)) +colnames(group_size_eqtl) <- c("grp", "size") +group_size_eqtl$grp_size <- paste0(group_size_eqtl$grp, "\nN = ", group_size_eqtl$size) + +deboever_eqtls_icc <- group_size_eqtl[deboever_eqtls_icc, on = "grp"] +grp_size_order_eqtl <- c("Line\nN = 2371", "Village\nN = 1836", "Site\nN = 2461", "Replicate\nN = 869", "Line:Village\nN = 367", "Line:Site\nN = 911", "Village:Site\nN = 898","Replicate:Village\nN = 305", "Replicate:Line\nN = 25", "Replicate:Site\nN = 78", "Residual\nN = 2542") +deboever_eqtls_icc$grp_size <- factor(deboever_eqtls_icc$grp_size, levels = grp_size_order_eqtl) + + + +deboever_eqtls_icc_1pct <- deboever_eqtls_icc + + +for (ensg in unique(deboever_eqtls_icc$gene)){ + if (!any(deboever_eqtls_icc[gene == ensg & grp != "Residual"]$percent > 1)){ + deboever_eqtls_icc_1pct <- deboever_eqtls_icc_1pct[gene != ensg] + } +} + + + +deboever_eqtls_icc_1pct_grouped_list <- list() + + +for (ensg in unique(deboever_eqtls_icc_1pct$gene)){ + group <- deboever_eqtls_icc_1pct[gene == ensg & grp != "Residual"][which.max(percent)]$grp + deboever_eqtls_icc_1pct_grouped_list[[group]][[ensg]] <- deboever_eqtls_icc_1pct[gene == ensg] +} + +deboever_eqtls_icc_1pct_grouped <- lapply(deboever_eqtls_icc_1pct_grouped_list, function(x) do.call(rbind, x)) +deboever_eqtls_icc_1pct_grouped <- lapply(names(deboever_eqtls_icc_1pct_grouped), function(x){ + deboever_eqtls_icc_1pct_grouped[[x]]$largest_contributor <- x + return(deboever_eqtls_icc_1pct_grouped[[x]]) +}) + +deboever_eqtls_icc_1pct_grouped_dt <- do.call(rbind, deboever_eqtls_icc_1pct_grouped) + +deboever_eqtls_icc_1pct_grouped_dt$largest_contributor <- factor(deboever_eqtls_icc_1pct_grouped_dt$largest_contributor, levels = c("Line", "Village", "Site", "Line:Village", "Line:Site", "Village:Site", "Replicate:Village")) + + + +pPluri_Genes_largest_Cont_eqtl <- ggplot() + + geom_bar(data = deboever_eqtls_icc_1pct_grouped_dt, aes(Gene_ID, percent, fill = factor(grp, levels = rev(vars))), position = "stack", stat = "identity", alpha = 0.75) + + theme_classic() + + facet_grid(. ~ largest_contributor, scales = "free_x", space = "free_x") + + scale_fill_manual(values = var_colors) + + ylab("Percent Gene Expression Variance Explained") + + theme(axis.title.x=element_blank(), + axis.text.x = element_blank(), + panel.spacing.x=unit(0, "lines"), + axis.ticks.x = element_blank()) + + geom_hline(yintercept = 1, linetype = "dashed") + # scale_y_discrete(expand = c(0.03, 0)) + # scale_x_discrete(expand = c(0.03, 0)) + + +ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "deboever_eQTL_Genes_Variance_Contributions_1pct_largest_cont.png"), width = 10, height = 4) +ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "deboever_eQTL_Genes_Variance_Contributions_1pct_largest_cont.pdf"), width = 10, height = 4) + + + +deboever_eqtls_icc_1pct_grouped_line_dt <- deboever_eqtls_icc_1pct_grouped_dt[gene %in% unique(deboever_eqtls_icc_1pct_grouped_dt[grp == "Line"]$gene)] + + +table(deboever_eqtls_icc_1pct_grouped_line_dt[gene %in% unique(deboever_eqtls_icc_1pct_grouped_dt[grp == "Line:Village"]$gene)]$grp) + + +length(unique(deboever_eqtls$gene)) ## number that has snp-gene pairs in deboever and +length(unique(deboever_eqtls_icc$gene)) ## 2548 +length(unique(deboever_eqtls_icc_1pct_grouped_dt$gene)) ### 1189 +table(deboever_eqtls_icc_1pct_grouped_dt$grp) ## 1189 sig in line +length(unique(deboever_eqtls_icc_1pct_grouped_dt$gene))/table(deboever_eqtls_icc_1pct_grouped_dt$grp)["Line"] ### 1 + + + + + diff --git a/Variance/test_interaction_nb_model.R b/Variance/test_interaction_nb_model.R index 926966c..51e8ba4 100644 --- a/Variance/test_interaction_nb_model.R +++ b/Variance/test_interaction_nb_model.R @@ -14,6 +14,8 @@ library(lmerTest) library(tictoc) library(Rfast) library(sjstats) +library(car) +library(data.table) @@ -405,6 +407,89 @@ sum(icc_glmmtmb(model_glmmtmb_mt)$vcov) model_glmmtmb_short <- glmmTMB(Expression ~ (1 | Replicate) + (1 | Village) + (1 | Line) + (1 | Site), data = df_hier_unscale, REML = TRUE) model_glmmtmb_short_interaction <- glmmTMB(Expression ~ (1 | Replicate) + (1 | Village) + (1 | Line) + (1 | Site) + (1 | Village:Line), data = df_hier_unscale, REML = TRUE) -anova(model_glmmtmb_short, model_glmmtmb_short_interaction)$`Pr(>Chisq)` +anova(model_glmmtmb_short, model_glmmtmb_short_interaction)$`Pr(>Chisq)` ### the order of testing matters!!! -ranova_mt <- ranova(model_glmmtmb_mt) + + +# ranova_mt <- ranova(model_glmmtmb_mt) + + +##### Leave one out method ##### +variables <- c("Replicate", "Village", "Line", "Site", "Replicate:Village", "Replicate:Line", "Replicate:Site", "Village:Line", "Village:Site", "Line:Site") + +model_loo <- list() + +sig_dt <- data.table(var = variables, P = as.numeric(NA)) + +for (variable in variables){ + print(variable) + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + model_loo[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE) + sig_dt[var == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2] +} + + +updated_variables <- sig_dt[P < 0.05/10]$var +updated_model <- as.formula(paste0("Expression ~ (1|", paste0(updated_variables, collapse = ") + (1|"), ")")) + +final_full_model <- glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE) + +icc_glmmtmb(final_full_model) + +model_loo_updated <- list() + +sig_dt_updated <- data.table(var = variables, P = as.numeric(NA)) + +for (variable in updated_variables){ + print(variable) + model <- as.formula(paste0("Expression ~ (1|", paste0(updated_variables[!updated_variables %in% variable], collapse = ") + (1|"), ")")) + model_loo_updated[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE) + sig_dt_updated[var == variable]$P <- anova(model_loo_updated[[variable]], final_full_model)$`Pr(>Chisq)`[2] +} + + +### Test mt gene +model_loo_mt <- list() + +sig_dt_mt <- data.table(var = variables, P = as.numeric(NA)) + +for (variable in variables){ + print(variable) + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + model_loo_mt[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale_mt, REML = TRUE) + sig_dt_mt[var == variable]$P <- anova(model_loo_mt[[variable]], model_glmmtmb_mt)$`Pr(>Chisq)`[2] +} + + +updated_variables_mt <- sig_dt_mt[P < 0.05/10]$var +updated_model_mt<- as.formula(paste0("Expression ~ (1|", paste0(updated_variables_mt, collapse = ") + (1|"), ")")) + +final_full_model_mt <- glmmTMB(formula = noquote(updated_model_mt), data = df_hier_unscale_mt, REML = TRUE) + +icc_glmmtmb(final_full_model_mt) + +model_loo_mt_updated <- list() + +sig_dt_mt_updated <- data.table(var = variables, P = as.numeric(NA)) + +for (variable in updated_variables){ + print(variable) + model <- as.formula(paste0("Expression ~ (1|", paste0(updated_variables[!updated_variables %in% variable], collapse = ") + (1|"), ")")) + model_loo_mt_updated[[variable]] <- glmmTMB(formula = noquote(model), data = df_hier_unscale_mt, REML = TRUE) + sig_dt_mt_updated[var == variable]$P <- anova(model_loo_mt_updated[[variable]], final_full_model)$`Pr(>Chisq)`[2] +} + + + +### Check if order of fixed effects impacts the residuals (will save to be used for eQTL detection) ### +model_fixed <- as.formula(paste0("Expression ~ ", paste0(variables, collapse = " + "))) +model_random <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) +model_glmmtmb_fixed <- glmmTMB(model, data = df_hier_unscale_mt, REML = TRUE) +model_glmmtmb_random <- glmmTMB(model, data = df_hier_unscale_mt, REML = TRUE) + + +all(resid(model_glmmtmb_fixed) == resid(model_glmmtmb_random)) + + + +model_glmmtmb_mt <- glmmTMB(Expression ~ (1 | Replicate) + (1 | Village) + (1 | Line) + (1 | Site) + (1|Replicate:Village) + (1|Replicate:Line) + (1|Replicate:Site) + (1|Village:Line) + (1|Village:Site) + (1|Line:Site), data = df_hier_unscale_mt, REML = TRUE) diff --git a/eQTL_check/eQTL_village_interaction.sh b/eQTL_check/eQTL_village_interaction.sh new file mode 100644 index 0000000..dd3a06a --- /dev/null +++ b/eQTL_check/eQTL_village_interaction.sh @@ -0,0 +1,109 @@ + + +SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/eQTL_village_interaction.smk" +LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/logs" + +mkdir -p $LOG + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --reason > jobs2run.txt + + + +nohup \ + snakemake \ + --snakefile $SNAKEFILE \ + --jobs 200 \ + --use-singularity \ + --restart-times 1 \ + --keep-going \ + --cluster \ + "qsub -S /bin/bash \ + -q short.q \ + -r yes \ + -pe smp {threads} \ + -l tmp_requested={resources.disk_per_thread_gb}G \ + -l mem_requested={resources.mem_per_thread_gb}G \ + -e $LOG \ + -o $LOG \ + -j y \ + -V" \ + > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log & + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --cleanup-metadata \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds + + + + + + +ENSG00000140481 +ENSG00000158552 +ENSG00000167004 +ENSG00000184281 +ENSG00000185437 +ENSG00000185875 +ENSG00000196109 +ENSG00000197536 +ENSG00000215910 +ENSG00000215883 +ENSG00000258297 +ENSG00000259488 +ENSG00000274877 + + +ENSG00000158555 +ENSG00000167005 +ENSG00000184292 +ENSG00000184281 +ENSG00000185437 +ENSG00000185875 +ENSG00000196109 +ENSG00000197536 +ENSG00000215866 +ENSG00000258289 +ENSG00000259485 +ENSG00000274828 + + +rm ENSG00000167011_fitted_models.rds +rm ENSG00000184304_fitted_models.rds +rm ENSG00000185477_fitted_models.rds +rm ENSG00000185900_fitted_models.rds +rm ENSG00000196132_fitted_models.rds +rm ENSG00000197566_fitted_models.rds +rm ENSG00000217930_fitted_models.rds +rm ENSG00000258593_fitted_models.rds +rm ENSG00000259715_fitted_models.rds +rm ENSG00000275074_fitted_models.rds diff --git a/eQTL_check/eQTL_village_interaction.smk b/eQTL_check/eQTL_village_interaction.smk new file mode 100644 index 0000000..b37e717 --- /dev/null +++ b/eQTL_check/eQTL_village_interaction.smk @@ -0,0 +1,59 @@ +import pandas as pd + + +genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv" +genes_file_kilpinen = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv" +genes = pd.read_csv(genes_file, sep = "\t") +# genes = genes.iloc[1] +genes_kilpenen = pd.read_csv(genes_file_kilpinen, sep = "\t") +# genes_kilpenen = genes_kilpenen.iloc[1] + +rule all: + input: + expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/{gene}_snpXvillage_interactions.tsv", gene = genes.gene_id), + expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/{gene}_snpXvillage_interactions.tsv", gene = genes_kilpenen.gene_id), + + +rule interaction_deboever: + input: + bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned.bed" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/{gene}_snpXvillage_interactions.tsv", + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 8, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 8 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/eQTL_village_interaction_deboever.R", + outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/", + snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene], + datadir = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/data/" + log: + shell: + """ + mkdir -p {params.datadir} + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed} {params.datadir} + """ + + + +rule interaction_kilpinen: + input: + bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned.bed" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/{gene}_snpXvillage_interactions.tsv", + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 8, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 8 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/eQTL_village_interaction_kilpinen.R", + outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/", + snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene], + datadir = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/data/" + log: + shell: + """ + mkdir -p {params.datadir} + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed} {params.datadir} + """ diff --git a/eQTL_check/eQTL_village_interaction_combine.R b/eQTL_check/eQTL_village_interaction_combine.R new file mode 100644 index 0000000..0e7973c --- /dev/null +++ b/eQTL_check/eQTL_village_interaction_combine.R @@ -0,0 +1,357 @@ +library(data.table) +library(dplyr) +library(ggplot2) +library(colorspace) +library(DEGreport) +library(ggpp) + + +deboever_indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/deboever/results/" +kilpinen_indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/kilpinen/results/" +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_village_interaction/combined/" + +dir.create(outdir, recursive = TRUE) + + +##### Read in results ##### +### Deboever +deboever_files <- list.files(deboever_indir) + + +deboever_results_list <- lapply(deboever_files, function(x){ + tmp <- fread(paste0(deboever_indir, x), sep = "\t") + tmp$ensg <- gsub("_snpXvillage_interactions.tsv", "", x) + return(tmp) +}) + + + +deboever_results_dt <- do.call(rbind, deboever_results_list) +deboever_results_dt$snp_fdr <- p.adjust(deboever_results_dt$snp_p, method = "fdr") +deboever_results_dt$snpXvillage_fdr <- p.adjust(deboever_results_dt$snpXvillage_p, method = "fdr") +deboever_results_dt$study <- "DeBoever" + + +### Kilpinen +kilpinen_files <- list.files(kilpinen_indir) + +kilpinen_results_list <- lapply(kilpinen_files, function(x){ + tmp <- fread(paste0(kilpinen_indir, x), sep = "\t") + tmp$ensg <- gsub("_snpXvillage_interactions.tsv", "", x) + return(tmp) +}) + + +kilpinen_results_dt <- do.call(rbind, kilpinen_results_list) +kilpinen_results_dt$snp_fdr <- p.adjust(kilpinen_results_dt$snp_p, method = "fdr") +kilpinen_results_dt$snpXvillage_fdr <- p.adjust(kilpinen_results_dt$snpXvillage_p, method = "fdr") +kilpinen_results_dt$study <- "Kilpinen" + + +results_dt <- rbind(deboever_results_dt,kilpinen_results_dt) + + +results_dt[order(snp_fdr)] +results_dt[order(snpXvillage_fdr)] + +### See how many snps have all different genotypes for each cell line +results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006] ## 1108 total + +fwrite(results_dt, paste0(outdir, "snpXvillage_results.tsv"), sep = "\t") + + +results_dt$significant <- ifelse(results_dt$snpXvillage_fdr < 0.05, TRUE, FALSE) + +results_dt[ensg == "ENSG00000106153"] + +results_dt$snpXvillage_sig <- ifelse(results_dt$snpXvillage_fdr < 0.05, "significant", "not significant") + + +correlation <- list() +correlation_dt <- data.table(study = unique(results_dt$study), rho = as.numeric(NA), p = as.numeric(NA)) + +for (Study in results_dt$study){ + correlation[[Study]] <- cor.test(results_dt[study == Study]$snp_beta, results_dt[study == Study]$snpXvillage_beta, method = "spearman", exact = TRUE) + correlation_dt[study == Study, "rho"] <- correlation[[Study]]$estimate[1] + correlation_dt[study == Study, "p"] <- correlation[[Study]]$p.value +} + + +correlation_plot <- ggplot(results_dt[study == "DeBoever"]) + + geom_hline(yintercept=0) + + geom_vline(xintercept=0) + + geom_point(aes(snp_beta, snpXvillage_beta)) + + theme_classic() + + # facet_wrap(vars(study)) + + xlab("SNP Effect Size") + + ylab("SNP x Village Interaction Effect Size") + + geom_text(data = correlation_dt, aes(x = -3.85, y = 5.5, label = paste0("Rho = ",round(rho,2),"***"))) + # scale_color_discrete_sequential(palette = "ag_GrnYl", rev = FALSE) + + +ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png"), height = 3, width = 5.5) + + + +### add number of genes in each quadrant +dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0]) # 2216 +dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0]) # 2103 + +dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0]) # 1155 +dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0]) # 1160 + + +dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 1670 +dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 1582 + +dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 546 +dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 586 + + +dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 546 +dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 521 + +dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 609 +dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 574 + + +### Sig +dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 15 +dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 18 + +dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 67 +dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 72 + + +dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 13 +dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 14 + +dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 47 +dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 46 + + +dim(results_dt[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 2 +dim(results_dt[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 4 + +dim(results_dt[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 20 +dim(results_dt[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 26 + + +### Filter by genes that matched direction in original dataset ### +effect_match_dt <- fread( "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_overlap/combined/overlap_results.tsv") + +effect_match_dt_sig_match <- effect_match_dt[direction == "match" & fdr < 0.05] + + +results_dt_sig_match <- results_dt[effect_match_dt_sig_match[,c("study", "ensg", "ID_ref_alt")], on = c("study", "ensg", "ID_ref_alt")] + + + +correlation_sig_match <- list() +correlation_sig_match_dt <- data.table(study = unique(results_dt_sig_match$study), rho = as.numeric(NA), p = as.numeric(NA)) + +for (Study in results_dt_sig_match$study){ + correlation_sig_match[[Study]] <- cor.test(results_dt_sig_match[study == Study]$snp_beta, results_dt_sig_match[study == Study]$snpXvillage_beta, method = "spearman", exact = TRUE) + correlation_sig_match_dt[study == Study, "rho"] <- correlation_sig_match[[Study]]$estimate[1] + correlation_sig_match_dt[study == Study, "p"] <- correlation_sig_match[[Study]]$p.value +} + + +correlation_sig_match_plot <- ggplot(results_dt_sig_match[study == "DeBoever"], aes(snp_beta, snpXvillage_beta)) + + geom_hline(yintercept=0) + + geom_vline(xintercept=0) + + geom_point() + + theme_classic() + + # facet_wrap(vars(study)) + + xlab("SNP Effect Size") + + ylab("SNP*Village Interaction\nEffect Size") + + geom_text(data = correlation_sig_match_dt, aes(x = -4, y = 0.5, label = paste0("Rho = ",round(rho,2),"***"))) + + stat_quadrant_counts() + + xlim(-7,3) + + ylim(-7,3) + # scale_color_discrete_sequential(palette = "ag_GrnYl", rev = FALSE) + + +ggsave(correlation_sig_match_plot, filename = paste0(outdir, "correlation_sig_match_plot.png"), height = 2.5, width = 2.75) +ggsave(correlation_sig_match_plot, filename = paste0(outdir, "correlation_sig_match_plot.pdf"), height = 2.5, width = 2.75) + + + +### add number of genes in each quadrant +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0]) # 2216 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0]) # 2103 + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0]) # 1155 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0]) # 1160 + + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 1670 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 1582 + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 546 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 586 + + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 546 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 521 + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 609 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 574 + + +### Sig +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 15 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 18 + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & snpXvillage_sig == "significant"]) # 67 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & snpXvillage_sig == "significant"]) # 72 + + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 13 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 14 + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 47 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever" & snpXvillage_sig == "significant"]) # 46 + + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 2 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 4 + +dim(results_dt_sig_match[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 20 +dim(results_dt_sig_match[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen" & snpXvillage_sig == "significant"]) # 26 + + + + + + + + + + + + + + + +### Filter by original interaction significant genes ### +effects_dt <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/combined/effect_results.tsv") + +interaction_sig_genes <- unique(effects_dt[grp == "Line:Village", "gene"]) + +results_dt_subset <- results_dt[ensg %in% interaction_sig_genes$gene] + +correlation_plot_sub <- ggplot(results_dt_subset, aes(snp_beta, snpXvillage_beta, color = snpXvillage_sig)) + + geom_point() + + theme_classic() + + scale_color_discrete_sequential(palette = "ag_GrnYl", rev = FALSE) + + facet_wrap(vars(study)) + + geom_hline(yintercept=0) + + geom_vline(xintercept=0) + # scale_color_manual(values = c("grey","black")) + +ggsave(correlation_plot_sub, filename = paste0(outdir, "correlation_plot_var_sig.png"), height = 2.25, width = 5.5) + + + +### add number of genes in each quadrant +dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta < 0]) # 202 +dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta > 0]) # 232 + +dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta > 0]) # 304 +dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta < 0]) # 323 + + +dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 144 +dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 153 + +dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 199 +dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 219 + + +dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 58 +dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 79 + +dim(results_dt_subset[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 105 +dim(results_dt_subset[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 104 + + + +deboever_eqtls <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/combined/deboever_overlap_results.tsv", sep = "\t") +deboever_eqtls_dif_snp <- deboever_eqtls[direction == "different_snp"] + +results_dt_subset_correct_direction <- results_dt_subset[!(ID_ref_alt %in% deboever_eqtls_dif_snp$gene_id_deboever)] + + +deboever_results_dt[deboever_eqtls[, .(gene_id_deboever, ID_ref_alt)], on = c("ensg" = "gene_id_deboever", "ID_ref_alt")][ensg == "ENSG00000106153"] + + + +### add number of genes in each quadrant +dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta < 0]) # 144 +dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta > 0]) # 153 + +dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta > 0]) # 199 +dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta < 0]) # 219 + + +dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 144 +dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 153 + +dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta > 0 & study == "DeBoever"]) # 199 +dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta < 0 & study == "DeBoever"]) # 219 + + +dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 58 +dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 79 + +dim(results_dt_subset_correct_direction[snp_beta < 0 & snpXvillage_beta > 0 & study == "Kilpinen"]) # 105 +dim(results_dt_subset_correct_direction[snp_beta > 0 & snpXvillage_beta < 0 & study == "Kilpinen"]) # 104 + + + + + +histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) + + geom_histogram(position = "stack", binwidth = 0.05) + + theme_classic() + + geom_vline(aes(xintercept=0.05), + color="red", linetype="dashed", size=1) + + scale_fill_manual(values = c("grey","black")) + +ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png")) + +histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + geom_vline(aes(xintercept=log10(0.05)), + color="red", linetype="dashed", size=0.25) + + scale_fill_manual(values = c("grey","black")) + + scale_y_continuous(expand = c(0, 0)) + + ylab("Number of Associations") + + labs(fill = "eQTL\nEffect\nDirection\nAgreement") + + +ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png")) + + +popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + scale_fill_manual(values = c("grey","black")) + + theme(axis.title.x=element_blank(), + axis.title.y=element_blank(), + legend.position="none", + axis.text.x=element_text(size=7), + axis.text.y=element_text(size=7)) + + scale_y_continuous(expand = c(0, 0)) + +ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png")) + +combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, + ymin = 500, ymax = 1225) + +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3) +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3) + + + +### Calculate number of eQTLs and write in notion for writing paper time ### diff --git a/eQTL_check/eQTL_village_interaction_deboever.R b/eQTL_check/eQTL_village_interaction_deboever.R new file mode 100644 index 0000000..8899377 --- /dev/null +++ b/eQTL_check/eQTL_village_interaction_deboever.R @@ -0,0 +1,177 @@ +### Author: Drew neavin +### Date: 25 June, 2022 +### Rational: Reviewer asked if eQTL effects are consistently higher in a village so need to look at village*snp effect for eQTLs + + +##### Read in Libraries ##### +library(data.table) +library(tidyverse) +library(Seurat) +library(vcfR) +library(pkgcond) +library(glmmTMB) + + + +##### Set up directories ##### +args <- commandArgs(trailingOnly = TRUE) +ensg <- as.character(args[1]) +outdir <- args[2] +bed <- args[3] +datadir <- args[4] + + + +##### Read in data ##### +### significant effects for all genes dataframe ### +effects_dt <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/combined/effect_results.tsv") + +### Seurat object ### +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/seurat_integrated_noncryo_1pct_expressing.rds") + + +### SNP vcf ### +vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_finalized_snps.recode.vcf") + + +##### Read in dataframe of pairs to test ##### +snp_gene_pairs <- fread(bed) +snp_gene_pairs$gene_id <- gsub("\\..+", "", snp_gene_pairs$gene_id) + + +##### Subset the correct snp ##### +snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg] + + +##### Make model dataframe ##### +df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[ensg,], "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)), "Site" = seurat$Location) +colnames(df_hier_unscale)[1] <- "Expression" + + + + +##### Get the Correct Variables for Modeling ##### +variables <- unique(effects_dt[gene == ensg]$grp) +variables <- variables[!(variables %in% c("Line", "Line:Village", "Residual"))] + + + +##### Fit effects except line and village*line effects ###### +model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) +model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + +###### Summarize data to site x replicate x village ##### +data <- data.table(df_hier_unscale) +data$Residuals <- resid(model_glmmtmb) +data_sum <- data[, .(residual=mean(Residuals)), by = .(Village, Line, Site, Replicate)] + + + +##### Get the snp from the vcf ##### +## GT ## +snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F)) + + +if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){ + message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.") + format_clust = "GT" + + if (any(grepl("\\|",snp_dt[,1]))){ + separator = "|" + message("Detected | separator for GT genotype format in cluster vcf") + } else if (any(grepl("/",snp_dt[,1]))) { + separator = "/" + message("Detected / separator for GT genotype format in cluster vcf") + } else { + format_clust = NA + message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.") + } + if (!is.na(format_clust)){ + snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>% + lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)}))) + } +} + + +colnames(snp_dt) <- gsub("36_TOB00421_i_E8", "TOB0421", colnames(snp_dt)) %>% + gsub("22_FSA", "FSA0006", .) %>% + gsub("29_MBE", "MBE1006", .) + + +## Get specific SNP ## +snp_dt$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT']) + +snp_dt_subset <- snp_dt[ID_ref_alt %in% snp_gene_pairs_subset$ID_ref_alt] +snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID_ref_alt"), measure.vars = c("TOB0421", "FSA0006", "MBE1006")) + + + +##### Make dataframe with results, snp genotypes and our beta + pvalue ##### +results_dt <- snp_gene_pairs_subset[,c("ID", "REF", "ALT", "ID_ref_alt")] +results_dt <- results_dt[snp_dt_subset, on = "ID_ref_alt"] + +results_dt$snp_beta <- as.numeric(NA) +results_dt$snp_beta_se <- as.numeric(NA) +results_dt$snp_z <- as.numeric(NA) +results_dt$snp_p <- as.numeric(NA) +results_dt$snpXvillage_beta <- as.numeric(NA) +results_dt$snpXvillage_beta_se <- as.numeric(NA) +results_dt$snpXvillage_z <- as.numeric(NA) +results_dt$snpXvillage_p <- as.numeric(NA) + + + + +for (snp in unique(snp_dt_subset_long$ID_ref_alt)){ + + ### Add SNP data to data data.table ### + data_sum_snp <- snp_dt_subset_long[ID_ref_alt == snp][data_sum, on = c("variable" = "Line")] + + gt_long <- melt(results_dt[ID_ref_alt == snp][,c("ID_ref_alt", "TOB0421", "FSA0006", "MBE1006", "REF", "ALT")], measure.vars = c("TOB0421", "FSA0006", "MBE1006")) + gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), + ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT), + ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA))) + + gt_long <- unique(gt_long) + + data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID_ref_alt")] + + data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype)) + + + ##### Fit SNP => store resids ##### + ##### Check for beta ##### + base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE) + snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE) + + anova_results <- anova(base_model, snp_model) + + results_dt[ID_ref_alt == snp]$snp_beta <- summary(snp_model)$coefficients$cond[2,1] + results_dt[ID_ref_alt == snp]$snp_beta_se <- summary(snp_model)$coefficients$cond[2,2] + results_dt[ID_ref_alt == snp]$snp_z <- summary(snp_model)$coefficients$cond[2,3] + results_dt[ID_ref_alt == snp]$snp_p <- anova_results$`Pr(>Chisq)`[2] + + + ##### resids fit SNP*village ##### + data_sum_snp$residual2 <- resid(snp_model) + + snpXvillage_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value)*Village, data = data_sum_snp, REML = TRUE) + + anova_results2 <- anova(snp_model, snpXvillage_model) + + + results_dt[ID_ref_alt == snp]$snpXvillage_beta <- summary(snpXvillage_model)$coefficients$cond[2,1] + results_dt[ID_ref_alt == snp]$snpXvillage_beta_se <- summary(snpXvillage_model)$coefficients$cond[2,2] + results_dt[ID_ref_alt == snp]$snpXvillage_z <- summary(snpXvillage_model)$coefficients$cond[2,3] + results_dt[ID_ref_alt == snp]$snpXvillage_p <- anova_results2$`Pr(>Chisq)`[2] + + fwrite(data_sum_snp, paste0(datadir, ensg, "_",snp,"_dataframe"), sep = "\t") +} + +fwrite(results_dt, paste0(outdir, ensg, "_snpXvillage_interactions.tsv"), sep = "\t") + + diff --git a/eQTL_check/eQTL_village_interaction_kilpinen.R b/eQTL_check/eQTL_village_interaction_kilpinen.R new file mode 100644 index 0000000..316f018 --- /dev/null +++ b/eQTL_check/eQTL_village_interaction_kilpinen.R @@ -0,0 +1,177 @@ +### Author: Drew neavin +### Date: 25 June, 2022 +### Rational: Reviewer asked if eQTL effects are consistently higher in a village so need to look at village*snp effect for eQTLs + + +##### Read in Libraries ##### +library(data.table) +library(tidyverse) +library(Seurat) +library(vcfR) +library(pkgcond) +library(glmmTMB) + + + +##### Set up directories ##### +args <- commandArgs(trailingOnly = TRUE) +ensg <- as.character(args[1]) +outdir <- args[2] +bed <- args[3] +datadir <- args[4] + + + +##### Read in data ##### +### significant effects for all genes dataframe ### +effects_dt <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/combined/effect_results.tsv") + +### Seurat object ### +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/seurat_integrated_noncryo_1pct_expressing.rds") + + +### SNP vcf ### +vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/finalized_snps.recode.vcf") + + +##### Read in dataframe of pairs to test ##### +snp_gene_pairs <- fread(bed) +snp_gene_pairs$gene_id <- gsub("\\..+", "", snp_gene_pairs$gene_id) + + +##### Subset the correct snp ##### +snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg] + + +##### Make model dataframe ##### +df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[ensg,], "Village" = as.factor(ifelse(seurat@meta.data$Time == "Baseline", 0, 1)), "Line" = seurat@meta.data$Final_Assignment, "Replicate" = as.factor(gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID)), "Site" = seurat$Location) +colnames(df_hier_unscale)[1] <- "Expression" + + + + +##### Get the Correct Variables for Modeling ##### +variables <- unique(effects_dt[gene == ensg]$grp) +variables <- variables[!(variables %in% c("Line", "Line:Village", "Residual"))] + + + +##### Fit effects except line and village*line effects ###### +model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) +model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + +###### Summarize data to site x replicate x village ##### +data <- data.table(df_hier_unscale) +data$Residuals <- resid(model_glmmtmb) +data_sum <- data[, .(residual=mean(Residuals)), by = .(Village, Line, Site, Replicate)] + + + +##### Get the snp from the vcf ##### +## GT ## +snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F)) + + +if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){ + message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.") + format_clust = "GT" + + if (any(grepl("\\|",snp_dt[,1]))){ + separator = "|" + message("Detected | separator for GT genotype format in cluster vcf") + } else if (any(grepl("/",snp_dt[,1]))) { + separator = "/" + message("Detected / separator for GT genotype format in cluster vcf") + } else { + format_clust = NA + message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.") + } + if (!is.na(format_clust)){ + snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>% + lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)}))) + } +} + + +colnames(snp_dt) <- gsub("36_TOB00421_i_E8", "TOB0421", colnames(snp_dt)) %>% + gsub("22_FSA", "FSA0006", .) %>% + gsub("29_MBE", "MBE1006", .) + + +## Get specific SNP ## +snp_dt$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT']) + +snp_dt_subset <- snp_dt[ID_ref_alt %in% snp_gene_pairs_subset$ID_ref_alt] +snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID_ref_alt"), measure.vars = c("TOB0421", "FSA0006", "MBE1006")) + + + +##### Make dataframe with results, snp genotypes and our beta + pvalue ##### +results_dt <- snp_gene_pairs_subset[,c("ID", "REF", "ALT", "ID_ref_alt")] +results_dt <- results_dt[snp_dt_subset, on = "ID_ref_alt"] + +results_dt$snp_beta <- as.numeric(NA) +results_dt$snp_beta_se <- as.numeric(NA) +results_dt$snp_z <- as.numeric(NA) +results_dt$snp_p <- as.numeric(NA) +results_dt$snpXvillage_beta <- as.numeric(NA) +results_dt$snpXvillage_beta_se <- as.numeric(NA) +results_dt$snpXvillage_z <- as.numeric(NA) +results_dt$snpXvillage_p <- as.numeric(NA) + + + + +for (snp in unique(snp_dt_subset_long$ID_ref_alt)){ + + ### Add SNP data to data data.table ### + data_sum_snp <- snp_dt_subset_long[ID_ref_alt == snp][data_sum, on = c("variable" = "Line")] + + gt_long <- melt(results_dt[ID_ref_alt == snp][,c("ID_ref_alt", "TOB0421", "FSA0006", "MBE1006", "REF", "ALT")], measure.vars = c("TOB0421", "FSA0006", "MBE1006")) + gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), + ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT), + ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA))) + + gt_long <- unique(gt_long) + + data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID_ref_alt")] + + data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype)) + + + ##### Fit SNP => store resids ##### + ##### Check for beta ##### + base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE) + snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE) + + anova_results <- anova(base_model, snp_model) + + results_dt[ID_ref_alt == snp]$snp_beta <- summary(snp_model)$coefficients$cond[2,1] + results_dt[ID_ref_alt == snp]$snp_beta_se <- summary(snp_model)$coefficients$cond[2,2] + results_dt[ID_ref_alt == snp]$snp_z <- summary(snp_model)$coefficients$cond[2,3] + results_dt[ID_ref_alt == snp]$snp_p <- anova_results$`Pr(>Chisq)`[2] + + + ##### resids fit SNP*village ##### + data_sum_snp$residual2 <- resid(snp_model) + + snpXvillage_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value)*Village, data = data_sum_snp, REML = TRUE) + + anova_results2 <- anova(snp_model, snpXvillage_model) + + + results_dt[ID_ref_alt == snp]$snpXvillage_beta <- summary(snpXvillage_model)$coefficients$cond[2,1] + results_dt[ID_ref_alt == snp]$snpXvillage_beta_se <- summary(snpXvillage_model)$coefficients$cond[2,2] + results_dt[ID_ref_alt == snp]$snpXvillage_z <- summary(snpXvillage_model)$coefficients$cond[2,3] + results_dt[ID_ref_alt == snp]$snpXvillage_p <- anova_results2$`Pr(>Chisq)`[2] + + fwrite(data_sum_snp, paste0(datadir, ensg, "_",snp,"_dataframe"), sep = "\t") +} + +fwrite(results_dt, paste0(outdir, ensg, "_snpXvillage_interactions.tsv"), sep = "\t") + + diff --git a/eQTL_check/filter_snps_kilpinen.R b/eQTL_check/filter_snps_kilpinen.R new file mode 100644 index 0000000..57528e6 --- /dev/null +++ b/eQTL_check/filter_snps_kilpinen.R @@ -0,0 +1,83 @@ +library(vcfR) +library(data.table) +library(tidyverse) + + +##### Set up variables ##### +dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/" + + +iPSC_Village/scripts/QCmetric_figs +##### Read in data ##### +vcf <- read.vcfR(paste0(dir, "finalized_snps.recode.vcf")) +eqtls_overlap_dt <- fread(paste0(dir,"kilpinen_imputed_overlapping_filtered_header.bed"), sep = "\t") + +geno <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F)) + + +if (!all(colSums(is.na(geno)) == nrow(geno))){ + message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.") + format_clust = "GT" + + if (any(grepl("\\|",geno[,1]))){ + separator = "|" + message("Detected | separator for GT genotype format in cluster vcf") + } else if (any(grepl("/",geno[,1]))) { + separator = "/" + message("Detected / separator for GT genotype format in cluster vcf") + } else { + format_clust = NA + message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.") + } + if (!is.na(format_clust)){ + geno <- data.table(as_tibble(lapply(geno, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>% + lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)}))) + } +} + + +geno$ID <- vcf@fix[,'ID'] +geno$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT']) + +eqtls_overlap_dt <- eqtls_overlap_dt[ID %in% unique(geno$ID)] +eqtls_overlap_dt$ID_ref_alt <- paste0(eqtls_overlap_dt$ID, "_", eqtls_overlap_dt$REF, "_", eqtls_overlap_dt$ALT) + + +temp <- list() + +for (gene in unique(eqtls_overlap_dt$gene_id)){ + print(gene) + temp[[gene]] <- eqtls_overlap_dt[gene_id == gene] + k = 1 + while (k < nrow(temp[[gene]])){ + test_snp <- temp[[gene]]$ID_ref_alt[k] + for (snp in temp[[gene]][(k+1):nrow(temp[[gene]]),]$ID_ref_alt){ + if (abs(cor(as.numeric(geno[ID_ref_alt == test_snp,1:3]), as.numeric(geno[ID_ref_alt == snp,1:3]))) == 1){ + temp[[gene]] <- temp[[gene]][ID_ref_alt != snp] + } + } + k = k + 1 + } +} + + +eqtls_overlap_dt_subset <- do.call(rbind, temp) +eqtls_overlap_dt_subset$gene_id <- gsub("\\..+", "", eqtls_overlap_dt_subset$gene_id) + + +resid_files <- list.files("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/gene_separated/residuals4qtl") +ensg_list <- unique(gsub("_residuals4qtl.rds", "", resid_files)) + +eqtls_overlap_dt_subset <- eqtls_overlap_dt_subset[gene_id %in% ensg_list] + + +fwrite(eqtls_overlap_dt_subset, paste0(dir, "kilpinen_imputed_overlapping_filtered_header_pruned.bed"), sep = "\t") + + +eqtls_overlap_dt_subset_gene_snp <- eqtls_overlap_dt_subset[,c("gene_id","ID_ref_alt")] + +fwrite(eqtls_overlap_dt_subset_gene_snp, paste0(dir, "kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"), sep = "\t") + + diff --git a/eQTL_check/multi-passage/filter_snps.R b/eQTL_check/multi-passage/filter_snps.R new file mode 100644 index 0000000..4321269 --- /dev/null +++ b/eQTL_check/multi-passage/filter_snps.R @@ -0,0 +1,88 @@ +library(vcfR) +library(data.table) +library(tidyverse) + + +##### Set up variables ##### +dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/" + + +iPSC_Village/scripts/QCmetric_figs +##### Read in data ##### +vcf <- read.vcfR(paste0(dir, "deboever_finalized_snps.vcf")) +eqtls_overlap_dt <- fread(paste0(dir,"deboever_imputed_overlapping_filtered_header.bed"), sep = "\t") + +geno <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F)) + + +if (!all(colSums(is.na(geno)) == nrow(geno))){ + message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.") + format_clust = "GT" + + if (any(grepl("\\|",geno[,1]))){ + separator = "|" + message("Detected | separator for GT genotype format in cluster vcf") + } else if (any(grepl("/",geno[,1]))) { + separator = "/" + message("Detected / separator for GT genotype format in cluster vcf") + } else { + format_clust = NA + message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.") + } + if (!is.na(format_clust)){ + geno <- data.table(as_tibble(lapply(geno, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>% + lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)}))) + } +} + + +geno$ID <- vcf@fix[,'ID'] +# geno$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT']) + +eqtls_overlap_dt <- eqtls_overlap_dt[ID %in% unique(geno$ID)] +eqtls_overlap_numberic_gt <- apply(eqtls_overlap_dt[,38:ncol(eqtls_overlap_dt)], 2, function(y) as.numeric(gsub("[0-9]\\|[0-9]:", "", y) %>% gsub(":.+", "", .))) +eqtls_overlap_dt <- cbind(eqtls_overlap_dt[,1:37],eqtls_overlap_numberic_gt) + + +# eqtls_overlap_dt$ID_ref_alt <- paste0(eqtls_overlap_dt$ID, "_", eqtls_overlap_dt$REF, "_", eqtls_overlap_dt$ALT) + + +temp <- list() + +for (gene in unique(eqtls_overlap_dt$gene_id)){ + print(gene) + temp[[gene]] <- eqtls_overlap_dt[gene_id == gene] + k = 1 + while (k < nrow(temp[[gene]])){ + test_snp <- temp[[gene]]$ID[k] + for (snp in temp[[gene]][(k+1):nrow(temp[[gene]]),]$ID){ + if (abs(cor(as.numeric(geno[ID == test_snp,1:(ncol(geno) - 1)]), as.numeric(geno[ID == snp,1:(ncol(geno) - 1)]))) == 1){ + temp[[gene]] <- temp[[gene]][ID != snp] + } + } + k = k + 1 + } +} + + +eqtls_overlap_dt_subset <- do.call(rbind, temp) +eqtls_overlap_dt_subset$gene_id <- gsub("\\..+", "", eqtls_overlap_dt_subset$gene_id) + + +resid_files <- list.files("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl") + +ensg_list <- unique(gsub("_residuals4qtl.rds", "", resid_files)) + +eqtls_overlap_dt_subset <- eqtls_overlap_dt_subset[gene_id %in% ensg_list] + + +fwrite(eqtls_overlap_dt_subset, paste0(dir, "deboever_imputed_overlapping_filtered_header_pruned.bed"), sep = "\t") + + +eqtls_overlap_dt_subset_gene_snp <- eqtls_overlap_dt_subset[,c("gene_id","ID")] + +fwrite(eqtls_overlap_dt_subset_gene_snp, paste0(dir, "deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv"), sep = "\t") + + diff --git a/eQTL_check/multi-passage/genes2test.sh b/eQTL_check/multi-passage/genes2test.sh new file mode 100644 index 0000000..562495e --- /dev/null +++ b/eQTL_check/multi-passage/genes2test.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +OUTDIR="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage" +mkdir -p $OUTDIR + +kilpinen_genes="$OUTDIR/../KilpinenOverlap/gene_snp_list.tsv" +deboever_genes="$OUTDIR/../KilpinenOverlap/deboever_gene_snp_list.tsv" +variance_genes="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl" + + +ls $variance_genes | sed "s/_residuals4qtl.rds//g" | sort -u > $OUTDIR/residual_genes.tsv +awk '{print $2}' $kilpinen_genes | sort -u > $OUTDIR/kilpinen_genes.tsv +awk '{print $2}' $deboever_genes | sort -u > $OUTDIR/deboever_genes.tsv + + +comm -12 $OUTDIR/residual_genes.tsv head $OUTDIR/kilpinen_genes.tsv > $OUTDIR/genes_residual_and_kilpinen.tsv ## NONE +comm -12 $OUTDIR/residual_genes.tsv $OUTDIR/deboever_genes.tsv > $OUTDIR/genes_residual_and_deboever.tsv + + +### Get finalized list of genes that were eQTLs and demonstrate line effects +grep -F -f $OUTDIR/genes_residual_and_deboever.tsv $deboever_genes > $OUTDIR/finalized_deboever_gene_snp_list.tsv + + + +### Filter vcf for just these snps ### +awk '{print $1}' $OUTDIR/finalized_deboever_gene_snp_list.tsv | sort -u > $OUTDIR/finalized_deboever_snps.tsv + +grep "#" $OUTDIR/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf >$OUTDIR/deboever_finalized_snps.vcf +grep -f $OUTDIR/finalized_deboever_snps.tsv $OUTDIR/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf >> $OUTDIR/deboever_finalized_snps.vcf + diff --git a/eQTL_check/multi-passage/metadata_prep.R b/eQTL_check/multi-passage/metadata_prep.R new file mode 100644 index 0000000..7ee4031 --- /dev/null +++ b/eQTL_check/multi-passage/metadata_prep.R @@ -0,0 +1,18 @@ +library(Seurat) +library(tivdyverse) +library(data.table) + + +dir.create("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Kilpinen_eQTLs/") + + + +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds") + + +### Make DF for modeling ### +df_hier_unscale <- data.frame("Passage" = as.factor(gsub("Village_", "", seurat@meta.data$Pool)), "Line" = seurat@meta.data$AtLeastHalfSinglet_Individual_Assignment) +df_hier_unscale$Barcode <- colnames(seurat) + +fwrite(df_hier_unscale, paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/cell_meta.tsv"), sep = "\t") + diff --git a/eQTL_check/multi-passage/pull_eQTL_SNPs.sh b/eQTL_check/multi-passage/pull_eQTL_SNPs.sh new file mode 100644 index 0000000..61ba936 --- /dev/null +++ b/eQTL_check/multi-passage/pull_eQTL_SNPs.sh @@ -0,0 +1,85 @@ +#!/bin/bash + + +vcf=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/multi-passage/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases.recode.vcf +kilpinen=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs/KilpineniPSCeQTLs.txt +deboever=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs/DeBoeveriPSCeQTLs.txt + +OUT=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs +OUT_TMP=/directflow/SCCGGroupShare/projects/DrewNeavin/References/iPSC_eQTLs/tmp + +mkdir -p $OUT_TMP + +INTERSECT_OUT=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage + +mkdir -p $INTERSECT_OUT + + + +### Filter on individuals and MAF for our lines ### +sed '/ $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps.vcf +sed -i '//d' $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps.vcf + + + +conda activate vcftools + + vcftools --vcf $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps.vcf --recode --mac 1 --out $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered + + bcftools view -e 'COUNT(GT="AA")=N_SAMPLES || COUNT(GT="RR")=N_SAMPLES || COUNT(GT="RA")=N_SAMPLES' $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered.recode.vcf > $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf + +conda deactivate + +### Redo intersection with these SNPs ##$# +conda activate bedtools + + bedtools intersect -a $OUT/kilpinen.bed -b $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf -wa -wb > $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed + +conda deactivate + + + +head -n 1 $OUT/kilpinen.bed > $INTERSECT_OUT/kilpinen_header_bed.tsv +grep "#CHROM" $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf > $INTERSECT_OUT/kilpinen_header_vcf.tsv +paste -d"\t" $INTERSECT_OUT/kilpinen_header_bed.tsv $INTERSECT_OUT/kilpinen_header_vcf.tsv > $INTERSECT_OUT/kilpinen_combined_header.tsv + +cat $INTERSECT_OUT/kilpinen_combined_header.tsv > $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed +cat $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed >> $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed + + + + + +### Use this file to test for trends in gene +awk 'BEGIN{FS=OFS="\t"}{print($24, $7)}' $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed | sed 's/\..*//g' > $INTERSECT_OUT/gene_snp_list.tsv + +sed -i '1i snp\tgene' $INTERSECT_OUT/gene_snp_list.tsv + + + + + + + +##### DEBOEVER DATA ##### +### Intersect with these SNPs ##$# +conda activate bedtools + + bedtools intersect -a $OUT/deboever.bed -b $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf -wa -wb > $INTERSECT_OUT/deboever_imputed_overlapping_filtered.bed + +conda deactivate + + +head -n 1 $OUT/deboever.bed > $INTERSECT_OUT/deboever_header_bed.tsv +grep "#CHROM" $INTERSECT_OUT/nona_cardiac_multiome_Filtered_INFO_0.4_MAF0.05_complete_cases_snps_filtered_diff_genotypes.vcf > $INTERSECT_OUT/deboever_header_vcf.tsv +paste -d"\t" $INTERSECT_OUT/deboever_header_bed.tsv $INTERSECT_OUT/deboever_header_vcf.tsv > $INTERSECT_OUT/deboever_combined_header.tsv + +cat $INTERSECT_OUT/deboever_combined_header.tsv > $INTERSECT_OUT/deboever_imputed_overlapping_filtered_header.bed +cat $INTERSECT_OUT/deboever_imputed_overlapping_filtered.bed >> $INTERSECT_OUT/deboever_imputed_overlapping_filtered_header.bed + + +### Use this file to test for trends in gene +awk 'BEGIN{FS=OFS="\t"}{print($31, $15, $25)}' $INTERSECT_OUT/deboever_imputed_overlapping_filtered_header.bed | sed 's/\..*\t/\t/g' > $INTERSECT_OUT/deboever_gene_snp_list.tsv + + + diff --git a/eQTL_check/multi-passage/test_eQTL.R b/eQTL_check/multi-passage/test_eQTL.R new file mode 100644 index 0000000..4bab96e --- /dev/null +++ b/eQTL_check/multi-passage/test_eQTL.R @@ -0,0 +1,204 @@ +### Reason: test eQTLs identified in Kilpinen et al in our dataset +### Author: Drew Neavin +### Date: 1 April, 2022 + + +library(data.table) +library(Seurat) +library(vcfR) +library(glmmTMB) +library(dplyr) +library(ggplot2) + + + +##### Bring in variables ##### +### Bring in arguments +args <- commandArgs(trailingOnly = TRUE) +ensg <- args[1] +outdir <- args[2] +bed <- args[3] + + +cell_line_colors <- c("TOB0199" = "#f44336", "TOB0220" = "#e81f63", "MBE2900" = "#9c27b0", "MBE0953" = "#673ab7", "TOB0421" = "#3f51b5", "MBE2817" = "#2096f3","FSA0004" = "#2096f3", "TOB0435" = "#009688", "WAB0004" = "#4caf50", "IST1877" = "#8bc34a", "WAB0103" = "#cddc39", "TOB0198" = "#ffeb3b", "TOB0205" = "#ffc108", "WAB0038" = "#ff9801", "IST3323" = "#ff5723" , "FSA0001" = "#795548", "180N" = "#9e9e9e", "166" = "#607d8b") + +##### Read in dataframe of pairs to test ##### +snp_gene_pairs <- fread(bed) + + + + +##### Subset the correct snp ##### +snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg] + + + +##### Read in data ##### +residuals <- readRDS(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl/", ensg, "_residuals4qtl.rds")) +residual_dt <- data.table(residuals) +colnames(residual_dt) <- c("residual") +residual_dt$Barcode <- names(residuals) + +vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/deboever_finalized_snps.vcf") +meta <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/cell_meta.tsv", sep = "\t") +data <- meta[residual_dt, on = "Barcode"] +data$Barcode <- NULL + + + + +### Summarize data by site and line and replicatae +data_sum <- data[, .(residual=mean(residual)), by = .(Passage, Line)] +data_sum$Line <- gsub("^0_", "", data_sum$Line) %>% + gsub("D-", "", .) %>% + gsub("\\.\\d\\.", "", .) %>% + gsub("N-", "", .) %>% + gsub("-P36", "", .) %>% + gsub("-", "", .) + + +##### Get the snp from the vcf ##### +## GT ## +snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F)) + + +if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){ + message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.") + format_clust = "GT" + + if (any(grepl("\\|",snp_dt[,1]))){ + separator = "|" + message("Detected | separator for GT genotype format in cluster vcf") + } else if (any(grepl("/",snp_dt[,1]))) { + separator = "/" + message("Detected / separator for GT genotype format in cluster vcf") + } else { + format_clust = NA + message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.") + } + if (!is.na(format_clust)){ + snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>% + lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)}))) + } +} + + +colnames(snp_dt) <- gsub("^0_", "", colnames(snp_dt)) %>% + gsub("D-", "", .) %>% + gsub("\\.\\d\\.", "", .) %>% + gsub("N-", "", .) %>% + gsub("-P36", "", .) %>% + gsub("-", "", .) + + +## Get specific SNP ## +snp_dt$ID <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],":", vcf@fix[,'REF'],":", vcf@fix[,'ALT']) + +snp_dt_subset <- snp_dt[ID %in% snp_gene_pairs_subset$ID] + + +### Add SNP data to data data.table ### +snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID"), measure.vars = colnames(snp_dt_subset)[1:(ncol(snp_dt_subset) - 1)]) +snp_dt_subset_long$variable <- gsub("^0_", "", snp_dt_subset_long$variable) %>% + gsub("D-", "", .) %>% + gsub("\\.\\d\\.", "", .) %>% + gsub("N-", "", .) %>% + gsub("-P36", "", .) %>% + gsub("-", "", .) + + + +##### Make dataframe with deboever results, snp genotypes and our beta + pvalue ##### +results_dt <- snp_gene_pairs_subset[,c("chrom", "start", "end", "rsid", "maf", "stat", "pvalue", "beta", "sebeta", "gene_id", "gene_name", "ref","alt", "ID", "REF", "ALT")] +colnames(results_dt)[5:13] <- paste0(colnames(results_dt)[5:13], "_deboever") +results_dt <- results_dt[snp_dt_subset, on = "ID"] + +results_dt$dataset_beta <- as.numeric(NA) +results_dt$dataset_beta_se <- as.numeric(NA) +results_dt$dataset_z <- as.numeric(NA) +results_dt$dataset_p <- as.numeric(NA) +results_dt$direction <- as.character(NA) + + +for (snp in unique(snp_dt_subset_long$ID)){ + + data_sum_snp <- snp_dt_subset_long[ID == snp][data_sum, on = c("variable" = "Line")] + + columns <- c("ID", unique(snp_dt_subset_long$variable), "REF", "ALT") + gt_long <- melt(results_dt[ID == snp][,..columns], measure.vars = unique(snp_dt_subset_long$variable)) + gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), + ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT), + ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA))) + + gt_long <- unique(gt_long) + + data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID")] + + data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype)) + + + ##### Check for beta ##### + base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE) + snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE) + + anova_results <- anova(base_model, snp_model) + + results_dt[ID == snp]$dataset_beta <- summary(snp_model)$coefficients$cond[2,1] + results_dt[ID == snp]$dataset_beta_se <- summary(snp_model)$coefficients$cond[2,2] + results_dt[ID == snp]$dataset_z <- summary(snp_model)$coefficients$cond[2,3] + results_dt[ID == snp]$dataset_p <- anova_results$`Pr(>Chisq)`[2] + + if (results_dt[ID == snp]$ref_deboever == results_dt[ID == snp]$REF & results_dt[ID == snp]$alt_deboever == results_dt[ID == snp]$ALT) { + if ((results_dt[ID == snp]$beta_deboever * results_dt[ID == snp]$dataset_beta) > 0){ + results_dt[ID == snp]$direction <- "match" + } else { + results_dt[ID == snp]$direction <- "opposite" + } + } else if (results_dt[ID == snp]$ref_deboever == results_dt[ID == snp]$ALT & results_dt[ID == snp]$REF == results_dt[ID == snp]$alt_deboever) { + if ((results_dt[ID == snp]$beta_deboever * results_dt[ID == snp]$dataset_beta) < 0){ + results_dt[ID == snp]$direction <- "match" + } else { + results_dt[ID == snp]$direction <- "opposite" + } + } else { + results_dt[ID == snp]$direction <- "different_snp" + } + + ###### Make a figure of the results + ### Different shapes for location + ### Different fill for village and uniculture + ### Color by line + if (!results_dt[ID == snp]$direction == "different_snp"){ + + data_sum_snp$value <- factor(data_sum_snp$value, levels = sort(unique(data_sum_snp$value))) + + labels <- levels(data_sum_snp$Genotype) + names(labels) <- as.numeric(levels(data_sum_snp$value)) + + plot <- ggplot(data_sum_snp, aes(value, residual)) + + geom_point(aes(color = variable)) + + theme_classic() + + scale_color_manual(values = cell_line_colors) + + scale_fill_manual(values = cell_line_colors) + + geom_smooth(aes(as.numeric(value), residual), position = "identity",method = "lm", color = "black", se=FALSE) + + scale_x_discrete(labels=labels) + + ylab("Normalized Expression") + + ggtitle(paste0(results_dt[ID == snp]$rsid, "-", results_dt[ID == snp]$gene_name_deboever, " eQTL"), + subtitle = paste0("beta = ", round(results_dt[ID == snp]$dataset_beta, 3))) + + xlab(paste0(results_dt[ID == snp]$rsid, '\nGenotype')) + + theme(plot.title = element_text(hjust = 0.5), + plot.subtitle = element_text(hjust = 0.5)) + + labs(color = "iPSC Line", shape = "Site & Village", fill = NULL) + + ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_deboever_eQTL_results.png"), width = 5, height = 3.5) + ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_deboever_eQTL_results.pdf"), width = 5, height = 3.5) + } +} + +fwrite(results_dt, paste0(outdir,"beds/",ensg, "_deboever_eQTL_results.bed"), sep = "\t") + + + + diff --git a/eQTL_check/multi-passage/test_eQTL.sh b/eQTL_check/multi-passage/test_eQTL.sh new file mode 100644 index 0000000..7bf4e60 --- /dev/null +++ b/eQTL_check/multi-passage/test_eQTL.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/multi-passage/test_eQTL.smk" +LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output//multi-passage/deboever/gene_separate/logs/" + +mkdir -p $LOG + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --reason > jobs2run.txt + + + +nohup \ + snakemake \ + --snakefile $SNAKEFILE \ + --jobs 200 \ + --use-singularity \ + --restart-times 1 \ + --keep-going \ + --cluster \ + "qsub -S /bin/bash \ + -q short.q \ + -r yes \ + -pe smp {threads} \ + -l tmp_requested={resources.disk_per_thread_gb}G \ + -l mem_requested={resources.mem_per_thread_gb}G \ + -e $LOG \ + -o $LOG \ + -j y \ + -V" \ + > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log & + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --cleanup-metadata \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds + + + + + + +ENSG00000140481 +ENSG00000158552 +ENSG00000167004 +ENSG00000184281 +ENSG00000185437 +ENSG00000185875 +ENSG00000196109 +ENSG00000197536 +ENSG00000215910 +ENSG00000215883 +ENSG00000258297 +ENSG00000259488 +ENSG00000274877 + + +ENSG00000158555 +ENSG00000167005 +ENSG00000184292 +ENSG00000184281 +ENSG00000185437 +ENSG00000185875 +ENSG00000196109 +ENSG00000197536 +ENSG00000215866 +ENSG00000258289 +ENSG00000259485 +ENSG00000274828 + + +rm ENSG00000167011_fitted_models.rds +rm ENSG00000184304_fitted_models.rds +rm ENSG00000185477_fitted_models.rds +rm ENSG00000185900_fitted_models.rds +rm ENSG00000196132_fitted_models.rds +rm ENSG00000197566_fitted_models.rds +rm ENSG00000217930_fitted_models.rds +rm ENSG00000258593_fitted_models.rds +rm ENSG00000259715_fitted_models.rds +rm ENSG00000275074_fitted_models.rds diff --git a/eQTL_check/multi-passage/test_eQTL.smk b/eQTL_check/multi-passage/test_eQTL.smk new file mode 100644 index 0000000..d63e8e9 --- /dev/null +++ b/eQTL_check/multi-passage/test_eQTL.smk @@ -0,0 +1,53 @@ +import pandas as pd + + +genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv" +# genes_file_kilpinen = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv" +genes = pd.read_csv(genes_file, sep = "\t") +# genes_kilpenen = pd.read_csv(genes_file_kilpinen, sep = "\t") +# genes_kilpenen = genes_kilpenen.iloc[1] + +rule all: + input: + expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed", gene = genes.gene_id), + # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed", gene = genes_kilpenen.gene_id), + + +rule eqtl: + input: + bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/multi-passage/deboever_imputed_overlapping_filtered_header_pruned.bed" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed" + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 4, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 4 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/multi-passage/test_eQTL.R", + outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/", + snp=lambda wildcards: genes.ID[genes.gene_id == wildcards.gene], + log: + shell: + """ + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed} + """ + + +rule eqtl_kilpinen: + input: + bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned.bed" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed" + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 4, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 4 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/test_eQTL_kilpinen.R", + outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/", + snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene], + log: + shell: + """ + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed} + """ diff --git a/eQTL_check/multi-passage/test_eQTL_combine_deboever.R b/eQTL_check/multi-passage/test_eQTL_combine_deboever.R new file mode 100644 index 0000000..7d152f4 --- /dev/null +++ b/eQTL_check/multi-passage/test_eQTL_combine_deboever.R @@ -0,0 +1,114 @@ +library(data.table) +library(dplyr) +library(ggplot2) +library(colorspace) +library(ggpp) + + +indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/gene_separate/beds/" +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/deboever/combined/" + +dir.create(outdir, recursive = TRUE) + + +### Read in results +files <- list.files(indir) + +bed_results_list <- lapply(files, function(x){ + fread(paste0(indir, x), sep = "\t") +}) + + +results_dt <- do.call(rbind, bed_results_list) + + +results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr") + + +results_dt <- results_dt[order(fdr)] + +head(results_dt, n =100) + + +fwrite(results_dt, paste0(outdir, "deboever_overlap_results.tsv"), sep = "\t") + +results_dt <- results_dt[direction != "different_snp"] +results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE) + + +results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match")) + + +correlation_plot <- ggplot(results_dt, aes(beta_deboever, dataset_beta, color = log(fdr))) + + geom_point() + + theme_classic() + + scale_color_continuous_sequential(palette = "Purp", rev = FALSE) + # scale_color_manual(values = c("grey","black")) + +ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png")) + + +histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) + + geom_histogram(position = "stack", binwidth = 0.05) + + theme_classic() + + geom_vline(aes(xintercept=0.05), + color="red", linetype="dashed", size=1) + + scale_fill_manual(values = c("grey","black")) + +ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png")) + +histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + geom_vline(aes(xintercept=log10(0.05)), + color="red", linetype="dashed", size=0.25) + + scale_fill_manual(values = c("grey","black")) + + scale_y_continuous(expand = c(0, 0)) + + ylab("Number of Associations") + + labs(fill = "eQTL\nEffect\nDirection\nAgreement") + + +ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png")) + + +popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + scale_fill_manual(values = c("grey","black")) + + theme(axis.title.x=element_blank(), + axis.title.y=element_blank(), + legend.position="none", + axis.text.x=element_text(size=7), + axis.text.y=element_text(size=7)) + + scale_y_continuous(expand = c(0, 0)) + +ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png")) + +combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, + ymin = 500, ymax = 1225) + +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3) +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3) + + +results_dt$updated_beta <- ifelse(results_dt$ref_deboever != results_dt$REF, results_dt$beta_deboever * -1, results_dt$beta_deboever) + + + + +correlation_plot <- ggplot(results_dt, aes(updated_beta, dataset_beta, color = direction)) + + geom_hline(yintercept=0) + + geom_vline(xintercept=0) + + geom_point(alpha = 0.5) + + theme_classic() + + ylab("This Dataset Effect Size") + + xlab("DeBoever Effect Size") + + scale_color_manual(values = c("grey60", "black"), name = "Effect\nDirections") + + stat_quadrant_counts() + + xlim(-2.25,2.25) + + ylim(-8.5,8.5) + + theme(legend.position = "none") + +ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.png"), height = 2.5, width = 2.5) +ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.pdf"), height = 2.5, width = 2.5) + diff --git a/eQTL_check/pull_eQTL_SNPs.sh b/eQTL_check/pull_eQTL_SNPs.sh index 4fd2b48..95a89a5 100644 --- a/eQTL_check/pull_eQTL_SNPs.sh +++ b/eQTL_check/pull_eQTL_SNPs.sh @@ -71,8 +71,21 @@ conda activate bedtools conda deactivate + + +head -n 1 $OUT/kilpinen.bed > $INTERSECT_OUT/kilpinen_header_bed.tsv +grep "#CHROM" $INTERSECT_OUT/merged_imputed_AllChrs_iPSC_R2_0.3_filtered_diff_genotypes.vcf > $INTERSECT_OUT/kilpinen_header_vcf.tsv +paste -d"\t" $INTERSECT_OUT/kilpinen_header_bed.tsv $INTERSECT_OUT/kilpinen_header_vcf.tsv > $INTERSECT_OUT/kilpinen_combined_header.tsv + +cat $INTERSECT_OUT/kilpinen_combined_header.tsv > $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed +cat $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed >> $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed + + + + + ### Use this file to test for trends in gene -awk 'BEGIN{FS=OFS="\t"}{print($24, $7)}' $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered.bed | sed 's/\..*//g' > $INTERSECT_OUT/gene_snp_list.tsv +awk 'BEGIN{FS=OFS="\t"}{print($24, $7)}' $INTERSECT_OUT/kilpinen_imputed_overlapping_filtered_header.bed | sed 's/\..*//g' > $INTERSECT_OUT/gene_snp_list.tsv sed -i '1i snp\tgene' $INTERSECT_OUT/gene_snp_list.tsv diff --git a/eQTL_check/test_eQTL.smk b/eQTL_check/test_eQTL.smk index 85465ea..ee79b38 100644 --- a/eQTL_check/test_eQTL.smk +++ b/eQTL_check/test_eQTL.smk @@ -2,12 +2,15 @@ import pandas as pd genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/deboever_imputed_overlapping_filtered_header_pruned_snp_gene.tsv" +genes_file_kilpinen = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned_snp_gene.tsv" genes = pd.read_csv(genes_file, sep = "\t") -# genes = genes.iloc[1] +genes_kilpenen = pd.read_csv(genes_file_kilpinen, sep = "\t") +# genes_kilpenen = genes_kilpenen.iloc[1] rule all: input: - expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed", gene = genes.gene_id), + # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/{gene}_deboever_eQTL_results.bed", gene = genes.gene_id), + expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed", gene = genes_kilpenen.gene_id), rule eqtl: @@ -30,3 +33,21 @@ rule eqtl: """ +rule eqtl_kilpinen: + input: + bed = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/kilpinen_imputed_overlapping_filtered_header_pruned.bed" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/{gene}_kilpinen_eQTL_results.bed" + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 4, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 4 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/eQTL_check/test_eQTL_kilpinen.R", + outdir="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/", + snp=lambda wildcards: genes.ID_ref_alt[genes.gene_id == wildcards.gene], + log: + shell: + """ + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {wildcards.gene} {params.outdir} {input.bed} + """ diff --git a/eQTL_check/test_eQTL_combine.R b/eQTL_check/test_eQTL_combine.R index 6252323..390a764 100644 --- a/eQTL_check/test_eQTL_combine.R +++ b/eQTL_check/test_eQTL_combine.R @@ -2,26 +2,53 @@ library(data.table) library(dplyr) library(ggplot2) library(colorspace) +library(ggpp) -indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/" -outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/combined/" +indir_deboever <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/" +indir_kilpinen <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/" +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/eQTL_overlap/combined/" + dir.create(outdir, recursive = TRUE) ### Read in results -files <- list.files(indir) +deboever_files <- list.files(indir_deboever) + +bed_results_list <- lapply(deboever_files, function(x){ + fread(paste0(indir_deboever, x), sep = "\t") +}) + + +deboever_results_dt <- do.call(rbind, bed_results_list) +deboever_results_dt$fdr <- p.adjust(deboever_results_dt$dataset_p, method = "fdr") +deboever_results_dt$study <- "DeBoever" + + +### Kilpinen +kilpinen_files <- list.files(indir_kilpinen) + +kilpinen_results_list <- lapply(kilpinen_files, function(x){ + fread(paste0(indir_kilpinen, x), sep = "\t") -bed_results_list <- lapply(files, function(x){ - fread(paste0(indir, x), sep = "\t") }) -results_dt <- do.call(rbind, bed_results_list) +kilpinen_results_dt <- do.call(rbind, kilpinen_results_list) +kilpinen_results_dt$fdr <- p.adjust(kilpinen_results_dt$dataset_p, method = "fdr") +kilpinen_results_dt$study <- "Kilpinen" + +deboever_results_dt_prep <- deboever_results_dt[,c("chrom", "start", "end", "maf_deboever", "ref_deboever", "alt_deboever", "gene_id_deboever", "beta_deboever", "ID_ref_alt", "REF", "ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta", "dataset_p", "direction", "fdr", "study")] +colnames(deboever_results_dt_prep) <- c("chrom", "start", "end", "maf", "REF", "ALT", "ensg", "beta", "ID_ref_alt", "dataset_REF", "dataset_ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta", "dataset_p", "direction", "fdr", "study") +kilpinen_results_dt_prep <- kilpinen_results_dt[,c("#chrom", "start", "end", "maf_kilpinen", "REF", "ALT", "gene_id_kilpinen", "lmm_peer_beta_kilpinen", "ID_ref_alt", "REF", "ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta", "dataset_p", "direction", "fdr", "study")] +colnames(kilpinen_results_dt_prep) <- c("chrom", "start", "end", "maf", "REF", "ALT", "ensg", "beta", "ID_ref_alt", "dataset_REF", "dataset_ALT", "TOB0421", "FSA0006", "MBE1006", "dataset_beta", "dataset_p", "direction", "fdr", "study") +kilpinen_results_dt_prep$REF <- NA +kilpinen_results_dt_prep$ALT <- NA -results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr") + +results_dt <- rbind(deboever_results_dt_prep, kilpinen_results_dt_prep) results_dt <- results_dt[order(fdr)] @@ -30,31 +57,97 @@ head(results_dt, n =100) results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006] -fwrite(results_dt, paste0(outdir, "deboever_overlap_results.tsv"), sep = "\t") +fwrite(results_dt, paste0(outdir, "overlap_results.tsv"), sep = "\t") results_dt <- results_dt[direction != "different_snp"] results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE) -results_dt[gene_name_deboever == "CHCHD2"] +results_dt[ensg == "ENSG00000106153"] results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match")) -correlation_plot <- ggplot(results_dt, aes(beta_deboever, dataset_beta, color = log(fdr))) + - geom_point() + +results_dt$updated_beta <- ifelse(results_dt$study == "Kilpinen" & results_dt$direction == "match" & results_dt$beta * results_dt$dataset_beta < 0, results_dt$beta * -1, + ifelse(results_dt$study == "Kilpinen" & results_dt$direction == "opposite" & results_dt$beta * results_dt$dataset_beta > 0, results_dt$beta * -1, + ifelse(results_dt$study == "DeBoever" & results_dt$REF != results_dt$dataset_REF, results_dt$beta * -1, results_dt$beta))) + + +correlation <- list() +correlation_dt <- data.table(study = unique(results_dt$study), rho = as.numeric(NA), p = as.numeric(NA)) + +for (Study in results_dt$study){ + correlation[[Study]] <- cor.test(results_dt[study == Study]$updated_beta, results_dt[study == Study]$dataset_beta, method = "spearman", exact = TRUE) + correlation_dt[study == Study, "rho"] <- correlation[[Study]]$estimate[1] + correlation_dt[study == Study, "p"] <- correlation[[Study]]$p.value +} + +correlation_plot <- ggplot(results_dt, aes(updated_beta, dataset_beta, color = significant)) + + geom_hline(yintercept=0) + + geom_vline(xintercept=0) + + geom_point(alpha = 0.25) + theme_classic() + - scale_color_continuous_sequential(palette = "Purp", rev = FALSE) + # scale_color_continuous_sequential(palette = "Purp", rev = FALSE) + + scale_color_manual(values = c("lightgrey", "black")) + + facet_wrap(vars(study)) # scale_color_manual(values = c("grey","black")) ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png")) + +correlation_plot <- ggplot(results_dt[study == "DeBoever" & significant == TRUE], aes(updated_beta, dataset_beta, color = direction)) + + geom_hline(yintercept=0) + + geom_vline(xintercept=0) + + geom_point(alpha = 0.25) + + theme_classic() + + ylab("This Dataset Effect Size") + + xlab("DeBoever Effect Size") + + scale_color_manual(values = c("grey60", "black"), name = "Effect\nDirections") + + stat_quadrant_counts() + + xlim(-2.25,2.25) + + ylim(-8.5,8.5) + + theme(legend.position = "none") + +ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.png"), height = 2.5, width = 2.5) +ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot_DeBoever.pdf"), height = 2.5, width = 2.5) + + + + +### add number of genes in each quadrant +dim(results_dt[updated_beta < 0 & dataset_beta < 0]) # 2255 +dim(results_dt[updated_beta > 0 & dataset_beta > 0]) # 2116 + +dim(results_dt[updated_beta < 0 & dataset_beta > 0]) # 1071 +dim(results_dt[updated_beta > 0 & dataset_beta < 0]) # 1008 + + +dim(results_dt[updated_beta < 0 & dataset_beta < 0 & study == "DeBoever"]) # 1534 +dim(results_dt[updated_beta > 0 & dataset_beta > 0 & study == "DeBoever"]) # 1378 + +dim(results_dt[updated_beta < 0 & dataset_beta > 0 & study == "DeBoever"]) # 688 +dim(results_dt[updated_beta > 0 & dataset_beta < 0 & study == "DeBoever"]) # 600 + + +dim(results_dt[updated_beta < 0 & dataset_beta < 0 & study == "Kilpinen"]) # 721 +dim(results_dt[updated_beta > 0 & dataset_beta > 0 & study == "Kilpinen"]) # 738 + +dim(results_dt[updated_beta < 0 & dataset_beta > 0 & study == "Kilpinen"]) # 383 +dim(results_dt[updated_beta > 0 & dataset_beta < 0 & study == "Kilpinen"]) # 408 + + + + + + + histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) + geom_histogram(position = "stack", binwidth = 0.05) + theme_classic() + geom_vline(aes(xintercept=0.05), color="red", linetype="dashed", size=1) + - scale_fill_manual(values = c("grey","black")) + scale_fill_manual(values = c("grey","black")) + + facet_wrap(vars(study)) ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png")) @@ -64,9 +157,11 @@ histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) + geom_vline(aes(xintercept=log10(0.05)), color="red", linetype="dashed", size=0.25) + scale_fill_manual(values = c("grey","black")) + - scale_y_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) + + ylab("Number of Associations") + + labs(fill = "eQTL\nEffect\nDirection\nAgreement") + + facet_wrap(vars(study), scales = "free") - ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png")) @@ -79,14 +174,58 @@ popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr) legend.position="none", axis.text.x=element_text(size=7), axis.text.y=element_text(size=7)) + - scale_y_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) + + facet_wrap(vars(study), scales = "free") + ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png")) -combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, +annotation_custom2 <- function (grob, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf, data) +{ + layer(data = data, stat = StatIdentity, position = PositionIdentity, + geom = ggplot2:::GeomCustomAnn, + inherit.aes = TRUE, params = list(grob = grob, + xmin = xmin, xmax = xmax, + ymin = ymin, ymax = ymax)) +} + +get_inset <- function(df){ + p <- ggplot(data=df %>% + group_by(study) %>% + slice(1), + aes(x = fdr, fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + scale_fill_manual(values = c("grey","black")) + + theme(axis.title.x=element_blank(), + axis.title.y=element_blank(), + legend.position="none", + axis.text.x=element_text(size=7), + axis.text.y=element_text(size=7)) + + scale_y_continuous(expand = c(0, 0)) + return(p) +} + + +insets <- results_dt %>% + split(f = .$study) %>% + purrr::map(~annotation_custom2( + grob = ggplotGrob(get_inset(.)), + data = data.frame(category=unique(.$study)), + ymin = -500, ymax=1225, xmin=-32, xmax=-15) + ) + + +combined_plot_log <- histogram_plot_log + annotation_custom2(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, + ymin = 500, ymax = 1225) + +combined_plot_log <- histogram_plot_log + annotation_custom(grob=ggplotGrob(insets), xmin = -35, xmax = -15, ymin = 500, ymax = 1225) -ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3) -ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3) + +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 2.5) +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 2.5) + +### Calculate number of eQTLs and write in notion for writing paper time ### diff --git a/eQTL_check/test_eQTL_combine_deboever.R b/eQTL_check/test_eQTL_combine_deboever.R new file mode 100644 index 0000000..8aa6c76 --- /dev/null +++ b/eQTL_check/test_eQTL_combine_deboever.R @@ -0,0 +1,96 @@ +library(data.table) +library(dplyr) +library(ggplot2) +library(colorspace) + + +indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/gene_separate/beds/" +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/deboever/combined/" + +dir.create(outdir, recursive = TRUE) + + +### Read in results +files <- list.files(indir) + +bed_results_list <- lapply(files, function(x){ + fread(paste0(indir, x), sep = "\t") +}) + + +results_dt <- do.call(rbind, bed_results_list) + + +results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr") + + +results_dt <- results_dt[order(fdr)] + +head(results_dt, n =100) + +results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006] + +fwrite(results_dt, paste0(outdir, "deboever_overlap_results.tsv"), sep = "\t") + +results_dt <- results_dt[direction != "different_snp"] +results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE) + +results_dt[gene_name_deboever == "CHCHD2"] + +results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match")) + + +correlation_plot <- ggplot(results_dt, aes(beta_deboever, dataset_beta, color = log(fdr))) + + geom_point() + + theme_classic() + + scale_color_continuous_sequential(palette = "Purp", rev = FALSE) + # scale_color_manual(values = c("grey","black")) + +ggsave(correlation_plot, filename = paste0(outdir, "correlation_plot.png")) + + +histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) + + geom_histogram(position = "stack", binwidth = 0.05) + + theme_classic() + + geom_vline(aes(xintercept=0.05), + color="red", linetype="dashed", size=1) + + scale_fill_manual(values = c("grey","black")) + +ggsave(histogram_plot, filename = paste0(outdir, "histogram_plot.png")) + +histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + geom_vline(aes(xintercept=log10(0.05)), + color="red", linetype="dashed", size=0.25) + + scale_fill_manual(values = c("grey","black")) + + scale_y_continuous(expand = c(0, 0)) + + ylab("Number of Associations") + + labs(fill = "eQTL\nEffect\nDirection\nAgreement") + + +ggsave(histogram_plot_log, filename = paste0(outdir, "histogram_plot_log.png")) + + +popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + scale_fill_manual(values = c("grey","black")) + + theme(axis.title.x=element_blank(), + axis.title.y=element_blank(), + legend.position="none", + axis.text.x=element_text(size=7), + axis.text.y=element_text(size=7)) + + scale_y_continuous(expand = c(0, 0)) + +ggsave(popout_histogram_plot_log, filename = paste0(outdir, "popout_histogram_plot_log.png")) + +combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -35, xmax = -15, + ymin = 500, ymax = 1225) + +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.png"), width = 5, height = 3) +ggsave(combined_plot_log, filename = paste0(outdir, "histogram_plot_log_w_popout.pdf"), width = 5, height = 3) + + + +### Calculate number of eQTLs and write in notion for writing paper time ### diff --git a/eQTL_check/test_eQTL_combine_kilpinen.R b/eQTL_check/test_eQTL_combine_kilpinen.R new file mode 100644 index 0000000..fde0f74 --- /dev/null +++ b/eQTL_check/test_eQTL_combine_kilpinen.R @@ -0,0 +1,109 @@ +library(data.table) +library(dplyr) +library(ggplot2) +library(colorspace) + + +indir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/gene_separate/beds/" +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/kilpinen/combined/" + +dir.create(outdir, recursive = TRUE) + + +### Read in results +files <- list.files(indir) + +bed_results_list <- lapply(files, function(x){ + fread(paste0(indir, x), sep = "\t") +}) + + +results_dt <- do.call(rbind, bed_results_list) + + +results_dt$fdr <- p.adjust(results_dt$dataset_p, method = "fdr") + + +results_dt <- results_dt[order(fdr)] + +head(results_dt, n =100) + +results_dt[TOB0421 != FSA0006 & TOB0421 != MBE1006 & FSA0006 != MBE1006] + +fwrite(results_dt, paste0(outdir, "kilpinen_overlap_results.tsv"), sep = "\t") + +results_dt <- results_dt[direction != "different_snp"] +results_dt$significant <- ifelse(results_dt$fdr < 0.05, TRUE, FALSE) + +results_dt[Gene_ID == "CHCHD2"] + +results_dt$direction <- factor(results_dt$direction, levels = c("opposite","match")) + +results_dt$lmm_peer_beta_kilpinen4plotting <- ifelse(results_dt$maf_kilpinen > 0.5, (-1 * results_dt$lmm_peer_beta_kilpinen), results_dt$lmm_peer_beta_kilpinen) +results_dt$dataset_beta4plotting <- ifelse(results_dt$dataset_maf > 0.5, (-1 * results_dt$dataset_beta), results_dt$dataset_beta) + + + +correlation_plot <- ggplot(results_dt, aes(lmm_peer_beta_kilpinen4plotting, dataset_beta4plotting, color = log(fdr))) + + geom_point() + + theme_classic() + + scale_color_continuous_sequential(palette = "Purp", rev = FALSE) + # scale_color_manual(values = c("grey","black")) + +ggsave(correlation_plot, filename = paste0(outdir, "kilpinen_correlation_plot.png")) + + +histogram_plot <- ggplot(results_dt, aes(fdr, fill = direction)) + + geom_histogram(position = "stack", binwidth = 0.05) + + theme_classic() + + geom_vline(aes(xintercept=0.05), + color="red", linetype="dashed", size=1) + + scale_fill_manual(values = c("grey","black")) + +ggsave(histogram_plot, filename = paste0(outdir, "kilpinen_histogram_plot.png")) + + +histogram_plot_log <- ggplot(results_dt, aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + geom_vline(aes(xintercept=log10(0.05)), + color="red", linetype="dashed", size=0.25) + + scale_fill_manual(values = c("grey","black")) + + scale_y_continuous(expand = c(0, 0)) + + ylab("Number of Associations") + + labs(fill = "eQTL\nEffect\nDirection\nAgreement") + + +ggsave(histogram_plot_log, filename = paste0(outdir, "kilpinen_histogram_plot_log.png")) + + +popout_histogram_plot_log <- ggplot(results_dt[log10(fdr) < -10], aes(log10(fdr), fill = direction)) + + geom_histogram(position = "stack", binwidth = 1) + + theme_classic() + + scale_fill_manual(values = c("grey","black")) + + theme(axis.title.x=element_blank(), + axis.title.y=element_blank(), + legend.position="none", + axis.text.x=element_text(size=7), + axis.text.y=element_text(size=7)) + + scale_y_continuous(expand = c(0, 0)) + +ggsave(popout_histogram_plot_log, filename = paste0(outdir, "kilpinen_popout_histogram_plot_log.png")) + +combined_plot_log <- histogram_plot_log + annotation_custom(ggplotGrob(popout_histogram_plot_log), xmin = -29, xmax = -12, + ymin = 275, ymax = 750) + +ggsave(combined_plot_log, filename = paste0(outdir, "kilpinen_histogram_plot_log_w_popout.png"), width = 5, height = 3) +ggsave(combined_plot_log, filename = paste0(outdir, "kilpinen_histogram_plot_log_w_popout.pdf"), width = 5, height = 3) + + + +### Calculate number of eQTLs and write in notion for writing paper time ### +nrow(results_dt) + +table(results_dt$direction) + +nrow(results_dt[fdr < 0.05]) + +table(results_dt[fdr < 0.05]$direction) + diff --git a/eQTL_check/test_eQTL_kilpinen.R b/eQTL_check/test_eQTL_kilpinen.R new file mode 100644 index 0000000..1b4bd07 --- /dev/null +++ b/eQTL_check/test_eQTL_kilpinen.R @@ -0,0 +1,206 @@ +### Reason: test eQTLs identified in Kilpinen et al in our dataset +### Author: Drew Neavin +### Date: 1 April, 2022 + + +library(data.table) +library(Seurat) +library(vcfR) +library(glmmTMB) +library(dplyr) +library(ggplot2) + + + +##### Bring in variables ##### +### Bring in arguments +args <- commandArgs(trailingOnly = TRUE) +ensg <- args[1] +outdir <- args[2] +bed <- args[3] +datadir <- args[4] + + +cell_line_colors <- c("FSA0006" = "#F79E29", "MBE1006" = "#9B2C99", "TOB0421"= "#35369C") + + +##### Read in dataframe of pairs to test ##### +snp_gene_pairs <- fread(bed) +snp_gene_pairs$gene_id <- gsub("\\..+", "", snp_gene_pairs$gene_id) + + + + +##### Subset the correct snp ##### +snp_gene_pairs_subset <- snp_gene_pairs[gene_id == ensg] + + + +##### Read in data ##### +residuals <- readRDS(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review2/gene_separated/residuals4qtl/", ensg, "_residuals4qtl.rds")) +residual_dt <- data.table(residuals) +colnames(residual_dt) <- c("residual") +residual_dt$Barcode <- names(residuals) + +vcf <- read.vcfR("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/finalized_snps.recode.vcf") +meta <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Kilpinen_eQTLs/cell_meta.tsv", sep = "\t") +data <- meta[residual_dt, on = "Barcode"] +data$Barcode <- NULL + + + + +### Summarize data by site and line and replicatae +data_sum <- data[, .(residual=mean(residual)), by = .(Village, Line, Site, Replicate)] + + + +##### Get the snp from the vcf ##### +## GT ## +snp_dt <- data.table(extract.gt(element = "GT",vcf, IDtoRowNames = F)) + + +if (!all(colSums(is.na(snp_dt)) == nrow(snp_dt))){ + message("Found GT genotype format in cluster vcf. Will use that metric for cluster correlation.") + format_clust = "GT" + + if (any(grepl("\\|",snp_dt[,1]))){ + separator = "|" + message("Detected | separator for GT genotype format in cluster vcf") + } else if (any(grepl("/",snp_dt[,1]))) { + separator = "/" + message("Detected / separator for GT genotype format in cluster vcf") + } else { + format_clust = NA + message("Can't identify a separator for the GT field in cluster vcf, moving on to using GP.") + } + if (!is.na(format_clust)){ + snp_dt <- data.table(as_tibble(lapply(snp_dt, function(x) {gsub(paste0("0\\",separator,"0"),0, x)}) %>% + lapply(., function(x) {gsub(paste0("0\\",separator,"1"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"0"),1, x)}) %>% + lapply(., function(x) {gsub(paste0("1\\",separator,"1"),2, x)}))) + } +} +print("Completed GT conversion") + + +colnames(snp_dt) <- gsub("36_TOB00421_i_E8", "TOB0421", colnames(snp_dt)) %>% + gsub("22_FSA", "FSA0006", .) %>% + gsub("29_MBE", "MBE1006", .) + + +## Get specific SNP ## +snp_dt$ID_ref_alt <- paste0(vcf@fix[,'CHROM'],":", vcf@fix[,'POS'],"_", vcf@fix[,'REF'],"_", vcf@fix[,'ALT']) +snp_dt$dataset_maf <- (1 - as.numeric(gsub(";.+", "", vcf@fix[,'INFO']) %>% gsub("AF=", "", .))) + +snp_dt_subset <- snp_dt[ID_ref_alt %in% snp_gene_pairs_subset$ID_ref_alt] + + + +### Add SNP data to data data.table ### +snp_dt_subset_long <- melt(snp_dt_subset, id.vars = c("ID_ref_alt", "dataset_maf"), measure.vars = c("TOB0421", "FSA0006", "MBE1006")) + + + +##### Make dataframe with kilpinen results, snp genotypes and our beta + pvalue ##### +results_dt <- snp_gene_pairs_subset[,c("#chrom", "start", "end", "variant_id", "info_score", "maf", "gene_id", "lmm_pvalue", "lmm_qvalue", "lmm_peer_beta", "primary_eQTL", "iPSC_specific", "ID", "REF", "ALT", "ID_ref_alt")] +colnames(results_dt) <- c(colnames(results_dt)[1:4],paste0(colnames(results_dt)[5:12], "_kilpinen"), colnames(results_dt)[13:ncol(results_dt)]) +results_dt <- results_dt[snp_dt_subset, on = "ID_ref_alt"] + + + +### Add genes to dataframe ### +gene_conversion <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/DRENEA_1/outs/filtered_feature_bc_matrix/features.tsv.gz", header = FALSE, col.names = c("ENSG", "Gene_ID", "Assay")) +results_dt$Gene_ID <- gene_conversion[ENSG == ensg]$Gene_ID + + + +results_dt$dataset_beta <- as.numeric(NA) +results_dt$dataset_beta_se <- as.numeric(NA) +results_dt$dataset_z <- as.numeric(NA) +results_dt$dataset_p <- as.numeric(NA) +results_dt$direction <- as.character(NA) + + +for (snp in unique(snp_dt_subset_long$ID_ref_alt)){ + + data_sum_snp <- snp_dt_subset_long[ID_ref_alt == snp][data_sum, on = c("variable" = "Line")] + + gt_long <- melt(results_dt[ID_ref_alt == snp][,c("ID_ref_alt", "TOB0421", "FSA0006", "MBE1006", "REF", "ALT")], measure.vars = c("TOB0421", "FSA0006", "MBE1006")) + gt_long$Genotype <- ifelse(gt_long$value == 0, paste0(gt_long$REF, "/", gt_long$REF), + ifelse(gt_long$value == 1, paste0(gt_long$REF, "/", gt_long$ALT), + ifelse(gt_long$value == 2, paste0(gt_long$ALT, "/", gt_long$ALT), NA))) + + gt_long <- unique(gt_long) + + data_sum_snp <- gt_long[data_sum_snp, on = c("value", "variable", "ID_ref_alt")] + + data_sum_snp$Genotype <- factor(data_sum_snp$Genotype, levels = unique(data_sum_snp[order(value)]$Genotype)) + + + ##### Check for beta ##### + base_model <- glmmTMB(residual ~ 1, data = data_sum_snp, REML = TRUE) + snp_model <- glmmTMB(as.numeric(residual) ~ as.numeric(value), data = data_sum_snp, REML = TRUE) + + anova_results <- anova(base_model, snp_model) + + results_dt[ID_ref_alt == snp]$dataset_beta <- summary(snp_model)$coefficients$cond[2,1] + results_dt[ID_ref_alt == snp]$dataset_beta_se <- summary(snp_model)$coefficients$cond[2,2] + results_dt[ID_ref_alt == snp]$dataset_z <- summary(snp_model)$coefficients$cond[2,3] + results_dt[ID_ref_alt == snp]$dataset_p <- anova_results$`Pr(>Chisq)`[2] + + if ((results_dt[ID_ref_alt == snp]$maf < 0.4 & results_dt[ID_ref_alt == snp]$dataset_maf < 0.4) | (results_dt[ID_ref_alt == snp]$maf > 0.6 & results_dt[ID_ref_alt == snp]$dataset_maf > 0.6)) { + if ((results_dt[ID_ref_alt == snp]$lmm_peer_beta * results_dt[ID_ref_alt == snp]$dataset_beta) > 0){ + results_dt[ID_ref_alt == snp]$direction <- "match" + } else { + results_dt[ID_ref_alt == snp]$direction <- "opposite" + } + } else { + if ((results_dt[ID_ref_alt == snp]$lmm_peer_beta * results_dt[ID_ref_alt == snp]$dataset_beta) < 0){ + results_dt[ID_ref_alt == snp]$direction <- "match" + } else { + results_dt[ID_ref_alt == snp]$direction <- "opposite" + } + } + + + ###### Make a figure of the results + ### Different shapes for location + ### Different fill for village and uniculture + ### Color by line + if (!results_dt[ID_ref_alt == snp]$direction == "different_snp"){ + shapes <- c(0,15,1,16,2,17) + names(shapes) <- paste0(c("Sydney", "Sydney", "Melbourne", "Melbourne", "Brisbane", "Brisbane"), " ", rep(c("Uni-culture","Village"))) + + data_sum_snp$shapes <- paste0(data_sum_snp$Site, " ", gsub(0, "Uni-culture",gsub(1, "Village", data_sum_snp$Village))) + data_sum_snp$value <- factor(data_sum_snp$value, levels = sort(unique(data_sum_snp$value))) + + labels <- levels(data_sum_snp$Genotype) + names(labels) <- as.numeric(levels(data_sum_snp$value)) + + plot <- ggplot(data_sum_snp, aes(value, residual)) + + geom_point(aes(shape = shapes, color = variable)) + + theme_classic() + + scale_color_manual(values = cell_line_colors) + + scale_fill_manual(values = cell_line_colors) + + scale_shape_manual(values = shapes) + + geom_smooth(aes(as.numeric(value), residual), position = "identity",method = "lm", color = "black", se=FALSE) + + scale_x_discrete(labels=labels) + + ylab("Normalized Expression") + + ggtitle(paste0(snp, "-", results_dt[ID_ref_alt == snp]$Gene_ID, " eQTL"), + subtitle = paste0("beta = ", round(results_dt[ID_ref_alt == snp]$dataset_beta, 3))) + + xlab(paste0(results_dt[ID_ref_alt == snp]$rsid, '\nGenotype')) + + theme(plot.title = element_text(hjust = 0.5), + plot.subtitle = element_text(hjust = 0.5)) + + labs(color = "iPSC Line", shape = "Site & Village", fill = NULL) + + ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_kilpinen_eQTL_results.png"), width = 5, height = 3.5) + ggsave(plot, filename = paste0(outdir,"plots/",ensg, "_", snp,"_kilpinen_eQTL_results.pdf"), width = 5, height = 3.5) + } +} + +fwrite(results_dt, paste0(outdir,"beds/",ensg, "_kilpinen_eQTL_results.bed"), sep = "\t") + + + + diff --git a/multi-passage/Variance/variance_partition_multipassage.R b/multi-passage/Variance/variance_partition_multipassage.R new file mode 100644 index 0000000..cffa815 --- /dev/null +++ b/multi-passage/Variance/variance_partition_multipassage.R @@ -0,0 +1,358 @@ +library(haven) +library(ggplot2) +library(glmmTMB) +library(Seurat) +library(tidyverse) +library(specr) +library(data.table) +library(dsLib) +library(pkgcond) +library(texreg) + + +inicio("Starting Analysis") + + +##### Define functions ##### +icc_glmmtmb <- function(model, percent = TRUE) { + tmp <- VarCorr(model) + var <- do.call(rbind, lapply(names(tmp$cond), function(x) data.table("grp" = x, "vcov" = attr(tmp$cond[[x]], "stddev")^2))) + var <- rbind(var, data.table("grp" = "Residual", "vcov" = sigma(model)^2)) + sum_var <- sum(var$vcov) + var <- var %>% dplyr::mutate(icc = vcov/sum_var) + if (isTRUE(percent)) { + var <- var %>% dplyr::mutate(percent = .data$icc * 100) + } + return(var) +} + + + +##### Bring in variables ##### +### Bring in arguments +args <- commandArgs(trailingOnly = TRUE) +icc_interaction_outdir <- paste0(args[1]) +icc_outdir <- paste0(args[2]) +model_interaction_outdir <- paste0(args[3]) +model_outdir <- paste0(args[4]) +resid_outdir <- paste0(args[5]) +gene <- as.character(args[6]) + +print(icc_outdir) +print(icc_outdir) +print(model_outdir) +print(resid_outdir) +print(gene) + + + +##### Read in seurat with genes ##### +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/seurat_joint_SCT_1pct_expressing.rds") + + + +### Make DF for modeling ### +df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Line" = seurat@meta.data$Assignment, "Time" = as.factor(seurat@meta.data$Pool)) +colnames(df_hier_unscale)[1] <- "Expression" + + + +##### Leave one out method ##### +variables <- c("Line", "Time") + +model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) + + +boolFalse<-F +while(boolFalse==F & length(variables) > 0){ + tryCatch({ + print(variables) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + } else { + variables <- c() + } + }) +} + + +if (!length(variables) == 0){ + + +### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance +while (!model_glmmtmb$sdr$pdHess & length(variables) > 0 ){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + print(variables) + model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + variables <- c() + } +} + +print(variables) + +if (length(variables) > 0){ + + model_loo <- list() + + icc <- data.table(grp = variables, P = as.numeric(NA)) + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2] + } + + + if (!(any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){ + model_loo_updated <- model_loo + + updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")")) + + model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc <- icc_glmmtmb(model_loo_updated[["all"]]) + + + ### Recalculate significance ### + icc$P <- as.numeric(NA) + icc$gene <- gene + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2] + } + } + + + while((any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){ + + print("Removing non-significant vartiables and retesting signficance") + + ##### Identify variables to keep ##### + variables <- icc[P < 0.05/length(variables)]$grp + + if (length(variables) > 0){ + + ##### Calculate full model ##### + updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")")) + + + model_loo_updated <- list() + model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + + ### Calculate the variance explained by each of the included variables ### + icc <- icc_glmmtmb(model_loo_updated[["all"]]) + + + + ### Recalfulate significance ### + icc$P <- as.numeric(NA) + icc$gene <- gene + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2] + } + + + + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } + } + + interaction_variables <- c() + + if (length(variables) > 1){ + ### Add in interactions of the significant variables + if ("Line" %in% variables & "Time" %in% variables){ + interaction_variables <- c(interaction_variables, "Line:Time") + } + + model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + + + boolFalse<-F + while(boolFalse==F & length(interaction_variables) > 0){ + tryCatch({ + print(c(variables, interaction_variables)) + model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(interaction_variables) > 1){ + interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)] + } else { + interaction_variables <- c() + } + }) + } + + ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance + while (!model_glmmtmb_interaction$sdr$pdHess & length(interaction_variables) > 0 ){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(interaction_variables) > 1){ + interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)] + print(c(interaction_variables, variables)) + model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + interaction_variables <- c() + } + } + + if (length(interaction_variables) > 0){ + + model_loo_interaction <- list() + + icc_interaction <- data.table(grp = interaction_variables, P = as.numeric(NA)) + + for (variable in c(interaction_variables)){ + print(variable) + if (length(interaction_variables) > 1){ + model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")")) + } else { + model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } + model_loo_interaction[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction[[variable]], model_glmmtmb_interaction)$`Pr(>Chisq)`[2] + } + + + if (!(any(icc_interaction[grp != "Residual"]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[grp != "Residual"]$P)))){ + model_loo_interaction_updated <- model_loo_interaction + + updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + + model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]]) + + + ### Recalculate significance ### + icc_interaction$P <- as.numeric(NA) + icc_interaction$gene <- gene + + for (variable in c(variables, interaction_variables)){ + print(variable) + if (length(c(interaction_variables)) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } + model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2] + } + } + + + while((any(icc_interaction[!(grp %in% c("Residual", variables))]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[!(grp %in%c("Residual", variables))]$P)))){ + + print("Removing non-significant vartiables and retesting signficance") + + ##### Identify variables to keep ##### + interaction_variables <- icc_interaction[!(grp %in% c("Residual", variables)) & P < 0.05/length(c(variables, interaction_variables))]$grp + + if (length(interaction_variables) > 0){ + + ##### Calculate full model ##### + updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + + + model_loo_interaction_updated <- list() + model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + + ### Calculate the variance explained by each of the included variables ### + icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]]) + + + + ### Recalfulate significance ### + icc_interaction$P <- as.numeric(NA) + icc_interaction$gene <- gene + + for (variable in c(variables, interaction_variables)){ + print(variable) + model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!(c(variables, interaction_variables) %in% variable)], collapse = ") + (1|"), ")")) + model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2] + } + } else { + icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_interaction_updated <- list() + } + + if (nrow(icc_interaction) > nrow(icc)){ + saveRDS(icc_interaction, paste0(icc_interaction_outdir, gene, "_icc.rds"), compress = TRUE) + saveRDS(model_loo_interaction_updated, paste0(model_interaction_outdir, gene, "_fitted_models.rds"), compress = TRUE) + } + } + } else { + icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_interaction_updated <- list() + } + + + ### If line is significant, then get residuals for downstream qtl checks ### + if ("Line" %in% variables){ + print("Making residuals for qtl detection") + if (length(variables) > 1){ + if (length(interaction_variables) > 0){ + model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[-grep("Line", c(variables, interaction_variables))], collapse = ") + (1|"), ")")) + } else { + model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Line"], collapse = ") + (1|"), ")")) + } + } else { + model_no_line <- as.formula(paste0("Expression ~ 1")) + } + fit_no_line <- glmmTMB(formula = noquote(model_no_line), data = df_hier_unscale, REML = TRUE) + residuals <- resid(fit_no_line) + saveRDS(residuals, paste0(resid_outdir, gene, "_residuals4qtl.rds"), compress = TRUE) + } + + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } +} + + +saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE) +saveRDS(model_loo_updated, paste0(model_outdir, gene, "_fitted_models.rds"), compress = TRUE) + + +fin() \ No newline at end of file diff --git a/multi-passage/Variance/variance_partition_multipassage.smk b/multi-passage/Variance/variance_partition_multipassage.smk new file mode 100644 index 0000000..9bb352f --- /dev/null +++ b/multi-passage/Variance/variance_partition_multipassage.smk @@ -0,0 +1,99 @@ +import pandas as pd + + +genes_file = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/genes_1pct_expressing.tsv" +genes = pd.read_csv(genes_file, sep = "\t") +# genes = genes.iloc[1] + + +rule all: + input: + # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/fit_models/{gene}_fitted_models.rds", gene = genes.Gene), + # expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/fit_models/{gene}_fitted_models.rds", gene = genes.Gene), + expand("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/fit_models/{gene}_fitted_models.rds", gene = genes.Gene), + + +rule partition_variance: + input: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/seurat_joint_SCT_1pct_expressing.rds" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc/{gene}_icc.rds", + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/fit_models/{gene}_fitted_models.rds" + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 16, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 16 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage.R", + out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc/", + out_model="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/fit_models/", + out_resids="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/residuals4qtl/", + out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc_interaction/", + out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc_interaction/" + log: + shell: + """ + mkdir -p {params.out_icc_interaction} + mkdir -p {params.out_model_interaction} + mkdir -p {params.out_resids} + + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_model_interaction} {params.out_model} {params.out_resids} {wildcards.gene} + """ + + +rule partition_variance_ncov: + input: + seurat = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/seurat_joint_SCT_1pct_expressing.rds" + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc/{gene}_icc.rds", + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc_interaction/{gene}_icc.rds", + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/fit_models/{gene}_fitted_models.rds" + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 16, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 16 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R", + out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc/", + out_model="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/fit_models/", + out_resids="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/residuals4qtl/", + out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc_interaction/", + out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_ncov/gene_separated/icc_interaction/" + log: + shell: + """ + mkdir -p {params.out_icc_interaction} + mkdir -p {params.out_model_interaction} + mkdir -p {params.out_resids} + + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_model_interaction} {params.out_model} {params.out_resids} {wildcards.gene} + """ + + +### Tried manually with just CHCHD2 and the Ncov impacts the ability to get the line effect so cannot run this way (could downsample some of the pools to be more even sizes) +rule partition_variance_integratedSCT: + input: + seurat = ancient("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds") + output: + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc/{gene}_icc.rds", + "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/fit_models/{gene}_fitted_models.rds" + resources: + mem_per_thread_gb = lambda wildcards, attempt: attempt * 8, + disk_per_thread_gb = lambda wildcards, attempt: attempt * 8 + threads: 4 + params: + script = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R", + out_icc="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc/", + out_model="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/fit_models/", + out_resids="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/residuals4qtl/", + out_icc_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc_interaction/", + out_model_interaction = "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc_interaction/" + log: + shell: + """ + mkdir -p {params.out_icc_interaction} + mkdir -p {params.out_model_interaction} + mkdir -p {params.out_resids} + + /directflow/SCCGGroupShare/projects/DrewNeavin/software/anaconda3/envs/baseR402/bin/Rscript {params.script} {params.out_icc_interaction} {params.out_icc} {params.out_model_interaction} {params.out_model} {params.out_resids} {wildcards.gene} + """ \ No newline at end of file diff --git a/multi-passage/Variance/variance_partition_multipassage_combine.R b/multi-passage/Variance/variance_partition_multipassage_combine.R new file mode 100644 index 0000000..37150ad --- /dev/null +++ b/multi-passage/Variance/variance_partition_multipassage_combine.R @@ -0,0 +1,874 @@ +##### Reason: combine the results for the variance explained by different factors for each gene +##### Author: Drew Neavin +##### Date: 14 March, 2022 + + +##### Load in libraries ##### +library(data.table) +library(tidyverse) +library(ggridges) +library(raincloudplots) +library(ggdist) +library(clusterProfiler) +library(org.Hs.eg.db) +library(GOSemSim) +library(RColorBrewer) + + + +##### Set up directories ##### +dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/" +icc_dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc/" +icc_dir2 <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance/gene_separated/icc/" +icc_interaction_dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/gene_separated/icc_interaction/" +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/variance_integratedSCT/combined/" +outdir_comparison <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/comparison/" + +dir.create(outdir, recursive = TRUE) +dir.create(outdir_comparison, recursive = TRUE) + + +vars <- c("Line", "Passage", "Line:Passage", "Residual") +selected_vars <- c("Line", "Passage", "Line:Passage", "Residual") +var_colors <- c("#4734a9", "#78c0fe", "#97cf8a", "gray80") + +names(var_colors) <- vars + +var_colors <- var_colors[selected_vars] + + + +##### Get list of icc files ##### +icc_files <- list.files(icc_dir) +# icc_files2 <- list.files(icc_dir2) + + + +##### Read in icc results ##### +icc_results_list <- lapply(icc_files, function(x){ + readRDS(paste0(icc_dir,x)) +}) +names(icc_results_list) <- icc_files + +# icc_results_list2 <- lapply(icc_files2, function(x){ +# readRDS(paste0(icc_dir2,x)) +# }) +# names(icc_results_list2) <- icc_files2 + + + +##### Get list of icc interaction files ##### +icc_interaction_files <- list.files(icc_interaction_dir, pattern = "_icc.rds") + + + +##### Read in icc results ##### +icc_interaction_results_list <- lapply(icc_interaction_files, function(x){ + readRDS(paste0(icc_interaction_dir,x)) +}) +names(icc_interaction_results_list) <- icc_interaction_files + + + +##### Merge icc results into a single data.table ##### +icc_dt <- do.call(rbind, icc_results_list) +# icc_dt2 <- do.call(rbind, icc_results_list2) +# colnames(icc_dt2) <- paste0("integrate_sct_", colnames(icc_dt2)) + + +# icc_dt_joined <- icc_dt[icc_dt2, on = c("grp" = "integrate_sct_grp", "gene" = "integrate_sct_gene")] +# icc_dt_joined$diff <- icc_dt_joined$percent - icc_dt_joined$integrate_sct_percent +# max(na.omit(icc_dt_joined$diff)) + +# icc_dt_joined[diff == max(na.omit(icc_dt_joined$diff))] + +# icc_dt_joined[gene == "ENSG00000106153"] + + +# diff_dist <- ggplot(icc_dt_joined, aes(diff)) + +# geom_histogram() + +# facet_wrap(vars(grp)) +# # theme_classic() + +# ggsave(diff_dist, filename = paste0(outdir_comparison, "diff_histogram.png")) + + +# scatter <- ggplot(icc_dt_joined, aes(percent,integrate_sct_percent)) + +# geom_point() + +# facet_wrap(vars(grp), scales = "free") + +# theme_classic() + +# ggsave(scatter, filename = paste0(outdir_comparison, "scatter_correlation.png"), height = 4.5) + + +# icc_dt_joined_long <- melt(icc_dt_joined, id.vars = c("grp", "gene"),measure.vars = c("percent", "integrate_sct_percent")) + + +# dist <- ggplot(icc_dt_joined_long, aes(value)) + +# geom_histogram() + +# facet_grid(grp ~ variable) + +# theme_classic() + +# ggsave(dist, filename = paste0(outdir_comparison, "histogram.png")) + + +# ### want to use the integrate_sct_percent +# ### compared the two methods and appears that fitting when have been separately sct assessed => Line effect well correlated but time is much less (and not well correlated between the two methods) so likely inducing Time effects from normalizing all together + + + +icc_dt$percent_round <- round(icc_dt$percent) + +icc_dt$grp <- gsub("Time", "Passage", icc_dt$grp) +icc_dt$grp <- factor(icc_dt$grp, levels= rev(c("Line", "Passage", "Line:Passage", "Residual"))) + +group_size <- data.table(table(icc_dt$grp)) +colnames(group_size) <- c("grp", "size") +group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size) + +icc_dt <- group_size[icc_dt, on = "grp"] +icc_dt$grp_size <- factor(icc_dt$grp_size, levels = unique(group_size$grp_size)) + + +##### Merge icc_interaction results into a single data.table ##### +icc_interaction_dt <- do.call(rbind, icc_interaction_results_list) + +icc_interaction_dt$percent_round <- round(icc_interaction_dt$percent) + +icc_interaction_dt$grp <- gsub("Time", "Passage", icc_interaction_dt$grp) +icc_interaction_dt$grp <- factor(icc_interaction_dt$grp, levels= rev(c("Line", "Passage", "Line:Passage", "Residual"))) + +group_size <- data.table(table(icc_interaction_dt$grp)) +colnames(group_size) <- c("grp", "size") +group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size) + +icc_interaction_dt <- group_size[icc_interaction_dt, on = "grp"] +icc_interaction_dt$grp_size <- factor(icc_interaction_dt$grp_size, levels = unique(group_size$grp_size)) + +### *** Need to add individual effects without interaction in to interaction dt *** ### +icc_interaction_plus_dt <- rbind(icc_interaction_dt, icc_dt[!(gene %in% icc_interaction_dt$gene)]) + + +group_size <- data.table(table(icc_interaction_plus_dt$grp)) +colnames(group_size) <- c("grp", "size") +group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size) + +icc_interaction_plus_dt <- group_size[icc_interaction_plus_dt, on = "grp"] +icc_interaction_plus_dt$grp_size <- factor(icc_interaction_plus_dt$grp_size, levels = unique(group_size$grp_size)) + + + +##### Check difference in percent explained with and without interactions ##### +icc_interaction_dt_joined <- icc_dt[icc_interaction_dt, on = c("grp", "gene")] + +icc_interaction_dt_joined$difference <- icc_interaction_dt_joined$percent - icc_interaction_dt_joined$i.percent + + +pRaincloud_dif <- ggplot(icc_interaction_dt_joined, aes(x = difference, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(selected_vars)))) + + geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) + + geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) + + coord_cartesian(xlim = c(1.2, NA), clip = "off") + + theme_classic() + + theme(axis.title.y=element_blank()) + + xlab("Percent Variance Explained") + + scale_y_discrete(expand = c(0.03, 0)) + + scale_fill_manual(values = var_colors) + +ggsave(pRaincloud_dif, filename = paste0(outdir, "variance_explained_interaction_difference_raincloud.png"), height = 8, width = 7) +ggsave(pRaincloud_dif, filename = paste0(outdir, "variance_explained_interaction_difference_raincloud.pdf"), height = 8, width = 7) + + + +##### Make a figure of stacked variance explained ##### +### Order based on line variance explained ### +genes_list <- list() + +for (group in c("Line", "Passage", "Line:Passage", "Residual")){ + genes_list[[group]] <- icc_dt[grp == group][rev(order(percent_round))]$gene +} + +genes <- unique(unlist(genes_list)) + +icc_dt$gene <- factor(icc_dt$gene, levels = genes) + +icc_dt$grp <- factor(icc_dt$grp, levels= rev(c("Line", "Passage", "Line:Passage", "Residual"))) + + +## First on line percent, then village percent ## +bar_proportions <- ggplot(icc_dt, aes(x = gene, y = percent, fill = grp)) + + geom_bar(position="stack", stat="identity") + + theme_classic() + + theme(axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) + + scale_fill_manual(values = var_colors) + + +ggsave(bar_proportions, filename = paste0(outdir, "variance_explained_bar.png"), width = 20) + + + +### Try boxplot ### +boxplot <- ggplot(icc_dt, aes(x = factor(grp, levels = rev(levels(grp))), y = percent, fill = factor(grp, levels = rev(levels(grp))), color = factor(grp, levels = rev(levels(grp))))) + + geom_boxplot(alpha = 0.5, size = 0.5) + + theme_classic() + + xlab("Covariate") + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), + legend.position="none") + + scale_fill_manual(values = var_colors) + + scale_color_manual(values = var_colors) + + +ggsave(boxplot, filename = paste0(outdir, "variance_explained_box.png"), height = 3, width = 2) + + +### Try ridgplots ### +pRidges <- ggplot(icc_dt[grp != "Residual"], aes(x = percent, y = factor(grp, levels = rev(levels(grp))), fill = factor(grp, levels = rev(levels(grp))))) + + geom_density_ridges(stat = "binline", bins = 200, scale = 0.95, draw_baseline = FALSE) + + # geom_density_ridges() + + theme_classic() + +ggsave(pRidges, filename = paste0(outdir, "variance_explained_ridge.png"), height = 8, width = 10) + +### Try ridgplots ### +pRidges_pop <- ggplot(icc_dt[grp != "Residual"][percent > 10], aes(x = percent, y = factor(grp, levels = rev(levels(grp))), fill = factor(grp, levels = rev(levels(grp))))) + + geom_density_ridges(stat = "binline", bins = 180, scale = 0.95, draw_baseline = FALSE) + + # geom_density_ridges() + + theme_classic() + +ggsave(pRidges_pop, filename = paste0(outdir, "variance_explained_ridge_pop.png"), height = 8, width = 10) + + +# pRaincloud <- ggplot(icc_dt, aes(x = percent, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(vars)))) + +# geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) + +# geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) + +# coord_cartesian(xlim = c(1.2, NA), clip = "off") + +# theme_classic() + +# theme(axis.title.y=element_blank()) + +# xlab("Percent Variance Explained") + +# scale_y_discrete(expand = c(0.03, 0)) + +# scale_fill_manual(values = var_colors) + +pRaincloud <- ggplot(icc_dt, aes(x = percent, y = factor(grp_size, levels = rev(levels(grp_size))), fill = factor(grp, levels = rev(vars)))) + + geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) + + geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) + + coord_cartesian(xlim = c(1.2, NA), clip = "off") + + theme_classic() + + theme(axis.title.y=element_blank()) + + xlab("Percent Variance Explained") + + scale_y_discrete(expand = c(0.1, 0)) + + scale_fill_manual(values = var_colors, name = "Variable") + + scale_color_manual(values = var_colors, name = "Variable") + + geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5) + +ggsave(pRaincloud, filename = paste0(outdir, "variance_explained_raincloud.png"), height = 2, width = 5) +ggsave(pRaincloud, filename = paste0(outdir, "variance_explained_raincloud.pdf"), height = 2, width = 5) + + + +icc_interaction_plus_dt$grp_size <- factor(icc_interaction_plus_dt$grp_size, levels = c("Line\nN = 750", "Passage\nN = 750", "Line:Passage\nN = 619", "Residual\nN = 750")) + + +# pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = grp_size, fill = factor(grp, levels = rev(vars)))) + +# geom_density_ridges(size = 0.1, stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity.., fill = factor(grp, levels = rev(vars))), alpha = 0.75) + +# geom_boxplot(outlier.shape=20, size = 0.1,width = .15, outlier.size = 0.01, position = position_nudge(y=-0.12), alpha = 0.75) + +# coord_cartesian(xlim = c(1.2, NA), clip = "off") + +# theme_classic() + +# theme(axis.title.y=element_blank()) + +# xlab("Percent Variance Explained") + +# scale_y_discrete(expand = c(0.03, 0)) + +# scale_fill_manual(values = var_colors) + +# scale_color_manual(values = var_colors) + +# geom_vline(xintercept = 1,color = "grey70", size = 0.4, lty="11") + +pRaincloud_interaction <- ggplot(icc_interaction_plus_dt, aes(x = percent, y = factor(grp_size, levels = levels(grp_size)), fill = factor(grp, levels = rev(vars)))) + + geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) + + geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) + + coord_cartesian(xlim = c(1.2, NA), clip = "off") + + theme_classic() + + theme(axis.title.y=element_blank()) + + xlab("Percent Variance Explained") + + scale_y_discrete(expand = c(0.1, 0)) + + scale_fill_manual(values = var_colors, name = "Variable") + + scale_color_manual(values = var_colors, name = "Variable") + + geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5) + +ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.png"), height = 2, width = 4) +ggsave(pRaincloud_interaction, filename = paste0(outdir, "variance_explained_raincloud_interaction.pdf"), height = 2, width = 4) + + + +icc_interaction_sig_list <- list() + +for (ensg in unique(icc_interaction_plus_dt$gene)){ + icc_interaction_sig_list[[ensg]] <- icc_interaction_plus_dt[gene == ensg][P < 0.05/(nrow(icc_interaction_plus_dt[gene == ensg])-1)] + icc_interaction_sig_list[[ensg]] <- rbind(icc_interaction_sig_list[[ensg]], icc_interaction_plus_dt[gene == ensg & grp == "Residual"]) +} + +icc_interaction_sig_dt <- do.call(rbind, icc_interaction_sig_list) + + + +group_size <- data.table(table(icc_interaction_sig_dt$grp)) +colnames(group_size) <- c("grp", "size") +group_size$grp_size <- paste0(group_size$grp, "\nN = ", group_size$size) + +icc_interaction_sig_dt <- group_size[icc_interaction_sig_dt, on = "grp"] +icc_interaction_sig_dt$grp_size <- factor(icc_interaction_sig_dt$grp_size, levels = unique(group_size$grp_size)) + +grp_size_order <- c("Line\nN = 750", "Passage\nN = 750", "Line:Passage\nN = 619", "Residual\nN = 750") + + + +# pRaincloud_interaction_sig <- ggplot(icc_interaction_sig_dt, aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + +# geom_density_ridges(stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) + +# geom_boxplot(size = 0.5,width = .15, outlier.size = 0.15, position = position_nudge(y=-0.12), alpha = 0.75) + +# coord_cartesian(xlim = c(1.2, NA), clip = "off") + +# theme_classic() + +# theme(axis.title.y=element_blank()) + +# xlab("Percent Variance Explained") + +# scale_y_discrete(expand = c(0.03, 0)) + +# scale_fill_manual(values = var_colors) + +# geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5) + +pRaincloud_interaction_sig <- ggplot(icc_interaction_sig_dt, aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + + geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) + + geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) + + coord_cartesian(xlim = c(1.2, NA), clip = "off") + + theme_classic() + + theme(axis.title.y=element_blank()) + + xlab("Percent Variance Explained") + + scale_y_discrete(expand = c(0.07, 0)) + + scale_fill_manual(values = var_colors, name = "Variable") + + scale_color_manual(values = var_colors, name = "Variable") + + geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5) + +ggsave(pRaincloud_interaction_sig, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant.png"), height = 2, width = 4) +ggsave(pRaincloud_interaction_sig, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant.pdf"), height = 2, width = 4) + + +total <- icc_interaction_sig_dt[,.(count = .N), by = .(grp)] +total_less1pct <- icc_interaction_sig_dt[percent <= 1][,.(count_less_1pct = .N), by = .(grp)] +total_less5pct <- icc_interaction_sig_dt[percent <= 5][,.(count_less_5pct = .N), by = .(grp)] +total_less10pct <- icc_interaction_sig_dt[percent <= 10][,.(count_less_10pct = .N), by = .(grp)] +summary <- total[total_less1pct, on = "grp"] +summary <- summary[total_less5pct, on = "grp"] +summary <- summary[total_less10pct, on = "grp"] +summary$count_greater_1pct <- summary$count - summary$count_less_1pct +summary$count_greater_5pct <- summary$count - summary$count_less_5pct +summary$count_greater_10pct <- summary$count - summary$count_less_10pct +summary$percent_1pct <- (summary$count_less_1pct/summary$count)*100 +summary$percent_5pct <- (summary$count_less_5pct/summary$count)*100 +summary$percent_10pct <- (summary$count_less_10pct/summary$count)*100 + + +# pRaincloud_interaction_sig_1pct <- ggplot(icc_interaction_sig_dt[percent >= 1], aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + +# geom_density_ridges(stat = "binline", bins = 90, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) + +# geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) + +# coord_cartesian(xlim = c(1.2, NA), clip = "off") + +# theme_classic() + +# theme(axis.title.y=element_blank()) + +# xlab("Percent Variance Explained") + +# scale_y_discrete(expand = c(0.03, 0)) + +# scale_fill_manual(values = var_colors) + +# geom_vline(xintercept = 1, linetype = "dashed", color = "firebrick3") + +pRaincloud_interaction_sig_1pct <- ggplot(icc_interaction_sig_dt[percent >= 1], aes(x = percent, y = factor(grp_size, levels = grp_size_order), fill = factor(grp, levels = rev(vars)))) + + geom_density_ridges(size = 0.1,stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..)) + + geom_point(size =1, position = position_nudge(y=-0.09), shape = "|", aes(color = factor(grp, levels = rev(vars)))) + + coord_cartesian(xlim = c(1.2, NA), clip = "off") + + theme_classic() + + theme(axis.title.y=element_blank()) + + xlab("Percent Variance Explained") + + scale_y_discrete(expand = c(0.07, 0)) + + scale_fill_manual(values = var_colors, name = "Variable") + + scale_color_manual(values = var_colors, name = "Variable") + + geom_vline(xintercept = 1, lty="11", color = "grey50", size = 0.5) + +ggsave(pRaincloud_interaction_sig_1pct, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_1pct.png"), height = 2, width = 4) +ggsave(pRaincloud_interaction_sig_1pct, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_1pct.pdf"), height = 2, width = 4) + + + + +##### Pull just the significant variances genome-wide ##### +icc_interaction_plus_dt$fdr <- p.adjust(icc_interaction_plus_dt$P, method="fdr") +icc_interaction_sig_gw_dt <- icc_interaction_plus_dt[fdr < 0.05 | is.na(fdr)] + +group_size_sig_gw <- data.table(table(icc_interaction_sig_gw_dt$grp)) +colnames(group_size_sig_gw) <- c("grp", "size") +group_size_sig_gw$grp_size <- paste0(group_size_sig_gw$grp, "\nN = ", formatC(group_size_sig_gw$size, format="d", big.mark=",")) + +icc_interaction_sig_gw_dt <- group_size_sig_gw[icc_interaction_sig_gw_dt, on = "grp"] +icc_interaction_sig_gw_dt$grp_size <- factor(icc_interaction_sig_gw_dt$grp_size, levels = unique(group_size_sig_gw$grp_size)) + + +group_size_sig_gw_order <- c("Line\nN = 750", "Passage\nN = 750", "Line:Passage\nN = 619", "Residual\nN = 750") + + +pRaincloud_interaction_sig_gw <- ggplot(icc_interaction_sig_gw_dt, aes(x = percent, y = factor(grp_size, levels = group_size_sig_gw_order), fill = factor(grp, levels = rev(vars)))) + + geom_density_ridges(stat = "binline", bins = 100, scale = 0.7, draw_baseline = FALSE, aes(height =..ndensity..), alpha = 0.75) + + geom_boxplot(size = 0.5,width = .15, outlier.size = 0.25, position = position_nudge(y=-0.12), alpha = 0.75) + + coord_cartesian(xlim = c(1.2, NA), clip = "off") + + theme_classic() + + theme(axis.title.y=element_blank()) + + xlab("Percent Variance Explained") + + scale_y_discrete(expand = c(0.03, 0)) + + scale_fill_manual(values = var_colors) + + geom_vline(xintercept = 1, linetype = "dashed", color = "firebrick3") + + labs(fill="Covariate") + +ggsave(pRaincloud_interaction_sig_gw, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_genome_wide.png"), height = 8, width = 7) +ggsave(pRaincloud_interaction_sig_gw, filename = paste0(outdir, "variance_explained_raincloud_interaction_significant_genome_wide.pdf"), height = 8, width = 7) + + + + + + + + + + + + + + + + + +icc_interaction_sig_dt[gene == "ENSG00000106153"] +icc_dt[gene == "ENSG00000106153"] +icc_interaction_plus_dt[gene == "ENSG00000106153"] +icc_interaction_sig_gw_dt[gene == "ENSG00000106153"] +icc_interaction_sig_gw_dt2[gene == "ENSG00000106153"] + + + + + + + +##### Add gene IDs for easy identification downstream ##### +GeneConversion1 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/DRENEA_1/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t") +GeneConversion2 <- read_delim("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/Expression_200128_A00152_0196_BH3HNFDSXY/GE/Village_B_1_week/outs/filtered_feature_bc_matrix/features.tsv.gz", col_names = F, delim = "\t") + +GeneConversion <- unique(rbind(GeneConversion1, GeneConversion2)) +GeneConversion <- GeneConversion[!duplicated(GeneConversion$X1),] +GeneConversion$X3 <- NULL +colnames(GeneConversion) <- c("gene", "Gene_ID") + +GeneConversion <- data.table(GeneConversion) + + +### Add the gene IDs to the icc_dt ### + +icc_interaction_sig_gw_dt <- GeneConversion[icc_interaction_sig_gw_dt, on = "gene"] + +icc_interaction_sig_gw_dt[grp == "Cryopreserved"& percent_round > 1][rev(order(percent))]$Gene_ID +head(icc_interaction_sig_gw_dt[grp == "Cryopreserved" & percent_round > 1][rev(order(percent))][,c("Gene_ID", "percent_round")], n = 50) +icc_interaction_sig_gw_dt[grp == "Line"][rev(order(percent))]$Gene_ID +head(icc_interaction_sig_gw_dt[grp == "Line" & percent_round > 1][rev(order(percent))][,c("Gene_ID", "percent_round")], n = 50) +icc_interaction_sig_gw_dt[grp == "Village"][rev(order(percent))]$Gene_ID +head(icc_interaction_sig_gw_dt[grp == "Village" & percent_round > 1][rev(order(percent))][,c("Gene_ID", "percent_round")], n = 50) + +fwrite(icc_interaction_sig_gw_dt, paste0(outdir, "sig_results.tsv.gz"), sep = "\t", compress = "gzip") + +## Highlight +## X chromosome genes - wouldn't expect these to be Line-biased because expressed by both males and females +## Y chromosome genes - should be line-biased because expressed by only males and have some male(s) and some female(s) +## mt genes +## ribosomal genes +## look at gsea and kegg pathways for each + +## Read in gtf used as reference and pull just X, Y or MT chromosome genes from it, use ribosomal file for rb genes +gtf <- fread("/directflow/GWCCGPipeline/projects/reference/refdata-cellranger-GRCh38-3.0.0/genes/genes.gtf", sep = "\t", autostart = 6, header = FALSE) + +gtf_genes <- gtf[!(grep("transcript_id", V9))] + +gtf_genes$V9 <- gsub("gene_id \"", "",gtf_genes$V9 ) %>% + gsub("\"; gene_version \"", ";", .) %>% + gsub("\"; gene_name \"", ";", .) %>% + gsub("\"; gene_source \"", ";", .) %>% + gsub("\"; gene_biotype \"", ";", .) %>% + gsub("\"", "", .) + +gtf_genes[, c("gene_id", "gene_version", "gene_name", "gene_source", "gene_biotype") := data.table(str_split_fixed(V9,";", 5))] + +icc_interaction_plus_dt <- GeneConversion[icc_interaction_plus_dt, on = "gene"] + + + +X_chromosome_genes <- gtf_genes[V1 == "X"] +X_genelist <- X_chromosome_genes$gene_id[X_chromosome_genes$gene_id %in% genes] +Y_chromosome_genes <- gtf_genes[V1 == "Y"] +Y_genelist <- Y_chromosome_genes$gene_id[Y_chromosome_genes$gene_id %in% genes] +MT_chromosome_genes <- gtf_genes[V1 == "MT"] +MT_genelist <- MT_chromosome_genes$gene_id[MT_chromosome_genes$gene_id %in% genes] +RbGeneList <- read.delim(file = "/directflow/SCCGGroupShare/projects/DrewNeavin/References/RibosomalGeneList_GeneID_ENSG.txt") +Rb_genelist <- RbGeneList$ENSG[RbGeneList$ENSG %in% genes] + + +### Make stacked bar plots of the variance explained by different factors for these gene groups + + + + +## Figure of x chromosome genes ## +icc_x <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% X_genelist][order(percent_round)]$gene), on = "gene"] +icc_x$grp <- factor(icc_x$grp, levels = rev(selected_vars)) +icc_x$gene <- factor(icc_x$gene, levels = unique(icc_x$gene)) + +bar_proportions_x <- ggplot(icc_x, aes(x = gene, y = percent, fill = grp)) + + geom_bar(position="stack", stat="identity", alpha = 0.75) + + theme_classic() + + theme(axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + plot.title = element_text(hjust = 0.5)) + + scale_fill_manual(values = var_colors) + + ggtitle("Variance Explained of\nX Chromosome Genes") + + +ggsave(bar_proportions_x, filename = paste0(outdir, "variance_explained_bar_x_genes.png"), width = 20) + + +## Figure of y chromosome genes ## +icc_y <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% Y_genelist][order(percent_round)]$gene), on = "gene"] +icc_y$grp <- factor(icc_y$grp, levels = rev(selected_vars)) +icc_y$gene <- factor(icc_y$gene, levels = unique(icc_y$gene)) +icc_y$Gene_ID <- factor(icc_y$Gene_ID, levels = unique(icc_y$Gene_ID)) + + +bar_proportions_y <- ggplot(icc_y, aes(x = Gene_ID, y = percent, fill = grp)) + + geom_bar(position="stack", stat="identity", alpha = 0.75) + + theme_classic() + + theme(axis.title.x=element_blank(), + # axis.text.x=element_blank(), + # axis.ticks.x=element_blank(), + plot.title = element_text(hjust = 0.5), + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + + scale_fill_manual(values = var_colors) + + scale_y_continuous(expand = c(0, 0)) + + ggtitle("Variance Explained of\nY Chromosome Genes") + + ylab("Percent") + + +ggsave(bar_proportions_y, filename = paste0(outdir, "variance_explained_bar_y_genes.png"), width = 4.5, height = 4.5) +ggsave(bar_proportions_y, filename = paste0(outdir, "variance_explained_bar_y_genes.pdf"), width = 4.5, height = 4.5) + + + +## Figure of mt chromosome genes ## +icc_mt <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% MT_genelist][order(percent_round)]$gene), on = "gene"] +icc_mt$grp <- factor(icc_mt$grp, levels = rev(selected_vars)) +icc_mt$gene <- factor(icc_mt$gene, levels = unique(icc_mt$gene)) +icc_mt$Gene_ID <- factor(icc_mt$Gene_ID, levels = unique(icc_mt$Gene_ID)) + + +bar_proportions_mt <- ggplot(icc_mt, aes(x = Gene_ID, y = percent, fill = grp)) + + geom_bar(position="stack", stat="identity", alpha = 0.75) + + theme_classic() + + theme(axis.title.x=element_blank(), + # axis.text.x=element_blank(), + # axis.ticks.x=element_blank(), + plot.title = element_text(hjust = 0.5), + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + + scale_fill_manual(values = var_colors) + + scale_y_continuous(expand = c(0, 0)) + + ggtitle("Variance Explained of\nMitochondrial Genes") + + ylab("Percent") + +ggsave(bar_proportions_mt, filename = paste0(outdir, "variance_explained_bar_mt_genes.png"), width = 4.5, height = 4.5) +ggsave(bar_proportions_mt, filename = paste0(outdir, "variance_explained_bar_mt_genes.pdf"), width = 4.5, height = 4.5) + + + +## Figure of mt chromosome genes ## +icc_rb <- icc_interaction_plus_dt[data.table(gene = icc_interaction_plus_dt[grp == "Residual"][gene %in% Rb_genelist][order(percent_round)]$gene), on = "gene"] +icc_rb$grp <- factor(icc_rb$grp, levels = rev(selected_vars)) +icc_rb$gene <- factor(icc_rb$gene, levels = unique(icc_rb$gene)) + + +bar_proportions_rb <- ggplot(icc_rb, aes(x = gene, y = percent, fill = grp)) + + geom_bar(position="stack", stat="identity", alpha = 0.75) + + theme_classic() + + theme(axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + plot.title = element_text(hjust = 0.5)) + + scale_fill_manual(values = var_colors) + + scale_y_continuous(expand = c(0, 0)) + + ggtitle("Variance Explained\nof Ribosomal Genes") + + +ggsave(bar_proportions_rb, filename = paste0(outdir, "variance_explained_bar_rb_genes.png"), width = 10, height = 4) +ggsave(bar_proportions_rb, filename = paste0(outdir, "variance_explained_bar_rb_genes.pdf"), width = 10, height = 4) + + +### Plot Pluripotency Genes ### +pluri_genes <- fread(paste0(dir,"data/pluripotency_genes.tsv"), sep = "\t", col.names = "Gene_ID", header = FALSE) + + +pluri_genes <- GeneConversion[pluri_genes, on = "Gene_ID"] + + +icc_dt_pluri_genes <- icc_interaction_plus_dt[pluri_genes,on = c("gene")] +icc_dt_pluri_genes$grp <- factor(icc_dt_pluri_genes$grp, levels = rev(selected_vars)) +icc_dt_pluri_genes <- icc_dt_pluri_genes[data.table(gene = icc_dt_pluri_genes[grp == "Residual"][order(percent)]$gene), on = "gene"] +icc_dt_pluri_genes$Gene_ID <- factor(icc_dt_pluri_genes$Gene_ID, levels = unique(icc_dt_pluri_genes$Gene_ID)) + + + +pPluri_Genes_Cont <- ggplot() + + geom_bar(data = icc_dt_pluri_genes, aes(Gene_ID, percent, fill = grp), position = "stack", stat = "identity", alpha = 0.75) + + theme_classic() + + # facet_wrap(Gene_ID ~ ., nrow = 3) + + scale_fill_manual(values = var_colors) + + theme(plot.title = element_text(hjust = 0.5), + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + + ylab("Percent Gene Expression Variance Explained") + + ggtitle("Variance Explained of\nPluripotency Genes") + + theme(axis.title.x=element_blank()) + +ggsave(pPluri_Genes_Cont, filename = paste0(outdir, "Pluripotent_Gene_Variable_Contributions.png"), width = 6, height = 4.5) +ggsave(pPluri_Genes_Cont, filename = paste0(outdir, "Pluripotent_Gene_Variable_Contributions.pdf"), width = 6, height = 4.5) + + + +##### check for variance explained for eQTL genes (from Kilpinen et al) that are ##### +eqtls <- fread("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/eQTL_check/KilpinenOverlap/gene_snp_list.tsv", sep = "\t") + + +eqtls_icc <- icc_interaction_plus_dt[unique(eqtls[,"gene"]), on = "gene"] +eqtls_icc$grp <- factor(eqtls_icc$grp, levels = rev(vars)) +eqtls_icc <- eqtls_icc[data.table(gene = eqtls_icc[grp == "Residual"][order(percent)]$gene), on = "gene"] +eqtls_icc$Gene_ID <- factor(eqtls_icc$Gene_ID, levels = unique(eqtls_icc$Gene_ID)) + + +group_size_eqtl <- data.table(table(eqtls_icc$grp)) +colnames(group_size_eqtl) <- c("grp", "size") +group_size_eqtl$grp_size <- paste0(group_size_eqtl$grp, "\nN = ", group_size_eqtl$size) + +eqtls_icc <- group_size_eqtl[eqtls_icc, on = "grp"] +grp_size_order_eqtl <- c("Line\nN = 2371", "Village\nN = 1836", "Site\nN = 2461", "Replicate\nN = 869", "Line:Village\nN = 367", "Line:Site\nN = 911", "Village:Site\nN = 898","Replicate:Village\nN = 305", "Replicate:Line\nN = 25", "Replicate:Site\nN = 78", "Residual\nN = 2542") +eqtls_icc$grp_size <- factor(eqtls_icc$grp_size, levels = grp_size_order_eqtl) + + + +eqtls_icc_1pct <- eqtls_icc + + +for (ensg in unique(eqtls_icc$gene)){ + if (!any(eqtls_icc[gene == ensg & grp != "Residual"]$percent > 1)){ + eqtls_icc_1pct <- eqtls_icc_1pct[gene != ensg] + } +} + + + +eqtls_icc_1pct_grouped_list <- list() + + +for (ensg in unique(eqtls_icc_1pct$gene)){ + group <- eqtls_icc_1pct[gene == ensg & grp != "Residual"][which.max(percent)]$grp + eqtls_icc_1pct_grouped_list[[group]][[ensg]] <- eqtls_icc_1pct[gene == ensg] +} + +eqtls_icc_1pct_grouped <- lapply(eqtls_icc_1pct_grouped_list, function(x) do.call(rbind, x)) +eqtls_icc_1pct_grouped <- lapply(names(eqtls_icc_1pct_grouped), function(x){ + eqtls_icc_1pct_grouped[[x]]$largest_contributor <- x + return(eqtls_icc_1pct_grouped[[x]]) +}) + +eqtls_icc_1pct_grouped_dt <- do.call(rbind, eqtls_icc_1pct_grouped) + +eqtls_icc_1pct_grouped_dt$largest_contributor <- factor(eqtls_icc_1pct_grouped_dt$largest_contributor, levels = c("Line", "Village", "Site", "Line:Village", "Line:Site", "Village:Site", "Replicate:Village")) + + + +pPluri_Genes_largest_Cont_eqtl <- ggplot() + + geom_bar(data = eqtls_icc_1pct_grouped_dt, aes(Gene_ID, percent, fill = factor(grp, levels = rev(vars))), position = "stack", stat = "identity", alpha = 0.75) + + theme_classic() + + facet_grid(. ~ largest_contributor, scales = "free_x", space = "free_x") + + scale_fill_manual(values = var_colors) + + ylab("Percent Gene Expression Variance Explained") + + theme(axis.title.x=element_blank(), + axis.text.x = element_blank(), + panel.spacing.x=unit(0, "lines"), + axis.ticks.x = element_blank()) + + geom_hline(yintercept = 1, linetype = "dashed") + # scale_y_discrete(expand = c(0.03, 0)) + # scale_x_discrete(expand = c(0.03, 0)) + + +ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "eQTL_Genes_Variance_Contributions_1pct_largest_cont.png"), width = 10, height = 4) +ggsave(pPluri_Genes_largest_Cont_eqtl, filename = paste0(outdir, "eQTL_Genes_Variance_Contributions_1pct_largest_cont.pdf"), width = 10, height = 4) + + + + + + + +# ### Count number of each chromosome/gene category type +# ### numbers are the numbers of that category that where a significant percent of variance is explained by this variable +# ### percent of that category that where a significant percent of variance is explained by this variable + +# x_number <- lapply(genes_list, function(x){ +# length(which(x %in% X_chromosome_genes$gene_id)) +# }) + +# x_percent <- lapply(genes_list, function(x){ +# length(which(x %in% X_chromosome_genes$gene_id))/length(X_chromosome_genes$gene_id) +# }) + +# y_number <- lapply(genes_list, function(x){ +# length(which(x %in% Y_chromosome_genes$gene_id)) +# }) + +# y_percent <- lapply(genes_list, function(x){ +# length(which(x %in% Y_chromosome_genes$gene_id))/length(Y_chromosome_genes$gene_id) +# }) + +# mt_number <- lapply(genes_list, function(x){ +# length(which(x %in% MT_chromosome_genes$gene_id)) +# }) + +# mt_percent <- lapply(genes_list, function(x){ +# length(which(x %in% MT_chromosome_genes$gene_id))/length(MT_chromosome_genes$gene_id) +# }) + +# rb_number <- lapply(genes_list, function(x){ +# length(which(x %in% RbGeneList$ENSG)) +# }) + +# rb_percent <- lapply(genes_list, function(x){ +# length(which(x %in% RbGeneList$ENSG))/length(RbGeneList$ENSG) +# }) + + + +# ### for > 1% var explained +# ### Count number of each chromosome/gene category type +# x_number_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% X_chromosome_genes$gene_id)) +# }) +# names(x_number_1) <- vars + +# x_percent_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% X_chromosome_genes$gene_id))/length(X_chromosome_genes$gene_id) +# }) +# names(x_percent_1) <- vars + +# y_number_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% Y_chromosome_genes$gene_id)) +# }) +# names(y_number_1) <- vars + +# y_percent_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% Y_chromosome_genes$gene_id))/length(Y_chromosome_genes$gene_id) +# }) +# names(y_percent_1) <- vars + +# mt_number_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% MT_chromosome_genes$gene_id)) +# }) +# names(mt_number_1) <- vars + +# mt_percent_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% MT_chromosome_genes$gene_id))/length( MT_chromosome_genes$gene_id) +# }) +# names(mt_percent_1) <- vars + +# rb_number_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% RbGeneList$ENSG)) +# }) +# names(rb_number_1) <- vars + +# rb_percent_1 <- lapply(vars, function(group){ +# genes <- icc_dt[grp == group][rev(order(percent_round))][percent > 1]$gene +# length(which(genes %in% RbGeneList$ENSG))/length(RbGeneList$ENSG) +# }) +# names(rb_percent_1) <- vars + + +### Pathway analysis ### +geneList <- lapply(genes_list, function(x){ + tmp <- bitr(x, fromType = "ENSEMBL", + toType = c("ENTREZID"), + OrgDb = org.Hs.eg.db)$ENTREZID + tmp[!is.na(tmp)] +}) + + +gg <- list() +kk <- list() + + + +df = as.data.frame(org.Hs.egGO) +go_gene_list = unique(sort(df$gene_id)) + +dfk = as.data.frame(org.Hs.egPATH) +kegg_gene_list = unique(sort(dfk$gene_id)) + + + +for (group in c("Line", "Village", "Cryopreserved", "Replicate","Line:Village", "Line:Cryopreserved", "Village:Cryopreserved", "Replicate:Village", "Replicate:Line", "Replicate:Cryopreserved", "Residual")){ + kk[[group]] <- enrichKEGG(gene = geneList[[group]], + universe = geneList[[group]], + organism = 'hsa', + pvalueCutoff = 0.05, + keyType = 'ncbi-geneid') + + gg[[group]] <- groupGO(gene = geneList[[group]], + OrgDb = org.Hs.eg.db, + readable = TRUE) + +} + + +hsGO <- godata('org.Hs.eg.db', ont="MF") + + +sim_results <- list() + +vars <- c("Line", "Village", "Cryopreserved", "Replicate","Village:Line", "Line:Cryopreserved", "Village:Cryopreserved", "Replicate:Village", "Replicate:Line", "Replicate:Cryopreserved") + +for (group1 in vars){ + print(group1) + for (group2 in vars[(grep(paste0("^",group1, "$"), vars) + 1): length(vars)]){ + print(group2) + sim_results[[group1]][[group2]] <- clusterSim(geneList[[group1]], geneList[[group2]], semData=hsGO, measure="Wang", combine="BMA") + } +} + + + + + +# genes2rerun <- c(character()) + +# for (g in unique(icc_dt$gene)){ +# print(g) +# genes2rerun <- c(genes2rerun, as.character(unique(icc_dt[gene == g][P > 0.05/(nrow(icc_dt[gene == g])-1)]$gene))) +# } + + +# for (gene in genes2rerun){ +# print(gene) +# # unlink(paste0(icc_dir,gene,"_icc.rds")) +# print(file.exists(paste0(icc_dir,gene,"_icc.rds"))) +# unlink(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/",gene,"_fitted_models.rds")) +# unlink(paste0("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/residuals4qtl/",gene,"_residuals4qtl.rds")) +# } + + diff --git a/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R b/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R new file mode 100644 index 0000000..c7ddd7b --- /dev/null +++ b/multi-passage/Variance/variance_partition_multipassage_integratedSCT.R @@ -0,0 +1,357 @@ +library(haven) +library(ggplot2) +library(glmmTMB) +library(Seurat) +library(tidyverse) +library(specr) +library(data.table) +library(dsLib) +library(pkgcond) +library(texreg) + + +inicio("Starting Analysis") + + +##### Define functions ##### +icc_glmmtmb <- function(model, percent = TRUE) { + tmp <- VarCorr(model) + var <- do.call(rbind, lapply(names(tmp$cond), function(x) data.table("grp" = x, "vcov" = attr(tmp$cond[[x]], "stddev")^2))) + var <- rbind(var, data.table("grp" = "Residual", "vcov" = sigma(model)^2)) + sum_var <- sum(var$vcov) + var <- var %>% dplyr::mutate(icc = vcov/sum_var) + if (isTRUE(percent)) { + var <- var %>% dplyr::mutate(percent = .data$icc * 100) + } + return(var) +} + + + +##### Bring in variables ##### +### Bring in arguments +args <- commandArgs(trailingOnly = TRUE) +icc_action_outdir <- paste0(args[1]) +icc_outdir <- paste0(args[2]) +model_action_outdir <- paste0(args[3]) +model_outdir <- paste0(args[4]) +resid_outdir <- paste0(args[5]) +gene <- as.character(args[6]) + +print(icc_outdir) +print(icc_outdir) +print(model_outdir) +print(resid_outdir) +print(gene) + + + +##### Read in seurat with genes ##### +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds") + + + +### Make DF for modeling ### +df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Line" = seurat@meta.data$Assignment, "Time" = as.factor(seurat@meta.data$Pool)) +colnames(df_hier_unscale)[1] <- "Expression" + + + +##### Leave one out method ##### +variables <- c("Line", "Time") + +model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) + + +boolFalse<-F +while(boolFalse==F & length(variables) > 0){ + tryCatch({ + print(variables) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + } else { + variables <- c() + } + }) +} + + +if (!length(variables) == 0){ + + + ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance + while (!model_glmmtmb$sdr$pdHess & length(variables) > 0 ){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + print(variables) + model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + variables <- c() + } + } + + print(variables) + + if (length(variables) > 0){ + + model_loo <- list() + + icc <- data.table(grp = variables, P = as.numeric(NA)) + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2] + } + + + if (!(any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){ + model_loo_updated <- model_loo + + updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")")) + + model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc <- icc_glmmtmb(model_loo_updated[["all"]]) + + + ### Recalculate significance ### + icc$P <- as.numeric(NA) + icc$gene <- gene + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2] + } + } + + + while((any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){ + + print("Removing non-significant vartiables and retesting signficance") + + ##### Identify variables to keep ##### + variables <- icc[P < 0.05/length(variables)]$grp + + if (length(variables) > 0){ + + ##### Calculate full model ##### + updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")")) + + + model_loo_updated <- list() + model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + + ### Calculate the variance explained by each of the included variables ### + icc <- icc_glmmtmb(model_loo_updated[["all"]]) + + + + ### Recalfulate significance ### + icc$P <- as.numeric(NA) + icc$gene <- gene + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2] + } + + + + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } + } + + action_variables <- c() + + if (length(variables) > 1){ + ### Add in actions of the significant variables + if ("Line" %in% variables & "Time" %in% variables){ + action_variables <- c(action_variables, "Line:Time") + } + + model_all_action <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")")) + + + boolFalse<-F + while(boolFalse==F & length(action_variables) > 0){ + tryCatch({ + print(c(variables, action_variables)) + model_glmmtmb_action <- suppress_warnings(glmmTMB(formula = noquote(model_all_action), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(action_variables) > 1){ + action_variables <- action_variables[1:(length(action_variables) -1)] + } else { + action_variables <- c() + } + }) + } + + ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance + while (!model_glmmtmb_action$sdr$pdHess & length(action_variables) > 0 ){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(action_variables) > 1){ + action_variables <- action_variables[1:(length(action_variables) -1)] + print(c(action_variables, variables)) + model_all_action <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")")) + model_glmmtmb_action <- suppress_warnings(glmmTMB(formula = noquote(model_all_action), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + action_variables <- c() + } + } + + if (length(action_variables) > 0){ + + model_loo_action <- list() + + icc_action <- data.table(grp = action_variables, P = as.numeric(NA)) + + for (variable in c(action_variables)){ + print(variable) + if (length(action_variables) > 1){ + model_action <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables)[!c(variables, action_variables) %in% variable], collapse = ") + (1|"), ")")) + } else { + model_action <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } + model_loo_action[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_action[grp == variable]$P <- anova(model_loo_action[[variable]], model_glmmtmb_action)$`Pr(>Chisq)`[2] + } + + + if (!(any(icc_action[grp != "Residual"]$P > 0.05/length(c(variables, action_variables))) | any(is.na(icc_action[grp != "Residual"]$P)))){ + model_loo_action_updated <- model_loo_action + + updated_model_action <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")")) + + model_loo_action_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc_action <- icc_glmmtmb(model_loo_action_updated[["all"]]) + + + ### Recalculate significance ### + icc_action$P <- as.numeric(NA) + icc_action$gene <- gene + + for (variable in c(variables, action_variables)){ + print(variable) + if (length(c(action_variables)) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables)[!c(variables, action_variables) %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } + model_loo_action_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_action[grp == variable]$P <- anova(model_loo_action_updated[[variable]], model_loo_action_updated[["all"]])$`Pr(>Chisq)`[2] + } + } + + + while((any(icc_action[!(grp %in% c("Residual", variables))]$P > 0.05/length(c(variables, action_variables))) | any(is.na(icc_action[!(grp %in%c("Residual", variables))]$P)))){ + + print("Removing non-significant vartiables and retesting signficance") + + ##### Identify variables to keep ##### + action_variables <- icc_action[!(grp %in% c("Residual", variables)) & P < 0.05/length(c(variables, action_variables))]$grp + + if (length(action_variables) > 0){ + + ##### Calculate full model ##### + updated_model_action <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables), collapse = ") + (1|"), ")")) + + + model_loo_action_updated <- list() + model_loo_action_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + + ### Calculate the variance explained by each of the included variables ### + icc_action <- icc_glmmtmb(model_loo_action_updated[["all"]]) + + + + ### Recalfulate significance ### + icc_action$P <- as.numeric(NA) + icc_action$gene <- gene + + for (variable in c(variables, action_variables)){ + print(variable) + model_action <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, action_variables)[!(c(variables, action_variables) %in% variable)], collapse = ") + (1|"), ")")) + model_loo_action_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_action), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_action[grp == variable]$P <- anova(model_loo_action_updated[[variable]], model_loo_action_updated[["all"]])$`Pr(>Chisq)`[2] + } + } else { + icc_action <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_action_updated <- list() + } + } + } else { + icc_action <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_action_updated <- list() + } + if (nrow(icc_action) > nrow(icc)){ + saveRDS(icc_action, paste0(icc_action_outdir, gene, "_icc.rds"), compress = TRUE) + saveRDS(model_loo_action_updated, paste0(model_action_outdir, gene, "_fitted_models.rds"), compress = TRUE) + } + + + ### If line is significant, then get residuals for downstream qtl checks ### + if ("Line" %in% variables){ + print("Making residuals for qtl detection") + if (length(variables) > 1){ + if (length(action_variables) > 0){ + model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, action_variables)[-grep("Line", c(variables, action_variables))], collapse = ") + (1|"), ")")) + } else { + model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Line"], collapse = ") + (1|"), ")")) + } + } else { + model_no_line <- as.formula(paste0("Expression ~ 1")) + } + fit_no_line <- glmmTMB(formula = noquote(model_no_line), data = df_hier_unscale, REML = TRUE) + residuals <- resid(fit_no_line) + saveRDS(residuals, paste0(resid_outdir, gene, "_residuals4qtl.rds"), compress = TRUE) + } + + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } +} + + +saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE) +saveRDS(model_loo_updated, paste0(model_outdir, gene, "_fitted_models.rds"), compress = TRUE) + + +fin() \ No newline at end of file diff --git a/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R b/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R new file mode 100644 index 0000000..6b23a9a --- /dev/null +++ b/multi-passage/Variance/variance_partition_multipassage_integratedSCT_Ncov.R @@ -0,0 +1,358 @@ +library(haven) +library(ggplot2) +library(glmmTMB) +library(Seurat) +library(tidyverse) +library(specr) +library(data.table) +library(dsLib) +library(pkgcond) +library(texreg) + + +inicio("Starting Analysis") + + +##### Define functions ##### +icc_glmmtmb <- function(model, percent = TRUE) { + tmp <- VarCorr(model) + var <- do.call(rbind, lapply(names(tmp$cond), function(x) data.table("grp" = x, "vcov" = attr(tmp$cond[[x]], "stddev")^2))) + var <- rbind(var, data.table("grp" = "Residual", "vcov" = sigma(model)^2)) + sum_var <- sum(var$vcov) + var <- var %>% dplyr::mutate(icc = vcov/sum_var) + if (isTRUE(percent)) { + var <- var %>% dplyr::mutate(percent = .data$icc * 100) + } + return(var) +} + + + +##### Bring in variables ##### +### Bring in arguments +args <- commandArgs(trailingOnly = TRUE) +icc_interaction_outdir <- paste0(args[1]) +icc_outdir <- paste0(args[2]) +model_interaction_outdir <- paste0(args[3]) +model_outdir <- paste0(args[4]) +resid_outdir <- paste0(args[5]) +gene <- as.character(args[6]) + +print(icc_outdir) +print(icc_outdir) +print(model_outdir) +print(resid_outdir) +print(gene) + + + +##### Read in seurat with genes ##### +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/preQC/time-integrated_filtered_seurat_1pct_expressing.rds") + + + +### Make DF for modeling ### +df_hier_unscale <- data.frame("Expression" = seurat[["SCT"]]@scale.data[gene,], "Line" = seurat@meta.data$Assignment, "Time" = as.factor(seurat@meta.data$Pool), "Ncov" = seurat$Ncov) +colnames(df_hier_unscale)[1] <- "Expression" + + + +##### Leave one out method ##### +variables <- c("Line", "Time", "Ncov") + +model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) + + +boolFalse<-F +while(boolFalse==F & length(variables) > 0){ + tryCatch({ + print(variables) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + } else { + variables <- c() + } + }) +} + + +if (!length(variables) == 0){ + + + ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance + while (!model_glmmtmb$sdr$pdHess & length(variables) > 0 ){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(variables) > 1){ + variables <- variables[1:(length(variables) -1)] + print(variables) + model_all <- as.formula(paste0("Expression ~ (1|", paste0(variables, collapse = ") + (1|"), ")")) + model_glmmtmb <- suppress_warnings(glmmTMB(formula = noquote(model_all), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + variables <- c() + } + } + + print(variables) + + if (length(variables) > 0){ + + model_loo <- list() + + icc <- data.table(grp = variables, P = as.numeric(NA)) + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo[[variable]], model_glmmtmb)$`Pr(>Chisq)`[2] + } + + + if (!(any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){ + model_loo_updated <- model_loo + + updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")")) + + model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc <- icc_glmmtmb(model_loo_updated[["all"]]) + + + ### Recalculate significance ### + icc$P <- as.numeric(NA) + icc$gene <- gene + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2] + } + } + + + while((any(icc[grp != "Residual"]$P > 0.05/length(variables)) | any(is.na(icc[grp != "Residual"]$P)))){ + + print("Removing non-significant vartiables and retesting signficance") + + ##### Identify variables to keep ##### + variables <- icc[P < 0.05/length(variables)]$grp + + if (length(variables) > 0){ + + ##### Calculate full model ##### + updated_model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables, collapse = ") + (1|"), ")")) + + + model_loo_updated <- list() + model_loo_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + + ### Calculate the variance explained by each of the included variables ### + icc <- icc_glmmtmb(model_loo_updated[["all"]]) + + + + ### Recalfulate significance ### + icc$P <- as.numeric(NA) + icc$gene <- gene + + for (variable in variables){ + print(variable) + if (length(variables) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ 1")) + } + model_loo_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc[grp == variable]$P <- anova(model_loo_updated[[variable]], model_loo_updated[["all"]])$`Pr(>Chisq)`[2] + } + + + + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } + } + + interaction_variables <- c() + + if (length(variables) > 1){ + ### Add in interactions of the significant variables + if ("Line" %in% variables & "Time" %in% variables){ + interaction_variables <- c(interaction_variables, "Line:Time") + } + + model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + + + boolFalse<-F + while(boolFalse==F & length(interaction_variables) > 0){ + tryCatch({ + print(c(variables, interaction_variables)) + model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + boolFalse<-T + },error=function(e){ + if (length(interaction_variables) > 1){ + interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)] + } else { + interaction_variables <- c() + } + }) + } + + ### Deal with singular fits by removing last variable until a fit can be found - ordered in variables buy importance + while (!model_glmmtmb_interaction$sdr$pdHess & length(interaction_variables) > 0 ){ + print("Singular fit: removing last variable and rerunning with one less covariate.") + if (length(interaction_variables) > 1){ + interaction_variables <- interaction_variables[1:(length(interaction_variables) -1)] + print(c(interaction_variables, variables)) + model_all_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + model_glmmtmb_interaction <- suppress_warnings(glmmTMB(formula = noquote(model_all_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + } else { + interaction_variables <- c() + } + } + + if (length(interaction_variables) > 0){ + + model_loo_interaction <- list() + + icc_interaction <- data.table(grp = interaction_variables, P = as.numeric(NA)) + + for (variable in c(interaction_variables)){ + print(variable) + if (length(interaction_variables) > 1){ + model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")")) + } else { + model_interaction <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } + model_loo_interaction[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction[[variable]], model_glmmtmb_interaction)$`Pr(>Chisq)`[2] + } + + + if (!(any(icc_interaction[grp != "Residual"]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[grp != "Residual"]$P)))){ + model_loo_interaction_updated <- model_loo_interaction + + updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + + model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + ### Calculate the variance explained by each of the included variables ### + icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]]) + + + ### Recalculate significance ### + icc_interaction$P <- as.numeric(NA) + icc_interaction$gene <- gene + + for (variable in c(variables, interaction_variables)){ + print(variable) + if (length(c(interaction_variables)) > 1){ + model <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!c(variables, interaction_variables) %in% variable], collapse = ") + (1|"), ")")) + } else { + model <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% variable], collapse = ") + (1|"), ")")) + } + model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2] + } + } + + + while((any(icc_interaction[!(grp %in% c("Residual", variables))]$P > 0.05/length(c(variables, interaction_variables))) | any(is.na(icc_interaction[!(grp %in%c("Residual", variables))]$P)))){ + + print("Removing non-significant vartiables and retesting signficance") + + ##### Identify variables to keep ##### + interaction_variables <- icc_interaction[!(grp %in% c("Residual", variables)) & P < 0.05/length(c(variables, interaction_variables))]$grp + + if (length(interaction_variables) > 0){ + + ##### Calculate full model ##### + updated_model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables), collapse = ") + (1|"), ")")) + + + model_loo_interaction_updated <- list() + model_loo_interaction_updated[["all"]] <- suppress_warnings(glmmTMB(formula = noquote(updated_model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + + + + ### Calculate the variance explained by each of the included variables ### + icc_interaction <- icc_glmmtmb(model_loo_interaction_updated[["all"]]) + + + + ### Recalfulate significance ### + icc_interaction$P <- as.numeric(NA) + icc_interaction$gene <- gene + + for (variable in c(variables, interaction_variables)){ + print(variable) + model_interaction <- as.formula(paste0("Expression ~ 1 + (1|", paste0(c(variables, interaction_variables)[!(c(variables, interaction_variables) %in% variable)], collapse = ") + (1|"), ")")) + model_loo_interaction_updated[[variable]] <- suppress_warnings(glmmTMB(formula = noquote(model_interaction), data = df_hier_unscale, REML = TRUE), "giveCsparse") + icc_interaction[grp == variable]$P <- anova(model_loo_interaction_updated[[variable]], model_loo_interaction_updated[["all"]])$`Pr(>Chisq)`[2] + } + } else { + icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_interaction_updated <- list() + } + + if (nrow(icc_interaction) > nrow(icc)){ + saveRDS(icc_interaction, paste0(icc_interaction_outdir, gene, "_icc.rds"), compress = TRUE) + saveRDS(model_loo_interaction_updated, paste0(model_interaction_outdir, gene, "_fitted_models.rds"), compress = TRUE) + } + } + } else { + icc_interaction <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_interaction_updated <- list() + } + + + ### If line is significant, then get residuals for downstream qtl checks ### + if ("Line" %in% variables){ + print("Making residuals for qtl detection") + if (length(variables) > 1){ + if (length(interaction_variables) > 0){ + model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(c(variables, interaction_variables)[-grep("Line", c(variables, interaction_variables))], collapse = ") + (1|"), ")")) + } else { + model_no_line <- as.formula(paste0("Expression ~ (1|", paste0(variables[!variables %in% "Line"], collapse = ") + (1|"), ")")) + } + } else { + model_no_line <- as.formula(paste0("Expression ~ 1")) + } + fit_no_line <- glmmTMB(formula = noquote(model_no_line), data = df_hier_unscale, REML = TRUE) + residuals <- resid(fit_no_line) + saveRDS(residuals, paste0(resid_outdir, gene, "_residuals4qtl.rds"), compress = TRUE) + } + + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } + } else { + icc <- data.table(grp=character(), vcov=numeric(), icc=numeric(), percent=numeric(), P=numeric(), gene=character()) + model_loo_updated <- list() + } +} + + +saveRDS(icc, paste0(icc_outdir, gene, "_icc.rds"), compress = TRUE) +saveRDS(model_loo_updated, paste0(model_outdir, gene, "_fitted_models.rds"), compress = TRUE) + + +fin() \ No newline at end of file diff --git a/multi-passage/Variance/variance_partition_multipassage_snake.sh b/multi-passage/Variance/variance_partition_multipassage_snake.sh new file mode 100644 index 0000000..8e72118 --- /dev/null +++ b/multi-passage/Variance/variance_partition_multipassage_snake.sh @@ -0,0 +1,136 @@ + +cd /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/ + +SNAKEFILE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/Variance/variance_partition_multipassage.smk" +LOG="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/Variance/logs/" + +mkdir -p $LOG + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --reason > jobs2run.txt + + + +nohup \ + snakemake \ + --snakefile $SNAKEFILE \ + --jobs 100 \ + --use-singularity \ + --restart-times 1 \ + --keep-going \ + --cluster \ + "qsub -S /bin/bash \ + -q short.q \ + -r yes \ + -pe smp {threads} \ + -l tmp_requested={resources.disk_per_thread_gb}G \ + -l mem_requested={resources.mem_per_thread_gb}G \ + -e $LOG \ + -o $LOG \ + -j y \ + -V" \ + > $LOG/nohup_`date +%Y-%m-%d.%H:%M:%S`.log & + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --unlock + + +snakemake \ + --snakefile $SNAKEFILE \ + --dryrun \ + --cores 1 \ + --quiet \ + --cleanup-metadata \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000116001_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000116001_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000084112_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000084112_fitted_models.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/icc/ENSG00000130706_icc.rds \ + /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/variance_partition_post_review/gene_separated/fit_models/ENSG00000130706_fitted_models.rds + + + +rm ENSG00000047617_icc.rds +rm ENSG00000089250_icc.rds +rm ENSG00000124602_icc.rds +rm ENSG00000130052_icc.rds +rm ENSG00000136099_icc.rds +rm ENSG00000143412_icc.rds +rm ENSG00000147124_icc.rds +rm ENSG00000158104_icc.rds +rm ENSG00000160188_icc.rds +rm ENSG00000163393_icc.rds +rm ENSG00000163947_icc.rds +rm ENSG00000167083_icc.rds +rm ENSG00000170113_icc.rds +rm ENSG00000170961_icc.rds +rm ENSG00000172748_icc.rds +rm ENSG00000184916_icc.rds +rm ENSG00000185386_icc.rds +rm ENSG00000186141_icc.rds +rm ENSG00000188536_icc.rds +rm ENSG00000189164_icc.rds +rm ENSG00000230606_icc.rds +rm ENSG00000242687_icc.rds +rm ENSG00000243444_icc.rds +rm ENSG00000253438_icc.rds +rm ENSG00000253731_icc.rds +rm ENSG00000258813_icc.rds +rm ENSG00000258944_icc.rds +rm ENSG00000260528_icc.rds +rm ENSG00000272030_icc.rds +rm ENSG00000272973_icc.rds +rm ENSG00000274367_icc.rds +rm ENSG00000275580_icc.rds + + + + +rm ENSG00000047617_fitted_models.rds +rm ENSG00000089250_fitted_models.rds +rm ENSG00000124602_fitted_models.rds +rm ENSG00000130052_fitted_models.rds +rm ENSG00000136099_fitted_models.rds +rm ENSG00000143412_fitted_models.rds +rm ENSG00000147124_fitted_models.rds +rm ENSG00000158104_fitted_models.rds +rm ENSG00000160188_fitted_models.rds +rm ENSG00000163393_fitted_models.rds +rm ENSG00000163947_fitted_models.rds +rm ENSG00000167083_fitted_models.rds +rm ENSG00000170113_fitted_models.rds +rm ENSG00000170961_fitted_models.rds +rm ENSG00000172748_fitted_models.rds +rm ENSG00000184916_fitted_models.rds +rm ENSG00000185386_fitted_models.rds +rm ENSG00000186141_fitted_models.rds +rm ENSG00000188536_fitted_models.rds +rm ENSG00000189164_fitted_models.rds +rm ENSG00000230606_fitted_models.rds +rm ENSG00000242687_fitted_models.rds +rm ENSG00000243444_fitted_models.rds +rm ENSG00000253438_fitted_models.rds +rm ENSG00000253731_fitted_models.rds +rm ENSG00000258813_fitted_models.rds +rm ENSG00000258944_fitted_models.rds +rm ENSG00000260528_fitted_models.rds +rm ENSG00000272030_fitted_models.rds +rm ENSG00000272973_fitted_models.rds +rm ENSG00000274367_fitted_models.rds +rm ENSG00000275580_fitted_models.rds diff --git a/multi-passage/preQC.R b/multi-passage/preQC.R index 323d606..bd1859d 100644 --- a/multi-passage/preQC.R +++ b/multi-passage/preQC.R @@ -4,11 +4,13 @@ library(dplyr) library(tidyverse) library(colorspace) library(Nebulosa) +library(RColorBrewer) +library(scran) ##### Set up directories ##### dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/" -datadir <- paste0(dir, "data/multi-passage/GIMR_GWCCG_211213_JOSPOW_10x_3p_results/GIMR_GWCCG_211213_JOSPOW_10x_3p/analyses/") +datadir <- paste0(dir, "data/multi-passage/resequencing/220405/") outdir <- paste0(dir, "output/multi-passage/preQC/") @@ -20,7 +22,7 @@ pools <- dir(datadir) ## Read in expression data counts_list <- lapply(pools, function(x){ print(x) - Read10X(paste0(datadir,x, "/GE/",x,"/outs/per_sample_outs/",x,"/count/sample_feature_bc_matrix"), gene.column = 1) + Read10X(paste0(datadir,x, "/GE/summary/sample_feature_bc_matrix/"), gene.column = 1) }) names(counts_list) <- pools @@ -34,8 +36,10 @@ names(counts_list) <- pools ## Make seurat object ## -seurat_list <- lapply(counts_list, function(x){ - CreateSeuratObject(counts = x) +seurat_list <- lapply(names(counts_list), function(x){ + temp <- CreateSeuratObject(counts = counts_list[[x]]) + temp@meta.data$Pool <- x + return(temp) }) names(seurat_list) <- pools @@ -65,6 +69,44 @@ RbGeneList <- RbGeneList[which(RbGeneList$ENSG %in% rownames(seurat)),] seurat <- PercentageFeatureSet(seurat, features = RbGeneList$ENSG, col.name = "percent.rb") +## Add Demultiplexing Results ## +demultiplexing_list <- lapply(pools, function(pool){ + temp <- data.frame(fread(paste0(dir, "output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/atleasthalf_singlet/", pool, "/atleasthalf_singlet_w_combined_assignments.tsv"))) + rownames(temp) <- gsub("-1", "", paste0(pool, "_", temp$Barcode)) + return(temp) +}) + +demultiplexing_dt <- do.call(rbind,demultiplexing_list) + + +seurat <- AddMetaData(seurat, demultiplexing_dt) + + +print("Starting cell cycle step") +hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran")) + +countsENSG <- seurat[["RNA"]]@counts # Create a counts matrix that has the ENSG gene clasifiers so can determine cell cycle phase +print("The size of the counts object in genes x cells is:") +print(dim(countsENSG)) + +### Run the cell cycle identification ### +print("Starting cell cycle determination") +assigned <- cyclone(countsENSG, pairs=hs.pairs) #Note, this takes hours +table(assigned$phases) +write.table(assigned, file = paste0(outdir,"CellCycleProportions.txt"), quote = F, sep = "\t") #Save so that can read in and don't have to wait to recompute again + + +assigned <- read.table(paste0(outdir,"CellCycleProportions.txt"), sep = "\t") +assigned <- as.data.frame(assigned) +rownames(assigned) <- colnames(seurat) +write.table(assigned, file = paste0(outdir,"CellCycleProportions.txt"), quote = F, sep = "\t") #Save so that can read in and don't have to wait to recompute again +assigned <- read.table(paste0(outdir,"CellCycleProportions.txt"), sep = "\t") + +seurat <- AddMetaData(seurat, assigned) +saveRDS(seurat, paste0(outdir,"seurat_all_cells_cell_cycle.rds")) + + + ### Make pre-QC figures ### seurat <- NormalizeData(seurat, verbose = TRUE) seurat <- FindVariableFeatures(seurat, selection.method = "mean.var.plot") @@ -75,45 +117,333 @@ seurat <- FindClusters(seurat, resolution = 0.5) seurat <- RunUMAP(seurat, dims = 1:10) -seurat$Pool <- gsub("_[ATCG]+", "", colnames(seurat)) - ### QC Figures pre filtering ### plot_mt_pct <- VlnPlot(seurat, features = c( "percent.mt"), group.by = "Pool", pt.size = 0) + - scale_fill_discrete_sequential(palette = "SunsetDark") + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) ggsave(plot_mt_pct, filename = paste0(outdir,"Mt_pct_vln.png")) plot_rb_pct <- VlnPlot(seurat, features = c( "percent.rb"), group.by = "Pool", pt.size = 0) + - scale_fill_discrete_sequential(palette = "SunsetDark") + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) ggsave(plot_rb_pct, filename = paste0(outdir,"Rb_pct_vln.png")) plot_n_count <- VlnPlot(seurat, features = c( "nCount_RNA"), group.by = "Pool", pt.size = 0) + - scale_fill_discrete_sequential(palette = "SunsetDark") + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) ggsave(plot_n_count, filename = paste0(outdir,"N_count_vln.png")) plot_nFeature_RNA <- VlnPlot(seurat, features = c( "nFeature_RNA"), group.by = "Pool", pt.size = 0) + - scale_fill_discrete_sequential(palette = "SunsetDark") + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) ggsave(plot_nFeature_RNA, filename = paste0(outdir,"nFeature_RNA_vln.png")) lib_mt <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Pool") + - scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + + theme(plot.background = element_rect(fill = "white")) ggsave(lib_mt, filename = paste0(outdir,"lib_mt.png")) lib_genes <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Pool") + - scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + + theme(plot.background = element_rect(fill = "white")) ggsave(lib_genes, filename = paste0(outdir,"lib_genes.png")) -nebulosa_mt_umap <- plot_density(seurat, "percent.mt", pal = "plasma") +nebulosa_mt_umap <- plot_density(seurat, "percent.mt", pal = "plasma") + + theme(plot.background = element_rect(fill = "white")) ggsave(nebulosa_mt_umap, filename = paste0(outdir,"mt_percent_umap.png")) -nebulosa_rb_umap <- plot_density(seurat, "percent.rb", pal = "plasma") +nebulosa_rb_umap <- plot_density(seurat, "percent.rb", pal = "plasma") + + theme(plot.background = element_rect(fill = "white")) ggsave(nebulosa_rb_umap, filename = paste0(outdir,"rb_percent_umap.png")) umap_Pool <- DimPlot(seurat, group.by = "Pool") + - scale_color_discrete_sequential(palette = "SunsetDark") + scale_color_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) ggsave(umap_Pool, filename = paste0(outdir,"pool_umap.png")) mt_umap <- FeaturePlot(seurat, features = "percent.mt") + - scale_color_continuous_sequential(palette = "RedPurple") + scale_color_continuous_sequential(palette = "RedPurple") + + theme(plot.background = element_rect(fill = "white")) ggsave(mt_umap, filename = paste0(outdir,"mt_percent_umap_seurat.png")) +colourCount = length(unique(seurat@meta.data$AtLeastHalfSinglet_Individual_Assignment)) +getPalette = colorRampPalette(brewer.pal(9, "Set1")) + +umap_individual <- DimPlot(seurat, group.by = "AtLeastHalfSinglet_Individual_Assignment") + + scale_color_manual(values = getPalette(colourCount)) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_individual, filename = paste0(outdir,"pool_individual.png")) + +umap_droplet_type <- DimPlot(seurat, group.by = "AtLeastHalfSinglet_DropletType") + + scale_color_discrete_sequential(palette = "Red-Blue", rev = FALSE) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_droplet_type, filename = paste0(outdir,"pool_droplet_type.png")) + +umap_phases <- DimPlot(seurat, group.by = "phases") + + scale_color_discrete_sequential(palette = "Sunset", rev = FALSE) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_phases, filename = paste0(outdir,"cell_cycle_type.png")) + + + +##### Subset seurat object by cells that have no info for cell assignments (removed by dropletQC) ##### +seurat_sub <- subset(seurat, subset = AtLeastHalfSinglet_Individual_Assignment != "doublet") + + + +### Make pre-QC figures ### +seurat_sub <- NormalizeData(seurat_sub, verbose = TRUE) +seurat_sub <- FindVariableFeatures(seurat_sub, selection.method = "mean.var.plot") +seurat_sub <- ScaleData(seurat_sub, features = VariableFeatures(seurat_sub)) +seurat_sub <- RunPCA(seurat_sub, features = VariableFeatures(object = seurat_sub)) +seurat_sub <- FindNeighbors(seurat_sub, dims = 1:10) +seurat_sub <- FindClusters(seurat_sub, resolution = 0.5) +seurat_sub <- RunUMAP(seurat_sub, dims = 1:10) + + + +### QC Figures pre filtering ### +plot_mt_pct <- VlnPlot(seurat_sub, features = c( "percent.mt"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_mt_pct, filename = paste0(outdir,"Mt_pct_vln_singlets.png")) + +plot_rb_pct <- VlnPlot(seurat_sub, features = c( "percent.rb"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_rb_pct, filename = paste0(outdir,"Rb_pct_vln_singlets.png")) + +plot_n_count <- VlnPlot(seurat_sub, features = c( "nCount_RNA"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_n_count, filename = paste0(outdir,"N_count_vln_singlets.png")) + +plot_nFeature_RNA <- VlnPlot(seurat_sub, features = c( "nFeature_RNA"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_nFeature_RNA, filename = paste0(outdir,"nFeature_RNA_vln_singlets.png")) + +lib_mt <- FeatureScatter(seurat_sub, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Pool") + + scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + + theme(plot.background = element_rect(fill = "white")) +ggsave(lib_mt, filename = paste0(outdir,"lib_mt_singlets.png")) + +lib_genes <- FeatureScatter(seurat_sub, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Pool") + + scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + + theme(plot.background = element_rect(fill = "white")) +ggsave(lib_genes, filename = paste0(outdir,"lib_genes_singlets.png")) + +nebulosa_mt_umap <- plot_density(seurat_sub, "percent.mt", pal = "plasma") + + theme(plot.background = element_rect(fill = "white")) +ggsave(nebulosa_mt_umap, filename = paste0(outdir,"mt_percent_umap_singlets.png")) + +nebulosa_rb_umap <- plot_density(seurat_sub, "percent.rb", pal = "plasma") + + theme(plot.background = element_rect(fill = "white")) +ggsave(nebulosa_rb_umap, filename = paste0(outdir,"rb_percent_umap_singlets.png")) + +umap_Pool <- DimPlot(seurat_sub, group.by = "Pool") + + scale_color_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_Pool, filename = paste0(outdir,"pool_umap_singlets.png")) + +mt_umap <- FeaturePlot(seurat_sub, features = "percent.mt") + + scale_color_continuous_sequential(palette = "RedPurple") + + theme(plot.background = element_rect(fill = "white")) +ggsave(mt_umap, filename = paste0(outdir,"mt_percent_umap_seurat_singlets.png")) + +colourCount = length(unique(seurat_sub@meta.data$AtLeastHalfSinglet_Individual_Assignment)) +getPalette = colorRampPalette(brewer.pal(9, "Set1")) + +umap_individual <- DimPlot(seurat_sub, group.by = "AtLeastHalfSinglet_Individual_Assignment") + + scale_color_manual(values = getPalette(colourCount)) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_individual, filename = paste0(outdir,"pool_individual_singlets.png")) + +umap_droplet_type <- DimPlot(seurat_sub, group.by = "AtLeastHalfSinglet_DropletType") + + scale_color_discrete_sequential(palette = "Red-Blue", rev = FALSE) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_droplet_type, filename = paste0(outdir,"pool_droplet_type_singlets.png")) + +umap_phases <- DimPlot(seurat_sub, group.by = "phases") + + scale_color_discrete_sequential(palette = "Sunset", rev = FALSE) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_phases, filename = paste0(outdir,"cell_cycle_type_singlets.png")) + + + + + +##### Remove high mitochondrial % cells ##### +seurat_sub_mt <- subset(seurat_sub, subset = percent.mt < 25) + + +### Integrate different time-points +seurat_sub_mt_list <- SplitObject(seurat_sub_mt, split.by = "Pool") +seurat_sub_mt_list <- lapply(X = seurat_sub_mt_list, function(x) SCTransform(x, vars.to.regress = c("scores.G1", "scores.S", "scores.G2M", "percent.mt", "percent.rb"), return.only.var.genes = FALSE)) +features <- SelectIntegrationFeatures(object.list = seurat_sub_mt_list, nfeatures = 3000) +seurat_sub_mt_list <- PrepSCTIntegration(object.list = seurat_sub_mt_list, anchor.features = features) + +saveRDS(seurat_sub_mt_list, paste0(outdir,"seurat_sub_mt_list.rds")) +seurat_sub_mt_list <- readRDS(paste0(outdir,"seurat_sub_mt_list.rds")) + +seurat_sub_mt_anchors <- FindIntegrationAnchors(object.list = seurat_sub_mt_list, normalization.method = "SCT", + anchor.features = features) +combined_sct <- IntegrateData(anchorset = seurat_sub_mt_anchors, normalization.method = "SCT") + +combined_sct <- RunPCA(combined_sct, verbose = FALSE) +combined_sct <- RunUMAP(combined_sct, reduction = "pca", dims = 1:30) + + +### QC Figures ### +plot_mt_pct <- VlnPlot(combined_sct, features = c( "percent.mt"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_mt_pct, filename = paste0(outdir,"Mt_pct_vln_singlets_filtered.png")) + +plot_rb_pct <- VlnPlot(combined_sct, features = c( "percent.rb"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_rb_pct, filename = paste0(outdir,"Rb_pct_vln_singlets_filtered.png")) + +plot_n_count <- VlnPlot(combined_sct, features = c( "nCount_RNA"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_n_count, filename = paste0(outdir,"N_count_vln_singlets_filtered.png")) + +plot_nFeature_RNA <- VlnPlot(combined_sct, features = c( "nFeature_RNA"), group.by = "Pool", pt.size = 0) + + scale_fill_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(plot_nFeature_RNA, filename = paste0(outdir,"nFeature_RNA_vln_singlets_filtered.png")) + +lib_mt <- FeatureScatter(combined_sct, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "Pool") + + scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + + theme(plot.background = element_rect(fill = "white")) +ggsave(lib_mt, filename = paste0(outdir,"lib_mt_singlets_filtered.png")) + +lib_genes <- FeatureScatter(combined_sct, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Pool") + + scale_color_discrete_sequential(palette = "SunsetDark", alpha = 0.25) + + theme(plot.background = element_rect(fill = "white")) +ggsave(lib_genes, filename = paste0(outdir,"lib_genes_singlets_filtered.png")) + +nebulosa_mt_umap <- plot_density(combined_sct, "percent.mt", pal = "plasma") + + theme(plot.background = element_rect(fill = "white")) +ggsave(nebulosa_mt_umap, filename = paste0(outdir,"mt_percent_umap_singlets_filtered.png")) + +nebulosa_rb_umap <- plot_density(combined_sct, "percent.rb", pal = "plasma") + + theme(plot.background = element_rect(fill = "white")) +ggsave(nebulosa_rb_umap, filename = paste0(outdir,"rb_percent_umap_singlets_filtered.png")) + +umap_Pool <- DimPlot(combined_sct, group.by = "Pool") + + scale_color_discrete_sequential(palette = "SunsetDark") + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_Pool, filename = paste0(outdir,"pool_umap_singlets_filtered.png")) + +mt_umap <- FeaturePlot(combined_sct, features = "percent.mt") + + scale_color_continuous_sequential(palette = "RedPurple") + + theme(plot.background = element_rect(fill = "white")) +ggsave(mt_umap, filename = paste0(outdir,"mt_percent_umap_seurat_singlets_filtered.png")) + +colourCount = length(unique(combined_sct@meta.data$AtLeastHalfSinglet_Individual_Assignment)) +getPalette = colorRampPalette(brewer.pal(9, "Set1")) + +umap_individual <- DimPlot(combined_sct, group.by = "AtLeastHalfSinglet_Individual_Assignment") + + scale_color_manual(values = getPalette(colourCount)) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_individual, filename = paste0(outdir,"pool_individual_singlets_filtered.png")) + +umap_droplet_type <- DimPlot(combined_sct, group.by = "AtLeastHalfSinglet_DropletType") + + scale_color_discrete_sequential(palette = "Red-Blue", rev = FALSE) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_droplet_type, filename = paste0(outdir,"pool_droplet_type_singlets_filtered.png")) + +umap_phases <- DimPlot(combined_sct, group.by = "phases") + + scale_color_discrete_sequential(palette = "Sunset", rev = FALSE) + + theme(plot.background = element_rect(fill = "white")) +ggsave(umap_phases, filename = paste0(outdir,"cell_cycle_type_singlets_filtered.png")) + + +### Update individual IDs to match what used for Nona's plots ### +combined_sct$Assignment <- gsub("^0_", "", combined_sct$AtLeastHalfSinglet_Individual_Assignment) %>% + gsub("D-", "", .) %>% + gsub("\\.\\d\\.", "", .) %>% + gsub("N-", "", .) %>% + gsub("-P36", "", .) %>% + gsub("-", "", .) + +seurat_sub$Assignment <- gsub("^0_", "", seurat_sub$AtLeastHalfSinglet_Individual_Assignment) %>% + gsub("D-", "", .) %>% + gsub("\\.\\d\\.", "", .) %>% + gsub("N-", "", .) %>% + gsub("-P36", "", .) %>% + gsub("-", "", .) + + +##### Make proportion plots (area plot) ##### +village_summary <- data.table(prop.table(table(seurat_sub@meta.data[,c("Assignment", "Pool")]), margin = 2)) +village_summary$Assignment <- factor(village_summary$Assignment, levels = rev(village_summary[Pool == "Village_P8"]$Assignment[order(village_summary[Pool == "Village_P8"]$N)])) + +colors_original <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Nona_multiome/line_colors") +colors_original <- colors_original[levels(village_summary$Assignment)] + + +p_stacked_area <- ggplot(village_summary, aes(x = as.numeric(as.character(gsub("Village_P","", Pool))), y = N, fill = factor(Assignment), group = Assignment)) + + geom_area(alpha=0.6 , size=0.5, colour="black") + + theme_classic() + + scale_fill_manual(values = colors_original) + + xlab("Passage") + + ylab("Proportion of Cells") +ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.png"), width = 7, height = 4) +ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.pdf"), width = 7, height = 4) + + +village_summary_singlets <- data.table(prop.table(table(combined_sct@meta.data[,c("Assignment", "Pool")]), margin = 2)) +village_summary_singlets$Assignment <- factor(village_summary_singlets$Assignment, levels = rev(village_summary_singlets[Pool == "Village_P8"]$Assignment[order(village_summary_singlets[Pool == "Village_P8"]$N)])) + +colors <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Nona_multiome/line_colors") +colors <- colors[levels(village_summary_singlets$Assignment)] + + +p_stacked_area <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(gsub("Village_P","", Pool))), y = N, fill = factor(Assignment), group = Assignment)) + + geom_area(alpha=0.6 , size=0.5, colour="black") + + theme_classic() + + scale_fill_manual(values = colors) + + xlab("Passage") + + ylab("Proportion of Cells") +ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area_filtered.png"), width = 7, height = 4) +ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area_filtered.pdf"), width = 7, height = 4) + + + + + + + +saveRDS(combined_sct, paste0(outdir,"time-integrated_filtered_seurat.rds")) +combined_sct <- readRDS(paste0(outdir,"time-integrated_filtered_seurat.rds")) + + +### Add number of cells at each time into metadata and 1/n for models +freq_dt <- as.data.frame(table(combined_sct$Assignment, combined_sct$Pool)) +freq_dt$Ncov <- ifelse(freq_dt$Freq == 0, NA, 1/freq_dt$Freq) + +meta <- left_join(combined_sct@meta.data, freq_dt, by = c("Assignment" = "Var1", "Pool" = "Var2")) +rownames(meta) <- rownames(combined_sct@meta.data) + +combined_sct_meta <- AddMetaData(combined_sct, meta) + + +### SCT the dataset +feats <- rownames(combined_sct_meta[["SCT"]]@counts)[which((rowSums(combined_sct_meta[["SCT"]]@counts > 0)/ncol(combined_sct_meta[["SCT"]]@counts)) >= 0.01)] +combined_sct_filt <- subset(combined_sct_meta, features = feats) + +saveRDS(combined_sct_filt, paste0(outdir, "time-integrated_filtered_seurat_1pct_expressing.rds")) + + +DefaultAssay(combined_sct_filt) <- "RNA" + +combined_sct_filt <- SCTransform(combined_sct_filt, verbose = TRUE, vars.to.regress = c("scores.G1", "scores.S", "scores.G2M", "percent.mt", "percent.rb"), return.only.var.genes = FALSE) + +saveRDS(combined_sct_filt, paste0(outdir, "seurat_joint_SCT_1pct_expressing.rds")) + +fwrite(data.table(Gene = rownames(combined_sct_filt)), paste0(outdir, "genes_1pct_expressing.tsv"), sep = "\t") diff --git a/multi-passage/test_vireo_ambient/vireo.sh b/multi-passage/test_vireo_ambient/vireo.sh new file mode 100644 index 0000000..4743d99 --- /dev/null +++ b/multi-passage/test_vireo_ambient/vireo.sh @@ -0,0 +1,6 @@ +#!/bin/bash + + + + +singularity exec --bind /directflow $SIF vireo -c $VIREO_INDIR -d $VIREO_INDIR/donor_subset.vcf -o $VIREO_OUTDIR -t $FIELD --callAmbientRNAs \ No newline at end of file diff --git a/multi-passage/test_vireo_ambient/vireo_submit.sh b/multi-passage/test_vireo_ambient/vireo_submit.sh new file mode 100644 index 0000000..8861288 --- /dev/null +++ b/multi-passage/test_vireo_ambient/vireo_submit.sh @@ -0,0 +1,41 @@ + + +DIR=/directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_dir/vireo_new_barcode +SIF=/directflow/SCCGGroupShare/projects/himaro/demuxafy/image/Demuxafy.sif +PIPELINE="/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/scripts/multi-passage/test_vireo_ambient/vireo.sh" + +THREADS=4 +N=18 + + +for pool in `ls /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/multi-passage/resequencing/220405/` +do + + FIELD="GP" + + + VIREO_INDIR=/directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/vireo_new_barcode/$pool + VIREO_OUTDIR=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/vireo_test/$pool + LOG=$VIREO_OUTDIR/logs + mkdir -p $LOG + + + + qsub -S /bin/bash \ + -cwd \ + -pe smp 4 \ + -N vireo \ + -q short.q \ + -l mem_requested=8G \ + -l tmp_requested=8G \ + -e $LOG \ + -o $LOG \ + -r yes \ + -j y \ + -M d.neavin@garvan.org.au \ + -v DIR=$DIR,SIF=$SIF,VIREO_INDIR=$VIREO_INDIR,VIREO_OUTDIR=$VIREO_OUTDIR,FIELD=$FIELD \ + -V \ + -C '' $PIPELINE + + +done \ No newline at end of file diff --git a/multi-passage/transfer_data.sh b/multi-passage/transfer_data.sh new file mode 100644 index 0000000..36f08a6 --- /dev/null +++ b/multi-passage/transfer_data.sh @@ -0,0 +1,45 @@ + + +INDIR=/directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/combined_results/AtLeastHalfSinglet +OUTDIR=/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed + +mkdir -p $OUTDIR + +# for VIL in `ls $INDIR` +# do +# cp -R $VIL $OUTDIR +# done + +### Asked Himanshi to redo the overlap with demuxlet included since it allowed us to identify additionao +mkdir -p $OUTDIR/with_demuxlet + +for VIL in `ls $INDIR` +do + cp -R $INDIR/$VIL $OUTDIR/with_demuxlet +done + + + +conda activate baseR402 + +for village in `ls /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/demuxlet_updated_barcodes/` +do + mkdir -p /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/atleasthalf_singlet/$village + mkdir -p /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/majority_singlet/$village + + Rscript /directflow/SCCGGroupShare/projects/DrewNeavin/Demultiplex_Benchmark/Demultiplexing_Doublet_Detecting_Docs/scripts/Combine_Results.R \ + -o /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/majority_singlet/$village/majority_singlet_new.tsv \ + -d /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/demuxlet_updated_barcodes/$village \ + -f /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/freemuxlet_new_barcode/$village \ + -u /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/souporcell_new_barcode/$village \ + -v /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/vireo_new_barcode/$village \ + --method "MajoritySinglet" + + Rscript /directflow/SCCGGroupShare/projects/DrewNeavin/Demultiplex_Benchmark/Demultiplexing_Doublet_Detecting_Docs/scripts/Combine_Results.R \ + -o /directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/multi-passage/demultiplexed/with_demuxlet/updated_2022_06_26/atleasthalf_singlet/$village/atleasthalf_singlet_new.tsv \ + -d /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/demuxlet_updated_barcodes/$village \ + -f /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/freemuxlet_new_barcode/$village \ + -u /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/souporcell_new_barcode/$village \ + -v /directflow/SCCGGroupShare/projects/himaro/demuxafy/demultiplex_scRNA-seq/log_new/vireo_new_barcode/$village \ + --method "AtLeastHalfSinglet" +done \ No newline at end of file diff --git a/scCODA/sccod.py b/scCODA/sccod.py new file mode 100644 index 0000000..6ddc350 --- /dev/null +++ b/scCODA/sccod.py @@ -0,0 +1,354 @@ +# Setup +import importlib +import warnings +warnings.filterwarnings("ignore") + +import pandas as pd +import pickle as pkl +import matplotlib.pyplot as plt + +from sccoda.util import comp_ana as mod +from sccoda.util import cell_composition_data as dat +from sccoda.util import data_visualization as viz + +import sccoda.datasets as scd + + + +### Set up directories ### +dir = '/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Expression_Boxplots/pluri_degs/' +outdir = '/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/scCODA/' + + +# Load data +cell_counts = pd.read_csv(outdir + 'cell_line_numbers.tsv', sep = '\t') + +print(cell_counts) + + +# Convert data to anndata object +data_all = dat.from_pandas(cell_counts, covariate_columns=["Group"]) + +print(data_all) + + +# Extract condition from mouse name and add it as an extra column to the covariates +data_all.obs["Location"] = data_all.obs["Group"].str.replace(r"_.+", "", regex=True) +data_all.obs["Village"] = data_all.obs["Group"].str.replace(r"Brisbane_", "", regex=True).str.replace(r"Melbourne_", "", regex=True).str.replace(r"Sydney_", "", regex=True).str.replace(r"_Replicate[0-9]_.+", "", regex=True) +data_all.obs["Replicate"] = data_all.obs["Group"].str.replace(r".+_Replicate", "Replicate", regex=True).str.replace("_.+", "", regex=True) +data_all.obs["Cryopreservation"] = data_all.obs["Group"].str.replace(r".+_", "", regex=True) + + +# Uni-culture vs village +## Brisbane +data_brisbane = data_all[data_all.obs["Location"].isin(["Brisbane"])] +viz.boxplots(data_brisbane, feature_name="Village") +plt.savefig(outdir + 'brisbane_numbers.png') + +model_brisbane = mod.CompositionalAnalysis(data_brisbane, formula="Village", reference_cell_type="automatic") +brisbane_sim_results = model_brisbane.sample_hmc() + +brisbane_sim_results.summary() +print(brisbane_sim_results.credible_effects()) + + +# Run scCODA with each cell type as the reference +cell_types_brisbane = data_brisbane.var.index +results_cycle_brisbane = pd.DataFrame(index=cell_types_brisbane, columns=["times_credible"]).fillna(0) + +models_brisbane = [] +results_brisbane = [] + +for ct in cell_types_brisbane: + print(f"Reference: {ct}") + # Run inference + model_temp = mod.CompositionalAnalysis(data_brisbane, formula="Village", reference_cell_type=ct) + models_brisbane.append(model_temp) + results_temp = model_temp.sample_hmc(num_results=20000) + results_brisbane.append(results_temp) + # Select credible effects + cred_eff = results_temp.credible_effects() + cred_eff.index = cred_eff.index.droplevel(level=0) + # add up credible effects + results_cycle_brisbane["times_credible"] += cred_eff.astype("int") + +# Calculate percentages +results_cycle_brisbane["pct_credible"] = results_cycle_brisbane["times_credible"]/len(cell_types_brisbane) +results_cycle_brisbane["is_credible"] = results_cycle_brisbane["pct_credible"] > 0.5 +print(results_cycle_brisbane) + +# Get average difference +### couldn't work out how to pull from summary in python, will do manually - angry emoji +results_brisbane[0].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 0.000000 1735.548669 0.855849 +# MBE1006 -0.665075 1502.677050 -0.103652 +# TOB0421 -1.146676 899.107614 -0.798455 + +results_brisbane[1].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 0.672738 1738.916835 0.863100 +# MBE1006 0.000000 1501.590528 -0.107455 +# TOB0421 -0.481418 896.825971 -0.801995 + +results_brisbane[2].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 1.155125 1739.197970 0.864988 +# MBE1006 0.480325 1500.238853 -0.108542 +# TOB0421 0.000000 897.896510 -0.801505 + + + +brisbane_logfc_results_dt = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'], + 'FSA0006_ref': [0.855849, -0.103652, -0.798455], + 'MBE1006_ref': [0.863100, -0.107455, -0.801995], + 'TOB0421_ref': [0.864988, -0.108542, -0.801505]}) + +brisbane_logfc_results_dt['mean'] = brisbane_logfc_results_dt.mean(axis=1) + +brisbane_logfc_results_dt['is_credible'] = [True, True, True] + +brisbane_logfc_results_dt.to_csv(outdir + "brisbane_logfc_results.tsv", sep = "\t") + + + + +## Melbourne +data_melbourne = data_all[data_all.obs["Location"].isin(["Melbourne"])] +viz.boxplots(data_melbourne, feature_name="Village") +plt.savefig(outdir + 'melbourne_numbers.png') + +model_melbourne = mod.CompositionalAnalysis(data_melbourne, formula="Village", reference_cell_type="automatic") +melbourne_sim_results = model_melbourne.sample_hmc() + +melbourne_sim_results.summary() +print(melbourne_sim_results.credible_effects()) + + +# Run scCODA with each cell type as the reference +cell_types_melbourne = data_melbourne.var.index +results_cycle_melbourne = pd.DataFrame(index=cell_types_melbourne, columns=["times_credible"]).fillna(0) + +models_melbourne = [] +results_melbourne = [] + +for ct in cell_types_melbourne: + print(f"Reference: {ct}") + # Run inference + model_temp = mod.CompositionalAnalysis(data_melbourne, formula="Village", reference_cell_type=ct) + models_melbourne.append(model_temp) + results_temp = model_temp.sample_hmc(num_results=20000) + results_melbourne.append(results_temp) + # Select credible effects + cred_eff = results_temp.credible_effects() + cred_eff.index = cred_eff.index.droplevel(level=0) + # add up credible effects + results_cycle_melbourne["times_credible"] += cred_eff.astype("int") + +# Calculate percentages +results_cycle_melbourne["pct_credible"] = results_cycle_melbourne["times_credible"]/len(cell_types_melbourne) +results_cycle_melbourne["is_credible"] = results_cycle_melbourne["pct_credible"] > 0.5 +print(results_cycle_melbourne) + + +# Get average difference +### couldn't work out how to pull from summary in python, will do manually - angry emoji +results_melbourne[0].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 0.000000 3471.687790 1.147833 +# MBE1006 -1.930117 648.252082 -1.636737 +# TOB0421 -1.203827 1065.893461 -0.588923 + +results_melbourne[1].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 1.932333 3478.980493 1.152990 +# MBE1006 0.000000 650.122718 -1.634778 +# TOB0421 0.716773 1056.730123 -0.600693 + +results_melbourne[2].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 1.139058 3411.776681 1.136354 +# MBE1006 -0.780606 662.039516 -1.633135 +# TOB0421 0.000000 1112.017136 -0.506959 + +melbourne_logfc_results_dt = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'], + 'FSA0006_ref': [1.147833, -1.636737, -0.588923], + 'MBE1006_ref': [1.152990, -1.634778, -0.600693], + 'TOB0421_ref': [1.136354, -1.633135, -0.506959]}) + +melbourne_logfc_results_dt['mean'] = melbourne_logfc_results_dt.mean(axis=1) + +melbourne_logfc_results_dt['is_credible'] = [True, True, True] + +melbourne_logfc_results_dt.to_csv(outdir + "melbourne_logfc_results.tsv", sep = "\t") + + + + + + +## Sydney - Fresh +data_sydney = data_all[data_all.obs["Location"].isin(["Sydney"]) & data_all.obs["Cryopreservation"].isin(["Fresh"])] +viz.boxplots(data_sydney, feature_name="Village") +plt.savefig(outdir + 'sydney_numbers.png') + +model_sydney = mod.CompositionalAnalysis(data_sydney, formula="Village", reference_cell_type="TOB0421") + +sydney_sim_results = model_sydney.sample_hmc() +sydney_sim_results.summary() +print(sydney_sim_results.credible_effects()) + + +# Run scCODA with each cell type as the reference +cell_types_sydney = data_sydney.var.index +results_cycle_sydney = pd.DataFrame(index=cell_types_sydney, columns=["times_credible"]).fillna(0) + +models_sydney = [] +results_sydney = [] + +for ct in cell_types_sydney: + print(f"Reference: {ct}") + # Run inference + model_temp = mod.CompositionalAnalysis(data_sydney, formula="Village", reference_cell_type=ct) + models_sydney.append(model_temp) + results_temp = model_temp.sample_hmc(num_results=20000) + results_sydney.append(results_temp) + # Select credible effects + cred_eff = results_temp.credible_effects() + cred_eff.index = cred_eff.index.droplevel(level=0) + # add up credible effects + results_cycle_sydney["times_credible"] += cred_eff.astype("int") + +# Calculate percentages +results_cycle_sydney["pct_credible"] = results_cycle_sydney["times_credible"]/len(cell_types_sydney) +results_cycle_sydney["is_credible"] = results_cycle_sydney["pct_credible"] > 0.5 +print(results_cycle_sydney) + +# Get average difference +### couldn't work out how to pull from summary in python, will do manually - angry emoji +results_sydney[0].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 0.0 1130.524812 0.0 +# MBE1006 0.0 1255.685903 0.0 +# TOB0421 0.0 819.289285 0.0 + +results_sydney[1].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 0.0 1132.029342 0.0 +# MBE1006 0.0 1244.846090 0.0 +# TOB0421 0.0 828.624568 0.0 + +results_sydney[2].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 0.0 1125.134806 0.0 +# MBE1006 0.0 1255.963310 0.0 +# TOB0421 0.0 824.401884 0.0 + + +sydney_logfc_results_dt = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'], + 'FSA0006_ref': [0, 0, 0], + 'MBE1006_ref': [0, 0, 0], + 'TOB0421_ref': [0, 0, 0]}) + +sydney_logfc_results_dt['mean'] = sydney_logfc_results_dt.mean(axis=1) + +sydney_logfc_results_dt['is_credible'] = [False, False, False] + +sydney_logfc_results_dt.to_csv(outdir + "sydney_logfc_results.tsv", sep = "\t") + + + + + +## Sydney - Cryopreserved +data_sydney_cryo = data_all[data_all.obs["Location"].isin(["Sydney"]) & data_all.obs["Cryopreservation"].isin(["Cryopreserved"])] +viz.boxplots(data_sydney_cryo, feature_name="Village") +plt.savefig(outdir + 'sydney_numbers.png') + +model_sydney_cryo = mod.CompositionalAnalysis(data_sydney_cryo, formula="Village", reference_cell_type="automatic") + +sydney_sim_results_cryo = model_sydney_cryo.sample_hmc() +sydney_sim_results_cryo.summary() +print(sydney_sim_results_cryo.credible_effects()) + + +# Run scCODA with each cell type as the reference +cell_types_sydney_cryo = data_sydney_cryo.var.index +results_cycle_sydney_cryo = pd.DataFrame(index=cell_types_sydney_cryo, columns=["times_credible"]).fillna(0) + +models_sydney_cryo = [] +results_sydney_cryo = [] + +for ct in cell_types_sydney_cryo: + print(f"Reference: {ct}") + # Run inference + model_temp = mod.CompositionalAnalysis(data_sydney_cryo, formula="Village", reference_cell_type=ct) + models_sydney_cryo.append(model_temp) + results_temp = model_temp.sample_hmc(num_results=20000) + results_sydney_cryo.append(results_temp) + # Select credible effects + cred_eff = results_temp.credible_effects() + cred_eff.index = cred_eff.index.droplevel(level=0) + # add up credible effects + results_cycle_sydney_cryo["times_credible"] += cred_eff.astype("int") + +# Calculate percentages +results_cycle_sydney_cryo["pct_credible"] = results_cycle_sydney_cryo["times_credible"]/len(results_cycle_sydney_cryo) +results_cycle_sydney_cryo["is_credible"] = results_cycle_sydney_cryo["pct_credible"] > 0.5 +print(results_cycle_sydney_cryo) + +# Get average difference +### couldn't work out how to pull from summary in python, will do manually - angry emoji +results_sydney_cryo[0].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 0.00000 1620.195457 1.368064 +# MBE1006 -2.05755 294.934049 -1.600354 +# TOB0421 -1.82538 280.037161 -1.265402 + +results_sydney_cryo[1].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 1.967652 1628.040102 1.372877 +# MBE1006 0.000000 308.733991 -1.465845 +# TOB0421 0.000000 258.392574 -1.465845 + +results_sydney_cryo[2].summary() +# Effects: +# Final Parameter Expected Sample log2-fold change +# Covariate Cell Type +# Village[T.Village] FSA0006 1.925718 1608.624854 1.359406 +# MBE1006 0.000000 320.030556 -1.418819 +# TOB0421 0.000000 266.511256 -1.418819 + +sydney_logfc_results_dt_cryo = pd.DataFrame({'Cell Type': ['FSA0006', 'MBE1006', 'TOB0421'], + 'FSA0006_ref': [1.368064, -1.600354, -1.265402], + 'MBE1006_ref': [1.372877, -1.465845, -1.465845], + 'TOB0421_ref': [1.359406, -1.418819, -1.418819]}) + +sydney_logfc_results_dt_cryo['mean'] = sydney_logfc_results_dt_cryo.mean(axis=1) + +sydney_logfc_results_dt_cryo['is_credible'] = [True, False, False] + +sydney_logfc_results_dt_cryo.to_csv(outdir + "sydney_cryo_logfc_results.tsv", sep = "\t") + diff --git a/scCODA/sccoda_plots.R b/scCODA/sccoda_plots.R new file mode 100644 index 0000000..8f782ae --- /dev/null +++ b/scCODA/sccoda_plots.R @@ -0,0 +1,105 @@ +library(data.table) +library(tidyverse) + + + +dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/scCODA/" + +line_colors <- c(FSA0006 = "#F79E29", MBE1006 = "#9B2C99", TOB0421 = "#35369C") +groups <- c("brisbane", "melbourne", "sydney", "sydney_cryo") + +##### Read in Results ##### +results <- lapply(groups, function(group){ + tmp <- fread(paste0(dir, group, "_logfc_results.tsv")) + tmp$group <- group + return(tmp) +}) + +results_dt <- do.call(rbind, results) +results_dt$V1 <- NULL +results_dt$group <- gsub("brisbane", "Brisbane", results_dt$group) %>% + gsub("melbourne", "Melbourne", .) %>% + gsub("^sydney$", "Sydney", .) %>% + gsub("sydney_cryo", "Sydney\nCryopreserved", .) + + +results_long <- melt(results_dt, id.vars = c("Cell Type", "is_credible", "group"), + measure.vars = c("FSA0006_ref", "MBE1006_ref", "TOB0421_ref")) + + + +plot <- ggplot(results_long, aes(`Cell Type`, value, color = `Cell Type`)) + + geom_hline(yintercept = 0, linetype = "dashed") + + geom_jitter(width = 0.2, alpha = 0.7) + + theme_classic() + + scale_color_manual(values = line_colors) + + facet_wrap(vars(group), nrow = 1) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), + axis.title.x = element_blank()) + + ylab("log2(Fold Change)") + + geom_text( + data = unique(results_long[,c("Cell Type", "is_credible", "group")]), + aes(y = 1.6,label = gsub(TRUE, "\\^", is_credible) %>% gsub(FALSE, " ", .)), + color = "black", + # position = position_dodge(width = 1), + # vjust = -0.5, + size = 3, + # stat = "unique", + # parse = TRUE + ) + +ggsave(plot, filename = paste0(dir, "fold_change_plot.png"), height = 3, width = 6) +ggsave(plot, filename = paste0(dir, "fold_change_plot.pdf"), height = 3, width = 6) + + + + +plot_fresh <- ggplot(results_long[group != "Sydney\nCryopreserved"], aes(`Cell Type`, value, color = `Cell Type`)) + + geom_hline(yintercept = 0, linetype = "dashed") + + geom_jitter(width = 0.2, alpha = 0.7) + + theme_classic() + + scale_color_manual(values = line_colors) + + facet_wrap(vars(group), nrow = 1) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), + axis.title.x = element_blank()) + + ylab("log2(Fold Change)") + + geom_text( + data = unique(results_long[group != "Sydney\nCryopreserved",c("Cell Type", "is_credible", "group")]), + aes(y = 1.6,label = gsub(TRUE, "\\^", is_credible) %>% gsub(FALSE, " ", .)), + color = "black", + # position = position_dodge(width = 1), + # vjust = -0.5, + size = 3, + # stat = "unique", + # parse = TRUE + ) + +ggsave(plot_fresh, filename = paste0(dir, "fold_change_plot_fresh.png"), height = 3, width = 4.5) +ggsave(plot_fresh, filename = paste0(dir, "fold_change_plot_fresh.pdf"), height = 3, width = 4.5) + + + + +plot_cryo <- ggplot(results_long[grep("Sydney", group)], aes(`Cell Type`, value, color = `Cell Type`)) + + geom_hline(yintercept = 0, linetype = "dashed") + + geom_jitter(width = 0.2, alpha = 0.7) + + theme_classic() + + scale_color_manual(values = line_colors) + + facet_wrap(vars(group), nrow = 1) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), + axis.title.x = element_blank()) + + ylab("log2(Fold Change)") + + geom_text( + data = unique(results_long[grep("Sydney", group),c("Cell Type", "is_credible", "group")]), + aes(y = 1.6,label = gsub(TRUE, "\\^", is_credible) %>% gsub(FALSE, " ", .)), + color = "black", + # position = position_dodge(width = 1), + # vjust = -0.5, + size = 3, + # stat = "unique", + # parse = TRUE + ) + +ggsave(plot_cryo, filename = paste0(dir, "fold_change_plot_cryo.png"), height = 3, width = 3.75) +ggsave(plot_cryo, filename = paste0(dir, "fold_change_plot_cryo.pdf"), height = 3, width = 3.75) + diff --git a/scCODA/sccoda_preparation.R b/scCODA/sccoda_preparation.R new file mode 100644 index 0000000..83347f5 --- /dev/null +++ b/scCODA/sccoda_preparation.R @@ -0,0 +1,40 @@ +library(data.table) +library(tidyverse) +library(Seurat) + + + + +outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/scCODA/" +dir.create(outdir, recursive = TRUE) + + + +seurat <- readRDS("/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/All_data_integrated_remove_bad/seurat_integrated_all_times_clustered.rds") + + +head(seurat@meta.data) + +seurat@meta.data$Location <- gsub("_.+", "", seurat@meta.data$Location) +seurat@meta.data$Cryopreserved <- ifelse(grepl("Thawed", seurat@meta.data$Location_Time), "Cryopreserved", "Fresh") +seurat@meta.data$Location_Cryopreserved_Line <- paste0(seurat@meta.data$Location, "_", seurat@meta.data$Cryopreserved, "_", seurat@meta.data$Final_Assignment) +seurat@meta.data$Village <- gsub("Baseline", "Uni-Culture", seurat@meta.data$Time) %>% + gsub("Thawed Village Day 0", "Uni-Culture", .) %>% + gsub("Thawed Village Day 7", "Village", .) %>% + gsub("Village Day 4", "Village", .) +seurat@meta.data$Replicate <- gsub("Brisbane", "Replicate", seurat@meta.data$MULTI_ID) %>% + gsub("Melbourne", "Replicate", .) %>% + gsub("Sydney", "Replicate", .) + + +seurat@meta.data$Group <- paste0(seurat@meta.data$Location, "_", seurat@meta.data$Village, "_",seurat@meta.data$Replicate, "_", seurat@meta.data$Cryopreserved) + + +cell_prop_long <- data.table(table(seurat@meta.data$Group, seurat@meta.data$Final_Assignment)) + + +cell_prop_dt <- dcast(cell_prop_long, V1 ~ V2, value.var = "N") +colnames(cell_prop_dt)[1] <- c("Group") + + +fwrite(cell_prop_dt, paste0(outdir, "cell_line_numbers.tsv"), sep = "\t")