update records

powellgenomicslab · Mar 8, 2022 · d19889f · d19889f
1 parent 456dc90
commit d19889f
Show file tree

Hide file tree

Showing 7 changed files with 780 additions and 0 deletions.
diff --git a/Growth_rate/growth_rate.R b/Growth_rate/growth_rate.R
@@ -0,0 +1,55 @@
+library(data.table)
+library(tidyverse)
+library(gtools)
+library(growthcurver)
+library("ggpubr")
+
+
+dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/"
+data <- fread(paste0(dir, "Growth_rate/growth_rate_measurements.csv"))
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/growth_rate/"
+
+
+dir.create(outdir, recursive = TRUE)
+
+
+models <- lapply(unique(data$CellLine), function(line){
+	SummarizeGrowth(data[CellLine == line]$Time, data[CellLine == line]$Count)
+})
+names(models) <- unique(data$CellLine)
+
+predict <- lapply(names(models), function(line){
+	tmp <- data.table(Time = data[CellLine == line]$Time, pred.wt = predict(models[[line]]$model))
+	tmp$CellLine <- line
+	return(tmp)
+})
+
+predict_df <- do.call(rbind, predict)
+
+
+
+
+### Make Plot ###
+pBasic <- ggplot(data, aes(Time, Count, color = CellLine)) +
+  geom_point() +
+  theme_classic() +
+  geom_line(data=predict_df, aes(y=pred.wt))
+
+ggsave(pBasic, filename = paste0(outdir, "/Basic_growth_rates_plot.png"))
+
+
+
+
+##### Correlate growth rate with confluency #####
+pCorr <- ggplot(data, aes(Count,Confluency, color = CellLine)) +
+			geom_point() +
+			theme_classic()
+
+ggsave(pCorr, filename = paste0(outdir, "Count_confluency_corr.png"))
+
+
+pCorr <- ggscatter(data, x = "Count", y = "Confluency", color = "CellLine",
+			add = "reg.line", conf.int = TRUE, 
+			cor.method = "pearson") +
+			stat_cor(aes(color = CellLine), label.x = 90)
+ggsave(pCorr, filename = paste0(outdir, "Count_confluency_corr.png"))
diff --git a/Growth_rate/ratrack.sh b/Growth_rate/ratrack.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+RATRACK=/directflow/SCCGGroupShare/projects/DrewNeavin/software/ratrack/
+SNAKEFILE=/directflow/SCCGGroupShare/projects/DrewNeavin/software/ratrack/Snakefile
+
+
+cd $RATRACK
+mkdir logs
+
+conda activate ratrack
+
+# ### First test with demo ###
+# snakemake results/minimal.pdf results/minimal.fit.csv --dryrun
+# snakemake results/kcl22.pdf results/kcl22.fit.csv
+
+
+# snakemake results/TOB0421_ratrack.pdf results/TOB0421_ratrack.fit.csv --dryrun
+# nohup snakemake results/TOB0421_ratrack.pdf results/TOB0421_ratrack.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_TOB0421.log &
+
+# snakemake results/FSA0006_ratrack.pdf results/FSA0006_ratrack.fit.csv --dryrun
+# nohup snakemake results/FSA0006_ratrack.pdf results/FSA0006_ratrack.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_FSA0006.log &
+
+# snakemake results/MBE1006_ratrack.pdf results/MBE1006_ratrack.fit.csv --dryrun
+# nohup snakemake results/MBE1006_ratrack.pdf results/MBE1006_ratrack.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_MBE1006.log &
+
+
+### Try with original counts + dillution info ###
+snakemake results/TOB0421_original_counts.pdf results/TOB0421_original_counts.fit.csv --dryrun
+nohup snakemake results/TOB0421_original_counts.pdf results/TOB0421_original_counts.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_TOB0421.log &
+
+snakemake results/FSA0006_original_counts.pdf results/FSA0006_original_counts.fit.csv --dryrun
+nohup snakemake results/FSA0006_original_counts.pdf results/FSA0006_original_counts.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_FSA0006.log &
+
+snakemake results/MBE1006_original_counts.pdf results/MBE1006_original_counts.fit.csv --dryrun
+nohup snakemake results/MBE1006_original_counts.pdf results/MBE1006_original_counts.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_MBE1006.log &
+
+
+
+
+
+snakemake results/TOB0421_ratrack.pdf results/TOB0421_ratrack.fit.csv --dryrun --unlock
diff --git a/Nona_multiome/cell_line_proportions.R b/Nona_multiome/cell_line_proportions.R
@@ -0,0 +1,146 @@
+##### Author: Drew Neavin
+##### Date: 2 December, 2021
+##### Reason: Look at the proportions of each line at each time of Nona's multi-ome experiment
+
+
+
+
+##### Load in libraries #####
+library(data.table)
+library(Seurat)
+library(tidyverse)
+library("ggpomological")
+
+
+##### Set up directories #####
+datadir <- "/directflow/SCCGGroupShare/projects/himaro/imputing_snp/demultiplexing/demultiplex_Nona/processed_data_demultiplex2/log_dir/"
+outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Nona_multiome/"
+
+dir.create(outdir, recursive = TRUE)
+
+
+##### Get a list of the village pools #####
+villages <- list.files(datadir, pattern = "Village")
+
+
+
+##### Get the singlets from the file #####
+village_id_list <- lapply(villages, function(x){
+    print(x)
+    tmp <- fread(paste0(datadir,x,"/CombinedResults/Final_Assignments_demultiplexing_doublets_new_edit.txt"), sep = "\t")
+    tmp$Pool_ID <- x
+    tmp$Day <- as.numeric(as.character(gsub("Village_Day", "", tmp$Pool_ID)))
+    tmp$V1 <- NULL
+    return(tmp)
+})
+
+village_id <- do.call(rbind, village_id_list)
+
+### Update columnes ###
+village_id$Day <- factor(village_id$Day, levels = c(0,1,2,3,4,5,7,15))
+village_id$Assignment <- gsub("^0_", "", village_id$Assignment) %>%
+                                gsub("D-", "", .) %>%
+                                    gsub("\\.\\d\\.", "", .) %>%
+                                        gsub("N-", "", .) %>%
+                                            gsub("-P36", "", .)  %>%
+                                                gsub("-", "", .)
+
+village_id$DropletType <- ifelse(village_id$DropletType != "singlet", "doublet", village_id$DropletType)
+
+data.table(prop.table(table(village_id[,c("DropletType", "Day")]), margin = 2))
+
+village_summary <- data.table(prop.table(table(village_id[,c("Assignment", "Day")]), margin = 2))
+
+village_summary_singlets <- data.table(prop.table(table(village_id[Assignment != "unassigned" & Assignment != "doublet",c("Assignment", "Day")]), margin = 2))
+
+village_summary_singlets$Assignment <- factor(village_summary_singlets$Assignment, levels = rev(village_summary_singlets[Day == 15]$Assignment[order(village_summary_singlets[Day == 15]$N)]))
+
+
+##### Make proportion plots (area plot) #####
+p_stacked_area <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day)), y = N, fill = factor(Assignment), group = Assignment)) +
+    geom_area(alpha=0.6 , size=0.5, colour="black") +
+    theme_classic() +
+    scale_fill_manual(values = c("#f44336", "#e81f63", "#9c27b0", "#673ab7", "#3f51b5", "#2096f3","#2096f3", "#009688", "#4caf50", "#8bc34a", "#cddc39", "#ffeb3b", "#ffc108", "#ff9801", "#ff5723" ,"#795548", "#9e9e9e", "#607d8b")) +
+    xlab("Days") +
+    ylab("Proportion of Cells")
+ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.png"), width = 7, height = 4)
+ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.pdf"), width = 7, height = 4)
+
+
+
+
+##### Make line plot of propotion over time #####
+p_line <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day)), y = N, color = Assignment)) +
+    geom_point() +
+    theme_classic() +
+    geom_line() +
+    scale_color_manual(values = c("#f44336", "#e81f63", "#9c27b0", "#673ab7", "#3f51b5", "#2096f3","#2096f3", "#009688", "#4caf50", "#8bc34a", "#cddc39", "#ffeb3b", "#ffc108", "#ff9801", "#ff5723" ,"#795548", "#9e9e9e", "#607d8b")) +
+    xlab("Days") +
+    ylab("Proportion of Cells")
+
+ggsave(p_line, filename = paste0(outdir,"line_proportions.png"), width = 7, height = 4)
+ggsave(p_line, filename = paste0(outdir,"line_proportions.pdf"), width = 7, height = 4)
+
+
+
+##### Check QC metrics #####
+### Load in Data ###
+tenxdir <- "/directflow/SCCGGroupShare/projects/annsen/ATACseq/"
+
+tenx_list <- lapply(villages, function(x){
+    Read10X(paste0(tenxdir,x, "/outs/filtered_feature_bc_matrix"))
+})
+
+
+seurat_list <- lapply(tenx_list, function(x){
+    CreateSeuratObject(counts = x)
+})
+
+seurat <- merge(seurat_list[[1]], y = seurat_list[2:length(seurat_list)], add.cell.ids = villages)
+
+seurat$Pool <- gsub("_[GTAC]+-1", "", rownames(seurat@meta.data))
+seurat$Day <- as.numeric(as.character(gsub("Village_Day", "", seurat$Pool)))
+
+seurat$mt_percent <- PercentageFeatureSet(seurat, pattern = "^MT-")
+seurat$rb_percent <- PercentageFeatureSet(seurat, pattern = "^RP[SL]")
+
+### Generate some plots by pool to show the QC metrics ###
+### N UMI ###
+p_UMI_vnl <- VlnPlot(seurat, features = "nCount_RNA", group.by = "Day", split.by = "Pool", pt.size = 0) +
+                    scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
+                    NoLegend()
+
+ggsave(p_UMI_vnl, filename = paste0(outdir,"umi_vln.png"))
+
+
+### N Genes ###
+p_gene_vnl <- VlnPlot(seurat, features = "nFeature_RNA", group.by = "Day", split.by = "Day", pt.size = 0) +
+                    scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
+                    NoLegend()
+
+ggsave(p_gene_vnl, filename = paste0(outdir,"gene_vln.png"))
+
+
+### Mt % ###
+p_mt_vnl <- VlnPlot(seurat, features = "mt_percent", group.by = "Day", split.by = "Day", pt.size = 0) +
+                    scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
+                    NoLegend()
+
+ggsave(p_mt_vnl, filename = paste0(outdir,"mt_vln.png"))
+
+### Rb % ###
+p_rb_vnl <- VlnPlot(seurat, features = "rb_percent", group.by = "Day", split.by = "Day", pt.size = 0) +
+                    scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
+                    NoLegend()
+
+ggsave(p_rb_vnl, filename = paste0(outdir,"rb_vln.png"))
+
+### Mt % vs N UMIs ###
+plot1 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "mt_percent") +
+            scale_color_manual()
+
+
+### N UMI vs N Genes ###
+plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
+
+### Data was already pretty clean since just used intronic reads, probably don't need to filter further for the high quality cells
diff --git a/QCmetric_figs/QC_metric_figs.R b/QCmetric_figs/QC_metric_figs.R
@@ -136,7 +136,20 @@ indiv_prop_df_long <- separate(indiv_prop_df_long, sep = "\\.", col = "Category"
 indiv_prop_df_long$Location_Time <- gsub("Brisbane_", "Brisbane.", indiv_prop_df_long$Location_Time) %>%
                                 gsub("Sydney_", "Sydney.", .) %>%
                                 gsub("Melbourne_", "Melbourne.", .)
+
 indiv_prop_df_long$Location_Time <- factor(indiv_prop_df_long$Location_Time, levels = c("Brisbane.Baseline","Brisbane.Village_Day_4", "Melbourne.Baseline", "Melbourne.Village_Day_4", "Sydney.Baseline", "Sydney.Village_Day_4", "Sydney.Thawed_Village_Day_0", "Sydney.Thawed_Village_Day_7"))
+indiv_prop_df_long$Location_Rep_Time <- factor(paste0(indiv_prop_df_long$Location, "-", gsub("\\D+", "", indiv_prop_df_long$Site_rep), ".", indiv_prop_df_long$Time),levels = c("Site 1-1.Baseline", "Site 1-2.Baseline", "Site 1-3.Baseline",
+																																												"Site 1-1.Village_Day_4", "Site 1-2.Village_Day_4", "Site 1-3.Village_Day_4", 
+																																												"Site 2-1.Baseline", "Site 2-2.Baseline", "Site 2-3.Baseline",
+																																												"Site 2-1.Village_Day_4", "Site 2-2.Village_Day_4", "Site 2-3.Village_Day_4", 
+																																												"Site 3-1.Baseline", "Site 3-2.Baseline", "Site 3-3.Baseline",
+																																												"Site 3-1.Village_Day_4", "Site 3-2.Village_Day_4", "Site 3-3.Village_Day_4",
+																																												"Site 3-1.Thawed_Village_Day_0", "Site 3-2.Thawed_Village_Day_0", "Site 3-3.Thawed_Village_Day_0",
+																																												"Site 3-1.Thawed_Village_Day_7", "Site 3-2.Thawed_Village_Day_7", "Site 3-3.Thawed_Village_Day_7"))
+
+
+
+
 
 ### Make column for location only and update to sites 1, 2 and 3
 indiv_prop_df_long <- separate(indiv_prop_df_long, col = "Location_Time", sep = "\\.", into = c("Location", "Time"), remove = FALSE)
@@ -154,8 +167,18 @@ save_figs(cell_line_loc_time,  paste0(outdir,"cell_line_stacked_bar_location_tim
 
 
 
+cell_line_loc_time <- ggbarplot(indiv_prop_df_long, "Location_Rep_Time", "Proportion", fill = "CellLine") +
+    rotate_x_text(45) +
+    scale_fill_manual(values = cell_line_colors)
+
+save_figs(cell_line_loc_time,  paste0(outdir,"cell_line_stacked_bar_location_rep_time"))
+
+
 ## Proportion of cell lines for just first two timepoints (baseline and village day 4)
 indiv_prop_df_long$Time <- gsub("Village_Day_4", "Village", indiv_prop_df_long$Time)
+indiv_prop_df_long$Time_rep <- paste0(indiv_prop_df_long$Time, "-", gsub("\\D+", "", indiv_prop_df_long$Site_rep))
+
+
 cell_line_loc_time_first2 <- ggbarplot(indiv_prop_df_long[which(indiv_prop_df_long$Time == "Baseline" | indiv_prop_df_long$Time == "Village"),], "Time", "Proportion", add = c("mean_se"), fill = "CellLine", facet.by = "Location", size = 0.25) +
     scale_fill_manual(values = cell_line_colors) +
 	rotate_x_text(45) +
@@ -164,6 +187,15 @@ cell_line_loc_time_first2 <- ggbarplot(indiv_prop_df_long[which(indiv_prop_df_lo
 save_figs(cell_line_loc_time_first2,  paste0(outdir,"cell_line_stacked_bar_location_time_baseline_village_4days"), width = 6, height = 8)
 
 
+cell_line_loc_time_rep <- ggbarplot(indiv_prop_df_long[which(indiv_prop_df_long$Time == "Baseline" | indiv_prop_df_long$Time == "Village"),], "Time_rep", "Proportion", fill = "CellLine", facet.by = "Location", size = 0.25) +
+    scale_fill_manual(values = cell_line_colors) +
+	rotate_x_text(45) +
+	theme(axis.title.x=element_blank())
+
+save_figs(cell_line_loc_time_rep,  paste0(outdir,"cell_line_stacked_bar_location_time_rep_baseline_village_4days"), width = 10, height = 8)
+
+
+
 ### Compare the FSA0006 level at baseline vs village ###
 indiv_prop_df_long <- data.table(indiv_prop_df_long)
 
@@ -182,6 +214,17 @@ for (site in unique(indiv_prop_df_long$Location)){
 fwrite(t_test_df, paste0(outdir,"hPSC_line_proportions.tsv"), sep = "\t")
 
 
+Mod <- glm(data = DF, cbind(irregular,regular) ~ Treat, family = "binomial") 
+summary(Mod)      #This prints the results, p.values and statistics. 
+exp(confint(Mod)) #This gives you the CIs for the different terms in the model
+
+
+
+
+
+
+
+
 ## Proportion of cell lines for Sydney
 indiv_prop_df_long_syd <- indiv_prop_df_long[which(indiv_prop_df_long$Location == "Site 3"),]
 indiv_prop_df_long_syd$Village <- ifelse((indiv_prop_df_long_syd$Time == "Thawed_Village_Day_0" | indiv_prop_df_long_syd$Time == "Baseline"), "Baseline", "Village")

diff --git a/Variance/nb_variance_w_interactions.R b/Variance/nb_variance_w_interactions.R
@@ -0,0 +1,66 @@
+##### Reason: want to test nb interaction model before running on large number of genes
+
+library(haven)
+library(ggplot2)
+library(lme4)
+library(glmmTMB)
+library(Seurat)
+library(tidyverse)
+library(lmtest)
+
+
+
+dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/"
+outdir <- paste0(dir,"output/test_interaction_nb_model")
+dir.create(outdir, recursive = TRUE)
+
+
+##### Read in seurat with genes #####
+seurat <- readRDS(paste0(dir,"output/Distribution_tests/seurat_integrated_all_times_clustered_1pct_expressing.rds"))
+
+seurat@meta.data$Location <- gsub("_Baseline", "", seurat@meta.data$Location) %>% gsub("_Village.+", "", .) %>% gsub("Thawed", "Cryopreserved",.)
+seurat@meta.data$Time <- gsub("Thawed Village Day 0", "Baseline", seurat@meta.data$Time) %>% gsub("Thawed Village Day 7", "Village", .)
+
+
+
+
+### Make dataframes for modeling ###
+gene <- "ENSG00000106153"
+
+df <- data.frame("Expression" = data.frame(seurat[["SCT"]]@counts[gene,]), "Line" = seurat@meta.data$Final_Assignment, "Village" = ifelse(seurat@meta.data$Time == "Baseline", 0, 1), "Replicate" = gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID), "Location" = seurat@meta.data$Location) ### The results are not dependent on the order or the covariates included so don't need to include extra covariates
+colnames(df)[1] <- "Expression"
+
+
+
+
+
+### Calculate R2 for each separately + remaining (1 - model with all in ) & use proporiton of these in figure
+mode <- list()
+model[["Base"]] <- glmmTMB(Expression ~ 1, data = df, family = nbinom2)
+model[["Line"]] <- glmmTMB(Expression ~ 1 + Line , data = df, family = nbinom2)
+model[["Replicate"]] <- glmmTMB(Expression ~ 1 + Replicate , data = df, family = nbinom2)
+model[["Location"]] <- glmmTMB(Expression ~ 1 + Location , data = df, family = nbinom2)
+model[["Village"]] <- glmmTMB(Expression ~ 1 + Village, data = df, family = nbinom2)
+model[["Line_Replicate_Location_Village"]] <- glmmTMB(Expression ~ 1 + Village, data = df, family = nbinom2)
+model[["Line_Replicate_Location_Village"]]
+
+
+
+### Calculate model together ###
+pseudoR2[[1-logLik(model_line)/logLik(base_model)
+
+
+1-logLik(model_rep)/logLik(model)
+
+
+1-logLik(model_site)/logLik(model)
+
+
+
+1-logLik(model_line_village)/logLik(model)
+1-logLik(model_village_line)/logLik(model)
+
+
+### fit model with interaction - lrt
+anova(model_village_line_rep_site2, model_village_line_rep_site_interaction2)
+pchisq(137.96, df=1, lower.tail=FALSE)/2