Skip to content

Commit

Permalink
update records
Browse files Browse the repository at this point in the history
  • Loading branch information
drneavin committed Mar 8, 2022
1 parent 456dc90 commit d19889f
Show file tree
Hide file tree
Showing 7 changed files with 780 additions and 0 deletions.
55 changes: 55 additions & 0 deletions Growth_rate/growth_rate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
library(data.table)
library(tidyverse)
library(gtools)
library(growthcurver)
library("ggpubr")


dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/data/"
data <- fread(paste0(dir, "Growth_rate/growth_rate_measurements.csv"))
outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/growth_rate/"


dir.create(outdir, recursive = TRUE)


models <- lapply(unique(data$CellLine), function(line){
SummarizeGrowth(data[CellLine == line]$Time, data[CellLine == line]$Count)
})
names(models) <- unique(data$CellLine)

predict <- lapply(names(models), function(line){
tmp <- data.table(Time = data[CellLine == line]$Time, pred.wt = predict(models[[line]]$model))
tmp$CellLine <- line
return(tmp)
})

predict_df <- do.call(rbind, predict)




### Make Plot ###
pBasic <- ggplot(data, aes(Time, Count, color = CellLine)) +
geom_point() +
theme_classic() +
geom_line(data=predict_df, aes(y=pred.wt))

ggsave(pBasic, filename = paste0(outdir, "/Basic_growth_rates_plot.png"))




##### Correlate growth rate with confluency #####
pCorr <- ggplot(data, aes(Count,Confluency, color = CellLine)) +
geom_point() +
theme_classic()

ggsave(pCorr, filename = paste0(outdir, "Count_confluency_corr.png"))


pCorr <- ggscatter(data, x = "Count", y = "Confluency", color = "CellLine",
add = "reg.line", conf.int = TRUE,
cor.method = "pearson") +
stat_cor(aes(color = CellLine), label.x = 90)
ggsave(pCorr, filename = paste0(outdir, "Count_confluency_corr.png"))
41 changes: 41 additions & 0 deletions Growth_rate/ratrack.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

RATRACK=/directflow/SCCGGroupShare/projects/DrewNeavin/software/ratrack/
SNAKEFILE=/directflow/SCCGGroupShare/projects/DrewNeavin/software/ratrack/Snakefile


cd $RATRACK
mkdir logs

conda activate ratrack

# ### First test with demo ###
# snakemake results/minimal.pdf results/minimal.fit.csv --dryrun
# snakemake results/kcl22.pdf results/kcl22.fit.csv


# snakemake results/TOB0421_ratrack.pdf results/TOB0421_ratrack.fit.csv --dryrun
# nohup snakemake results/TOB0421_ratrack.pdf results/TOB0421_ratrack.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_TOB0421.log &

# snakemake results/FSA0006_ratrack.pdf results/FSA0006_ratrack.fit.csv --dryrun
# nohup snakemake results/FSA0006_ratrack.pdf results/FSA0006_ratrack.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_FSA0006.log &

# snakemake results/MBE1006_ratrack.pdf results/MBE1006_ratrack.fit.csv --dryrun
# nohup snakemake results/MBE1006_ratrack.pdf results/MBE1006_ratrack.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_MBE1006.log &


### Try with original counts + dillution info ###
snakemake results/TOB0421_original_counts.pdf results/TOB0421_original_counts.fit.csv --dryrun
nohup snakemake results/TOB0421_original_counts.pdf results/TOB0421_original_counts.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_TOB0421.log &

snakemake results/FSA0006_original_counts.pdf results/FSA0006_original_counts.fit.csv --dryrun
nohup snakemake results/FSA0006_original_counts.pdf results/FSA0006_original_counts.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_FSA0006.log &

snakemake results/MBE1006_original_counts.pdf results/MBE1006_original_counts.fit.csv --dryrun
nohup snakemake results/MBE1006_original_counts.pdf results/MBE1006_original_counts.fit.csv --cores 5 > logs/nohup_`date +%Y-%m-%d.%H:%M:%S`_MBE1006.log &





snakemake results/TOB0421_ratrack.pdf results/TOB0421_ratrack.fit.csv --dryrun --unlock
146 changes: 146 additions & 0 deletions Nona_multiome/cell_line_proportions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
##### Author: Drew Neavin
##### Date: 2 December, 2021
##### Reason: Look at the proportions of each line at each time of Nona's multi-ome experiment




##### Load in libraries #####
library(data.table)
library(Seurat)
library(tidyverse)
library("ggpomological")


##### Set up directories #####
datadir <- "/directflow/SCCGGroupShare/projects/himaro/imputing_snp/demultiplexing/demultiplex_Nona/processed_data_demultiplex2/log_dir/"
outdir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/output/Nona_multiome/"

dir.create(outdir, recursive = TRUE)


##### Get a list of the village pools #####
villages <- list.files(datadir, pattern = "Village")



##### Get the singlets from the file #####
village_id_list <- lapply(villages, function(x){
print(x)
tmp <- fread(paste0(datadir,x,"/CombinedResults/Final_Assignments_demultiplexing_doublets_new_edit.txt"), sep = "\t")
tmp$Pool_ID <- x
tmp$Day <- as.numeric(as.character(gsub("Village_Day", "", tmp$Pool_ID)))
tmp$V1 <- NULL
return(tmp)
})

village_id <- do.call(rbind, village_id_list)

### Update columnes ###
village_id$Day <- factor(village_id$Day, levels = c(0,1,2,3,4,5,7,15))
village_id$Assignment <- gsub("^0_", "", village_id$Assignment) %>%
gsub("D-", "", .) %>%
gsub("\\.\\d\\.", "", .) %>%
gsub("N-", "", .) %>%
gsub("-P36", "", .) %>%
gsub("-", "", .)

village_id$DropletType <- ifelse(village_id$DropletType != "singlet", "doublet", village_id$DropletType)

data.table(prop.table(table(village_id[,c("DropletType", "Day")]), margin = 2))

village_summary <- data.table(prop.table(table(village_id[,c("Assignment", "Day")]), margin = 2))

village_summary_singlets <- data.table(prop.table(table(village_id[Assignment != "unassigned" & Assignment != "doublet",c("Assignment", "Day")]), margin = 2))

village_summary_singlets$Assignment <- factor(village_summary_singlets$Assignment, levels = rev(village_summary_singlets[Day == 15]$Assignment[order(village_summary_singlets[Day == 15]$N)]))


##### Make proportion plots (area plot) #####
p_stacked_area <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day)), y = N, fill = factor(Assignment), group = Assignment)) +
geom_area(alpha=0.6 , size=0.5, colour="black") +
theme_classic() +
scale_fill_manual(values = c("#f44336", "#e81f63", "#9c27b0", "#673ab7", "#3f51b5", "#2096f3","#2096f3", "#009688", "#4caf50", "#8bc34a", "#cddc39", "#ffeb3b", "#ffc108", "#ff9801", "#ff5723" ,"#795548", "#9e9e9e", "#607d8b")) +
xlab("Days") +
ylab("Proportion of Cells")
ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.png"), width = 7, height = 4)
ggsave(p_stacked_area, filename = paste0(outdir,"stacked_area.pdf"), width = 7, height = 4)




##### Make line plot of propotion over time #####
p_line <- ggplot(village_summary_singlets, aes(x = as.numeric(as.character(Day)), y = N, color = Assignment)) +
geom_point() +
theme_classic() +
geom_line() +
scale_color_manual(values = c("#f44336", "#e81f63", "#9c27b0", "#673ab7", "#3f51b5", "#2096f3","#2096f3", "#009688", "#4caf50", "#8bc34a", "#cddc39", "#ffeb3b", "#ffc108", "#ff9801", "#ff5723" ,"#795548", "#9e9e9e", "#607d8b")) +
xlab("Days") +
ylab("Proportion of Cells")

ggsave(p_line, filename = paste0(outdir,"line_proportions.png"), width = 7, height = 4)
ggsave(p_line, filename = paste0(outdir,"line_proportions.pdf"), width = 7, height = 4)



##### Check QC metrics #####
### Load in Data ###
tenxdir <- "/directflow/SCCGGroupShare/projects/annsen/ATACseq/"

tenx_list <- lapply(villages, function(x){
Read10X(paste0(tenxdir,x, "/outs/filtered_feature_bc_matrix"))
})


seurat_list <- lapply(tenx_list, function(x){
CreateSeuratObject(counts = x)
})

seurat <- merge(seurat_list[[1]], y = seurat_list[2:length(seurat_list)], add.cell.ids = villages)

seurat$Pool <- gsub("_[GTAC]+-1", "", rownames(seurat@meta.data))
seurat$Day <- as.numeric(as.character(gsub("Village_Day", "", seurat$Pool)))

seurat$mt_percent <- PercentageFeatureSet(seurat, pattern = "^MT-")
seurat$rb_percent <- PercentageFeatureSet(seurat, pattern = "^RP[SL]")

### Generate some plots by pool to show the QC metrics ###
### N UMI ###
p_UMI_vnl <- VlnPlot(seurat, features = "nCount_RNA", group.by = "Day", split.by = "Pool", pt.size = 0) +
scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
NoLegend()

ggsave(p_UMI_vnl, filename = paste0(outdir,"umi_vln.png"))


### N Genes ###
p_gene_vnl <- VlnPlot(seurat, features = "nFeature_RNA", group.by = "Day", split.by = "Day", pt.size = 0) +
scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
NoLegend()

ggsave(p_gene_vnl, filename = paste0(outdir,"gene_vln.png"))


### Mt % ###
p_mt_vnl <- VlnPlot(seurat, features = "mt_percent", group.by = "Day", split.by = "Day", pt.size = 0) +
scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
NoLegend()

ggsave(p_mt_vnl, filename = paste0(outdir,"mt_vln.png"))

### Rb % ###
p_rb_vnl <- VlnPlot(seurat, features = "rb_percent", group.by = "Day", split.by = "Day", pt.size = 0) +
scale_fill_manual(values = c("#c03728", "#919c4c", "#f18721", "#f5c049", "#e68c7c", "#828585", "#c3c377", "#4f5157")) +
NoLegend()

ggsave(p_rb_vnl, filename = paste0(outdir,"rb_vln.png"))

### Mt % vs N UMIs ###
plot1 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "mt_percent") +
scale_color_manual()


### N UMI vs N Genes ###
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

### Data was already pretty clean since just used intronic reads, probably don't need to filter further for the high quality cells
43 changes: 43 additions & 0 deletions QCmetric_figs/QC_metric_figs.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,20 @@ indiv_prop_df_long <- separate(indiv_prop_df_long, sep = "\\.", col = "Category"
indiv_prop_df_long$Location_Time <- gsub("Brisbane_", "Brisbane.", indiv_prop_df_long$Location_Time) %>%
gsub("Sydney_", "Sydney.", .) %>%
gsub("Melbourne_", "Melbourne.", .)

indiv_prop_df_long$Location_Time <- factor(indiv_prop_df_long$Location_Time, levels = c("Brisbane.Baseline","Brisbane.Village_Day_4", "Melbourne.Baseline", "Melbourne.Village_Day_4", "Sydney.Baseline", "Sydney.Village_Day_4", "Sydney.Thawed_Village_Day_0", "Sydney.Thawed_Village_Day_7"))
indiv_prop_df_long$Location_Rep_Time <- factor(paste0(indiv_prop_df_long$Location, "-", gsub("\\D+", "", indiv_prop_df_long$Site_rep), ".", indiv_prop_df_long$Time),levels = c("Site 1-1.Baseline", "Site 1-2.Baseline", "Site 1-3.Baseline",
"Site 1-1.Village_Day_4", "Site 1-2.Village_Day_4", "Site 1-3.Village_Day_4",
"Site 2-1.Baseline", "Site 2-2.Baseline", "Site 2-3.Baseline",
"Site 2-1.Village_Day_4", "Site 2-2.Village_Day_4", "Site 2-3.Village_Day_4",
"Site 3-1.Baseline", "Site 3-2.Baseline", "Site 3-3.Baseline",
"Site 3-1.Village_Day_4", "Site 3-2.Village_Day_4", "Site 3-3.Village_Day_4",
"Site 3-1.Thawed_Village_Day_0", "Site 3-2.Thawed_Village_Day_0", "Site 3-3.Thawed_Village_Day_0",
"Site 3-1.Thawed_Village_Day_7", "Site 3-2.Thawed_Village_Day_7", "Site 3-3.Thawed_Village_Day_7"))





### Make column for location only and update to sites 1, 2 and 3
indiv_prop_df_long <- separate(indiv_prop_df_long, col = "Location_Time", sep = "\\.", into = c("Location", "Time"), remove = FALSE)
Expand All @@ -154,8 +167,18 @@ save_figs(cell_line_loc_time, paste0(outdir,"cell_line_stacked_bar_location_tim



cell_line_loc_time <- ggbarplot(indiv_prop_df_long, "Location_Rep_Time", "Proportion", fill = "CellLine") +
rotate_x_text(45) +
scale_fill_manual(values = cell_line_colors)

save_figs(cell_line_loc_time, paste0(outdir,"cell_line_stacked_bar_location_rep_time"))


## Proportion of cell lines for just first two timepoints (baseline and village day 4)
indiv_prop_df_long$Time <- gsub("Village_Day_4", "Village", indiv_prop_df_long$Time)
indiv_prop_df_long$Time_rep <- paste0(indiv_prop_df_long$Time, "-", gsub("\\D+", "", indiv_prop_df_long$Site_rep))


cell_line_loc_time_first2 <- ggbarplot(indiv_prop_df_long[which(indiv_prop_df_long$Time == "Baseline" | indiv_prop_df_long$Time == "Village"),], "Time", "Proportion", add = c("mean_se"), fill = "CellLine", facet.by = "Location", size = 0.25) +
scale_fill_manual(values = cell_line_colors) +
rotate_x_text(45) +
Expand All @@ -164,6 +187,15 @@ cell_line_loc_time_first2 <- ggbarplot(indiv_prop_df_long[which(indiv_prop_df_lo
save_figs(cell_line_loc_time_first2, paste0(outdir,"cell_line_stacked_bar_location_time_baseline_village_4days"), width = 6, height = 8)


cell_line_loc_time_rep <- ggbarplot(indiv_prop_df_long[which(indiv_prop_df_long$Time == "Baseline" | indiv_prop_df_long$Time == "Village"),], "Time_rep", "Proportion", fill = "CellLine", facet.by = "Location", size = 0.25) +
scale_fill_manual(values = cell_line_colors) +
rotate_x_text(45) +
theme(axis.title.x=element_blank())

save_figs(cell_line_loc_time_rep, paste0(outdir,"cell_line_stacked_bar_location_time_rep_baseline_village_4days"), width = 10, height = 8)



### Compare the FSA0006 level at baseline vs village ###
indiv_prop_df_long <- data.table(indiv_prop_df_long)

Expand All @@ -182,6 +214,17 @@ for (site in unique(indiv_prop_df_long$Location)){
fwrite(t_test_df, paste0(outdir,"hPSC_line_proportions.tsv"), sep = "\t")


Mod <- glm(data = DF, cbind(irregular,regular) ~ Treat, family = "binomial")
summary(Mod) #This prints the results, p.values and statistics.
exp(confint(Mod)) #This gives you the CIs for the different terms in the model








## Proportion of cell lines for Sydney
indiv_prop_df_long_syd <- indiv_prop_df_long[which(indiv_prop_df_long$Location == "Site 3"),]
indiv_prop_df_long_syd$Village <- ifelse((indiv_prop_df_long_syd$Time == "Thawed_Village_Day_0" | indiv_prop_df_long_syd$Time == "Baseline"), "Baseline", "Village")
Expand Down
66 changes: 66 additions & 0 deletions Variance/nb_variance_w_interactions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
##### Reason: want to test nb interaction model before running on large number of genes

library(haven)
library(ggplot2)
library(lme4)
library(glmmTMB)
library(Seurat)
library(tidyverse)
library(lmtest)



dir <- "/directflow/SCCGGroupShare/projects/DrewNeavin/iPSC_Village/"
outdir <- paste0(dir,"output/test_interaction_nb_model")
dir.create(outdir, recursive = TRUE)


##### Read in seurat with genes #####
seurat <- readRDS(paste0(dir,"output/Distribution_tests/seurat_integrated_all_times_clustered_1pct_expressing.rds"))

seurat@meta.data$Location <- gsub("_Baseline", "", seurat@meta.data$Location) %>% gsub("_Village.+", "", .) %>% gsub("Thawed", "Cryopreserved",.)
seurat@meta.data$Time <- gsub("Thawed Village Day 0", "Baseline", seurat@meta.data$Time) %>% gsub("Thawed Village Day 7", "Village", .)




### Make dataframes for modeling ###
gene <- "ENSG00000106153"

df <- data.frame("Expression" = data.frame(seurat[["SCT"]]@counts[gene,]), "Line" = seurat@meta.data$Final_Assignment, "Village" = ifelse(seurat@meta.data$Time == "Baseline", 0, 1), "Replicate" = gsub("[A-Z][a-z]+", "", seurat@meta.data$MULTI_ID), "Location" = seurat@meta.data$Location) ### The results are not dependent on the order or the covariates included so don't need to include extra covariates
colnames(df)[1] <- "Expression"





### Calculate R2 for each separately + remaining (1 - model with all in ) & use proporiton of these in figure
mode <- list()
model[["Base"]] <- glmmTMB(Expression ~ 1, data = df, family = nbinom2)
model[["Line"]] <- glmmTMB(Expression ~ 1 + Line , data = df, family = nbinom2)
model[["Replicate"]] <- glmmTMB(Expression ~ 1 + Replicate , data = df, family = nbinom2)
model[["Location"]] <- glmmTMB(Expression ~ 1 + Location , data = df, family = nbinom2)
model[["Village"]] <- glmmTMB(Expression ~ 1 + Village, data = df, family = nbinom2)
model[["Line_Replicate_Location_Village"]] <- glmmTMB(Expression ~ 1 + Village, data = df, family = nbinom2)
model[["Line_Replicate_Location_Village"]]



### Calculate model together ###
pseudoR2[[1-logLik(model_line)/logLik(base_model)


1-logLik(model_rep)/logLik(model)


1-logLik(model_site)/logLik(model)



1-logLik(model_line_village)/logLik(model)
1-logLik(model_village_line)/logLik(model)


### fit model with interaction - lrt
anova(model_village_line_rep_site2, model_village_line_rep_site_interaction2)
pchisq(137.96, df=1, lower.tail=FALSE)/2
Loading

0 comments on commit d19889f

Please sign in to comment.