Sam Buckberry 2023-06-23
source("R/project_functions.R")
Load DNA methylation metadata
mdat <- read.csv("wgbs/metadata/wgbs_metadata_local.csv")
Load the DMR object with annotations
all_dmrs <- readRDS("wgbs/processed_data/classified_dmrs_granges.Rds")
Write DMR results to file
all_dmrs_df <- as.data.frame(all_dmrs)
all_dmrs_df$loci <- gr_to_loci(all_dmrs)
write.table(all_dmrs_df, "wgbs/processed_data/all_dmrs_reults.csv", quote = FALSE, sep = ",", row.names = FALSE)
Make CG-DMR mCG/CG matrix for relevant samples
fib_libs <- c("RL415", "RL702")
primed_libs <- c("RL417", "RL418", "RL703", "RL1751_1771")
naive_libs <- c("RL411", "RL412", "RL838")
tnt_libs <- c("RL1124", "RL1125")
ntp_libs <- c("RL936", "RL937", "RL837")
esc_libs <- c("RL2352_merge", "RL2351_merge", "SRS114877",
"SRR1561745_merge", "SRS004213", "SRS606777", "SRS606778")
libs <- c(fib_libs, primed_libs, naive_libs, tnt_libs, ntp_libs, esc_libs)
ind <- match(libs, mdat$Library_id)
dmr_mCG_matrix <- make_mC_matrix(obj_fls = mdat$BSseq_CG[ind],
gr = all_dmrs, cores = 4)
## Making matrix of mC levels for regions...
colnames(dmr_mCG_matrix) <- mdat$Library_id[ind]
Caclculate mean delta for hiPSCs compared to hESCs
## Calculate mean delta
esc_n2p_d13_delta <- rowMeans(dmr_mCG_matrix[ ,tnt_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
esc_n2p_p10_delta <- rowMeans(dmr_mCG_matrix[ ,ntp_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
esc_naive_delta <- rowMeans(dmr_mCG_matrix[ ,naive_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
esc_ips_delta <- rowMeans(dmr_mCG_matrix[ ,primed_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
esc_fib_delta <- rowMeans(dmr_mCG_matrix[ ,fib_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
all_dmrs$esc_n2p_d13_delta <- esc_n2p_d13_delta
all_dmrs$esc_n2p_p10_delta <- esc_n2p_p10_delta
all_dmrs$esc_ips_delta <- esc_ips_delta
all_dmrs$esc_naive_delta <- esc_naive_delta
#all_dmrs$esc_fib_delta <- esc_fib_delta
all_dmrs$loci <- gr_to_loci(all_dmrs)
Plot DMR delta values
delta_df <- data.frame(TNT=all_dmrs$esc_n2p_d13_delta,
NtP=all_dmrs$esc_n2p_p10_delta,
Primed=all_dmrs$esc_ips_delta,
Naive=all_dmrs$esc_naive_delta,
#Fibroblast=esc_fib_delta,
class=all_dmrs$class)
delta_df_melt <- reshape2::melt(delta_df)
## Using class as id variables
delta_df_melt$variable <- factor(delta_df_melt$variable,
levels=c("Primed", "Naive", "NtP", "TNT"))
delta_df_melt$class <- factor(delta_df_melt$class)
line_mm <- 0.5 / 2.835
gg_delta_correct <- ggplot(delta_df_melt,
aes(x = value, group=variable,
color=variable,
fill=variable)) +
geom_histogram(bins = 30, position="identity") +
geom_vline(xintercept = c(-.2, .2), linetype='dashed', size = line_mm) +
#geom_density() +
scale_color_manual(values=reprog_pal[c("Primed", "Naive", "TNT", "NtP")]) +
scale_fill_manual(values=reprog_pal[c("Primed", "Naive", "TNT", "NtP")]) +
facet_grid(class~variable, scales = "free_y") +
scale_x_continuous(limits = c(-1, 1)) +
xlab("DMR mCG delta \n(iPSC mCG - ESC mCG)") +
ylab("Number of DMRs") +
sams_pub_theme(legend_pos = 'right', x.text.angle = 0, hjust = 0.5)
pdf("wgbs/plots/mcg_dmr_esc_delta_histograms.pdf", width = 5, height = 2)
gg_delta_correct
dev.off()
## quartz_off_screen
## 2
gg_delta_correct_combined <- ggplot(delta_df_melt,
aes(x = value, group=variable,
color=variable,
fill=variable)) +
geom_histogram(bins = 30, position="identity") +
geom_vline(xintercept = c(-.2, .2), linetype='dashed', size = line_mm) +
#geom_density() +
scale_color_manual(values=reprog_pal[c("Primed", "Naive", "TNT", "NtP")]) +
scale_fill_manual(values=reprog_pal[c("Primed", "Naive", "TNT", "NtP")]) +
facet_grid(variable~.) +
scale_x_continuous(limits = c(-1, 1)) +
xlab("DMR mCG delta \n(iPSC mCG - ESC mCG)") +
ylab("Number of DMRs") +
sams_pub_theme(legend_pos = 'right', x.text.angle = 0, hjust = 0.5)
pdf("wgbs/plots/mcg_dmr_esc_delta_histograms_combined_vertical.pdf", width = 2.5, height = 3.25)
gg_delta_correct_combined
dev.off()
## quartz_off_screen
## 2
wb_fig3 <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3c")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3c",
x = gg_delta_correct_combined$data)
Assign DMRs as corrected or uncorrected and write bed files
## Seperate if corrected or uncorrected
all_dmrs$tnt_corrected <- abs(all_dmrs$esc_n2p_d13_delta) < 0.2
all_dmrs$ntp_corrected <- abs(all_dmrs$esc_n2p_p10_delta) < 0.2
all_dmrs$both_corrected <- (all_dmrs$tnt_corrected == TRUE) &
(all_dmrs$ntp_corrected == TRUE)
all_dmrs$any_corrected <- (all_dmrs$tnt_corrected == TRUE) | (all_dmrs$ntp_corrected == TRUE)
all_dmrs$correction_class <- str_c(all_dmrs$class, "_", all_dmrs$both_corrected)
table(all_dmrs$correction_class)
##
## Aberrant_hypermethylation_FALSE Aberrant_hypermethylation_TRUE
## 94 105
## Aberrant_hypomethylation_FALSE Aberrant_hypomethylation_TRUE
## 64 76
## Memory_hypermethylation_FALSE Memory_hypermethylation_TRUE
## 86 67
## Memory_hypomethylation_FALSE Memory_hypomethylation_TRUE
## 447 1202
## Partial_hypermethyation_memory_FALSE Partial_hypermethyation_memory_TRUE
## 4 11
## Partial_hypomethylation_memory_FALSE Partial_hypomethylation_memory_TRUE
## 171 400
all_dmrs
## GRanges object with 2727 ranges and 35 metadata columns:
## seqnames ranges strand | L area beta
## <Rle> <IRanges> <Rle> | <integer> <numeric> <numeric>
## [1] chr12 130642603-130650204 * | 338 168.5753 1.248068
## [2] chr10 133108848-133111109 * | 119 80.4240 1.776016
## [3] chr4 8856630-8879787 * | 928 323.4014 0.926906
## [4] chr16 53406798-53407808 * | 57 46.7636 -2.083580
## [5] chr12 130386982-130389235 * | 152 93.3987 1.732472
## ... ... ... ... . ... ... ...
## [2723] chr20 60237468-60237766 * | 15 2.34649 -0.566588
## [2724] chr18 50343719-50345032 * | 9 2.22555 -0.806841
## [2725] chr4 34350170-34350596 * | 5 1.56146 -1.035597
## [2726] chr16 6709279-6710023 * | 12 3.17818 -0.727783
## [2727] chr8 140148147-140149073 * | 11 1.58127 -0.492863
## stat pval qval index g1_mean g2_mean
## <numeric> <numeric> <numeric> <IRanges> <numeric> <numeric>
## [1] 55.6732 3.51537e-06 0.0205001 16910132-16910469 0.057 0.535
## [2] 35.1931 3.51537e-06 0.0205001 14538777-14538895 0.032 0.729
## [3] 33.8936 3.51537e-06 0.0205001 5734171-5735098 0.133 0.504
## [4] -33.7511 3.51537e-06 0.0205001 19792147-19792203 0.882 0.088
## [5] 31.0682 3.51537e-06 0.0205001 16906067-16906218 0.012 0.631
## ... ... ... ... ... ... ...
## [2723] -6.11683 0.0498198 0.481918 23468510-23468524 0.883 0.664
## [2724] -6.11666 0.0498374 0.481974 21669383-21669391 0.860 0.583
## [2725] -6.11634 0.0498585 0.482064 5910155-5910159 0.877 0.444
## [2726] -6.11504 0.0499358 0.482546 19420156-19420167 0.909 0.655
## [2727] -6.11414 0.0499991 0.482737 12193960-12193970 0.828 0.623
## delta Smith_H9_KSR Lister_H9_E8 Lister_MEL1_E8 HESO7
## <numeric> <numeric> <numeric> <numeric> <numeric>
## [1] -0.478 0.03588355 0.0713293 0.0490713 0.04832948
## [2] -0.696 0.02356021 0.0210970 0.0153551 0.03076923
## [3] -0.371 0.09316851 0.1891220 0.0883625 0.12799310
## [4] 0.794 0.95044248 0.8170530 0.9393939 0.69470405
## [5] -0.618 0.00928793 0.0148342 0.0118577 0.00788436
## ... ... ... ... ... ...
## [2723] 0.219 0.870968 0.902174 0.945055 0.798611
## [2724] 0.277 0.949153 0.879257 0.935115 0.746914
## [2725] 0.433 0.973333 0.870445 0.889503 0.902439
## [2726] 0.254 0.857143 0.873684 0.912162 0.952941
## [2727] 0.206 0.858896 0.775735 0.596970 0.903382
## HESO8 Ecker_H9 Ecker_H1 RL1751_1771 RL417 RL418
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## [1] 0.0451249 0.04066667 0.1072961 0.491974 0.592696 0.5650394
## [2] 0.0249855 0.03183024 0.0794913 0.932629 0.657875 0.4189308
## [3] 0.1256141 0.14296771 0.1668120 0.893547 0.292004 0.4642339
## [4] 0.9276074 0.94273128 0.9012195 0.188377 0.052439 0.0600775
## [5] 0.0087500 0.00522648 0.0265487 0.965190 0.562500 0.4903943
## ... ... ... ... ... ... ...
## [2723] 0.810056 0.940678 0.911765 0.695238 0.658683 0.680672
## [2724] 0.715152 0.865854 0.928571 0.644578 0.562189 0.561905
## [2725] 0.893805 0.900000 0.707792 0.450980 0.389706 0.433962
## [2726] 0.940945 0.909639 0.915730 0.759615 0.670588 0.502591
## [2727] 0.901554 0.891026 0.870690 0.698925 0.696970 0.398907
## RL703 blacklist significant class
## <numeric> <logical> <logical> <factor>
## [1] 0.4905745 FALSE TRUE Memory_hypermethylation
## [2] 0.9053763 FALSE TRUE Aberrant_hypermethylation
## [3] 0.3672399 FALSE TRUE Aberrant_hypermethylation
## [4] 0.0512048 FALSE TRUE Memory_hypomethylation
## [5] 0.5040650 FALSE TRUE Aberrant_hypermethylation
## ... ... ... ... ...
## [2723] 0.622222 FALSE TRUE Memory_hypomethylation
## [2724] 0.564655 FALSE TRUE Memory_hypomethylation
## [2725] 0.500000 FALSE TRUE Memory_hypomethylation
## [2726] 0.688525 FALSE TRUE Memory_hypomethylation
## [2727] 0.695652 FALSE TRUE Partial_hypomethylation_memory
## direction esc_n2p_d13_delta esc_n2p_p10_delta esc_ips_delta
## <factor> <numeric> <numeric> <numeric>
## [1] Hyper 0.01873569 0.179476 0.478256
## [2] Hyper 0.05501255 0.261937 0.696261
## [3] Hyper 0.00894135 0.219105 0.370822
## [4] Hypo -0.84033627 -0.587228 -0.793854
## [5] Hyper 0.02438335 0.179902 0.618482
## ... ... ... ... ...
## [2723] Hypo -0.0503771 -0.0330782 -0.218554
## [2724] Hypo -0.1762265 -0.1455069 -0.276670
## [2725] Hypo -0.1135322 -0.0634828 -0.433098
## [2726] Hypo -0.0947447 -0.0612849 -0.253562
## [2727] Hypo -0.0131534 -0.0290512 -0.205708
## esc_naive_delta loci tnt_corrected ntp_corrected
## <numeric> <character> <logical> <logical>
## [1] 0.0886185 chr12:130642603-1306.. TRUE TRUE
## [2] 0.0217845 chr10:133108848-1331.. TRUE FALSE
## [3] 0.1996891 chr4:8856630-8879787 TRUE FALSE
## [4] -0.7967694 chr16:53406798-53407.. FALSE FALSE
## [5] 0.1176200 chr12:130386982-1303.. TRUE TRUE
## ... ... ... ... ...
## [2723] -0.285546 chr20:60237468-60237.. TRUE TRUE
## [2724] -0.615811 chr18:50343719-50345.. TRUE TRUE
## [2725] -0.735022 chr4:34350170-34350596 TRUE TRUE
## [2726] -0.463575 chr16:6709279-6710023 TRUE TRUE
## [2727] -0.561072 chr8:140148147-14014.. TRUE TRUE
## both_corrected any_corrected correction_class
## <logical> <logical> <character>
## [1] TRUE TRUE Memory_hypermethylat..
## [2] FALSE TRUE Aberrant_hypermethyl..
## [3] FALSE TRUE Aberrant_hypermethyl..
## [4] FALSE FALSE Memory_hypomethylati..
## [5] TRUE TRUE Aberrant_hypermethyl..
## ... ... ... ...
## [2723] TRUE TRUE Memory_hypomethylati..
## [2724] TRUE TRUE Memory_hypomethylati..
## [2725] TRUE TRUE Memory_hypomethylati..
## [2726] TRUE TRUE Memory_hypomethylati..
## [2727] TRUE TRUE Partial_hypomethylat..
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
corrected_dmr_gr <- all_dmrs[all_dmrs$any_corrected == TRUE]
corrected_dmr_df <- data.frame(chr=seqnames(corrected_dmr_gr),
start=start(corrected_dmr_gr),
end=end(corrected_dmr_gr),
name=1:length(corrected_dmr_gr),
score=corrected_dmr_gr$esc_ips_delta, strand="+")
write.table(x = corrected_dmr_df, file = "wgbs/processed_data/corrected_dmrs.bed",
quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
uncorrected_dmr_gr <- all_dmrs[all_dmrs$any_corrected == FALSE]
uncorrected_dmr_df <- data.frame(chr=seqnames(uncorrected_dmr_gr),
start=start(uncorrected_dmr_gr),
end=end(uncorrected_dmr_gr),
name=1:length(uncorrected_dmr_gr),
score=uncorrected_dmr_gr$esc_ips_delta, strand="+")
write.table(x = uncorrected_dmr_df, file = "wgbs/processed_data/uncorrected_dmrs.bed",
quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
saveRDS(object = all_dmrs, file = "wgbs/processed_data/all_dmrs_annotated_granges.Rds")
Plot corrected fraction of DMRs
# barplot correction fraction
correction_tab <- mcols(all_dmrs)[ ,c("tnt_corrected", "ntp_corrected", "class")] %>%
data.frame() %>% reshape2::melt(id.vars=c('class')) %>%
dplyr::group_by(class, variable, value) %>% dplyr::tally()
correction_tab$variable <- ifelse(test = correction_tab$variable == "tnt_corrected",
yes = "TNT-hiPSCs", no = "NtP-hiPSCs")
correction_tab$group <- ifelse(test = grepl(pattern = "emory",
x = correction_tab$class),
yes = "Memory", no = "Abberant")
bar_cols <- as.character(c(reprog_pal[7], vitC[2]))
dmr_element_corrected_gg <- ggplot(correction_tab,
aes(x = variable, y=n, fill=value)) +
geom_bar(position = "fill", stat="identity", alpha=0.9) +
facet_grid(.~group+class, scales = "free") +
scale_fill_manual(values = bar_cols) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.text.y = element_text(size=10)) +
ylab("Proportion of DMRs") +
scale_y_continuous(expand = c(0,0)) +
sams_pub_theme(legend_pos = "right")
dmr_element_corrected_gg
pdf("wgbs/plots/dmr_class_corrected_barplots.pdf",
width = 4, height =2)
dmr_element_corrected_gg
dev.off()
## quartz_off_screen
## 2
wb_ed_fig3 <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb_ed_fig3, sheetName = "Fig_3f")
openxlsx::writeData(wb = wb_ed_fig3, sheet = "Fig_3f",
x = dmr_element_corrected_gg$data)
correction_tab2 <- mcols(all_dmrs)[ ,c("tnt_corrected", "ntp_corrected", "loci")] %>%
data.frame() %>% reshape2::melt(id.vars='loci') %>%
dplyr::group_by(variable, value) %>% dplyr::tally()
correction_tab2$variable <- ifelse(test = correction_tab2$variable == "tnt_corrected",
yes = "TNT-hiPSC", no = "NtP-hiPSC")
dmr_element_corrected2_gg <- ggplot(correction_tab2,
aes(x = variable, y=n, fill=value)) +
geom_bar(position = "fill", stat="identity", alpha=0.9) +
scale_fill_manual(values = bar_cols) +
# theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = 10),
# axis.text.y = element_text(size=10)) +
ylab("Proportion of DMRs") +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
sams_pub_theme(legend_pos = "right", x.text.angle = 0, hjust = 0.5)
pdf("wgbs/plots/dmr_corrected_barplots.pdf", width = 2.9, height = 1)
dmr_element_corrected2_gg
dev.off()
## quartz_off_screen
## 2
sum(all_dmrs$tnt_corrected)
## [1] 1946
sum(all_dmrs$ntp_corrected)
## [1] 2122
dmr_element_corrected2_gg
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3b")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3b",
x = dmr_element_corrected2_gg$data)
Venn diagram of corrected DMR overlaps
venn_df <- data.frame(tnt=sum(all_dmrs$tnt_corrected),
ntp=sum(all_dmrs$ntp_corrected),
olap=sum(all_dmrs$both_corrected))
pdf(file = "wgbs/plots/DMR_corrected_venn.pdf", width = 2, height = 2)
VennDiagram::draw.pairwise.venn(area1 = venn_df$tnt,
area2 = venn_df$ntp,
cross.area = venn_df$olap, euler.d = TRUE)
## (polygon[GRID.polygon.1428], polygon[GRID.polygon.1429], polygon[GRID.polygon.1430], polygon[GRID.polygon.1431], text[GRID.text.1432], text[GRID.text.1433], lines[GRID.lines.1434], text[GRID.text.1435], text[GRID.text.1436], text[GRID.text.1437])
dev.off()
## quartz_off_screen
## 2
VennDiagram::draw.pairwise.venn(area1 = venn_df$tnt,
area2 = venn_df$ntp,
cross.area = venn_df$olap, euler.d = TRUE)
## (polygon[GRID.polygon.1438], polygon[GRID.polygon.1439], polygon[GRID.polygon.1440], polygon[GRID.polygon.1441], text[GRID.text.1442], text[GRID.text.1443], lines[GRID.lines.1444], text[GRID.text.1445], text[GRID.text.1446], text[GRID.text.1447])
openxlsx::addWorksheet(wb_ed_fig3, sheetName = "Fig_3e")
openxlsx::writeData(wb = wb_ed_fig3, sheet = "Fig_3e",
x = venn_df)
Plot DMR correction heatmap
## Rank matrix by variance
dmr_dat <- dmr_mCG_matrix
var_dat <- dmr_mCG_matrix[ ,c(esc_libs, primed_libs)] %>% as.matrix() %>% rowVars()
dmr_dat <- dmr_dat[order(var_dat, decreasing = FALSE), ]
# Split heatmap by corrected and uncorrected
corrected_ids <- all_dmrs$loci[all_dmrs$any_corrected == TRUE]
uncorrected_ids <- all_dmrs$loci[all_dmrs$any_corrected == FALSE]
dmr_dat_split <- rbind(dmr_dat[rownames(dmr_dat) %in% corrected_ids, ],
dmr_dat[rownames(dmr_dat) %in% uncorrected_ids, ])
heatmap_ind <- match(colnames(dmr_dat_split), mdat$Library_id)
colnames(dmr_dat_split) <- mdat$Manuscript.Name[heatmap_ind]
png(filename = "wgbs/plots/dmr_var_sorted_correction_split_heatmap.png",
height = 2.5, width = 3, units = "in", res = 300)
pheatmap(dmr_dat_split, cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 6,
show_rownames = FALSE, gaps_col = c(2,6,9,11,14),
gaps_row = length(corrected_ids), border_color = NA)
dev.off()
## quartz_off_screen
## 3
pdf(file = "wgbs/plots/dmr_var_sorted_correction_split_heatmap.pdf",
height = 2.5, width = 3)
pheatmap(dmr_dat_split, cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 6,
show_rownames = FALSE, gaps_col = c(2,6,9,11,14),
gaps_row = length(corrected_ids), border_color = NA)
dev.off()
## quartz_off_screen
## 3
gr_to_bed(gr = GRanges(seqnames = rownames(dmr_dat)),
out_path = str_c("wgbs/processed_data/all_DMR_variance_sort.bed"))
pheatmap(dmr_dat_split, cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 6,
show_rownames = FALSE, gaps_col = c(2,6,9,11,14),
gaps_row = length(corrected_ids), border_color = NA)
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3d")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3d",
x = dmr_dat_split)
Profile plot of all samples against hESC mean
stopifnot(all(rownames(dmr_mCG_matrix) == all_dmrs$loci))
dmr_sample_delta_mat <- dmr_mCG_matrix - (rowMeans(dmr_mCG_matrix[ ,esc_libs]))
dmr_sample_delta_mat <- dmr_sample_delta_mat[ ,!colnames(dmr_sample_delta_mat) %in% fib_libs]
dmr_sample_delta_mat <- data.frame(dmr_sample_delta_mat)
dmr_sample_delta_mat$class <- as.character(all_dmrs$class)
dmr_sample_delta_mat <- reshape2::melt(dmr_sample_delta_mat)
## Using class as id variables
dmr_sample_delta_mat <- dmr_sample_delta_mat[!is.na(dmr_sample_delta_mat$value), ]
head(dmr_sample_delta_mat)
## class variable value
## 1 Memory_hypermethylation RL417 0.5358814
## 2 Aberrant_hypermethylation RL417 0.6254337
## 3 Aberrant_hypermethylation RL417 0.1585693
## 4 Memory_hypomethylation RL417 -0.8294398
## 5 Aberrant_hypermethylation RL417 0.5504444
## 6 Aberrant_hypermethylation RL417 0.2207794
table(dmr_sample_delta_mat$variable)
##
## RL417 RL418 RL703 RL1751_1771
## 2727 2727 2727 2727
## RL411 RL412 RL838 RL1124
## 2727 2727 2708 2727
## RL1125 RL936 RL937 RL837
## 2726 2727 2726 2727
## RL2352_merge RL2351_merge SRS114877 SRR1561745_merge
## 2727 2727 2727 2727
## SRS004213 SRS606777 SRS606778
## 2727 2727 2727
dmr_sample_delta_mat$sample_group <- NA
sample_order <- c(esc_libs, primed_libs, naive_libs, tnt_libs, ntp_libs) %>% rev()
dmr_sample_delta_mat$variable <- factor(dmr_sample_delta_mat$variable,
levels=sample_order)
dmr_sample_delta_mat$sample_group[dmr_sample_delta_mat$variable %in% primed_libs] <- "Primed-hiPSC"
dmr_sample_delta_mat$sample_group[dmr_sample_delta_mat$variable %in% tnt_libs] <- "TNT-hiPSC"
dmr_sample_delta_mat$sample_group[dmr_sample_delta_mat$variable %in% ntp_libs] <- "NtP-hiPSC"
dmr_sample_delta_mat$sample_group[dmr_sample_delta_mat$variable %in% esc_libs] <- "hESC"
dmr_sample_delta_mat$sample_group[dmr_sample_delta_mat$variable %in% naive_libs] <- "Naive-hiPSC"
dens_ind <- match(dmr_sample_delta_mat$variable, mdat$Library_id)
dmr_sample_delta_mat$id <- mdat$Manuscript.Name[dens_ind]
factor_ind <- match(sample_order, mdat$Library_id)
dmr_sample_delta_mat$id <- factor(dmr_sample_delta_mat$id, levels=mdat$Manuscript.Name[factor_ind])
gg_ridges <- ggplot(data = dmr_sample_delta_mat,
mapping = aes(y = id, x = value, fill=stat(x))) +
#facet_grid(sample_group~., drop = TRUE, scales = "free_y") +
geom_density_ridges_gradient(height=3) +
scale_fill_gradient2(low = "firebrick", mid = "white", high = "firebrick", midpoint = 0) +
geom_vline(xintercept = c(-0.2, .2), alpha=0.4, linetype='dashed') +
scale_x_continuous(limits = c(-1, 1)) + xlab("Cell line DMR mCG - hESC mean") +
sams_pub_theme(x.text.angle = 0, legend_pos = "right", hjust = 0.5)
pdf("wgbs/plots/mcg_dmr_esc_delta_density_sample_plots_.pdf", width = 3.5, height = 4)
gg_ridges
## Warning: `stat(x)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## Picking joint bandwidth of 0.0215
dev.off()
## quartz_off_screen
## 2
gg_ridges
## Picking joint bandwidth of 0.0215
openxlsx::addWorksheet(wb_ed_fig3, sheetName = "Fig_3c")
openxlsx::writeData(wb = wb_ed_fig3, sheet = "Fig_3c",
x = gg_ridges$data)
DMR correction scatter-plots for hiPSCs relative to hESCs and fibroblasts
# Make scatterplot of DMR correction
primed_delta <- rowMeans(dmr_mCG_matrix[ ,primed_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
NtoP_delta <- rowMeans(dmr_mCG_matrix[ ,ntp_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
TNT_delta <- rowMeans(dmr_mCG_matrix[ ,tnt_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,esc_libs], na.rm = TRUE)
fib_primed_delta <- rowMeans(dmr_mCG_matrix[ ,primed_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,fib_libs], na.rm = TRUE)
fib_tnt_delta <- rowMeans(dmr_mCG_matrix[ ,tnt_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,fib_libs], na.rm = TRUE)
fib_NtoP_delta <- rowMeans(dmr_mCG_matrix[ ,ntp_libs], na.rm = TRUE) - rowMeans(dmr_mCG_matrix[ ,fib_libs], na.rm = TRUE)
dmr_delta_df <- data.frame(Primed_delta = primed_delta, TNT_delta = TNT_delta, NtoP_delta = NtoP_delta,
fib_primed_delta = fib_primed_delta, fib_tnt_delta = fib_tnt_delta, fib_NtoP_delta = fib_NtoP_delta)
dmr_delta_df$Class <- ifelse(test = primed_delta < 0, yes = "Hypo", no = "Hyper")
primed_scatter <- ggplot(data = dmr_delta_df, mapping = aes(x = Primed_delta, y = fib_primed_delta, colour = Class, fill = Class)) +
geom_vline(xintercept = c(-.2, .2), alpha=0.5) + geom_hline(yintercept = c(-.2, .2), alpha=0.5) +
geom_point(alpha=0.1, inherit.aes = TRUE, size=1) +
scale_x_continuous(limits = c(-1, 1), expand = c(0,0)) + scale_y_continuous(limits = c(-1, 1), expand = c(0,0)) +
ylab("Primed hiPSC mCG - Fibroblast mCG") +
xlab("Primed hiPSC mCG - hESC mCG") +
scale_colour_manual(values = vitC[c(6,1)]) +
#scale_colour_manual(values = brewer.pal(n = 2, name = "Dark2")) +
sams_pub_theme(x.text.angle = 0, hjust = 0.5)
primed_scatter_gg <- ggExtra::ggMarginal(primed_scatter, groupColour = TRUE, groupFill=TRUE)
tnt_scatter <- ggplot(data = dmr_delta_df, mapping = aes(x = TNT_delta, y = fib_tnt_delta, colour = Class, fill = Class)) +
geom_vline(xintercept = c(-.2, .2), alpha=0.5) + geom_hline(yintercept = c(-.2, .2), alpha=0.5) +
geom_point(alpha=0.1, inherit.aes = TRUE, size=1) +
scale_x_continuous(limits = c(-1, 1), expand = c(0,0)) + scale_y_continuous(limits = c(-1, 1), expand = c(0,0)) +
ylab("TNT hiPSC mCG - Fibroblast mCG") +
xlab("TNT hiPSC mCG - hESC mCG") +
scale_colour_manual(values = vitC[c(6,1)]) +
#scale_colour_manual(values = brewer.pal(n = 2, name = "Dark2")) +
sams_pub_theme(x.text.angle = 0, hjust = 0.5)
tnt_scatter_gg <- ggExtra::ggMarginal(tnt_scatter, groupColour = TRUE, groupFill=TRUE)
ntp_scatter <- ggplot(data = dmr_delta_df, mapping = aes(x = NtoP_delta, y = fib_NtoP_delta, colour = Class, fill = Class)) +
geom_vline(xintercept = c(-.2, .2), alpha=0.5) + geom_hline(yintercept = c(-.2, .2), alpha=0.5) +
geom_point(alpha=0.1, inherit.aes = TRUE, size=1) +
scale_x_continuous(limits = c(-1, 1), expand = c(0,0)) + scale_y_continuous(limits = c(-1, 1), expand = c(0,0)) +
ylab("NtP hiPSC mCG - Fibroblast mCG") +
xlab("NtP hiPSC mCG - hESC mCG") +
scale_colour_manual(values = vitC[c(6,1)]) +
#scale_colour_manual(values = brewer.pal(n = 2, name = "Dark2")) +
sams_pub_theme(x.text.angle = 0, hjust = 0.5)
ntp_scatter_gg <- ggExtra::ggMarginal(ntp_scatter, groupColour = TRUE, groupFill=TRUE)
pdf("wgbs/plots/donor_DMR_correction_scatterplots_with_histograms.pdf", height = 2, width = 6)
plot_grid(primed_scatter_gg, tnt_scatter_gg, ntp_scatter_gg, ncol = 3)
dev.off()
## quartz_off_screen
## 2
Primed-hiPSC
primed_scatter_gg
openxlsx::addWorksheet(wb_ed_fig3, sheetName = "Fig_3d1")
openxlsx::writeData(wb = wb_ed_fig3, sheet = "Fig_3d1",
x = primed_scatter$data)
TNT-hiPSC
tnt_scatter_gg
openxlsx::addWorksheet(wb_ed_fig3, sheetName = "Fig_3d2")
openxlsx::writeData(wb = wb_ed_fig3, sheet = "Fig_3d2",
x = tnt_scatter$data)
NtP-hiPSC
ntp_scatter_gg
openxlsx::addWorksheet(wb_ed_fig3, sheetName = "Fig_3d3")
openxlsx::writeData(wb = wb_ed_fig3, sheet = "Fig_3d3",
x = ntp_scatter$data)
Enrichment tests for DMRs (First chunk run on remote server)
source("R/server_libraries_and_functions.R")
library(regioneR)
## Load CG DMRs
all_dmrs <- readRDS("wgbs/processed_data/all_dmrs_annotated_granges.Rds")
## Load list of genomics elements
all_elements_gr <- readRDS("resources/all_genomic_elements_granges_final.Rds")
### Add CH-DMRs
library(readxl)
real_ch <- read_excel(path = "resources/nature13551-s3.xlsx")
mCH_gr <- GRanges(seqnames = real_ch$chr,
ranges = IRanges(start = as.numeric(real_ch$start),
end = as.numeric(real_ch$end)))
mCH_gr$study <- real_ch$presence
mCH_gr$class <- "CH-DMR"
all_elements_gr <- c(all_elements_gr, mCH_gr)
target_elements <- unique(all_elements_gr$class) %>% as.character()
target_elements <- target_elements[!target_elements %in% c("Other", "Satellite", "snRNA", "tRNA", "srpRNA",
"rRNA", "RC", "scRNA", "RNA")]
unmappable_gr <- bed_to_gr("resources/d0_d3_d7_merged_zero_coverage_regions.bed")
test_element_enrichment <- function(target_element="Enhancer", dmr_class=TRUE,
permutations=200, mask=unmappable_gr){
gc()
message(Sys.time())
message(str_c("Running ", target_element, " ", dmr_class, "..."))
gr_dmr <- all_dmrs[all_dmrs$any_corrected == dmr_class]
target_gr <- all_elements_gr[all_elements_gr$class == target_element]
set.seed(123)
perm_test_results <- regioneR::permTest(A=gr_dmr, B=target_gr,
alternative = "auto",
randomize.function=regioneR::randomizeRegions,
ntimes = permutations,
evaluate.function=regioneR::numOverlaps,
count.once = TRUE,
genome="hg19",
mask = unmappable_gr,
allow.overlaps=FALSE,
per.chromosome=TRUE,
mc.cores=20,
mc.set.seed=FALSE,
force.parallel=TRUE)
lz <- localZScore(pt=perm_test_results, A=gr_dmr, B=target_gr,
step = mean(width(gr_dmr)/2),
window = 100000)
results <- list(pt=perm_test_results, lz=lz, element=target_element,
class=dmr_class)
message("Done!")
return(results)
}
test <- test_element_enrichment(target_element="Promoter", dmr_class=FALSE,
permutations=100, mask=unmappable_gr)
all_dmr_corrected_test <- lapply(X = target_elements,
FUN = test_element_enrichment,
dmr_class=TRUE)
saveRDS(object = all_dmr_corrected_test,
file = "wgbs/processed_data/corrected_dmr_enrichment_tests.Rds")
all_dmr_uncorrected_test <- lapply(X = target_elements,
FUN = test_element_enrichment,
dmr_class=FALSE)
saveRDS(object = all_dmr_uncorrected_test,
file = "wgbs/processed_data/uncorrected_dmr_enrichment_tests.Rds")
Plot the enrichment results
all_dmr_uncorrected_test <- readRDS("wgbs/processed_data/uncorrected_dmr_enrichment_tests.Rds")
all_dmr_corrected_test <- readRDS("wgbs/processed_data/corrected_dmr_enrichment_tests.Rds")
aggregate_enrich_results <- function(x, enrich_list=all_dmr_corrected_test){
dat <- enrich_list[[x]]
df <- data.frame(DMR=dat$class, Element=dat$element, olaps=dat$pt$`regioneR::numOverlaps`$observed,
Pval=dat$pt$`regioneR::numOverlaps`$pval, Zscore=dat$pt$`regioneR::numOverlaps`$zscore)
return(df)
}
corrected_dat <- lapply(1:length(all_dmr_corrected_test), aggregate_enrich_results, enrich_list=all_dmr_corrected_test) %>%
do.call(rbind, .)
uncorrected_dat <- lapply(1:length(all_dmr_uncorrected_test), aggregate_enrich_results, enrich_list=all_dmr_uncorrected_test) %>% do.call(rbind, .)
all_enrich_dat <- rbind(corrected_dat, uncorrected_dat)
all_enrich_dat <- all_enrich_dat[order(all_enrich_dat$Zscore, decreasing = TRUE), ]
all_enrich_dat$Element <- factor(all_enrich_dat$Element,
levels=unique(rev(all_enrich_dat$Element[all_enrich_dat$DMR == TRUE])))
#all_enrich_dat <- all_enrich_dat[all_enrich_dat$olaps > 10, ]
all_enrich_dat$pc <- lapply(X = 1:nrow(all_enrich_dat), FUN = function(x){
ifelse(test = all_enrich_dat$DMR[x] == TRUE,
yes = all_enrich_dat$olaps[x] / length(all_dmrs[all_dmrs$any_corrected == TRUE]),
no = all_enrich_dat$olaps[x] / length(all_dmrs[all_dmrs$any_corrected == FALSE]))
}) %>% unlist()
all_enrich_dat$DMR <- ifelse(test = all_enrich_dat$DMR == TRUE, yes = "Corrected", no = "Uncorrected")
gg_enrich <- ggplot(data = all_enrich_dat, aes(y = Zscore, x = Element, fill=DMR)) +
geom_col(position = 'dodge') +
scale_fill_manual(values = rev(bar_cols)) +
#facet_grid(.~DMR) +
coord_flip() +
sams_pub_theme(x.text.angle = 0, legend_pos = "right")
pdf("wgbs/plots/dmr_enrichment_zscore_barplots_all.pdf", width = 4, height = 2)
gg_enrich
dev.off()
## quartz_off_screen
## 2
gg_enrich
wb_ed_fig4 <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb_ed_fig4, sheetName = "Fig_4a")
openxlsx::writeData(wb = wb_ed_fig4, sheet = "Fig_4a",
x = gg_enrich$data)
element_order <- c("Fibroblast_H3K9me3", "Fibroblast_PMD", "Fibroblast_LAD",
"Constitutive_H3K9me3", "Constitutive_LAD",
"ESC_H3K9me3", "ESC_LAD")
all_enrich_dat$DMR <- factor(all_enrich_dat$DMR, levels=c("Corrected", "Uncorrected"))
gg_enrich_sub <- ggplot(data = all_enrich_dat, aes(y = Zscore, x = Element, fill=DMR)) +
geom_col(position = 'dodge') +
#facet_wrap(DMR~.) +
scale_x_discrete(limits=element_order, drop=TRUE) +
scale_y_continuous(limits = c(-25, 55), breaks = c(-25, 0, 25, 50)) +
#coord_flip() +
scale_fill_manual(values = rev(bar_cols)) +
sams_pub_theme(x.text.angle = 45, legend_pos = "right")
gg_enrich_sub
## Warning: Removed 18 rows containing missing values (`geom_col()`).
pdf("wgbs/plots/dmr_enrichment_zscore_sub_barplots.pdf", width = 2.5, height = 2)
gg_enrich_sub
## Warning: Removed 18 rows containing missing values (`geom_col()`).
dev.off()
## quartz_off_screen
## 2
# gg_olaps <- ggplot(data = all_enrich_dat, aes(y = pc, x = Element)) +
# geom_col() +
# facet_wrap(DMR~.) +
# #geom_hline(yintercept = 959) +
# coord_flip()
# gg_olaps
pdf("wgbs/plots/dmr_enrichment_zscore_barplots.pdf", width = 4, height = 1.75)
gg_enrich
gg_enrich_sub
## Warning: Removed 18 rows containing missing values (`geom_col()`).
dev.off()
## quartz_off_screen
## 2
gg_enrich_sub
## Warning: Removed 18 rows containing missing values (`geom_col()`).
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3e")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3e",
x = gg_enrich_sub$data)
Plot the CG-DMRs in CH-DMRs
real_ch <- read_excel(path = "resources/nature13551-s3.xlsx")
mCH_gr <- GRanges(seqnames = real_ch$chr,
ranges = IRanges(start = as.numeric(real_ch$start),
end = as.numeric(real_ch$end)))
### PLot CH dmrs overlapping CG dmrs
all_dmrs <- readRDS("wgbs/processed_data/all_dmrs_annotated_granges.Rds")
all_dmrs$CH_olap <- overlapsAny(all_dmrs, mCH_gr)
ch_olap_dat <- mcols(all_dmrs)[ ,colnames(mcols(all_dmrs)) %in% c("esc_ips_delta",
"esc_n2p_d13_delta",
"esc_n2p_p10_delta",
"CH_olap",
"class",
"loci")] %>% data.frame()
colnames(ch_olap_dat)[2:4] <- c("TNT-hiPSC", "NtP-hiPSC", "Primed-hiPSC")
ch_olap_dat <- reshape2::melt(ch_olap_dat)
## Using class, loci, CH_olap as id variables
ch_olap_dat$CH_olap <- ifelse(test = ch_olap_dat$CH_olap, yes = "CH-DMR_overlap", no = "No overlap")
ch_olap_dat$CH_olap <- factor(ch_olap_dat$CH_olap, levels = c("FALSE", "TRUE"))
ch_olap_dat$variable <- factor(ch_olap_dat$variable,
levels = c("Primed-hiPSC", "TNT-hiPSC", "NtP-hiPSC"))
gg_ch_olap_violin <- ggplot(ch_olap_dat, aes(x = variable, y = abs(value), fill=CH_olap)) +
#geom_point(alpha=0.05, position = 'jitter') +
geom_violin(scale = 'width', draw_quantiles = 0.5, alpha=0.7,
trim = TRUE, size=0.2) +
scale_fill_pander()+ ylab("mCG delta in CG-DMRs") +
#geom_boxplot(color='black', outlier.alpha=0.05, outlier.size = 0.5) +
scale_y_continuous(breaks = seq(from=0, to=1, by=0.2)) +
#facet_grid(.~CH_olap) +
sams_pub_theme(legend_pos = "right")
pdf(file = "wgbs/plots/cg_dmr_delta_ch_olaps_violin.pdf", height = 2, width = 2.5)
gg_ch_olap_violin
dev.off()
## quartz_off_screen
## 2
gg_ch_olap_violin
openxlsx::addWorksheet(wb_ed_fig4, sheetName = "Fig_4c")
openxlsx::writeData(wb = wb_ed_fig4, sheet = "Fig_4c",
x = gg_ch_olap_violin$data)
CG DMR in CH DMR sum
## CG DMR in CH DMR sum
cg_in_ch <- sum(all_dmrs[all_dmrs$CH_olap == TRUE]$any_corrected) /
length(all_dmrs[all_dmrs$CH_olap == TRUE]$any_corrected)
cg_no_ch <- sum(all_dmrs[all_dmrs$CH_olap == FALSE]$any_corrected) /
length(all_dmrs[all_dmrs$CH_olap == FALSE]$any_corrected)
ch_cor_df <- data.frame(group=c("CG-DMR overlap CH-DMR", "No CH-DMR overlap"),
pc=c(cg_in_ch, cg_no_ch))
ch_dmr_olap_count <- ggplot(ch_cor_df, aes(x = group, y = pc*100)) +
geom_col() + scale_y_continuous(limits = c(0,100)) +
ylab("Proportion of CG-DMRs corrected (%)") +
geom_text(aes(label = sprintf("%.1f", pc*100), y= pc*100), vjust = 3) +
sams_pub_theme()
pdf("wgbs/plots/cg_dmr_ch_olaps_barplot.pdf", width = 1, height = 2.5)
ch_dmr_olap_count
dev.off()
## quartz_off_screen
## 2
ch_dmr_olap_count
openxlsx::addWorksheet(wb_ed_fig4, sheetName = "Fig_4b")
openxlsx::writeData(wb = wb_ed_fig4, sheet = "Fig_4b",
x = ch_dmr_olap_count$data)
Plot H3K9me3 signal in corrected CG DMRs (first code block run on remote server)
##### Get H3K9me3 in CG DMRs
# And fold-enrichment in CG-DMRs
##! Run this section on server
all_dmrs <- readRDS("wgbs/processed_data/all_dmrs_annotated_granges.Rds")
fibroblast <- c("ENCFF735TXC.bigWig")
ESC <- c("ENCFF108MOZ.bigWig")
Primed <- c("Primed_32F_H3K9me3_FE_sorted.bw", "Primed_38F_H3K9me3_FE_sorted.bw")
Naive <- c("RL2011_2020_02_06_P33_32F_Naive_SR_clone1_H3K9me3_ChIP_S7_merged_lanes_dedup_FE.bigwig")
TNT <- c("TNT_32F_H3K9me3_r1_FE_sorted.bw", "TNT_32F_H3K9me3_r2_FE_sorted.bw", "TNT_38F_H3K9me3_FE_sorted.bw")
N2P <- c("N2P_38F_H3K9me3_r1_FE_sorted.bw", "N2P_38F_H3K9me3_r2_FE_sorted.bw")
bw_files <- c(fibroblast, ESC, Primed, Naive, TNT, N2P)
bw_files <- str_c("/d/home/hg19ips/hg19ips_samb/mcc_hg19ips/sam/hs-reprogram/ChIPseq/bigwigs/", bw_files)
file.exists(bw_files)
bw_path <- bw_files[1]
gr <- all_dmrs
get_dmr_means <- function(bw_path, gr){
id <- basename(bw_path)
bw_dat <- import.bw(con = bw_path, which = gr)
hits <- findOverlaps(gr, bw_dat, ignore.strand = TRUE)
ans <- gr
mcols(ans) <- aggregate(bw_dat, hits, score.mean = mean(score),
drop = FALSE)
dat <- matrix(data = mcols(ans)$score.mean, nrow = length(gr),
byrow = TRUE) %>% data.frame()
dat$class <- gr$class
dat$loci <- gr_to_loci(gr)
dat$id <- id
colnames(dat)[1] <- "value"
return(dat)
}
all_cg_window_dat <- mclapply(X = bw_files, FUN = get_dmr_means, mc.cores = 10,
gr = all_dmrs)
all_cg_window_dat <- do.call(rbind, all_cg_window_dat)
write.table(all_cg_window_dat, "ChIPseq/processed_data/all_CG_DMR_h3k9me3_FE_means.txt",
quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)
fibroblast <- c("ENCFF735TXC.bigWig")
ESC <- c("ENCFF108MOZ.bigWig")
Primed <- c("Primed_32F_H3K9me3_FE_sorted.bw", "Primed_38F_H3K9me3_FE_sorted.bw")
Naive <- c("RL2011_2020_02_06_P33_32F_Naive_SR_clone1_H3K9me3_ChIP_S7_merged_lanes_dedup_FE.bigwig")
TNT <- c("TNT_32F_H3K9me3_r1_FE_sorted.bw", "TNT_32F_H3K9me3_r2_FE_sorted.bw", "TNT_38F_H3K9me3_FE_sorted.bw")
N2P <- c("N2P_38F_H3K9me3_r1_FE_sorted.bw", "N2P_38F_H3K9me3_r2_FE_sorted.bw")
all_cg_window_dat <- read.table("ChIPseq/processed_data/all_CG_DMR_h3k9me3_FE_means.txt")
h3k9_dmr_mat <- reshape2::dcast(data = all_cg_window_dat[ ,c(1,3,4)], formula = loci~id)
rownames(h3k9_dmr_mat) <- h3k9_dmr_mat$loci
h3k9_dmr_mat$loci <- NULL
colnames(h3k9_dmr_mat) <- str_remove(string = colnames(h3k9_dmr_mat), pattern = "merged_lanes_dedup_FE.bigwig")
TNT_corrected <- all_dmrs$loci[all_dmrs$tnt_corrected == TRUE]
N2P_corrected <- all_dmrs$loci[all_dmrs$ntp_corrected == TRUE]
all_cg_window_dat$TNT_corrected <- all_cg_window_dat$loci %in% TNT_corrected
all_cg_window_dat$N2P_corrected <- all_cg_window_dat$loci %in% N2P_corrected
all_cg_window_dat$id %>% unique()
## [1] "ENCFF735TXC.bigWig"
## [2] "ENCFF108MOZ.bigWig"
## [3] "Primed_32F_H3K9me3_FE_sorted.bw"
## [4] "Primed_38F_H3K9me3_FE_sorted.bw"
## [5] "RL2011_2020_02_06_P33_32F_Naive_SR_clone1_H3K9me3_ChIP_S7_merged_lanes_dedup_FE.bigwig"
## [6] "TNT_32F_H3K9me3_r1_FE_sorted.bw"
## [7] "TNT_32F_H3K9me3_r2_FE_sorted.bw"
## [8] "TNT_38F_H3K9me3_FE_sorted.bw"
## [9] "N2P_38F_H3K9me3_r1_FE_sorted.bw"
## [10] "N2P_38F_H3K9me3_r2_FE_sorted.bw"
chip_keep <- list(Fibroblast=fibroblast, ESC=ESC, Primed=Primed, Naive=Naive, TNT=TNT, N2P=N2P) %>%
do.call(rbind, .) %>% reshape2::melt() %>% .[ ,c(1,3)] %>% unique()
## Warning in rbind(...): number of columns of result is not a multiple of vector
## length (arg 3)
all_cg_window_dat <- all_cg_window_dat[all_cg_window_dat$id %in% chip_keep$value, ]
chip_keep$group <- NA
chip_keep$group[1] <- "Fibroblast"
chip_keep$group[2] <- "ESC (H9)"
chip_keep$group[3:nrow(chip_keep)] <- grepl(pattern = "32F",
x = chip_keep$value[3:nrow(chip_keep)]) %>%
ifelse(yes = "32F", no = "38F")
ind <- match(all_cg_window_dat$id, chip_keep$value)
all_cg_window_dat$group <- chip_keep$Var1[ind]
all_cg_window_dat$donor <- chip_keep$group[ind]
all_cg_window_dat$both_corrected <- all_cg_window_dat$TNT_corrected == TRUE |
all_cg_window_dat$N2P_corrected == TRUE
# Get the mean for each group where there are replicates
replicate_mean_dat <- all_cg_window_dat[ ,c(1,3,4,7:9)] %>%
dplyr::group_by(loci, group, donor, both_corrected) %>%
dplyr::summarize(Mean = mean(value, na.rm=TRUE)) %>% data.frame()
## `summarise()` has grouped output by 'loci', 'group', 'donor'. You can override
## using the `.groups` argument.
replicate_mean_dat$group <- factor(replicate_mean_dat$group,
levels = c("Fibroblast", "ESC", "Primed",
"N2P", "TNT", "Naive"))
replicate_mean_dat$donor <- factor(replicate_mean_dat$donor,
levels = c("Fibroblast", "ESC (H9)",
"32F", "38F"))
replicate_mean_dat <- replicate_mean_dat[is.finite(replicate_mean_dat$Mean), ]
replicate_mean_dat <- replicate_mean_dat[replicate_mean_dat$group != "Naive", ]
replicate_mean_dat$group <- factor(replicate_mean_dat$group,
levels = c("Fibroblast", "Primed", "N2P",
"TNT", "ESC"))
h3k9me3_boxplots <- ggplot(replicate_mean_dat, aes(x = group, y=Mean, fill=group)) +
geom_boxplot(color='black', outlier.alpha=0.2, lwd = line_mm,
outlier.shape = NA, notch = TRUE) +
#geom_boxplot(color='black', outlier.alpha=0.2, lwd = line_mm,notch = TRUE) +
ylab("H3K9me3 ChIP-seq fold enrichment in CG-DMRs") +
#facet_grid(.~donor, scales = "free_x", space = "free", drop = TRUE) +
sams_pub_theme()
pdf(file = "ChIPseq/plots/h3k9me3_in_CG_DMRs_boxplot.pdf",
width = 1.5, height = 2.5)
h3k9me3_boxplots
dev.off()
## quartz_off_screen
## 2
## Stats test of groups
replicate_spread <- reshape2::recast(replicate_mean_dat[ ,c(1,2,5)], formula = loci~group, fun.aggregate = mean)
## Using loci, group as id variables
replicate_spread <- round(replicate_spread[ ,-1], digits = 3)
run_wilcox <- function(x,y){
wt <- wilcox.test(x = log2(replicate_spread[ ,x]), conf.int = TRUE,
y = log2(replicate_spread[ ,y]), alternative = 'greater',
paired = TRUE)
#return(wt)
return(wt$p.value)
}
wilcox_df <- data.frame(Primed_v_NtP = run_wilcox(x = "Primed", y = "N2P"),
Primed_v_TNT = run_wilcox(x = "Primed", y = "TNT"),
TNT_v_ESC = run_wilcox(x = "TNT", y = "ESC"),
NtP_v_ESC = run_wilcox(x = "N2P", y = "ESC"),
Primed_v_ESC = run_wilcox(x = "Primed", y = "ESC"))
h3k9me3_boxplots
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3f")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3f",
x = h3k9me3_boxplots$data)
Calculate mCA methylayion in CH-DMRs. This code was not run locally and needs a high memory machine.
CH_dmr <- bed_to_gr("resources/nature13551-s3.bed")
make_window_matrix_nc_corrected <- function(x, gr, bins=30, nc_correct=TRUE,
nc_contig="chrL"){
rds_path <- mCA_files[x]
message(basename(rds_path))
#id <- mdat_wgbs$Library_id[x]
gr_expand <- gr + width(gr)
gr_expand$loci <- gr_to_loci(gr_expand)
gr_tiles <- tile(gr_expand, n = bins)
add_loci_to_grl <- function(x){
grt <- gr_tiles[[x]]
grt$loci <- gr_to_loci(gr_expand[x])
return(grt)
}
read_bs_obj <- function(rds_path){
stopifnot(file.exists(rds_path))
message(str_c("Reading ", rds_path))
message(Sys.time())
bs_obj <- readRDS(file = rds_path)
return(bs_obj)
}
# Calculate C coverage and methylation for ranges. Returns GRanges object
calc_mC_window <- function(bs_obj, gr){
message("Calculating coverage...")
gr$Cov <- getCoverage(BSseq = bs_obj, regions = gr, type = "Cov",
what = "perRegionTotal")
message("Calculating M...")
gr$M <- getCoverage(BSseq = bs_obj, regions = gr, type = "M",
what = "perRegionTotal")
message("Calculating methylation percentage...")
gr$pc <- gr$M / gr$Cov
return(gr)
}
gr_tiles <- lapply(1:length(gr_tiles), add_loci_to_grl) %>%
GRangesList() %>% unlist()
bs_obj <- read_bs_obj(rds_path)
mC_dat <- calc_mC_window(bs_obj = bs_obj, gr = gr_tiles)
dat <- matrix(data = mC_dat$pc, nrow = length(gr), byrow = TRUE)
# Correct for non-conversion
correct_nc <- function(bs_obj, nc_contig = "chrL"){
nc_obj <- chrSelectBSseq(BSseq = bs_obj, seqnames = nc_contig)
nc_cov <- getCoverage(BSseq = nc_obj, type = "Cov",
what = "perBase")
nc_m <- getCoverage(BSseq = nc_obj, type = "M",
what = "perBase")
nc_rate <- sum(nc_m) / sum(nc_cov)
return(nc_rate)
}
if (nc_correct == TRUE){
message("Correcting for non-conversion...")
nc_rate <- correct_nc(bs_obj = bs_obj, nc_contig = nc_contig)
dat <- dat - nc_rate
dat[dat < 0] <- 0
}
dat <- data.frame(dat)
rownames(dat) <- gr_to_loci(gr)
colnames(dat) <- 1:ncol(dat)
dat$id <- basename(rds_path)
return(dat)
}
mCA_files <- mdat_sub$BSseq_CA
mdat_sub$Library_id[!file.exists(mCA_files)]
mCA_files <- mCA_files[file.exists(mCA_files)]
all(file.exists(mCA_files))
dfl <- lapply(1:length(mCA_files), make_window_matrix_nc_corrected,
gr=CH_dmr)
dfl <- do.call(rbind, dfl)
saveRDS(dfl, "wgbs/processed_data/mCH_DMR_window_dat_all.Rds")
CH-DMR plots
dfl <- readRDS("wgbs/processed_data/mCH_DMR_window_dat_all.Rds")
id_ind <- match(dfl$id, basename(mdat$BSseq_CA))
dfl$id <- mdat$Library_id[id_ind]
CH_dat <- readxl::read_excel(path = "resources/nature13551-s3.xlsx")
CH_dmr <- GRanges(seqnames = CH_dat$chr,
ranges = IRanges(start = as.numeric(CH_dat$start),
end = as.numeric(CH_dat$end)))
CH_dmr$presence <- CH_dat$presence
CH_dmr$loci <- gr_to_loci(CH_dmr)
dfl$loci <- rownames(dfl)
dfl$loci <- rep(gr_to_loci(CH_dmr), times=length(unique(dfl$id)))
keep <- dfl$loci %in% CH_dmr$loci[CH_dmr$presence == "all iPSCs"]
table(keep)
## keep
## FALSE TRUE
## 7811 3139
dfl <- dfl[keep, ]
dfl <- dfl[dfl$id %in% c(esc_libs, primed_libs, tnt_libs, ntp_libs), ]
### Normalise to max (flank normalisation)
max_norm <- function(x){
vec <- dfl[x, 1:30] / max(dfl[x, 1:30], na.rm = TRUE)
return(vec)
}
dfl_norm <- lapply(X = 1:nrow(dfl), max_norm) %>% do.call(rbind, .)
dfl_norm$id <- dfl$id
dfl_norm$loci <- dfl$loci
ind <- match(dfl_norm$id, mdat$Library_id)
dfl_norm$grp <- mdat$Group[ind]
dfl_norm$progenitor <- mdat$Progenitor[ind]
dfl_norm$batch <- mdat$Batch[ind]
dfl_norm$lab <- mdat$Lab[ind]
dfl_norm$background <- mdat$Background[ind]
dfl_norm <- dfl_norm[dfl_norm$lab != "Smith", ] # No lambda spike to control non-conversion
norm_dat <- reshape2::melt(dfl_norm)
## Using id, loci, grp, progenitor, batch, lab, background as id variables
### heatmap each dmr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:VariantAnnotation':
##
## select
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:Biostrings':
##
## collapse, intersect, setdiff, setequal, union
## The following object is masked from 'package:XVector':
##
## slice
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:bsseq':
##
## combine
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hm_dat <- norm_dat[norm_dat$variable %in% 11:20, ]
hm_dat$grp <- as.character(hm_dat$grp)
hm_dat <- hm_dat[ ,c(2,3,9)] %>%
dplyr::group_by(loci, grp) %>%
dplyr::summarise(mean=mean(value, na.rm=TRUE))
## `summarise()` has grouped output by 'loci'. You can override using the
## `.groups` argument.
hm_dat <- hm_dat %>% tidyr::spread(grp, mean) %>% data.frame()
rownames(hm_dat) <- hm_dat$loci
hm_dat$loci <- NULL
pdf(file = "wgbs/plots/CH_DMR_mCA_flank_norm_heatmap.pdf", width = 2, height = 4)
pheatmap(hm_dat, border_color = NA, show_rownames = FALSE)
dev.off()
## quartz_off_screen
## 3
pheatmap(hm_dat, border_color = NA)
openxlsx::addWorksheet(wb_ed_fig4, sheetName = "Fig_4d")
openxlsx::writeData(wb = wb_ed_fig4, sheet = "Fig_4d",
x = hm_dat)
up <- rownames(hm_dat)[hm_dat$Primed.hiPSC < hm_dat$hESC]
up_gr <- GRanges(up)
gr_to_bed(up_gr, out_path = "wgbs/processed_data/mCH_DMR_primed_demethylated.bed")
head(norm_dat)
## id loci grp progenitor batch lab background
## 1 RL1124 chr1:49365000-50570000 TNT-hiPSC Fibroblast A Lister 32F
## 2 RL1124 chr1:2370000-3505000 TNT-hiPSC Fibroblast A Lister 32F
## 3 RL1124 chr1:3980000-5925000 TNT-hiPSC Fibroblast A Lister 32F
## 4 RL1124 chr1:152085000-152955000 TNT-hiPSC Fibroblast A Lister 32F
## 5 RL1124 chr10:37065000-38300000 TNT-hiPSC Fibroblast A Lister 32F
## 6 RL1124 chr10:131980000-133695000 TNT-hiPSC Fibroblast A Lister 32F
## variable value
## 1 1 0.6777856
## 2 1 0.7069248
## 3 1 0.9935123
## 4 1 1.0000000
## 5 1 0.9092462
## 6 1 0.8270384
norm_dat$up <- norm_dat$loci %in% up
line_mm <- 0.5 / 2.835
norm_dat$grp <- factor(norm_dat$grp, levels=rev(c("Primed-hiPSC", "NtP-hiPSC",
"TNT-hiPSC", "hESC")))
barplot(c(1,1,1,1,1,1), col=vitC); barplot(c(1,1,1,1,1), col=reprog_pal)
pal <- rev(c(reprog_pal[c(1)], vitC[c(6,2)], reprog_pal[4]))
gg_mch <- ggplot(data = norm_dat, mapping = aes(x = variable, y = value,
group=grp, col=grp, fill=grp)) +
stat_summary(geom = "line", na.rm = TRUE, size=0.5, alpha = 0.7) +
#geom_smooth(na.rm = TRUE, span = 0.25) +
#stat_summary(geom = 'ribbon', fun.data = mean_se, alpha=0.2, col=NA, na.rm=TRUE) +
#scale_fill_manual(values = pal) +
#scale_color_manual(values = pal) +
scale_color_colorblind() +
facet_grid(up~., scales = "free_y") +
ylab("Normalised mCA/CA") +
xlab("") +
geom_vline(xintercept = c(11,20), linetype="dashed", size=line_mm) +
theme_bw() +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
strip.text.y = element_text(size = 6),
text = element_text(size=6),
strip.background = element_blank(),
legend.position = "right",
axis.line.x = element_line(color = 'black', size = line_mm),
axis.text.y = element_text(color = 'black'),
axis.line.y = element_line(color = 'black', size = line_mm),
axis.ticks.y = element_line(color = 'black', size = line_mm),
axis.ticks.x = element_blank(),
axis.text.x = element_blank())
gg_mch
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
ggsave(filename = "wgbs/plots/mch_dmr_summary_plot.pdf", width = 3, height = 2)
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3g")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3g",
x = gg_mch$data)
openxlsx::addWorksheet(wb_ed_fig4, sheetName = "Fig_4e")
openxlsx::writeData(wb = wb_ed_fig4, sheet = "Fig_4e",
x = gg_mch$data)
Plot H3K9me3 for CH-DMRs (first code block run on remote server)
source("R/server_libraries_and_functions.R")
gr <- bed_to_gr("resources/nature13551-s3.bed")
bw_files <- list.files(pattern = "", full.names = TRUE, path = "ChIPseq/bigwigs")
#H9_H3K9me3 <- "ENCFF108MOZ.bigWig"
#fibroblast_H3K9me3 <- "ENCFF735TXC.bigWig"
#bw_files <- c(bw_files, H9_H3K9me3, fibroblast_H3K9me3)
bw_path <- bw_files[1]
make_dmr_window_means <- function(bw_path, gr, bins=30){
id <- basename(bw_path)
gr_expand <- gr + width(gr)
gr_expand$loci <- gr_to_loci(gr_expand)
gr_tiles <- tile(gr_expand, n = bins)
add_loci_to_grl <- function(x){
grt <- gr_tiles[[x]]
grt$loci <- gr_to_loci(gr_expand[x])
return(grt)
}
gr_tiles <- lapply(1:length(gr_tiles), add_loci_to_grl) %>%
GRangesList() %>% unlist()
bw_dat <- import.bw(con = bw_path, which = gr_tiles)
hits <- findOverlaps(gr_tiles, bw_dat, ignore.strand = TRUE)
ans <- gr_tiles
mcols(ans) <- aggregate(bw_dat, hits, score.mean = mean(score),
drop = FALSE)
dat <- matrix(data = mcols(ans)$score.mean, nrow = length(gr),
byrow = TRUE) %>% data.frame()
dat$loci <- gr_to_loci(gr)
dat$id <- id
return(dat)
}
all_window_dat <- mclapply(X = bw_files, FUN = make_dmr_window_means, mc.cores = 18, gr = gr)
all_window_df <- do.call(rbind, all_window_dat)
write.table(all_window_df, "ChIPseq/processed_data/all_CH_DMR_h3k9me3_logFE_means.txt",
quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)
all_window_df <- read.table("ChIPseq/processed_data/all_CH_DMR_h3k9me3_logFE_means.txt")
colnames(all_window_df) <- colnames(all_window_df) %>% str_remove(pattern = "X")
#all_window_df$loci <- rownames(all_window_df)
table(all_window_df$id)
##
## ENCFF108MOZ.bigWig
## 150
## ENCFF735TXC.bigWig
## 150
## N2P_32F_H3K9me3_r1_FE_sorted.bw
## 150
## N2P_38F_H3K9me3_r1_FE_sorted.bw
## 150
## N2P_38F_H3K9me3_r2_FE_sorted.bw
## 150
## N2P_H3K9me3_FE_sorted.bw
## 150
## Primed_32F_H3K9me3_FE_sorted.bw
## 150
## Primed_38F_H3K9me3_FE_sorted.bw
## 150
## Primed_H3K9me3_FE_sorted.bw
## 150
## RL2005_2020_02_06_P17_MEL1_HDF_to_SR_H3K9me3_ChIP_S1_merged_lanes_dedup_FE.bigwig
## 150
## RL2006_2020_02_06_P13_plus_20_MEL1_to_E8_H3K9me3_ChIP_S2_merged_lanes_dedup_FE.bigwig
## 150
## RL2007_2020_02_06_P18_MEL1_HDF_to_E8_H3K9me3_ChIP_S3_merged_lanes_dedup_FE.bigwig
## 150
## RL2008_2020_02_06_P17_MEL1_D13_TNT_H3K9me3_ChIP_S4_merged_lanes_dedup_FE.bigwig
## 150
## RL2009_2020_02_06_P4_plus_10_TNT_MEL1_H3K9me3_ChIP_S5_merged_lanes_dedup_FE.bigwig
## 150
## RL2010_2020_02_06_P9_plus_6_TNT_MEL1_H3K9me3_ChIP_S6_merged_lanes_dedup_FE.bigwig
## 150
## RL2011_2020_02_06_P33_32F_Naive_SR_clone1_H3K9me3_ChIP_S7_merged_lanes_dedup_FE.bigwig
## 150
## RL2012_2020_02_06_P7_32F_HDFa_Aug_2019_H3K9me3_ChIP_S8_merged_lanes_dedup_FE.bigwig
## 150
## RL2013_2020_02_06_P20_MEL1_to_P6_HDFa_H3K9me3_ChIP_S9_merged_lanes_dedup_FE.bigwig
## 150
## RL2014_2020_02_06_P7_32F_HDFa_Oct_2019_H3K9me3_ChIP_S10_merged_lanes_dedup_FE.bigwig
## 150
## RL2015_2020_02_06_P9_38F_HDFa_H3K9me3_ChIP_S11_merged_lanes_dedup_FE.bigwig
## 150
## TNT_32F_H3K9me3_r1_FE_sorted.bw
## 150
## TNT_32F_H3K9me3_r2_FE_sorted.bw
## 150
## TNT_38F_H3K9me3_FE_sorted.bw
## 150
## TNT_H3K9me3_FE_sorted.bw
## 150
table(all_window_df$loci %in% CH_dmr$loci)
##
## TRUE
## 3600
head(all_window_df)
## 1 2 3 4 5 6 7
## 1 0.8161353 0.9166089 0.8690738 0.9994672 1.1810257 1.0843122 0.8362058
## 2 0.8804295 0.7452304 0.8173114 0.7604953 0.9293903 2.3804093 2.7781496
## 3 0.7985588 0.6271983 0.6016571 0.6389467 0.6850343 0.8164633 0.8054794
## 4 0.7277600 0.8186797 1.2667908 3.3399080 3.1504939 0.7676743 0.8542513
## 5 2.8642192 2.8017036 3.0059634 2.6886044 2.7980874 2.6298348 3.0797360
## 6 0.7443335 0.7789159 0.6911676 0.6936853 0.8145397 0.8015978 0.7197695
## 8 9 10 11 12 13 14 15
## 1 0.8304317 0.8589303 0.9468344 2.783606 2.962831 3.208627 3.056240 2.995155
## 2 2.8726232 3.0953939 2.9407263 2.899702 3.058350 3.057658 3.051382 3.060446
## 3 0.7008003 0.7434213 0.8612542 0.701840 1.497120 2.951440 3.448113 3.407489
## 4 0.7731590 0.7978956 0.9286602 1.125752 1.423666 1.325595 1.095944 1.280401
## 5 2.7723321 2.5035173 2.6806967 2.559016 2.497548 2.976839 2.890653 2.825433
## 6 0.9706597 0.8211206 2.3364557 3.262949 3.047996 2.967371 3.153054 3.293995
## 16 17 18 19 20 21 22 23
## 1 2.985045 3.000356 2.877596 2.8852633 3.1541258 3.0708987 3.0117867 0.9951730
## 2 3.147131 3.100159 3.115457 3.3392855 3.0691969 2.7415128 3.0455581 2.9434633
## 3 2.057002 0.729445 0.944228 0.7587574 0.8498437 0.6998497 0.8638159 0.8358737
## 4 1.242841 1.281522 1.228403 1.1491141 1.1676808 0.7621017 0.7312021 0.6969597
## 5 2.491691 2.530878 2.811663 3.0376571 3.1451127 3.2443145 2.4871034 1.4985741
## 6 2.901171 3.171254 3.230500 3.2314908 3.2159000 2.6625520 2.6875548 3.1618481
## 24 25 26 27 28 29 30
## 1 0.8187580 0.7263985 0.7798237 0.8939533 0.8664973 0.8414255 0.6987161
## 2 2.7986421 2.4614677 0.8100529 0.9663161 0.9080945 0.8294612 0.7575552
## 3 0.0000000 1.1972503 1.0482895 1.2795265 1.5948747 1.2771958 1.2652769
## 4 0.6187949 0.7160526 0.8309455 0.8736097 1.0142609 0.8310721 0.7983465
## 5 1.7672349 1.2670175 2.5656156 2.6060339 0.7214993 2.2581043 2.9156145
## 6 3.3533527 2.8090388 1.9513595 0.7890664 0.6996271 0.6575542 0.7068788
## loci id
## 1 chr1:49365000-50570000 ENCFF108MOZ.bigWig
## 2 chr1:159335000-159610000 ENCFF108MOZ.bigWig
## 3 chr1:2370000-3505000 ENCFF108MOZ.bigWig
## 4 chr1:3980000-5925000 ENCFF108MOZ.bigWig
## 5 chr1:248335000-248555000 ENCFF108MOZ.bigWig
## 6 chr1:152085000-152955000 ENCFF108MOZ.bigWig
### Check for infinate rows
#keep <- is.finite(rowSums(all_window_df[ , 1:30]))
#all_window_df <- all_window_df[keep, ]
### Normalise to max (flank normalisation)
min_norm <- function(x){
vec <- all_window_df[x, 1:30] +1 # stop dividing by zero
vec <- vec / min(vec, na.rm = TRUE)
#vec <- vec - 1
return(vec)
}
norm_window_df <- lapply(X = 1:nrow(all_window_df), min_norm) %>% do.call(rbind, .)
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
## Warning in min(structure(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
## no non-missing arguments to min; returning Inf
norm_window_df$id <- all_window_df$id
norm_window_df$loci <- all_window_df$loci
### Get relative mean for each sample
sample_ids <- unique(norm_window_df$id)
## Filter for plotted CH DMRs
loci_class <- norm_dat[ ,c("loci", "up")] %>% unique()
norm_window_df <- norm_window_df[norm_window_df$loci %in% loci_class$loci, ]
ind <- match(norm_window_df$loci, loci_class$loci)
norm_window_df$up <- loci_class$up[ind]
head(norm_window_df)
## 1 2 3 4 5 6 7 8
## 1 1.069122 1.128269 1.100286 1.177046 1.283926 1.226993 1.080937 1.077538
## 3 1.798559 1.627198 1.601657 1.638947 1.685034 1.816463 1.805479 1.700800
## 4 1.067312 1.123478 1.400295 2.680950 2.563941 1.091969 1.145452 1.095357
## 6 1.052354 1.073217 1.020279 1.021798 1.094709 1.086901 1.037534 1.188896
## 9 1.903994 2.009877 1.972632 1.896889 1.881570 1.962512 1.894117 2.033977
## 11 1.167172 1.116643 1.125415 1.129168 1.129697 1.197032 1.127807 1.248535
## 9 10 11 12 13 14 15 16
## 1 1.094315 1.146062 2.227333 2.332839 2.477534 2.387827 2.351867 2.345916
## 3 1.743421 1.861254 1.701840 2.497120 3.951440 4.448113 4.407489 3.057002
## 4 1.110638 1.191417 1.313169 1.497204 1.436621 1.294756 1.408703 1.385500
## 6 1.098679 2.012879 2.571831 2.442150 2.393509 2.505532 2.590561 2.353571
## 9 1.916459 1.879691 3.659867 3.550741 3.475193 3.778470 3.960645 3.619152
## 11 1.070628 1.075318 1.646557 1.258055 1.156703 1.232273 1.183187 1.185769
## 17 18 19 20 21 22 23 24
## 1 2.354929 2.282663 2.287176 2.445450 2.396456 2.361658 1.174518 1.070666
## 3 1.729445 1.944228 1.758757 1.849844 1.699850 1.863816 1.835874 1.000000
## 4 1.409395 1.376582 1.327601 1.339071 1.088527 1.069439 1.048286 1.000000
## 6 2.516512 2.552254 2.552852 2.543446 2.209612 2.224696 2.510837 2.626371
## 9 3.813443 3.816992 4.074554 3.752327 3.145022 3.404574 2.341535 3.966292
## 11 1.184087 1.201166 1.139579 1.231112 1.085433 1.074097 1.050367 1.000000
## 25 26 27 28 29 30 id
## 1 1.016296 1.047746 1.114932 1.098769 1.084010 1.000000 ENCFF108MOZ.bigWig
## 3 2.197250 2.048290 2.279527 2.594875 2.277196 2.265277 ENCFF108MOZ.bigWig
## 4 1.060080 1.131055 1.157410 1.244297 1.131133 1.110917 ENCFF108MOZ.bigWig
## 6 2.297987 1.780551 1.079341 1.025383 1.000000 1.029757 ENCFF108MOZ.bigWig
## 9 3.977840 1.815847 4.286488 1.000000 1.000000 1.000000 ENCFF108MOZ.bigWig
## 11 1.182682 1.159927 1.026690 1.174469 1.097615 2.444871 ENCFF108MOZ.bigWig
## loci up
## 1 chr1:49365000-50570000 FALSE
## 3 chr1:2370000-3505000 FALSE
## 4 chr1:3980000-5925000 TRUE
## 6 chr1:152085000-152955000 FALSE
## 9 chr10:37065000-38300000 FALSE
## 11 chr10:131980000-133695000 TRUE
calc_dmr_means_scaled <- function(id){
df <- norm_window_df[norm_window_df$id == id, ]
#keep <- is.finite(rowSums(df[ ,1:30]))
#df <- df[keep, ]
sample_means_up <- colMeans(df[df$up == TRUE, 1:30], na.rm = TRUE)
sample_means_up <- sample_means_up - (min(sample_means_up) -1)
sample_means_down <- colMeans(df[df$up == FALSE, 1:30], na.rm = TRUE)
sample_means_down <- sample_means_down - (min(sample_means_down) -1)
out_df <- rbind(sample_means_up, sample_means_down) %>% data.frame()
out_df$direction <- c("Up", "Down")
out_df$id <- id
return(out_df)
}
norm_window_scaled_df <- lapply(sample_ids, calc_dmr_means_scaled) %>%
do.call(rbind, .) %>% data.frame()
window_melt <- reshape2::melt(norm_window_scaled_df)
## Using direction, id as id variables
#window_melt <- melt(all_window_df)
head(window_melt)
## direction id variable value
## 1 Up ENCFF108MOZ.bigWig X1 1.086362
## 2 Down ENCFF108MOZ.bigWig X1 1.344499
## 3 Up ENCFF735TXC.bigWig X1 1.000000
## 4 Down ENCFF735TXC.bigWig X1 1.066396
## 5 Up N2P_32F_H3K9me3_r1_FE_sorted.bw X1 1.074708
## 6 Down N2P_32F_H3K9me3_r1_FE_sorted.bw X1 1.000000
window_melt$variable <- str_remove(string = window_melt$variable, pattern = "X")
window_melt$variable <- as.numeric(window_melt$variable)
window_melt$pooled <- !grepl(pattern = "F_", x = window_melt$id)
#window_melt <- window_melt[window_melt$loci %in% norm_dat$loci, ]
#window_melt[window_melt$id == "ENCFF067LWX.bigWig", ]$value <-
# log(window_melt[window_melt$id == "ENCFF067LWX.bigWig", ]$value +1)
ids <- unique(window_melt$id)
id <- ids[1]
#window_melt <- window_melt[window_melt$pooled == TRUE, ]
fibroblast <- c("ENCFF735TXC.bigWig")
ESC <- c("ENCFF108MOZ.bigWig")
Primed <- c("Primed_32F_H3K9me3_FE_sorted.bw", "Primed_38F_H3K9me3_FE_sorted.bw")
Naive <- c("RL2011_2020_02_06_P33_32F_Naive_SR_clone1_H3K9me3_ChIP_S7_merged_lanes_dedup_FE.bigwig")
TNT <- c("TNT_32F_H3K9me3_r1_FE_sorted.bw", "TNT_32F_H3K9me3_r2_FE_sorted.bw", "TNT_38F_H3K9me3_FE_sorted.bw")
N2P <- c("N2P_38F_H3K9me3_r1_FE_sorted.bw", "N2P_38F_H3K9me3_r2_FE_sorted.bw")
bw_files <- c(fibroblast, ESC, Primed, Naive, TNT, N2P)
chip_keep <- list(Fibroblast=fibroblast, ESC=ESC, Primed=Primed, Naive=Naive, TNT=TNT, N2P=N2P) %>%
do.call(rbind, .) %>% reshape2::melt() %>% .[ ,c(1,3)] %>% unique()
## Warning in rbind(...): number of columns of result is not a multiple of vector
## length (arg 3)
window_melt <- window_melt[window_melt$id %in% chip_keep$value, ]
ind <- match(window_melt$id, chip_keep$value)
window_melt$group <- chip_keep$Var1[ind]
gg_mch_chip <- ggplot(data = window_melt, mapping = aes(x = variable, y = value,
group=group, col=group, fill=group)) +
facet_grid(direction~., scales = "free_y") +
stat_summary(geom = "line", na.rm = TRUE, size=0.5, alpha = 0.7) +
#geom_smooth(na.rm = TRUE, span = 0.25) +
#stat_summary(geom = 'ribbon', fun.data = mean_se, alpha=0.2, na.rm=TRUE) +
#scale_fill_manual(values = pal) +
#scale_color_manual(values = pal) +
#scale_color_colorblind() +
ylab("Normalised H3K9me3 ChIP-seq enrichment") +
xlab("") +
geom_vline(xintercept = c(11,20), linetype="dashed", size=line_mm) +
theme_bw() +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
strip.text.y = element_text(size = 6),
text = element_text(size=6),
strip.background = element_blank(),
legend.position = "right",
axis.line.x = element_line(color = 'black', size = line_mm),
axis.text.y = element_text(color = 'black'),
axis.line.y = element_line(color = 'black', size = line_mm),
axis.ticks.y = element_line(color = 'black', size = line_mm),
axis.ticks.x = element_blank(),
axis.text.x = element_blank())
gg_mch_chip
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
ggsave(filename = "ChIPseq/plots/mch_dmr_h3k9me3_summary_plot.pdf", width = 3.5, height = 2)
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
pdf("wgbs/plots/mch_dmr_summary_plot.pdf", width = 2.75, height = 4)
plot_grid(gg_mch, gg_mch_chip, align = "hv", ncol = 1)
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
dev.off()
## quartz_off_screen
## 2
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3h")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3h",
x = gg_mch_chip$data)
openxlsx::addWorksheet(wb_ed_fig4, sheetName = "Fig_4e2")
openxlsx::writeData(wb = wb_ed_fig4, sheet = "Fig_4e2",
x = gg_mch_chip$data)
ICR plots for Fig. 3j and Fig. S4
icr_gr <- readRDS("resources/imprint_control_regions_granges_hg19.Rds")
icr_mCG <- readRDS("wgbs/processed_data/icr_mcg_all.Rds")
icr_endpoint <- c("RL417", "RL418", "RL1751_1771", "RL703", ## Primed
"RL1124", "RL1125", # TNT
"RL936", "RL937", "RL837", #NtP
"RL2351_merge", "RL2352_merge",
"SRR1561745_merge", "SRS004213", "SRS114877",
"SRS606777", "SRS606778") ## ESC
icr_endpoint_df <- icr_mCG[ ,icr_endpoint] %>% data.frame()
icr_endpoint_df$class <- as.character(icr_gr$Categoly)
icr_endpoint_df <- reshape2::melt(icr_endpoint_df)
## Using class as id variables
icr_endpoint_ind <- match(icr_endpoint_df$variable, mdat$Library_id)
icr_endpoint_ind2 <- match(icr_endpoint, mdat$Library_id)
icr_endpoint_df$id <- factor(mdat$Manuscript.Name[icr_endpoint_ind],
levels=mdat$Manuscript.Name[icr_endpoint_ind2])
icr_endpoint_df$group <- factor(mdat$State[icr_endpoint_ind],
levels = c("Primed", "TNT", "NtP",
"Naive", "ESC"))
gg_icr_endpoint_box <- ggplot(icr_endpoint_df, aes(x = id, y = value,
fill=group, alpha=0.8)) +
geom_boxplot(outlier.shape = NA) +
facet_grid(class~group, drop = TRUE, space = "free_x", scales = "free_x") +
scale_fill_manual(values = reprog_pal2[c(2,1,3,4)]) +
ylab("ICR mCG/CG") + xlab("") +
sams_pub_theme()
gg_icr_endpoint_box_maternal <- ggplot(icr_endpoint_df[icr_endpoint_df$class == "Maternal germline ICR", ],
aes(x = id, y = value, fill=group)) +
geom_boxplot(outlier.shape = NA, lwd=0.18) +
facet_grid(.~group, drop = TRUE, space = "free_x", scales = "free_x") +
geom_hline(yintercept = c(0.25, 0.75), linetype="dashed", size=line_mm) +
scale_fill_manual(values = reprog_pal2[c(2,1,3,4)]) +
ylab("ICR mCG/CG") + xlab("") +
sams_pub_theme()
gg_icr_endpoint_box_NOT_maternal <- ggplot(icr_endpoint_df[icr_endpoint_df$class == "Maternal germline ICR", ],
aes(x = id, y = value, fill=group)) +
geom_boxplot(outlier.shape = NA, lwd=0.18) +
facet_grid(.~group, drop = TRUE, space = "free_x", scales = "free_x") +
geom_hline(yintercept = c(0.25, 0.75), linetype="dashed", size=line_mm) +
scale_fill_manual(values = reprog_pal2[c(2,1,3,4)]) +
ylab("ICR mCG/CG") + xlab("") +
sams_pub_theme()
pdf("wgbs/plots/icr_endpoint_boxplots_maternal_germline.pdf",
height = 2, width = 3)
gg_icr_endpoint_box_maternal
dev.off()
## quartz_off_screen
## 2
pdf("wgbs/plots/icr_endpoint_boxplots_NOT_maternal_germline.pdf",
height = 3.5, width = 3)
gg_icr_endpoint_box_NOT_maternal
dev.off()
## quartz_off_screen
## 2
gg_icr_endpoint_box_maternal
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3j")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3j",
x = gg_icr_endpoint_box_maternal$data)
Do some ‘wider’ statistics for ICRs, as asked by reviewer 4
icr_df <- icr_mCG[ ,c("RL415", "RL702", icr_endpoint)] %>% data.frame()
## Setup anova for each ICR
groups <- factor(mdat$State[match(colnames(icr_df), mdat$Library_id)])
design <- model.matrix(~ 0+groups)
colnames(design) <- c("Fibroblast", "ESC", "NtP", "Primed", "TNT")
contrast.matrix <- makeContrasts(Fibroblast - Primed,
Fibroblast - TNT,
Fibroblast - NtP,
Primed - TNT,
Primed - NtP,
levels = design)
fit <- lmFit(icr_df, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
tt_primed <- topTable(fit2, coef=1, adjust="BH",
number = nrow(icr_df), sort="none")
#tt_primed <- tt_primed[match(rownames(icr_df), rownames(tt_primed)), ]
tt_tnt <- topTable(fit2, coef=2, adjust="BH",
number = nrow(icr_df), sort="none")
tt_ntp <- topTable(fit2, coef=3, adjust="BH",
number = nrow(icr_df), sort="none")
row_dat <- data.frame(row.names = rownames(icr_df),
class = icr_gr$Categoly,
Primed = as.numeric(tt_primed$adj.P.Val < 0.05),
TNT = as.numeric(tt_tnt$adj.P.Val < 0.05),
NtP = as.numeric(tt_ntp$adj.P.Val < 0.05)
)
col_dat <- data.frame(row.names = colnames(icr_df),
group = groups)
icr_hm <- pheatmap(icr_df[ ,1:11],
annotation_row = row_dat,
fontsize = 6,
annotation_col = col_dat,
labels_col = mdat$Manuscript.Name[match(colnames(icr_df),
mdat$Library_id)][1:11],
labels_row = icr_gr$Name,
cluster_cols = FALSE, cluster_rows = FALSE,
gaps_col = c(2, 6, 8),
gaps_row = c(29, 31, 46))
pdf("wgbs/plots/icr_endpoint_annotated_heatmap.pdf", width = 5)
icr_hm
dev.off()
## quartz_off_screen
## 2
icr_hm
wb_ed_fig6 <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb_ed_fig6, sheetName = "Fig_6a")
openxlsx::writeData(wb = wb_ed_fig6, sheet = "Fig_6a",
x = icr_df[ ,1:11])
Plot FACS sorted cell data for NSC differentiations
nsc_dat <- read.csv(file = "differentiation_expts/NSC_facs_daniel_20221102.csv", header = FALSE)
nsc_dat <- reshape2::melt(nsc_dat)
## Using V1 as id variables
grp <- str_split(string = nsc_dat$V1, pattern = "_", n = 2, simplify = TRUE) %>% data.frame()
colnames(grp) <- c("Background", "Group")
grp$Group[grp$Group == "d8"] <- "Primed-hiPSC"
grp$Group[grp$Group == "TNT"] <- "TNT-hiPSC"
grp$Group[grp$Group == "ESC"] <- "hESC"
grp$Group[grp$Group == "hESC" & grp$Background == "Mel1"] <- "hESC (MEL1)"
grp$Group[grp$Group == "hESC" & grp$Background == "H9"] <- "hESC (H9)"
grp$Background[grp$Group %in% c("hESC (MEL1)", "hESC (H9)")] <- "hESC"
nsc_dat <- cbind(nsc_dat, grp)
nsc_dat$Background <- factor(nsc_dat$Background, levels=c("hESC", "32F", "Mel1", "MSC", "NHEK"))
nsc_dat <- nsc_dat[!is.na(nsc_dat$value), ]
nsc_dat <- nsc_dat[nsc_dat$V1 %in% c("32F_d8", "32F_TNT", "H9_ESC"), ]
gg_msc <- ggplot(data = nsc_dat, aes(x = Group, y = value, fill=Group, colour="black")) +
geom_point(size=2, alpha=0.7, shape=21) +
scale_color_manual(values = c("#000000")) +
scale_fill_manual(values = c("#5B7876", "#5B7876", "#009e73", "#eebc4c")) +
stat_summary(fun.data=mean_sdl, fun.args = list(mult=1),
geom="errorbar", color="black", width=0.1,
position = position_nudge(x = 0.25)) +
stat_summary(fun=mean, geom="point", color="black",
position = position_nudge(x = 0.25)) +
ylab("NCAM+ / FAP- cells (%)") + xlab("") +
facet_grid(.~Background, scales = "free_x", space = "free", drop = TRUE) +
scale_y_continuous(breaks = c(10,20,30,40,50,60)) +
theme_bw(base_size = 7, base_line_size = line_mm) +
theme(axis.text.x = element_text(angle = 45, hjust = 1,
colour = 'black'),
plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(size = line_mm, colour = "grey30", linetype = "dotted"),
legend.position = "none")
nsc_dat
## V1 variable value Background Group
## 1 32F_d8 V2 4.1 32F Primed-hiPSC
## 2 32F_TNT V2 11.2 32F TNT-hiPSC
## 10 H9_ESC V2 20.8 hESC hESC (H9)
## 11 32F_d8 V3 0.5 32F Primed-hiPSC
## 12 32F_TNT V3 17.2 32F TNT-hiPSC
## 20 H9_ESC V3 38.3 hESC hESC (H9)
## 21 32F_d8 V4 6.9 32F Primed-hiPSC
## 22 32F_TNT V4 16.4 32F TNT-hiPSC
## 30 H9_ESC V4 17.7 hESC hESC (H9)
## 31 32F_d8 V5 5.1 32F Primed-hiPSC
## 32 32F_TNT V5 21.8 32F TNT-hiPSC
## 40 H9_ESC V5 16.7 hESC hESC (H9)
## 41 32F_d8 V6 3.3 32F Primed-hiPSC
## 42 32F_TNT V6 5.6 32F TNT-hiPSC
## 50 H9_ESC V6 13.1 hESC hESC (H9)
## 51 32F_d8 V7 0.7 32F Primed-hiPSC
## 52 32F_TNT V7 6.3 32F TNT-hiPSC
## 60 H9_ESC V7 10.9 hESC hESC (H9)
## 61 32F_d8 V8 0.5 32F Primed-hiPSC
## 62 32F_TNT V8 56.9 32F TNT-hiPSC
## 71 32F_d8 V9 2.6 32F Primed-hiPSC
## 72 32F_TNT V9 6.9 32F TNT-hiPSC
## 81 32F_d8 V10 0.4 32F Primed-hiPSC
## 82 32F_TNT V10 7.5 32F TNT-hiPSC
pdf(file = "differentiation_expts/nsc_ncam_fap_plots.pdf", width = 4, height = 3.5)
gg_msc
dev.off()
## quartz_off_screen
## 2
gg_msc
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3m")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3m",
x = gg_msc$data)
Plot NSC CG methylation in corrected CG DMRs
# Import the methylation data ---------------------------------------------
# Create a list of the BSseq objects that have the CG methylation data
bs_obj_list <- c("wgbs/CG_bsobj/RL702_all_lanes_merged_CG_bsseq_obj.Rds",
"wgbs/CG_bsobj/RL703_all_merge_CG_bsseq_obj.Rds",
"wgbs/CG_bsobj/RL1812_2019_09_19_38F_NSCs_primed_p8_S3_all_lanes_merge_CG_bsseq_obj.Rds",
"wgbs/CG_bsobj/RL837_2017_09_28_P12_plus_10_38F_SmithR_to_E8_merge_CG_bsseq_obj.Rds",
"wgbs/CG_bsobj/RL1811_2019_09_19_38F_NSCs_N2P_p8_S2_all_lanes_merge_CG_bsseq_obj.Rds")
all_dmrs <- readRDS("wgbs/processed_data/all_dmrs_annotated_granges.Rds")
# keep only corrected DMRs for this analysis
corrected_dmrs <- all_dmrs[all_dmrs$any_corrected == TRUE]
nsc_dmr_mCG <- make_mC_matrix(obj_fls = bs_obj_list,
gr = corrected_dmrs, cores = 4)
## Making matrix of mC levels for regions...
dim(nsc_dmr_mCG)
## [1] 2207 5
colnames(mcols(corrected_dmrs))
## [1] "L" "area" "beta"
## [4] "stat" "pval" "qval"
## [7] "index" "g1_mean" "g2_mean"
## [10] "delta" "Smith_H9_KSR" "Lister_H9_E8"
## [13] "Lister_MEL1_E8" "HESO7" "HESO8"
## [16] "Ecker_H9" "Ecker_H1" "RL1751_1771"
## [19] "RL417" "RL418" "RL703"
## [22] "blacklist" "significant" "class"
## [25] "direction" "esc_n2p_d13_delta" "esc_n2p_p10_delta"
## [28] "esc_ips_delta" "esc_naive_delta" "loci"
## [31] "tnt_corrected" "ntp_corrected" "both_corrected"
## [34] "any_corrected" "correction_class"
mC_esc <- mcols(corrected_dmrs)[ ,c("g1_mean")]
all_mCG <- cbind(mC_esc, nsc_dmr_mCG)
all_mCG <- all_mCG[complete.cases(all_mCG), ]
primed_means <- rowMeans(as.matrix(all_mCG[ ,c("RL1812_2019_09_19_38F_NSCs_primed_p8_S3_all_lanes_merge_CG_bsseq_obj.Rds",
"RL703_all_merge_CG_bsseq_obj.Rds")]), na.rm = TRUE)
all_mCG <- as.data.frame(all_mCG)
all_mCG <- all_mCG[complete.cases(all_mCG), ]
ind <- order(abs(all_mCG$mC_esc - primed_means))
colnames(all_mCG) <- c("hESCs (mean)", "day 0 (38F)",
"Primed-hiPSC (38F)", "Primed-NSC (38F)",
"NtP-hiPSC (38F)", "NtP-NSC (38F)")
ind <- order(abs(all_mCG$`hESCs (mean)` - primed_means))
pdf(file = "wgbs/plots/memory_dmr_NSC_diff_38F_heatmap.pdf",
width = 2.5, height = 5)
pheatmap(all_mCG[ind, ],
cluster_cols = FALSE, gaps_col = c(1,2,4),
border_color = NA, cluster_rows = FALSE, show_rownames = FALSE)
dev.off()
## quartz_off_screen
## 3
png(file = "wgbs/plots/memory_dmr_NSC_diff_38F_heatmap.png", width = 2,
height = 2, units = 'in', res = 600, pointsize = 7)
pheatmap(all_mCG[ind, ],
cluster_cols = FALSE, gaps_col = c(1,2,4),
border_color = NA, cluster_rows = FALSE, show_rownames = FALSE)
dev.off()
## quartz_off_screen
## 3
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3o")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3o",
x = all_mCG[ind, ])
NSC CG methylation in CH-DMRs
#CH_dmr <- bed_to_gr("resources/nature13551-s3.bed")
CH_dmr <- bed_to_gr("wgbs/processed_data/mCH_DMR_primed_demethylated.bed")
rds_paths <- c("wgbs/CG_bsobj/RL1812_2019_09_19_38F_NSCs_primed_p8_S3_all_lanes_merge_CG_bsseq_obj.Rds",
"wgbs/CG_bsobj/RL1811_2019_09_19_38F_NSCs_N2P_p8_S2_all_lanes_merge_CG_bsseq_obj.Rds",
"wgbs/CG_bsobj/RL702_all_lanes_merged_CG_bsseq_obj.Rds")
make_window_matrix <- function(rds_path, gr, bins=30){
message(basename(rds_path))
gr_expand <- gr + width(gr)
gr_expand$loci <- gr_to_loci(gr_expand)
gr_tiles <- tile(gr_expand, n = bins)
add_loci_to_grl <- function(x){
grt <- gr_tiles[[x]]
grt$loci <- gr_to_loci(gr_expand[x])
return(grt)
}
read_bs_obj <- function(rds_path){
stopifnot(file.exists(rds_path))
message(str_c("Reading ", rds_path))
message(Sys.time())
bs_obj <- readRDS(file = rds_path)
return(bs_obj)
}
# Calculate C coverage and methylation for ranges. Returns GRanges object
calc_mC_window <- function(bs_obj, gr){
message("Calculating coverage...")
gr$Cov <- getCoverage(BSseq = bs_obj, regions = gr, type = "Cov",
what = "perRegionTotal")
message("Calculating M...")
gr$M <- getCoverage(BSseq = bs_obj, regions = gr, type = "M",
what = "perRegionTotal")
message("Calculating methylation percentage...")
gr$pc <- gr$M / gr$Cov
return(gr)
}
gr_tiles <- lapply(1:length(gr_tiles), add_loci_to_grl)
gr_tiles <- GRangesList(gr_tiles)
gr_tiles <- unlist(gr_tiles)
bs_obj <- read_bs_obj(rds_path)
mC_dat <- calc_mC_window(bs_obj = bs_obj, gr = gr_tiles)
dat <- matrix(data = mC_dat$pc, nrow = length(gr), byrow = TRUE)
dat <- data.frame(dat)
rownames(dat) <- gr_to_loci(gr)
colnames(dat) <- 1:ncol(dat)
dat$id <- basename(rds_path)
return(dat)
}
dfl <- lapply(rds_paths, make_window_matrix, gr=CH_dmr)
## RL1812_2019_09_19_38F_NSCs_primed_p8_S3_all_lanes_merge_CG_bsseq_obj.Rds
## Reading wgbs/CG_bsobj/RL1812_2019_09_19_38F_NSCs_primed_p8_S3_all_lanes_merge_CG_bsseq_obj.Rds
## 2023-06-23 17:15:01
## Calculating coverage...
## Calculating M...
## Calculating methylation percentage...
## RL1811_2019_09_19_38F_NSCs_N2P_p8_S2_all_lanes_merge_CG_bsseq_obj.Rds
## Reading wgbs/CG_bsobj/RL1811_2019_09_19_38F_NSCs_N2P_p8_S2_all_lanes_merge_CG_bsseq_obj.Rds
## 2023-06-23 17:15:08
## Calculating coverage...
## Calculating M...
## Calculating methylation percentage...
## RL702_all_lanes_merged_CG_bsseq_obj.Rds
## Reading wgbs/CG_bsobj/RL702_all_lanes_merged_CG_bsseq_obj.Rds
## 2023-06-23 17:15:13
## Calculating coverage...
## Calculating M...
## Calculating methylation percentage...
dfl <- do.call(rbind, dfl)
saveRDS(dfl, "wgbs/processed_data/NSC_mCG_in_mCH_DMR_window_dat_all.Rds")
id_ind <- match(dfl$id, basename(mdat$BSseq_CG))
dfl$id <- mdat$Library_id[id_ind]
### Normalise to max (flank normalisation)
max_norm <- function(x){
vec <- dfl[x, 1:30] / max(dfl[x, 1:30], na.rm = TRUE)
return(vec)
}
dfl_norm <- lapply(X = 1:nrow(dfl), max_norm) %>% do.call(rbind, .)
dfl_norm$id <- dfl$id
dfl_norm$loci <- dfl$loci
norm_dat <- reshape2::melt(dfl_norm)
## Using id as id variables
norm_dat2 <- norm_dat %>% dplyr::group_by(id, variable) %>%
dplyr::summarise(value=mean(.data$value, na.rm=TRUE))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
norm_dat2 <- data.frame(norm_dat2)
## Scale to max=1 for flank normalisation to show relative mCG difference
max_norm2 <- function(id){
dat_sub <- norm_dat2[norm_dat2$id == id, ]
dat_sub$scaled <- dat_sub$value / max(dat_sub$value, na.rm = TRUE)
return(dat_sub)
}
norm_dat2 <- lapply(unique(norm_dat2$id), max_norm2) %>% do.call(rbind, .)
gg_nsc_mch_dmr <- ggplot(data = norm_dat2, mapping = aes(x = variable, y = scaled,
group=id, fill=id,
colour=id)) +
geom_line() +
scale_color_colorblind() +
ylab("Normalised mCA/CA") +
xlab("") +
geom_vline(xintercept = c(11,20), linetype="dashed", size=line_mm) +
theme_bw() +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
strip.text.y = element_text(size = 6),
text = element_text(size=6),
strip.background = element_blank(),
legend.position = "right",
axis.line.x = element_line(color = 'black', size = line_mm),
axis.text.y = element_text(color = 'black'),
axis.line.y = element_line(color = 'black', size = line_mm),
axis.ticks.y = element_line(color = 'black', size = line_mm),
axis.ticks.x = element_blank(),
axis.text.x = element_blank())
gg_nsc_mch_dmr
openxlsx::addWorksheet(wb_fig3, sheetName = "Fig_3p")
openxlsx::writeData(wb = wb_fig3, sheet = "Fig_3p",
x = gg_nsc_mch_dmr$data)
Per-read methylation values for ICR in Extended data Fig 6b
mdat <- read.csv("wgbs/metadata/wgbs_metadata_local.csv")
icr_gr <- readRDS("resources/imprint_control_regions_granges_hg19.Rds")
mcols(icr_gr) <- icr_gr$Categoly
per_read_list <- list.files("wgbs/ICR_per_read_data/",
pattern = "_per_read_ICR_mC_calls.txt", full.names = TRUE)
process_per_read <- function(per_read_file){
dat <- read.table(per_read_file, colClasses = c("character", "numeric", "character", "character"))
dat$read_length <- nchar(dat$V4)
count_character <- function(x, char){sum(unlist(strsplit(dat$V4[x], "")) == char)}
dat$mCG_count <- lapply(1:nrow(dat), count_character, char = "X") %>% unlist()
dat$CG_count <- lapply(1:nrow(dat), count_character, char = "x") %>% unlist()
# remove rows with all 0
dat <- dat[rowSums(dat[ ,c("mCG_count", "CG_count")]) > 0, ]
dat$read_mCG <- dat$mCG_count / (dat$mCG_count + dat$CG_count)
dat$file <- basename(per_read_file) %>% str_remove("_per_read_ICR_mC_calls.txt")
return(dat)
}
per_read_mat <- lapply(per_read_list, process_per_read) %>% do.call(rbind, .)
per_read_mat$read_mCG <- as.numeric(per_read_mat$read_mCG)
per_read_mat$read_class <- "Discordant"
per_read_mat$read_class[per_read_mat$read_mCG == 1] <- "Methylated"
per_read_mat$read_class[per_read_mat$read_mCG == 0] <- "Unmethylated"
per_read_gr <- GRanges(seqnames = per_read_mat$V1, ranges = IRanges(per_read_mat$V2, per_read_mat$V2 + per_read_mat$read_length))
mcols(per_read_gr) <- per_read_mat
icr_olaps <- findOverlaps(per_read_gr, icr_gr, type = 'within')
per_read_gr <- per_read_gr[icr_olaps@from]
mcols(per_read_gr) <- cbind(mcols(per_read_gr), mcols(icr_gr)[icr_olaps@to, ])
per_read_mat <- as.data.frame(per_read_gr)
colnames(per_read_mat)[16] <- "Category"
per_read_mat$Category[grepl("Secondary", per_read_mat$Categoly)] <- "Secondary DMR"
per_read_mat$file <- str_c(per_read_mat$file, ".bam")
bam_ind <- match(per_read_mat$file, basename(mdat$BAM))
per_read_mat$Group <- mdat$Group[bam_ind]
per_read_mat$Short_name <- mdat$Manuscript.Name[bam_ind]
per_read_mat$Short_name <- factor(per_read_mat$Short_name)
per_read_mat$State <- mdat$State[bam_ind]
per_read_mat$State <- factor(per_read_mat$State, levels= c("Early", "Naive", "Primed", "TNT", "NtP", "ESC"))
per_read_mat_counts <- per_read_mat %>% dplyr::group_by(Short_name, read_class, Category, State) %>% dplyr::tally()
# Remove groups that didn't end up in paper
per_read_mat_counts <- per_read_mat_counts[!grepl(pattern = "Placenta", per_read_mat_counts$Category), ]
per_read_mat_counts <- per_read_mat_counts[per_read_mat_counts$State != "ESC", ]
gg_icr_per_read <- ggplot(data = per_read_mat_counts[!grepl(pattern = "Placenta", per_read_mat_counts$Category), ],
aes(x = Short_name, y = n, fill = read_class)) +
geom_bar(position = "fill", stat = "identity") +
facet_grid(Category~State, scales = 'free', space = 'free', drop = TRUE) +
theme_linedraw() +
scale_y_continuous(expand = c(0,0)) +
xlab(label = "") +
ylab(label = "Proportion of WGBS reads in ICRs") +
theme_bw() +
theme(
plot.background = element_blank()
,panel.grid.major = element_blank()
,panel.grid.minor = element_blank()
,panel.border = element_blank(),
axis.text = element_text(colour = "black"),
axis.line = element_line(color = 'black'),
axis.text.x = element_text(angle = 90, size=8,
hjust = 1, vjust = 0.5)
) +
# Edit legend
scale_fill_manual(values = vitC[c(1,3,5)], name="Read class") + sams_pub_theme(legend_pos = "right")
pdf("wgbs/plots/ICR_per_read_mCG_proportion_bar_plots.pdf", width = 6, height = 4)
gg_icr_per_read
dev.off()
## quartz_off_screen
## 2
gg_icr_per_read
openxlsx::addWorksheet(wb_ed_fig6, sheetName = "Fig_6b")
openxlsx::writeData(wb = wb_ed_fig6, sheet = "Fig_6b",
x = gg_icr_per_read$data)
X-chromosome analysis. Extended Data Figure 6c-d
### Bin chrX and profile mCG -------------------
### Make the bins
## Calculate window mCA/CA for DMR testing
gr_genome <- GRanges(seqinfo(BSgenome.Hsapiens.UCSC.hg19))
gr_chrX <- gr_genome[seqnames(gr_genome) == "chrX"]
#chrX_windows <- slidingWindows(x = gr_chrX, width = 5000, step = 1000)
chrX_windows <- tile(x = gr_chrX, width = 5000)
chrX_windows <- unlist(chrX_windows)
chrX_ids <- c("SRS114877", "RL1124", "RL1125", "RL1751_1771", "RL1752_1772", "RL412",
"RL415", "RL417", "RL418", "RL702", "RL703", "RL837", "RL838", "RL936",
"RL937", "SRS606777")
chrX_matrix <- make_mC_matrix(obj_fls = mdat[mdat$Library_id %in% chrX_ids],
gr = chrX_windows, cores = 3)
saveRDS(chrX_matrix, "wgbs/processed_data/chrX_5kb_bin_mCG.Rds")
chrX_matrix <- readRDS("wgbs/processed_data/chrX_5kb_bin_mCG.Rds")
ind <- match(colnames(chrX_matrix), mdat$Library_id)
colnames(chrX_matrix) <- mdat$Library_id[ind]
head(chrX_matrix)
## SRS114877 RL1124 RL1125 RL1751_1771 RL1752_1772 RL412 RL415
## chrX:1-4999 NA NA NA NA NA NA NA
## chrX:5000-9999 NA NA NA NA NA NA NA
## chrX:10000-14999 NA NA NA NA NA NA NA
## chrX:15000-19999 NA NA NA NA NA NA NA
## chrX:20000-24999 NA NA NA NA NA NA NA
## chrX:25000-29999 NA NA NA NA NA NA NA
## RL417 RL418 RL702 RL703 RL837 RL838 RL936 RL937 SRS606777
## chrX:1-4999 NA NA NA NA NA NA NA NA NA
## chrX:5000-9999 NA NA NA NA NA NA NA NA NA
## chrX:10000-14999 NA NA NA NA NA NA NA NA NA
## chrX:15000-19999 NA NA NA NA NA NA NA NA NA
## chrX:20000-24999 NA NA NA NA NA NA NA NA NA
## chrX:25000-29999 NA NA NA NA NA NA NA NA NA
chrX_matrix_cc <- chrX_matrix[complete.cases(chrX_matrix), ]
### plot the correlation heatmap
## Plot the correlation heatmap
cormat <- round(cor(chrX_matrix_cc, method='pearson'),5)
head(cormat)
## SRS114877 RL1124 RL1125 RL1751_1771 RL1752_1772 RL412 RL415
## SRS114877 1.00000 0.69295 0.69346 0.77650 0.68884 0.35501 0.41969
## RL1124 0.69295 1.00000 0.82288 0.74694 0.73605 0.43594 0.60725
## RL1125 0.69346 0.82288 1.00000 0.70879 0.72934 0.49858 0.60337
## RL1751_1771 0.77650 0.74694 0.70879 1.00000 0.70493 0.29921 0.42185
## RL1752_1772 0.68884 0.73605 0.72934 0.70493 1.00000 0.41827 0.51285
## RL412 0.35501 0.43594 0.49858 0.29921 0.41827 1.00000 0.52825
## RL417 RL418 RL702 RL703 RL837 RL838 RL936 RL937
## SRS114877 0.67773 0.62642 0.27563 0.66524 0.72680 0.36693 0.68277 0.66951
## RL1124 0.84268 0.72787 0.42743 0.82521 0.76235 0.43653 0.86584 0.82985
## RL1125 0.79830 0.72031 0.47982 0.76069 0.74902 0.51934 0.80616 0.82912
## RL1751_1771 0.71708 0.58500 0.26933 0.72837 0.75036 0.31489 0.74226 0.69658
## RL1752_1772 0.71228 0.66796 0.40329 0.73908 0.79031 0.47486 0.74270 0.72430
## RL412 0.46954 0.57105 0.49877 0.45134 0.37336 0.77342 0.47912 0.42308
## SRS606777
## SRS114877 0.69475
## RL1124 0.83618
## RL1125 0.81137
## RL1751_1771 0.69961
## RL1752_1772 0.71748
## RL412 0.46683
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
melted_cormat <- melt(cormat)
head(melted_cormat)
## Var1 Var2 value
## 1 SRS114877 SRS114877 1.00000
## 2 RL1124 SRS114877 0.69295
## 3 RL1125 SRS114877 0.69346
## 4 RL1751_1771 SRS114877 0.77650
## 5 RL1752_1772 SRS114877 0.68884
## 6 RL412 SRS114877 0.35501
annot_dat <- mdat[ind ,c("State", "Media")] %>% data.frame()
rownames(annot_dat) <- mdat$Manuscript.Name[ind]
colnames(cormat) <- mdat$Manuscript.Name[ind]
rownames(cormat) <- mdat$Manuscript.Name[ind]
pdf("wgbs/plots/chrX_5kb_bin_mCG_correlation_heatmap.pdf", width = 8, height = 6)
pheatmap(mat = cormat, annotation_col = annot_dat, border_color = NA)
dev.off()
## quartz_off_screen
## 3
openxlsx::addWorksheet(wb_ed_fig6, sheetName = "ED_Fig_6c")
openxlsx::writeData(wb = wb_ed_fig6, sheet = "ED_Fig_6c",
x = cormat)
## Promoter analysis
# Load object generated for figure 1 promoter analysis
pro_mCG <- readRDS("wgbs/processed_data/promoter_mCG.Rds")
pro_mCG <- pro_mCG[!duplicated(pro_mCG), ]
pro_mCG <- pro_mCG[ ,colnames(pro_mCG) %in% mdat$Library_id[ind]]
table(complete.cases(pro_mCG))
##
## FALSE TRUE
## 751 31559
pro_mCG <- pro_mCG[complete.cases(pro_mCG), ]
pro_mCG_gr <- GRanges(seqnames = rownames(pro_mCG))
chrX_hit <- as.character(seqnames(pro_mCG_gr)) == "chrX"
pro_mCG_gr <- pro_mCG_gr[chrX_hit]
pro_mCG <- pro_mCG[chrX_hit, ]
chrX_cgi_gr <- bed_to_gr("resources/chrX_CG_islands.bed")
pro_mCG_gr$CGI <- overlapsAny(pro_mCG_gr, chrX_cgi_gr)
ind_pro <- match(colnames(pro_mCG), mdat$Library_id)
colnames(pro_mCG) <- mdat$Manuscript.Name[ind_pro]
pdf("wgbs/plots/chrX_promoter_mCG_heatmaps.pdf", height = 5, width = 5)
pheatmap(pro_mCG[pro_mCG_gr$CGI, ], cluster_rows = TRUE, show_rownames = FALSE,
main = "chrX CGI promoter mCG/CG",
#labels_col = labs,
clustering_distance_cols = 'correlation',
clustering_distance_rows = 'correlation')
pheatmap(pro_mCG[pro_mCG_gr$CGI == FALSE, ], cluster_rows = TRUE, show_rownames = FALSE,
main = "chrX Non-CGI promoter mCG/CG",
#annotation_col = annot_dat,
#labels_col = labs,
clustering_distance_cols = 'correlation', clustering_distance_rows = 'correlation')
dev.off()
## quartz_off_screen
## 3
chrX_pro_source_dat <- cbind(pro_mCG, data.frame(CGI=pro_mCG_gr$CGI))
openxlsx::addWorksheet(wb_ed_fig6, sheetName = "ED_Fig_6d")
openxlsx::writeData(wb = wb_ed_fig6, sheet = "ED_Fig_6d",
x = chrX_pro_source_dat)
type_32F <- readRDS("scRNAseq/32F_celltype.rds")
pdf("scRNAseq/32F_celltype_umap.pdf", width = 5, height = 4)
type_32F
dev.off()
## quartz_off_screen
## 2
type_32F
openxlsx::addWorksheet(wb_ed_fig6, sheetName = "ED_Fig_6f_type")
openxlsx::writeData(wb = wb_ed_fig6, sheet = "ED_Fig_6f_type",
x = type_32F$data)
treat_32F <- readRDS("scRNAseq/32F_treatment.rds")
pdf("scRNAseq/32F_treatment_umap.pdf", width = 4, height = 3)
treat_32F
dev.off()
## quartz_off_screen
## 2
treat_32F
UMAP
gg_treat_32F <- ggplot(data = treat_32F$data, aes(x = UMAP_1, y = UMAP_2,
colour=group)) +
geom_point(size=0.1) +
sams_pub_theme(legend_pos = "right")
gg_type_32F <- ggplot(data = type_32F$data, aes(x = UMAP_1, y = UMAP_2,
colour=clusterprop)) +
geom_point(size=0.1) +
sams_pub_theme(legend_pos = "right")
cowplot::plot_grid(gg_treat_32F, gg_type_32F)
openxlsx::addWorksheet(wb_ed_fig6, sheetName = "ED_Fig_6f_treatment")
openxlsx::writeData(wb = wb_ed_fig6, sheet = "ED_Fig_6f_treatment",
x = treat_32F$data)
Close out source data files
openxlsx::saveWorkbook(wb = wb_fig3,
file = "Figure_3_source_data.xlsx", overwrite = TRUE)
openxlsx::saveWorkbook(wb = wb_ed_fig3,
file = "ED_Figure_3cdef_source_data.xlsx",
overwrite = TRUE)
openxlsx::saveWorkbook(wb = wb_ed_fig4,
file = "ED_Figure_4abcde_source_data.xlsx",
overwrite = TRUE)
openxlsx::saveWorkbook(wb = wb_ed_fig6,
file = "ED_Figure_6abcd_source_data.xlsx",
overwrite = TRUE)
Session info
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.4
## [2] dplyr_1.0.10
## [3] RColorBrewer_1.1-3
## [4] XML_3.99-0.12
## [5] ggExtra_0.10.0
## [6] gprofiler2_0.2.1
## [7] gt_0.8.0
## [8] Gviz_1.40.1
## [9] edgeR_3.38.4
## [10] limma_3.52.4
## [11] UpSetR_1.4.0
## [12] gtools_3.9.4
## [13] ggdendro_0.1.23
## [14] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [15] ChIPpeakAnno_3.30.1
## [16] ggridges_0.5.4
## [17] ggalluvial_0.12.3
## [18] alluvial_0.1-2
## [19] VariantAnnotation_1.42.1
## [20] Rsamtools_2.12.0
## [21] ggthemes_4.2.4
## [22] cowplot_1.1.1
## [23] ggrepel_0.9.2
## [24] ggfortify_0.4.15
## [25] pheatmap_1.0.12
## [26] GenomicFeatures_1.48.4
## [27] AnnotationDbi_1.58.0
## [28] BSgenome.Hsapiens.UCSC.hg19_1.4.3
## [29] BSgenome_1.64.0
## [30] rtracklayer_1.56.1
## [31] Biostrings_2.64.1
## [32] XVector_0.36.0
## [33] data.table_1.14.6
## [34] readxl_1.4.1
## [35] openxlsx_4.2.5.1
## [36] stringr_1.5.0
## [37] magrittr_2.0.3
## [38] bsseq_1.32.0
## [39] SummarizedExperiment_1.26.1
## [40] MatrixGenerics_1.8.1
## [41] matrixStats_0.63.0
## [42] GenomicRanges_1.48.0
## [43] GenomeInfoDb_1.32.4
## [44] IRanges_2.30.1
## [45] S4Vectors_0.34.0
## [46] e1071_1.7-12
## [47] caret_6.0-93
## [48] lattice_0.20-45
## [49] ggplot2_3.4.1
## [50] Biobase_2.56.0
## [51] BiocGenerics_0.42.0
## [52] preprocessCore_1.58.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 ModelMetrics_1.2.2.2
## [3] R.methodsS3_1.8.2 ragg_1.2.5
## [5] tidyr_1.2.1 bit64_4.0.5
## [7] knitr_1.41 DelayedArray_0.22.0
## [9] R.utils_2.12.2 rpart_4.1.19
## [11] KEGGREST_1.36.3 hardhat_1.2.0
## [13] RCurl_1.98-1.9 AnnotationFilter_1.20.0
## [15] generics_0.1.3 lambda.r_1.2.4
## [17] RSQLite_2.2.19 proxy_0.4-27
## [19] future_1.29.0 bit_4.0.5
## [21] xml2_1.3.3 lubridate_1.9.0
## [23] httpuv_1.6.6 assertthat_0.2.1
## [25] gower_1.0.0 xfun_0.35
## [27] hms_1.1.2 evaluate_0.18
## [29] promises_1.2.0.1 fansi_1.0.4
## [31] restfulr_0.0.15 progress_1.2.2
## [33] dbplyr_2.2.1 DBI_1.1.3
## [35] htmlwidgets_1.5.4 futile.logger_1.4.3
## [37] purrr_0.3.5 ellipsis_0.3.2
## [39] backports_1.4.1 permute_0.9-7
## [41] biomaRt_2.52.0 deldir_1.0-6
## [43] sparseMatrixStats_1.8.0 vctrs_0.5.2
## [45] ensembldb_2.20.2 cachem_1.0.6
## [47] withr_2.5.0 checkmate_2.1.0
## [49] GenomicAlignments_1.32.1 prettyunits_1.1.1
## [51] cluster_2.1.4 lazyeval_0.2.2
## [53] crayon_1.5.2 labeling_0.4.2
## [55] recipes_1.0.3 pkgconfig_2.0.3
## [57] nlme_3.1-160 ProtGenerics_1.28.0
## [59] nnet_7.3-18 rlang_1.0.6
## [61] globals_0.16.2 lifecycle_1.0.3
## [63] miniUI_0.1.1.1 filelock_1.0.2
## [65] BiocFileCache_2.4.0 dichromat_2.0-0.1
## [67] VennDiagram_1.7.3 cellranger_1.1.0
## [69] graph_1.74.0 Matrix_1.5-3
## [71] Rhdf5lib_1.18.2 base64enc_0.1-3
## [73] png_0.1-8 viridisLite_0.4.1
## [75] rjson_0.2.21 bitops_1.0-7
## [77] R.oo_1.25.0 rhdf5filters_1.8.0
## [79] pROC_1.18.0 blob_1.2.3
## [81] DelayedMatrixStats_1.18.2 regioneR_1.28.0
## [83] parallelly_1.32.1 jpeg_0.1-10
## [85] scales_1.2.1 memoise_2.0.1
## [87] plyr_1.8.8 zlibbioc_1.42.0
## [89] compiler_4.2.1 BiocIO_1.6.0
## [91] cli_3.6.0 listenv_0.8.0
## [93] htmlTable_2.4.1 formatR_1.12
## [95] Formula_1.2-4 MASS_7.3-58.1
## [97] tidyselect_1.2.0 stringi_1.7.12
## [99] textshaping_0.3.6 highr_0.9
## [101] yaml_2.3.6 locfit_1.5-9.6
## [103] latticeExtra_0.6-30 tools_4.2.1
## [105] timechange_0.1.1 future.apply_1.10.0
## [107] rstudioapi_0.14 foreach_1.5.2
## [109] foreign_0.8-83 gridExtra_2.3
## [111] prodlim_2019.11.13 farver_2.1.1
## [113] digest_0.6.30 shiny_1.7.3
## [115] lava_1.7.0 Rcpp_1.0.9
## [117] later_1.3.0 httr_1.4.4
## [119] biovizBase_1.44.0 colorspace_2.1-0
## [121] splines_4.2.1 RBGL_1.72.0
## [123] multtest_2.52.0 systemfonts_1.0.4
## [125] plotly_4.10.1 xtable_1.8-4
## [127] jsonlite_1.8.3 futile.options_1.0.1
## [129] timeDate_4021.106 ipred_0.9-13
## [131] R6_2.5.1 Hmisc_4.7-2
## [133] pillar_1.8.1 htmltools_0.5.3
## [135] mime_0.12 glue_1.6.2
## [137] fastmap_1.1.0 BiocParallel_1.30.4
## [139] class_7.3-20 codetools_0.2-18
## [141] utf8_1.2.3 tibble_3.1.8
## [143] curl_4.3.3 zip_2.2.2
## [145] interp_1.1-3 survival_3.4-0
## [147] rmarkdown_2.18 InteractionSet_1.24.0
## [149] munsell_0.5.0 rhdf5_2.40.0
## [151] GenomeInfoDbData_1.2.8 iterators_1.0.14
## [153] HDF5Array_1.24.2 gtable_0.3.1