-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze_newatac.R
211 lines (155 loc) · 9.01 KB
/
analyze_newatac.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
sumcolpairs <- function(x){
y <- matrix(0,ncol=ncol(x)/2,nrow=nrow(x))
for(i in 1:ncol(y)){
y[,i] <- x[,i*2-1]+x[,i*2]
}
y
}
### Read peak annotation
newatac_ann <- read.csv("atac/mouse/ATACall_peaks.red.ann.csv",sep="\t",stringsAsFactors = FALSE)
colnames(newatac_ann)[1] <- "peakid" #or something
head(newatac_ann)
### Read background counts and figure out average counts
newatac_inv <- read.table("atac/mouse/ATACall_peaks.inv.bed",sep="\t",stringsAsFactors = FALSE)
newatac_inv_sum <- sum((newatac_inv$V3-newatac_inv$V2))
newatac_bg <- read.table("atac/mouse/counts.f.bg.csv",sep="\t",stringsAsFactors = FALSE)
newatac_bg <- apply(sumcolpairs(newatac_bg[,-(1:3)]),2,sum)
newatac_bg_avgreads <- newatac_bg/newatac_inv_sum
### Read peak counts
newatac_peaks <- read.table("atac/mouse/counts.f.peaks.csv",sep="\t",stringsAsFactors = FALSE)
newatac_peaks <- cbind(newatac_peaks[,4,drop=FALSE],sumcolpairs(newatac_peaks[,-(1:6)]))
colnames(newatac_peaks) <- c("peakid","Naive","Th2_2h","Th2_4h","Th2_24h","Th2_48h","Th2_72h") #consider other parts of the code
newatac_peaks <- smerge(newatac_peaks,newatac_ann)
#newatac_peaks <- newatac_peaks[order(newatac_peaks$Th2_4h,decreasing = TRUE),]
newatac_peaks <- newatac_peaks[order(newatac_peaks$Th2_72h,decreasing = TRUE),]
newatac_peaks <- newatac_peaks[order(newatac_peaks$Th2_4h/newatac_peaks$Th2_72h,decreasing = TRUE),]
newatac_peaks <- newatac_peaks[order(apply(newatac_peaks[,1+(1:6)],1,sum),decreasing = TRUE),]
newatac_peaks$rcrank <- 1:nrow(newatac_peaks)
#Normalize peaks by background and length
newatac_peaks_norm <- newatac_peaks
newatac_peakslen <- (newatac_peaks$End-newatac_peaks$Start)
for(i in 1:6){
#Arbitrary unit to make it easier to think. now most peaks in 0-2. with up to 10
newatac_peaks_norm[,i+1] <- 1e5*(newatac_peaks[,i+1]-newatac_bg_avgreads[i])/newatac_bg[i]/newatac_peakslen
}
#head(newatac_peaks$Detailed.Annotation,n=100)
#t(newatac_peaks[47,])
# plot(as.double(newatac_peaks[1,1+(1:6)]),type="l",ylim=c)
# for(i in 1:100){
# lines(as.double(newatac_peaks[i,1+(1:6)]),type="l")
# }
#grep("Satellite",newatac_peaks$Detailed.Annotation)
#newatac_peaks <- newatac_peaks[-grep("Satellite",newatac_peaks$Detailed.Annotation),]
###### scaling law over time?
h<-hist(log(apply(newatac_peaks[,1+(1:6)],1,sum)),breaks=100)
plot(h$mids,log(h$density),type="l")
###### scaling law over time? this with log-y
h<-hist(log(newatac_peaks_norm[,1+1]),breaks=100,plot = FALSE)
plot(h$mids,log(h$density),type="l",xlim=c(-13,0)) #naive
h<-hist(log(newatac_peaks_norm[,2+1]),breaks=100,plot = FALSE)
lines(h$mids,log(h$density),type="l",xlim=c(-13,0)) #2h
h<-hist(log(newatac_peaks_norm[,3+1]),breaks=100,plot = FALSE)
lines(h$mids,log(h$density),type="l",xlim=c(-13,0)) #4h
h<-hist(log(newatac_peaks_norm[,4+1]),breaks=100,plot = FALSE) ##Suddenly all low-expressing peaks went up 4-fold
lines(h$mids,log(h$density),type="l",xlim=c(-13,0)) #24h
h<-hist(log(newatac_peaks_norm[,5+1]),breaks=100,plot = FALSE) #slight broadening but same trend
lines(h$mids,log(h$density),type="l",xlim=c(-13,0)) #48h
h<-hist(log(newatac_peaks_norm[,6+1]),breaks=100,plot = FALSE) #here everything went down except the mid-population
lines(h$mids,log(h$density),type="l",xlim=c(-13,0))
###### scaling law over time? this w/o log-y
h<-hist(log(newatac_peaks_norm[,1+1]),breaks=100,plot = FALSE)
plot(h$mids,(h$density),type="l",xlim=c(-13,0)) #naive
h<-hist(log(newatac_peaks_norm[,2+1]),breaks=100,plot = FALSE)
lines(h$mids,(h$density),type="l",xlim=c(-13,0)) #2h
h<-hist(log(newatac_peaks_norm[,3+1]),breaks=100,plot = FALSE)
lines(h$mids,(h$density),type="l",xlim=c(-13,0)) #4h
h<-hist(log(newatac_peaks_norm[,4+1]),breaks=100,plot = FALSE) ##Suddenly all low-expressing peaks went up 4-fold
lines(h$mids,(h$density),type="l",xlim=c(-13,0)) #24h
h<-hist(log(newatac_peaks_norm[,5+1]),breaks=100,plot = FALSE) #slight broadening but same trend
lines(h$mids,(h$density),type="l",xlim=c(-13,0)) #48h
h<-hist(log(newatac_peaks_norm[,6+1]),breaks=100,plot = FALSE) #here everything went down except the mid-population
lines(h$mids,(h$density),type="l",xlim=c(-13,0))
h<-hist(log(apply(newatac_peaks[,2+(1:6)],1,sum)),breaks=100)
plot(h$mids,log(h$density),type="l",xlim=c(-13,0))
plot(as.double(newatac_peaks_norm[1001,1+(1:6)]),type="l")
plot(apply(newatac_peaks_norm[1:500,1+(1:6)],2,mean),type="l")
plot(apply(newatac_peaks_norm[1:500,1+(1:6)],2,mean),type="l")
plot(as.double(newatac_peaks_norm[70000,1+(1:6)]),type="l")
plot(as.double(newatac_peaks_norm[1,1+(1:6)]),type="l",ylim=c(0,2))
plot(as.double(newatac_peaks_norm[1,1+(1:6)]),type="l",ylim=c(0,0.1))
for(i in 2:100){
lines(as.double(newatac_peaks_norm[i,1+(1:6)]),type="l")
}
#### due to the unfolding, some regions become hyperaccessible. These are then taken down to normal levels. Which ones are affected?
#hist(newatac_peaks[newatac_peaks[,7]>1e-13,7])
# hist(newatac_peaks_norm[newatac_peaks_norm[,2]>0.1,2],breaks = 20)
# hist(newatac_peaks_norm[newatac_peaks_norm[,3]>0.1,3],breaks = 20)
# hist(newatac_peaks_norm[newatac_peaks_norm[,4]>0.1,4],breaks = 20)
# hist(newatac_peaks_norm[newatac_peaks_norm[,5]>0.1,5],breaks = 20)
# hist(newatac_peaks_norm[newatac_peaks_norm[,6]>0.1,6],breaks = 20)
###Now much more clear: eno
# plot( apply(newatac_peaks_norm[,1+(1:6)],2,function(x) quantile(x,0.9)),type="l",ylim=c(0,1e-3))
# lines(apply(newatac_peaks_norm[,1+(1:6)],2,function(x) quantile(x,0.7)),col="blue")
# lines(apply(newatac_peaks_norm[,1+(1:6)],2,function(x) quantile(x,0.5)),col="blue")
# lines(apply(newatac_peaks_norm[,1+(1:6)],2,function(x) quantile(x,0.3)),col="blue")
# lines(apply(newatac_peaks_norm[,1+(1:6)],2,function(x) quantile(x,0.1)),col="blue")
colnames(newatac_peaks)
newatac_peaks_sum <- apply(newatac_peaks[,-(1)],2,sum)
barplot(newatac_bg)
barplot(newatac_peaks_sum)
barplot(newatac_peaks_sum/newatac_bg)
### The peaks with most reads recondense. The peaks with fewer reads however do not.
#if we instead used the peaks with few reads as reference then the large ones would decrease even more!
newatac_peaks_norm_sum <- apply(newatac_peaks_norm[(1:1000),1+(1:6)],2,sum)
barplot(newatac_peaks_norm_sum)
newatac_peaks_norm_sum <- apply(newatac_peaks_norm[-(1:35000),1+(1:6)],2,sum)
barplot(newatac_peaks_norm_sum)
newatac_peaks_norm_sum <- apply(newatac_peaks_norm[-(33000:37000),1+(1:6)],2,sum)
barplot(newatac_peaks_norm_sum)
newatac_peaks_norm_sum <- apply(newatac_peaks_norm[-(1:50000),1+(1:6)],2,sum)
barplot(newatac_peaks_norm_sum)
newatac_peaks_norm_sum <- apply(newatac_peaks_norm[-(1:70000),1+(1:6)],2,sum)
barplot(newatac_peaks_norm_sum)
###Categorize the peaks around IL4. These are index 35000+, and average of these is decreasing a bit
newatac_peaks_norm[newatac_peaks_norm$peakid %in% c("ATACall_peak_11314","ATACall_peak_11312"),]
newatac_peaks
sqldf("select median(rcrank) from newatac_peaks_norm group by Chr")
sqldf("select avg(rcrank) from newatac_peaks_norm group by Chr")
#Chromesome 21 has a much lower mean and median rank! also the smallest chromosome. suggesting that this one is turn down in favour
#of others. https://en.wikipedia.org/wiki/Category:Genes_on_human_chromosome_21
#Appears to have many of the immune genes
#Infgr2, Il10 receptor, runx1, Aire,
#Chromosome 20 has the highest mean/median.
#Nfatc2.
#Chromosome 8 and 17 among the lowest otherwise. 17 has the Ccl, Fli1,
#### Read chromosome sizes
chrsize <- read.table("atac/mm10.chrom.sizes",sep="\t", stringsAsFactors = FALSE)
colnames(chrsize)<-c("Chr","Chrsize")
rownames(chrsize) <- chrsize$Chr
chrsize <- chrsize[order(chrsize$Chr),]
chrsize <- chrsize[-grep("_",chrsize$Chr),]
normalizebychrsize <- function(x){
round(x/chrsize[names(x),]$Chrsize*1e8,digits = 3)
}
#For sorting by average size
normalizebychrsize(table(newatac_peaks_norm$Chr[1:100])) #chr9 has many annoying. nfil3 jak mtorc il33 here. 17 relatively many
normalizebychrsize(table(newatac_peaks_norm$Chr[1000:2000]))
sort(table(newatac_peaks_norm$Chr[40000:80000])/table(newatac_peaks_norm$Chr[1000:20000]))
#table(newatac_peaks_norm$Chr[100:20000])
#Sorting by final size
normalizebychrsize(table(newatac_peaks_norm$Chr[1:100])) #Chromosome 9 has the biggest increases in openness. then 17, 7 in general
normalizebychrsize(table(newatac_peaks_norm$Chr[1:1000])) #9 second largest
normalizebychrsize(table(newatac_peaks_norm$Chr[1:10000])) #Y extremely small
normalizebychrsize(table(newatac_peaks_norm$Chr[1:30000])) #Y extremely small, X larger. otherwise even
normalizebychrsize(table(newatac_peaks_norm$Chr[1:70000])) #Y still very small and X not catching up. 11 most open
#1,9,16 supposed to have large chunks of heterochromatin https://en.wikipedia.org/wiki/Heterochromatin
#Easiest is to add up the replicates
head(newatac_peaks)
peakpat <- read.csv("out_tc/fimo.txt",stringsAsFactors = FALSE,sep="\t")
head(peakpat)
colnames(peakpat)[1]<-"motifid" ###there is the TF name there straight!
#colnames(peakpat)[2]<-"peak"
colnames(peakpat)[3]<-"peakrename"
hist(log(peakpat$p.value))
newatac_peaks_norm$peakrename <- sprintf("%s:%s-%s",newatac_peaks_norm$Chr, newatac_peaks_norm$Start, newatac_peaks_norm$End)
#chr7:25120342-25120810