-
Notifications
You must be signed in to change notification settings - Fork 0
/
plotKMCgenomesize.dynamic.minimum.small_genomesize.R
executable file
·72 lines (58 loc) · 2.85 KB
/
plotKMCgenomesize.dynamic.minimum.small_genomesize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#Rscript
args <- commandArgs(trailingOnly = TRUE)
print(args[1])
inputfile <- (args[1])
TITLE <- (args[2])
kmercounts <- read.table(inputfile)
#plot(kmercounts[0:20,],type="l")
#points(kmercounts[2:20,])
#sum(as.numeric(kmercounts[2:10000,1]*kmercounts[2:10000,2]))
#2 103 603 758
#allKmers <- sum(as.numeric(kmercounts[2:10000,1]*kmercounts[2:10000,2]))
#max(kmercounts[2:60,2])
#45619475
peak <- max(kmercounts[3:1000,2])
#kmercounts$V1[kmercounts$V2==peak]
maxKmer <- kmercounts$V1[kmercounts$V2==peak]
peak.minimum <- min(kmercounts[0:(maxKmer),2])
#peak.minimum = 2
#minKmer <- kmercounts$V1[kmercounts$V2==peak.minimum]
minKmer = 2
allKmers <- sum(as.numeric(kmercounts[minKmer:10000,1]*kmercounts[minKmer:10000,2]))
ALLSUM <- round((allKmers/maxKmer)/1000000, digits = 2)
#ALLSUM
#525.9009 Mb
## only single copy region (peak between 2 and 15
#peakregionKmers <- sum(as.numeric(kmercounts[2:(10+maxKmer*3),1]*kmercounts[2:(10+maxKmer*3),2]))
peakregionKmers <- sum(as.numeric(kmercounts[minKmer:(10+maxKmer*3),1]*kmercounts[minKmer:(10+maxKmer*3),2]))
PEAKSUM <- round((peakregionKmers/maxKmer)/1000000, digits = 2)
#PEAKSUM
#357.3006 Mb
#--> n=1 kmers mostly seq error, but since peak is at n=4, its v close, thus the proximal end of the distribution is inside the n=1 peak, thus th$
# proportion
PROPORTION <- round(PEAKSUM/ALLSUM, digits = 4)
#PROPORTION
#0.6794067
REPETITIVE <- round(1-(PEAKSUM/ALLSUM), digits = 4)
#REPETITIVE
#0.3205933
PLOTTITLE1 <- paste(ALLSUM," Mb ",sep = "")
PLOTTITLE2 <- paste(PEAKSUM," Mb ",sep = "")
PLOTTITLE3 <- paste(REPETITIVE," repetitive",sep = "")
#For an inbred line, two set of genomes have mostly identical sequence and the major peak would be diploid (2C) peak. #On the otherhand, for high$
#compare the peak shape with poisson distribution
singleC <- (peakregionKmers/maxKmer)
png(paste(inputfile,".smallvalue_genomesize.png",sep=""),width = 1500, height = 1500)
plot(1:(20+maxKmer*3),dpois(1:(20+maxKmer*3), maxKmer)*singleC, type = "l", col=3, lty=2,xlab="kmer depth", ylab="kmers")
lines(kmercounts[1:(20+maxKmer*3),],type="l")
points(kmercounts[2:(20+maxKmer*3),])
points(kmercounts[maxKmer,],type = "p",pch = 16, col = "black",bg ="black", cex = 3)
points(kmercounts[minKmer,],type = "p",pch = 16, col = "black",bg ="black", cex = 3)
points(kmercounts[(10+maxKmer*3),],type = "p",pch = 16, col = "black",bg ="black", cex = 3)
text(maxKmer, (peak*1.01), labels=maxKmer, cex= 1,pos=3)
text(maxKmer+5, (peak*1.15), labels=paste(TITLE,PLOTTITLE1,PLOTTITLE2,PLOTTITLE3,sep = "\n"), cex= 2,pos=3)
dev.off()
new.df <- data.frame(NAME = c(TITLE), ALL = c(ALLSUM), PEAK = c(PEAKSUM), REP = c(REPETITIVE))
#write.table(new.df, file = paste0("/scratch/ek/testoutR",".summary.txt"), sep='\t', row.names=FALSE, col.names=FALSE)
write.table(new.df, file = paste0(inputfile,".small_value.summary.txt"), sep='\t', row.names=FALSE, col.names=FALSE)
print("done")