forked from npchar/Phylogenomic
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSilix-NumberOfGenesPerFamilyPerSpecies.R
executable file
·71 lines (57 loc) · 2.97 KB
/
Silix-NumberOfGenesPerFamilyPerSpecies.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env Rscript
library("optparse")
option_list = list(
make_option(c("-f", "--file"), type="character", default=NULL,
help="Silix fnodes file name required", metavar="character"),
make_option(c("-o", "--out"), type="character", default=NULL,
help="Matrix output file name required", metavar="character")
);
usage = "This R script will produce an abundance matrix (Number of sequences per family per sp) from a silix Fnodes file.\nWe assume that
the species name is at the begining of the sequence name and separated by an underscore \'_\' (sp1_SeqName).\n It will also report the n
umber of SCO (with different occupancy thresholds) as well as the number of Family."
opt_parser = OptionParser(option_list=option_list, description=usage);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$file) & is.null(opt$out)){
print_help(opt_parser)
stop("Two argument must be supplied (input and output files names)\n", call.=FALSE)
}
inc <- function(x)
{
eval.parent(substitute(x <- x + 1))
}
NODES = read.table(opt$file, h=F)
Name_SP = levels(as.factor(t(as.data.frame(strsplit(as.character(NODES$V2), "_")))[,1]))
Name_FAM = levels(as.factor(as.character(NODES$V1)))
# Extract Silix parameters from the filename :
#---------------------------------------------
#Assuming nammed as follow : silix-allTsa.i0.3_r0.7_l250_m0.5.fnodes
n=opt$file
Words = strsplit(n, "_")
Sidentity = as.numeric(sub("^.*silix-allTsa.i","",Words[[1]][1]))
Soverlap = as.numeric(sub("r","",Words[[1]][2]))
Sminlength = as.numeric(sub("l","",Words[[1]][3]))
Sminoverlap = as.numeric(sub(".fnodes","",sub("m","",Words[[1]][4])))
#Create Matrix from Fnodes file :
#--------------------------------
MAT = as.data.frame(matrix(rep(0, c(length(Name_FAM)*length(Name_SP))),nrow=length(Name_FAM), ncol=length(Name_SP)), row.names=Name_FAM)
names(MAT) = Name_SP
for(i in seq(1,length(NODES$V1))){
SPI = sub("_[0-9]*", "",NODES[i,]$V2)
inc(MAT[rownames(MAT)==NODES[i,]$V1,names(MAT)==SPI])
}
#Dump matrix
write.table(MAT, quote=F, file=opt$out)
# Compute some metrics :
#-----------------------
#Number of SCO (different level of missing values) :
SCO = MAT[apply(MAT[,1:length(MAT[1,])], 1, function(x) sum(x<2)==length(x)),]
SCO100 = dim(SCO[apply(SCO[,1:length(SCO[1,])], 1, function(x) sum(x)== length(x)),])[1]
SCO75 = dim(SCO[apply(SCO[,1:length(SCO[1,])], 1, function(x) sum(x)> length(x)*0.75),])[1]
SCO50 = dim(SCO[apply(SCO[,1:length(SCO[1,])], 1, function(x) sum(x)> length(x)*0.5),])[1]
SCO375 = dim(SCO[apply(SCO[,1:length(SCO[1,])], 1, function(x) sum(x)> length(x)*0.375),])[1]
NumberFam=dim(MAT)[1]
header=c("FileName", "identity", "overlap", "minlength", "minoverlap", "SCO100", "SCO75", "SCO50", "SCO375", "Nfam")
body=c(n, Sidentity, Soverlap, Sminlength, Sminoverlap, SCO100, SCO75, SCO50, SCO375, NumberFam)
InfosFile=paste(opt$out, '.metrics', sep='')
write.table(rbind(header,body), quote=F, row.names=F, col.names=F,file=InfosFile)