-
Notifications
You must be signed in to change notification settings - Fork 0
/
toolBox.R
1039 lines (865 loc) · 42.6 KB
/
toolBox.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###############################################################################
# set of some usefull small tools
# TODO: translate comments comment
#
# Author: jfmartin
###############################################################################
###### Fonction qui plotte les meileures correlation entre les var de 2 matrices A et B
# Elimine les data manquantes par couples
plotrsup = function(A, B, rmin, method="pearson")
{
for(i in 1:dim(A)[2]) {
if(is.numeric(A[,i]))
for(j in 1:dim(B)[2]) {
if(is.numeric(B[,j]))
{
AnNa <- A[!is.na(A[,i]) & !is.na(B[,j]),i]
BnNa <- B[!is.na(A[,i]) & !is.na(B[,j]),j]
r = cor(AnNa,BnNa, method = method)
if(abs(r) >= rmin && r<0.99999)
{
plot(AnNa, BnNa, xlab = dimnames(A)[[2]][i], ylab = dimnames(B)[[2]][j])
abline(lsfit(AnNa, BnNa))
title(paste("r=", as.character(round(r, 2))))
}
}
}
}
}
###### Fonction qui ecrit dans un dataframe les meileures correlation entre les var de 2 matrices A et B
# Elimine les data manquantes par couples
write.rsup = function(A, B, rmin)
{
resr <- array(0,c(dim(A)[2]*dim(B)[2],3)); it <- 1
for(i in 1:dim(A)[2]) {
if(is.numeric(A[,i]))
for(j in 1:dim(B)[2]) {
if(is.numeric(B[,j]))
{
AnNa=A[!is.na(A[,i]) & !is.na(B[,j]),i]
BnNa=B[!is.na(A[,i]) & !is.na(B[,j]),j]
r = cor(AnNa,BnNa)
if(abs(r) >= rmin && r<0.99999)
{
resr[it,1] <- dimnames(A)[[2]][i]
resr[it,2] <- dimnames(B)[[2]][j]
resr[it,3] <- r
it <- it +1
}
}
}
}
resr <- resr[resr[,1]!=0,]
return(resr)
}
##### Plot A,B avec symboles differents en fonction du facteur s
plotby = function(s, A, B, lx ="", ly ="", reg=T ,regby=F, leg=T ,rmin)
{
if(!is.factor(s)) stop("Le premier argument doit etre un FACTEUR")
if(!is.numeric(A)) stop("Le deuxieme argument doit etre NUMERIQUE")
if(!is.numeric(B)) stop("Le troisieme argument doit etre NUMERIQUE")
AnNa=A[!is.na(A)& !is.na(B)]
BnNa=B[!is.na(A)& !is.na(B)]
snNa=s[!is.na(A)& !is.na(B)]
D <- data.frame(f=snNa, x=AnNa, y=BnNa)
r <- cor (D$x, y=D$y)
if(abs(r) >= rmin && r<0.99999) {
plot (D$x , D$y , type="n", xlab=lx , ylab=ly )
if ( reg==T ) abline(lsfit(D$x , D$y), lty=1)
title (paste("r=", as.character(round(r, 2))))
nl <- length(levels(D$f))
cl=c(1,17,16,2,3,4); sy=c(1,16,17,3,5,12)
for (i in 1:nl) {
sD <- D[ D$f==levels(D$f)[i],]
points (sD$x , sD$y, pch=sy[i], col=cl[i])
if ( regby==T ) abline(lsfit( sD$x , sD$y), lty=i+1)
}
if (leg==T) {
py <- max(D$y)
if (r >= 0) px <- min(D$x)
if (r < 0) px <- min(D$x)+(max(D$x)-min(D$x))/2
legend(px, py , levels(D$f),pch=sy[1:nl])
}
}
}
##### Plot les boxplot des var d un dataframe qui comporte des obs a +/- nsd ecartypes
plot_outlier = function (ids, by, nsd)
{
for (i in 1:dim(ids)[2])
if (is.numeric(ids[,i]))
{
mini=min(ids[!is.na(ids[,i]),i])
maxi=max(ids[!is.na(ids[,i]),i])
moy=mean(ids[!is.na(ids[,i]),i])
ect=sd(ids[!is.na(ids[,i]),i])
if (mini < moy- nsd*ect | maxi > moy+nsd*ect) boxplot (ids[,i]~ by, ylab=dimnames(ids)[[2]][i])
}
}
acpEllipse=function(ids,fact,firstvar,lastvar=dim(ids)[2],scaling="uv",meth="svd",plotloading=T,tc=0.66) {
# fait une ACP sur ids sachant que la colonne 1 contient l'identificateur d'individu
# tc est la taille du caractere utilisé pour les plots
# la colonne 2 contient le facteur définissant la couleur des individus
# Les colonnes suivantes 3:dim(ids)[2] sont les données quantitatives utilisées pour l'ACP
# L'ACP est réalisé avec le scaling "uv","none","pareto"
# L'Algo d'ACP est : nipals (si données manquantes) ou svdImpute ou Bayesian ou svd ou probalistic
# Le loading de l'ACP est plotté si plotloading==T (valeur par defaut)
require(ade4)
classe=factor(ids[[fact]])
idions=dimnames(ids[,firstvar:lastvar])[2]
colour=1:length(levels(classe))
ions=as.matrix(ids[,c(firstvar:lastvar)])
# choix du scaling : "uv","none","pareto"
object=prep(ions, scale=scaling, center=TRUE)
# ALGO: nipals,svdImpute, Bayesian, svd, probalistic=F
result <- pca(object, center=F, method=meth, nPcs=2)
# ADE4 : representation des ellipsoides des individus de chaque classe
s.class(result@scores, classe, cpoint = 1,xax=1,yax=2,col=colour,sub=sprintf("Scores - PCs %sx%s",1,2), possub="bottomright")
if (plotloading==T)
{s.label(result@loadings,cpoint = 0,boxes=F,clabel=tc, xax=1,yax=2,sub="Loadings",possub="bottomright")}
}
acplight=function(ids, scaling="uv") {
# fait une ACP sur ids sachant que la colonne 1 contient l'identificateur d'individu
# la colonne 2:nf contient les facteurs définissant la couleur des individus
for (i in 1:3) {
# i=1
idss=data.frame(ids[ids[,i+1]!="",])
classe=as.factor(idss[[i+1]])
idsample=as.character(idss[[1]])
colour=1:length(levels(classe))
ions=as.matrix(idss[,5:dim(idss)[2]])
# choix du scaling : "uv","none","pareto"
object=prep(ions, scale=scaling, center=TRUE)
# ALGO: nipals,svdImpute, Bayesian, svd, probalistic=F
result <- pca(object, center=F, method="svd", nPcs=2)
# ADE4 : representation des ellipsoides des individus de chaque classe
s.class(result@scores, classe, cpoint = 1,xax=1,yax=2,col=colour,sub=sprintf("Scores - PCs %sx%s",1,2), possub="bottomright")
#s.label(result@loadings,label = ions, cpoint = 0, clabel=0.4, xax=1,yax=2,sub="Loadings",possub="bottomright")
}
}
# Programme de plot des ions avec mis en évidence des facteurs
plotInterCI = function (ids,indf1,indf2,firstvar) {
# Function qui plotte une variable (ions) avec 2 facteurs croisés. Si cinetique l'utiliser en facteur 2
# En entrée, ids input dataframe, indf1 et indf2 indice dans la dataframe des 2 facteurs
# ids=x;i=25;indf1=6;indf2=10;indSubject=4
lastvar=dim(ids)[2]
for (i in firstvar:lastvar) {
par(las=2)
xp=ids[!is.na(ids[[i]]),c(indf1,indf2,i)];
xp=data.frame(xp,interaction(xp[[1]],xp[[2]],sep="."));
dimnames(xp)[[2]][4]="Inter";
levInter=c(1:length(levels(xp[[4]])));nl1=length(levels(xp[[1]]));nl2=length(levels(xp[[2]]));
levInter=array(data=levInter,dim=c(nl1,nl2))
connection=vector("list",nl1)
for (i in 1:nl1) {connection[[i]]=levInter[i,]}
plotmeans(xp[[3]]~ xp[[4]],
connect=connection,
ccol=c(1:nl1), barwidth=1.5, pch=16,
main=dimnames(xp)[[2]][3],xlab=" ",
ylab="Intensity"
);
# plot.design(xp[[3]]~xp[[1]]+xp[[2]]+xp[[4]],fun=mean,main=dimnames(xp)[[2]][3],xlab=" ",ylab=" ");
# boxplot(xp[[3]]~xp[[4]],ylab="Intensity");
}
}
BoxHistDesign = function (ids, fact, firstvar,lastvar=dim(ids)[2]) {
# ids input dataset; fact type vecteur contenant les indices des facteurs étudiés; firstvar 1ere var quantitative
# Function qui pour une dataset ids plotte un histogramme de chaque var quantitative AVEC 2 FACTEURS Minimum
# plotte un boxplot by facteur fact[i]
# plotte le plo.design des facteurs
# S'il y a une cinétique, mettre l'indice en premier dans le vecteur d'indice "fact"
# lastvar=dim(ids)[2]
nbfact=length(fact); nbinter=0.5*(nbfact^2)-0.5*nbfact; gco=c(1,2,3,3,4);gli=c(3,3,4,6,6)
for (i in firstvar:lastvar) {
if (nbfact < 6) { par(mfrow=c(gli[nbfact],gco[nbfact]))} else { par(mfrow=c(nbfact-1,nbinter-1))}
# cat("var ",i," -> ",names(ids)[i],"\n")
# eliminantion des données manquantes
ods=ids[!is.na(ids[[i]]),]; lab=names(ods)[i]
xpf=ods[,fact]
xpv=ods[,i]
# xpi=array(data="inter",nrow=dim(ods)[1],ncol=nbinter)
xb=data.frame(xpf,xpv); indvar=dim(xb)[2];lab=names(ods)[i];names(xb)[indvar]=lab
hist(xb[[indvar]],main=lab,xlab=" ",ylab=" ")
plot.design(xb,fun=mean,main=" ",xlab=" ",ylab=" ")
for (b in 1:nbfact) {
boxplot(xb[[indvar]]~xb[[b]],ylab="Intensity")
}
## Plot des interactions en fonction du nombre de facteurs
if (nbfact>1) {
y=0
for (j in 1:(nbfact-1)){ for (u in (j+1):nbfact){
y=y+1
xpi=interaction(ods[[fact[j]]],ods[[fact[u]]],sep="*")
xp=data.frame(xpf,xpi,xpv);
indinter=length(fact)+1; names(xp)[indinter]="interaction";
indvar=dim(xp)[2]; names(xp)[indvar]=names(ods)[i];
indinter=length(fact)+1; names(xp)[indinter]="interaction";
boxplot(xp[[indvar]]~xp[[indinter]],ylab="Intensity")
interaction.plot(xp[[j]],xp[[u]],xp[[indvar]])
}}
}
}
}
BoxHistDesign2 = function (ids, fact, firstvar,lastvar=dim(ids)[2]) {
# ids input dataset; fact type vecteur contenant les indices des facteurs étudiés; firstvar 1ere var quantitative
# Function qui pour une dataset ids plotte un histogramme de chaque var quantitative AVEC 2 FACTEURS Minimum
# plotte un boxplot by facteur fact[i]
# plotte le plo.design des facteurs
# S'il y a une cinétique, mettre l'indice en premier dans le vecteur d'indice "fact"
# lastvar=dim(ids)[2]
nbfact=length(fact); nbinter=0.5*(nbfact^2)-0.5*nbfact; gco=c(1,2,3,3,4);gli=c(3,3,4,6,6)
for (i in firstvar:lastvar) {
if (nbfact < 6) { par(mfrow=c(gli[nbfact],gco[nbfact]))} else { par(mfrow=c(nbfact-1,nbinter-1))}
# cat("var ",i," -> ",names(ids)[i],"\n")
# eliminantion des données manquantes
ods=ids[!is.na(ids[[i]]),]; lab=names(ods)[i]
xpf=ods[,fact]
xpv=ods[,i]
# xpi=array(data="inter",nrow=dim(ods)[1],ncol=nbinter)
xb=data.frame(xpf,xpv); indvar=dim(xb)[2];lab=names(ods)[i];names(xb)[indvar]=lab
hist(xb[[indvar]],main=lab,xlab=" ",ylab=" ")
plot.design(xb,fun=mean,main=" ",xlab=" ",ylab=" ")
for (b in 1:nbfact) {
boxplot(xb[[indvar]]~xb[[b]],ylab="Intensity")
}
}
}
deter_ioni <- function (aninfo, pm)
{
# determine l'ionisation de l'annotation de ProbMetab
# si la difference m/z et masse proposée par probmetab ~1 on utilise le signe de la difference pour definir l'ionisation
# si adduits diff >>1 alors on cherche les chaines "+" ou "2+" ou "3+" ou "-"... pour définir l'ionisation
# aninfo : vecteur resultant du parse (seprateur #) du champ annotation de ProbMetab
if (round(abs(as.numeric(aninfo[1]) - pm),0) ==1) {
if (as.numeric(aninfo[1]) - pm <0) {esi <- "n"} else {esi <- "p"}
} else
if (!is.na(aninfo[4])) {
anstr <- aninfo[4]
cat(anstr)
if ((grepl("]+",anstr,fixed=T)==T) || (grepl("]2+",anstr,fixed=T)==T) || (grepl("]3+",anstr,fixed=T)==T)) { esi <- "p"}
else
if ((grepl("]-",anstr,fixed=T)==T) || (grep("]2-",anstr,fixed=T)==T) || (grep("]3-",anstr,fixed=T)==T)) { esi <- "n"}
cat(" ioni ",esi,"\n")
} else
{ esi <- "u"}
return(esi)
}
##################### fonction VennSignif #####################
## compute common signif variables in a varMedata file with columns of significant variable define with 1 for significant
## And draw a Venn diagram. Able to draw for only 2 or 3 factors.
vennSignif <- function(ids,sigsub,siglab,sigcol,chx=1.5)
## ids : input dataframe
## nv : number of factor to compute and draw as a Venn diagram : 2 or 3
## sigsub : vector of factor subscrip
## siglab : vector of names of factors
## sigcol : vector of color filling of the circles
{
nv <- length(sigsub)
if (nv==2) {
n1 <- nrow(subset(ids, ids[[sigsub[1]]]==1))
n2 <- nrow(subset(ids, ids[[sigsub[2]]]==1))
cat(colnames(ids)[sigsub[1]]," & ",colnames(ids)[sigsub[2]],"\n")
n12 <- nrow(subset(ids, (ids[[sigsub[1]]]==1 & ids[[sigsub[2]]]==1)))
draw.pairwise.venn(area1 = n1, area2 = n2, cross.area = n12, category = siglab,
cex=chx, cat.cex = chx,
fill = sigcol[1:2], cat.pos = c(0,0), cat.dist = rep(0.025, 2))
} else
if (nv==3) {
n1 <- nrow(subset(ids, ids[[sigsub[1]]]==1))
n2 <- nrow(subset(ids, ids[[sigsub[2]]]==1))
n3 <- nrow(subset(ids, ids[[sigsub[3]]]==1))
cat(colnames(ids)[sigsub[1]]," & ",colnames(ids)[sigsub[2]]," & ",colnames(ids)[sigsub[3]],"\n")
n12 <- nrow(subset(ids, (ids[[sigsub[1]]]==1 & ids[[sigsub[2]]]==1)))
n13 <- nrow(subset(ids, (ids[[sigsub[1]]]==1 & ids[[sigsub[3]]]==1)))
n23 <- nrow(subset(ids, (ids[[sigsub[2]]]==1 & ids[[sigsub[3]]]==1)))
n123 <- nrow(subset(ids, (ids[[sigsub[1]]]==1 & ids[[sigsub[2]]]==1 & ids[[sigsub[3]]]==1)))
draw.triple.venn(area1 = n1, area2 = n2, area3 = n3, n12,n23,n13,n123,
cex=chx, cat.cex = chx,
category = siglab, lty = "blank",
fill = sigcol[1:3])
} else stop("Only 2 ou 3 factors are supported")
}
crea3files <- function(vmfile, vipfile, splfile, dmfile, outsuf) {
# This function creates the W4M 3 metadata + matrix files for univariate statistics corresponding with significant VIP
# vmfile: variableMetadata file with xcms xtract infos(annotation, ions in rows, columns= mz CAMERA intensities annotations
# vipfile : file of ions + vip n. These ions must be a subset of ions of vmfile, last column define the signifcance
# splfile : file of sample(rows) + metadata samples. These samples must be a subset of samples of vmfile
# dmfile : file of data matrix to be used for univariate test.
# outsuf : out file common prefix name to create the 3 differents files
## read variable metadata file
VM <- read.table(file=vmfile,header=TRUE, sep="\t",stringsAsFactors =FALSE)
## order by ions id which must be in the first column
VM <- VM[order(VM[[1]]),]
VIP <- read.table(file=vipfile,header=TRUE, sep="\t",stringsAsFactors =FALSE)
VIP <- VIP[order(VIP[[1]]),]
if (sum(VIP[,1] != VM[,1]) > 0) stop("ions id are different in vip and annot files")
## selection of signifcant VIP ions, last column of VIP file
VIPsignif <- VIP[VIP[ncol(VIP)] == 1,]
## merge to keep only signifcant ions
VMs <- merge(x = VM, y=VIPsignif, by.x=1, by.y =1, all.x=FALSE, all.y=TRUE)
ionsSignif <- VIPsignif[[1]]
## samples avec uniquement les sampleType ="sample"
SPL <- read.table(splfile, header=TRUE,sep="\t",stringsAsFactors = FALSE)
SPLs <- SPL[SPL$sampleType=="sample",]
splist <- SPLs[[1]]
## dataMatrix avec les ions signifs et les sampleType ="sample" uniquement et classés
## read table data matrix
DM <- read.table(file = dmfile, header = TRUE, sep="\t")
DM <- DM[order(DM[[1]]),]
## on conserve les ions signifs
DMsignif <- DM[DM[[1]] %in% ionsSignif,]
DMSS <- data.frame(DMsignif[[1]],DMsignif[,colnames(DMsignif) %in% splist])
################################## create output files ##################################
write.table(DMSS,file=paste(outsuf,"DM.txt",sep=""), sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE)
write.table(SPLs,file=paste(outsuf,"SPL.txt",sep=""),sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE)
write.table(VMs, file=paste(outsuf,"VM.txt",sep=""), sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE)
#return(annotAll)
}
moyByFactor <- function(imtx, ispl, subfact, onlyspl=TRUE, oper="mean") {
## imtx correspond to W4M data matrix : var in rows Id var in the first columns, samples in columns given by vector isample
## ispl correspond to W4M samples metadata : samples in rows (same as the columns of imtx data matrix)
## The function computes the mean for each level of factor in column #subact, present in the ispl metadata sample dataframe
## if onlyspl = TRUE, the average (or other) is computed only on samples defined as "sample" in the sampleType
## for each variables of which subscripts given in ivar vector) of the imtx dataframe.
## returns a data matrix of means with variales in rows and levels of factor in columns
## selection of biological sample sampleType=="sample"
## in the sampleMetadata ispl
if (onlyspl) {ispl <- ispl[ispl$sampleType == "sample",]}
## Selection of biological sample in the dataMatrix by merging with ispl
varName <- colnames(imtx)[1]
rownames(imtx) <- imtx[[1]]
imtx <- imtx[,-1]
Timtx <- t(imtx)
Timtx <- data.frame(row.names(Timtx),Timtx)
ids <- merge(x=ispl, y=Timtx, by.x=1, by.y=1, all.x=TRUE, all.y=FALSE)
## compute mean by levels of factors diet
firstion <- ncol(ispl)+1
lastion <- ncol(ids)
nbions <- lastion-firstion+1
levnames <- levels(as.factor(ispl[[subfact]]))
nlevels <- length(levnames)
avInt <- array(0,dim=c(nlevels,nbions))
colnames(avInt) <- colnames(ids)[c(firstion:lastion)]
row.names(avInt) <- levnames
for (i in 1:nbions) {avInt[,i] <- tapply(ids[[(i+firstion-1)]], ids[[subfact]], FUN=oper)}
TavInt <- t(avInt)
resMoy <- data.frame(row.names(TavInt),TavInt)
colnames(resMoy)[1] <- varName
return(resMoy)
}
foldchange <- function(ids,reflev,collev=c(2:ncol(ids)), fctype=c("log2","simple")[1]) {
## computes fold change between all levels of a factor
## ids : an input dataframe with variables in rows and
## id of variable in 1st column and average of the level of a factor in the other columns
## this matrix is computed by the function : moyByFactor
## reflev is the reference level which is used as a denominator in fold change computing
## collev is a vector of subscript of the different columns of levels.
## ids may contains others columns but after the id+columns of average
## So it is possible tu use this function for all the levels in successive runs
## fold change by default is "log2 ratio" or "simple" numerator/denominator
ireflev <- which(colnames(ids)==reflev)
#collev <- collev[-which(collev==ireflev)]
nblev <- length(collev)
foldch <- data.frame(array(NA,dim=c(nrow(ids),nblev)),stringsAsFactors = FALSE)
colnames(foldch) <- paste(colnames(ids)[collev],"_vs_",reflev,sep="")
rownames(foldch) <- ids[[1]]
for(i in 1:ncol(foldch))
#if (fctype=="simple") foldch[i] <- ids[i+1]/ids[ireflev] else foldch[i] <- log2(ids[i+1]/ids[ireflev])
if (fctype=="simple") foldch[i] <- ids[collev[i]]/ids[ireflev] else foldch[i] <- log2(ids[collev[i]]/ids[ireflev])
#resids <- cbind(ids,foldch)
resids <- foldch
return(resids)
}
ggplotFC <- function(ids,labDEN,labNUM,sNAME,mtit,stit="log2 Fold change") {
## function to ggplot a fold change (labNUM/labDEN) in descending order of absolute value
## needs an input dataframe ids (var in rows and FC in columns) with all informations on average and FC
## with at least 3 columns :
# - id in the first column
# - Name of the variable which is used on the plot
# - Fold change between labNUM/labDEM
FC2plot <- paste(labNUM,"_vs_",labDEN,sep="")
sFC2plot <- which(colnames(ids)==FC2plot)
ids <- ids[,c(1,sNAME,sFC2plot)]
mtitle <- paste(mtit,FC2plot, sep=" ")
## construct the name of the column to plot and then define his subscrip in log2FC
sens <- ifelse(ids[[3]] < 0, labDEN, labNUM)
ids <- cbind(ids,sens)
### pour ordonner soit dans le sens decroissant de FC [[3]] soit par ordre alpha de name [[2]]
ids <- ids[order(abs(ids[[3]])),]
## ids <- ids[order(ids[[2]]),]
colnames(ids)[3] <- "log2FC"
colnames(ids)[2] <- "name"
ids$name <- factor(ids$name, levels = ids$name) # convert to factor to retain sorted order in plot.
## ggplot of the foldchange
p <- ggplot(ids, aes(x=name, y=log2FC, label= log2FC)) +
geom_bar(stat='identity', aes(fill=sens), width=.5) +
#geom_text(aes(label = round(pvalue24h3DEM_vs_C,4)), nudge_y = 2) +
#scale_fill_manual(name="log2FC", labels = c(labDEN, labNUM), values = c(labDEN="green", labNUM="red")) +
labs(subtitle=stit, title= mtitle) +
coord_flip()
print(p)
}
meanFCggplot <- function(VM,VMsel,DM,SPL,mfact,labDEN,labNUM,sNAME,mtit) {
## compute means of levels of factor mfact, then computes all the possible FC among levels of factor mfact
## add these means and FC values to VM file and save it
## Then between the chosen levels labNUM/labDEN for a selection of variable defined by the file VMsel
## ggplot the fold change in descending order of the absolute values of FC
## sNAME is the colum number defining the label of molecules uses on Y axis
subfact <- which(colnames(SPL)==mfact)
moyDM <- moyByFactor(imtx=DM, ispl=SPL, subfact, onlyspl=TRUE, oper="mean")
moyVM <- merge(x = VM, y =moyDM, by.x=1, by.y=1, all.x=TRUE, all.y=FALSE)
# compute log2 fold change for all level = columns of moyVM corresponding to the means of the different levels
nlev <- ncol(x = moyDM) - 1
collev=c((ncol(moyVM)-nlev+1):ncol(moyVM))
lablev <- colnames(moyVM)[collev]
## for each level defined as denominator of FC, compute FC and store and write a txt file based on
for (l in 1:nlev) {
FCDEN <- lablev[l]
FC <- foldchange(moyVM,reflev=FCDEN,collev, fctype="log2")
#subVM contains all the FC results for each level of the chosen factor mfact
moyVM <- cbind(moyVM,FC)
}
## for the mfact factor using 2 levels as Numerator and Denominator performs graphics of the fold change
## prepare dataframe for ploting. If exist
if (!is.null(VMsel)) VMsel <- merge(x=VMsel, y=moyVM, by.x=1, by.y=1, all.x=TRUE, all.y=FALSE) else VMsel <- moyVM
ggplotFC(ids=VMsel, labDEN=labDEN,labNUM=labNUM,sNAME,mtit=mtit, stit="log2 Fold change")
return(VMsel)
}
ggplotOR <- function(ids,sNAME=2,sOR=3,mtit,stit="Odd Ratio") {
## function to ggplot an Odd ratio in descending order
## needs an input dataframe ids with at least 3 columns :
# - id in the first column
# - Name of the variable which is used on the plot (column sNAME)
# - odd ratio (column sOR)
## sens define the color with a threshold for OR=1
sens <- ifelse(ids[[3]] < 1, yes="OR<1",no="OR>1")
ids <- cbind(ids,sens)
ids <- ids[order(abs(ids[[3]])),]
colnames(ids)[3] <- "OR"
colnames(ids)[2] <- "name"
ids$name <- factor(ids$name, levels = ids$name) # convert to factor to retain sorted order in plot.
## ggplot of the foldchange
p <- ggplot(ids, aes(x=name, y=OR, label= OR)) +
geom_bar(stat='identity', aes(fill=sens), width=.5) +
labs(subtitle=stit, title= mtit) +
coord_flip()
print(p)
}
parse_chemFormula <- function(formul, atom2search=c("C","H","O","N","S")) {
## function able to count the number of atom (defined by teh vector atom2search)
## in a vector formul contaninig only chemical formulas
## return a matrix with the number of rows corresponding to formul rows and the number of columns
## corresponding to the number of atom define by atom2search
## eliminate blank in the formula
## ignore case
nbfc <- length(formul)
nba2s <- length(atom2search)
nbatom <- matrix(data=NA,nrow=nbfc,ncol=nba2s)
colnames(nbatom) <- atom2search
## for each formula in vector formul....
for (cmol in 1:nbfc) {
# elimination of blank and transform lower to uppercase
f1 <- toupper(gsub(" ", "", as.character(formul[cmol]), fixed = TRUE))
## init of vector position
posa2s <- rep(0,nba2s)
## detection of the position of the number associated to each atom define by atom2search
for (a in 1:nba2s) {
res <- regexpr(atom2search[a],f1, ignore.case = TRUE)
if (res[1]>0) posa2s[a] <- res[1] + nchar(atom2search[a])
else posa2s[a] <- 0
}
## detection of number associated to each atom 1 to 99 (none <=> 1)
for (a in 1:nba2s) {
i <- posa2s[a]
if (i >0) { ## i=0 means that atom is not in the formula
nba <- c(NA,NA); j <-0
repeat{
nblu <- substr(f1,i,i)
if (!is.na(as.numeric(nblu)) & i <= nchar(f1)) {
j<-j+1
nba[j] <- as.numeric(nblu)
i <- i+1
} else break
}
if (j==0) nba[1] <- 1 ## j=0 means atom without number => value=1
if (is.na(nba[2])) nbatom[cmol,a] <- nba[1] else nbatom[cmol,a] <- (10*nba[1]+nba[2])
}
else nbatom[cmol,a] <-0
}
}
return(nbatom)
}
plot_prince <- function(ids,spl,s_var,s_fact, nbcomp) {
## compute the % explained of each factor(spl) on the first ncomp PCA components compute on ids
## ids : data matrix with variables in rows and samples in columns : type dataframe with the First column = id of ions
## s_Var : substring of quantitative variables to use in ids
## spl : sample metadata with sample in row (idem colmuns of ids) and factors in columns. First column= id of samples
## s_fact : substring of factors tu use in sample Metadata spl :
## nb_comp : number of components tu use as "top" in prince plot
data <- as.matrix(ids[,-1]) ## on enleve la colonne 1 des identifiants variables
data <- data[,order(colnames(data))]
row.names(data) <- ids[[1]] ## ident variable -> colonne 1
row.names(spl) <- spl[[1]]
spl <- spl[order(row.names(spl)),]
raw.prince <- prince(as.matrix(data),spl[,s_fact],top=nbcomp)
prince.plot(raw.prince,Rsquared = T,key=T,cexRow=1,xlab="Principal Components (%total variation)")
}
molobsfreq <- function(vmdata,dm, spl,nTimeBL=10, intThreshold=0,freqBy=0,iId=1) {
## This function computes the observed frequency of the differents ions of a peaklist
## Threshold is defined as nTimeBL*average of non zero blank value (nTimeBL) or an abosulte value (nTimeBL=0 and intThreshold=absolute value of threshold)
## for each ion: freq <- 1-(number of samples with intensity<intThreshold / number of samples)
## This freq can be computed for all samples or by levels of a factor (freqBy = subscript of the factor)
## use the 3 W4M files vmdata : var metadata , dm : dataMatrix, spl : sample metadata
## iId : subscript of ID var vmdata
tdm <- t(as.matrix(dm[,-1]))
tdm <- data.frame(row.names(tdm),tdm)
dsf <- merge(x=spl,y=tdm,by.x = 1, by.y=1, all.x=TRUE, all.y = TRUE )
firstvar <- ncol(spl)+1
lastvar <- ncol(dsf)
## selection of samples only
splonly <- dsf[dsf$sampleType=="sample",]
if (freqBy >0) {
levby <- as.character(levels(as.factor(splonly[[freqBy]])) )
nlev <- nlevels(as.factor(splonly[[freqBy]]))
} else
{
levby <- "Freq"
nlev=1
}
## extract only blank to compute an average of non zero sample to define the threshold by multiplying this average by nTimeBL
if (nTimeBL > 0){
thresh <- rep(NA,lastvar - firstvar +1)
blonly <- dsf[dsf$sampleType=="blank",]
for(j in firstvar:lastvar) {
inz <- which(blonly[[j]]>0)
## if all blank =0 then threshold is fixed to nTimeBL * intThreshold
if (length(inz)==0) thresh[j-firstvar+1] <- nTimeBL * intThreshold
else thresh[j-firstvar+1] <- nTimeBL*mean(blonly[inz,j])
}
}
## if nTimeBL==0 means the same arbitrary threshold for every ions of the data matrix
if (nTimeBL == 0) thresh <- rep(intThreshold,lastvar - firstvar +1)
## init result dataframe 1 line / ion nlev colonnes si by factor + 1 for mean Intensity
nVar <- nrow(vmdata)
resFreq <- data.frame(vmdata[,iId],matrix(data=NA, nrow = nVar, ncol = nlev),stringsAsFactors = FALSE)
resInt <- data.frame(vmdata[,iId],matrix(data=NA, nrow = nVar, ncol = nlev),stringsAsFactors = FALSE)
resMin <- data.frame(vmdata[,iId],matrix(data=NA, nrow = nVar, ncol = nlev),stringsAsFactors = FALSE)
resMax <- data.frame(vmdata[,iId],matrix(data=NA, nrow = nVar, ncol = nlev),stringsAsFactors = FALSE)
colnames(resFreq) <- c("ions",paste(levby,"F",sep="."))
colnames(resInt) <- c("ions",paste(levby,"Med",sep="."))
colnames(resMin) <- c("ions",paste(levby,"min",sep="."))
colnames(resMax) <- c("ions",paste(levby,"MAX",sep="."))
for (i in 1:nVar) {
ions <- dm[i,1]
subds <- splonly[,c(1:firstvar-1,firstvar+i-1)]
## subscript of current variable (ions) last column of subds
sv <- ncol(subds)
## compute number of sample with intensity > intensity Threshold thresh
if (freqBy > 0) {
nul1 <- rep(0,nlev)
int1 <- rep(0,nlev)
min1 <- rep(0,nlev)
max1 <- rep(0,nlev)
for (p in 1:nlev) {
sublev <- subds[subds[[freqBy]]== levby[p],]
## number of sample < threshold
nul1[p] <- nrow(sublev[sublev[[sv]]< thresh[i],])
## frequency : 1 - taux d'inferieur au seuil
nul1[p] <- (1-(nul1[p]/nrow(subds[subds[[freqBy]]== levby[p],])))
## intensity
int1[p] <- median(sublev[sublev[[sv]]>= thresh[i],sv])
## minimum or threshold (computed by blank intensities) if min in sample < threshold
minp <- min(sublev[sublev[[sv]] >= thresh[i],sv])
if ( minp == "Inf") min1[p] <- thresh[i]
else min1[p] <- min(sublev[sublev[[sv]]>= thresh[i],sv])
max1[p] <- max(sublev[[sv]])
}
} else
{
nul1 <- 1-(nrow(subds[subds[[sv]]< thresh[i],])/nrow(subds))
int1 <- median(subds[subds[[sv]] >= thresh[i],sv])
if (min(subds[[sv]]) < thresh[i]) min1 <- thresh[i]
else min1 <- min <- thresh[i]
max1 <- max(subds[[sv]])
}
## results
resFreq[i,1] <- ions
resFreq[i,c(2:(ncol(resFreq)))] <- nul1
resInt[i,1] <- ions
resInt[i,c(2:(ncol(resInt)))] <- int1
resMin[i,1] <- ions
resMin[i,c(2:(ncol(resMin)))] <- min1
resMax[i,1] <- ions
resMax[i,c(2:(ncol(resMax)))] <- max1
#cat(resFreq[i,1],"\n")
}
#resFI <- merge(x=resFreq,y=resInt,by.x=1,by.y=1,all.x=TRUE,all.y=TRUE)
resFI <- cbind(resFreq, resInt, resMin, resMax)
return(resFI)
}
NA_replace <- function(DM,method=c("zero","randmin","halfmin")[2]) {
## replace NA values in the dataMatrix by a random value between 0 and min(ions)
## data matrix DM is supposed to have ions names in the first column
## methods of replacement :
## - "zero" : NA are replaced by 0
## - "randmin" : NA are replaced by a random value between 0 and min(ions)
## - "halfmin" : NA are replaced by min(ions)/2
rownames(DM) <- DM[[1]]; IDvar <- colnames(DM)[1]
tDM <- data.frame(t(DM[,-1]), stringAsFactor =FALSE)
for (i in 1:ncol(tDM)) {
# iNA subscript of obs with NA
iNA <- which(is.na(tDM[[i]]))
if (method=="randmin") {
# min value of the current ion is the minimum without NA values
minval <- min(tDM[-iNA,i])
tDM[iNA,i] <- runif(n=length(iNA),min=0, max=minval)
}
else if (method=="halfmin") {
minval <- min(tDM[-iNA,i])
tDM[iNA,i] <- minval/2
}
else {
## for other method 0 replace NA
tDM[iNA,i] <- 0
}
}
DMnoNA <- t(tDM)
DMnoNA <- data.frame(row.names(DMnoNA),DMnoNA,stringsAsFactors = FALSE); colnames(DMnoNA)[1] <- IDvar
return(DMnoNA)
}
QCBL_filter <- function(VM,DM,spl, threshBL) {
## filtration of ions based on QC/BL intensity (must be > thresBL)
## this function aimed to filter the xtracted ions () for which QC/BL < threhBL
## First check if we have BL and QC or Sample
splType <- as.character(levels(as.factor(spl$sampleType)))
spools <- which(splType == "pool")
ssamp <- which(splType == "sample")
sbl <- which(splType == "blank")
itype <- which(colnames(spl)=="sampleType")
## compute mean by sampleType
meanByType <- moyByFactor(imtx=DM,ispl = spl,subfact=itype,onlyspl=FALSE, oper="mean")
## ratio QC/BL average and set a filter
QCBL <- meanByType$pool/meanByType$blank
QCBLfilter <- QCBL>threshBL
## merge ratio and filter to VM and DM
filterVM <- data.frame(cbind(meanByType,QCBL,QCBLfilter),stringsAsFactors = FALSE)
filterVM <- merge(x = VM, y=filterVM, by.x=1, by.y=1, all.x=TRUE, all.y=TRUE)
filterVM <- filterVM[filterVM$QCBLfilter==TRUE ,]
filterDM <- data.frame(cbind(DM,QCBLfilter),stringsAsFactors = FALSE)
filterDM <- filterDM[filterDM$QCBLfilter==TRUE ,-ncol(filterDM)]
## a tester stopifnot("error VM and DM don't match",filterVM[[1]] != filterDM[[1]])
filterData <- list(filterVM,filterDM)
return(filterData)
}
########## extraction function for both neg ou pos ionisation
## pgen : general param common to both ionisation
## pioni: parameters with possibly different values for each ionisation
xcms_xtract <- function(ioniUsed,pgen,pioni,spl,idmanip,repIoni) {
resSuf=paste(idmanip,ioniUsed,sep="")
resfile = paste(resSuf,"txt",sep=".")
resGroup= paste(resSuf,"2ndGrouping","pdf",sep=".")
resIdent= paste(resSuf,"Ident",sep="_")
cdffiles <- list.files(repIoni,recursive=TRUE,full.names=TRUE,pattern=fileform,ignore.case=TRUE)
###### save the cpu time used for each step using system.time
tpuc <- array(data = NA,dim = c(6,3))
tpucl <- c("readMS","findChromPeak","1stGroup","adjustRtime","2ndGroup","fillPeaks")
## data frame of sample data based on sampleMetadata load before
samples <- sub(basename(cdffiles),pattern = paste(".",fileform,sep=""), replacement = "", fixed = TRUE)
mfgroup <- spl[[pgen$grIndex]] ## subscript of the column group
pd <- data.frame(sample_name = samples,
sample_group=mfgroup,
stringsAsFactors = FALSE)
## readMSdata
tuc <- system.time(raw_data <- readMSData(files = cdffiles, pdata = new("NAnnotatedDataFrame", pd), mode = "onDisk"))
tpuc[1,] <-tuc[c(1:3)]
## find chromatogram peaks xtract from readMS files
cwp <- CentWaveParam(ppm = pioni$ppm,
peakwidth = pioni$pw,
noise = pioni$noise,
snthresh = pioni$sn,
mzdiff = pioni$mzd,
prefilter = c(5,pioni$noise)
)
tuc <-system.time(xdata <- findChromPeaks(raw_data, param = cwp))
tpuc[2,] <-tuc[c(1:3)]
nameGroup <- levels(as.factor(x = xdata$sample_group))
group_colors <- brewer.pal(n=length(x = nameGroup),"Set3")
## group and rt correction
## 1st grouping.
pdp1 <- PeakDensityParam(sampleGroups = xdata$sample_group,
minFraction = pgen$minf, bw <- pioni$bw1, binSize= pioni$mzw)
tuc <- system.time(xdata <- groupChromPeaks(xdata, param = pdp1) )
tpuc[3,] <-tuc[c(1:3)]
##### P O U R T E S T
# x11()
# ## Define the mz slice.
# mzr <- c(205.05, 205.1)
# pdp0 <- PeakDensityParam(sampleGroups = xdata$sample_group,
# minFraction = pgen$minf,
# bw <- 2, binSize= pioni$mzw)
#
# plotChromPeakDensity(xdata, mz = mzr, param = pdp0, pch = 16, xlim = c(400, 600))
# plotChromPeakDensity(xdata, mz = mzr, param = pdp1, pch = 16, xlim = c(400, 600))
##### F I N T E S T
## retention time correction.
pgp <- PeakGroupsParam(minFraction = pgen$miss)
tuc <- system.time(xdata <- adjustRtime(xdata, param = pgp))
tpuc[4,]<-tuc[c(1:3)]
x11(); plotAdjustedRtime(xdata , col = group_colors)
# example with obiwarp
# xdata <- adjustRtime(xdata, param = ObiwarpParam(binSize = 0.6))
## 2nd group after adustRtime
pdp2 <- PeakDensityParam(sampleGroups = xdata$sample_group,
minFraction = pgen$minf, bw <- pioni$bw2, binSize= pioni$mzw)
tuc <- system.time(xdata <- groupChromPeaks(xdata, param = pdp2))
tpuc[5,]<-tuc[c(1:3)]
## fillcchromPeaks replace NA values
tuc <- system.time(xdata <- fillChromPeaks(xdata))
tpuc[6,]<-tuc[c(1:3)]
## saving dataframe of result
save(xdata, file=paste(ioniUsed,"XtractFill.Rdata",sep=""))
## list of features
xf <- featureDefinitions(xdata)
## CAMERA ####################################################################################
xs <- as(xdata, 'xcmsSet')
## xs <- getxcmsSetObject(xdata)
#xsa <- xsAnnotate(xs,polarity =ioniUsed)
#Group after RT value of the xcms grouped peak
#xsaF <- groupFWHM(xsa, perfwhm=0.6)
#Verify grouping
#xsaC <- groupCorr(xsaF)
#Annotate isotopes, could be done before groupCorr
#xsaFI <- findIsotopes(xsaC)
#Annotate adducts
#xsaFA <- findAdducts(xsaFI, polarity=ioniUsed)
### OUUUUUUUUUUUUUUUUUUUUUUUUUUU A utiliser au lieu des 5 dernieres lignes de code
xsaFA <- annotate(xs, sample=NA, sigma=6, perfwhm=0.6,
cor_eic_th=0.75, graphMethod="hcs", pval=0.05, calcCiS=TRUE,
calcIso=FALSE, calcCaS=FALSE, maxcharge=3, maxiso=4, minfrac=0.5,
ppm=pioni$ppm, mzabs=0.015, quick=FALSE, psg_list=NULL, rules=NULL,
polarity=ioniUsed, multiplier=3, max_peaks=100 ,intval="maxo")
# Peaklist post CAMERA
xC <- (getPeaklist(xsaFA))
ions <- paste(substr(ioniUsed,1,1),round(xC$mz,4),"T",round((xC$rt/60),1),sep="")
xC <- data.frame(cbind(ions,xC$isotopes,xC$adduct,xC$pcgroup), stringsAsFactors = FALSE)
colnames(xC)[2:4] <- c("isotopes","adducts","pcgroup")
## variable metadata
ncr <- dim(xf)[2]-1
ions <- paste(substr(ioniUsed,1,1),round(xf$mzmed,4),"T",round((xf$rtmed/60),1),sep="")
VM <- data.frame(ions,xf[,c(1:ncr)])
## merge VM with CAMERA annotation results
VM <- merge(x=VM, y=xC, by.x=1, by.y=1, all.x=TRUE, all.y=TRUE)
## data matrix
## default missing=NA but can be : missing = "rowmin_half"
## values = "into" or "maxo"
DM <- featureValues(xdata, value = "maxo")
colnames(DM) <- samples
DM <- data.frame(ions,DM,stringsAsFactors = FALSE)
## NA replace
DM <- NA_replace(DM,method="randmin")
############################# F I L T r A T I O N QC/Blank > threshBL #################################
fildata <- QCBL_filter(VM,DM,spl,threshBL)
## write on disk
tpuc <- data.frame(tpucl,tpuc)
allres <- list(VM,DM,tpuc)
return(allres)
}
computeCV <- function(VM,DMsimca,spl) {
## compute CV pools and samples using :
## - spl metadataSample file
## - VM variable metadata sample file
## - DMsimca : data matrix simca with ions in column and sample in rows
# subscript of samples and pools
ispl <- which(DMsimca$sampleType=="sample")
iQC <- which(DMsimca$sampleType=="pool")
if (length(ispl)<2 | length(iQC)<1) {stop("pas assez de sample ou de QC")}
firstion <- ncol(spl)+1
lastion <- ncol(DMsimca)
nbions <- lastion - firstion+1
labion <- rep(NA,nrow(VM))
CV <- matrix(0,nrow=nbions, ncol = 3); row.names(CV) <- VM[[1]]; colnames(CV) <- c("CVpools","CVsamples","CVok")
for (j in firstion:lastion) {
i <- j-firstion+1
labion[i] <- colnames(DMsimca)[j]
## cv pools
CV[i,1] <- sd(DMsimca[iQC,j])/mean(DMsimca[iQC,j])
## cv samples
CV[i,2] <- sd(DMsimca[ispl,j])/mean(DMsimca[ispl,j])
## if CV pool < 30% and CV pools < CV samples then CV ok =1
if (!is.na(CV[i,1]) & !is.na(CV[i,2]) & CV[i,1] <0.3 & CV[i,1] < CV[i,2]) CV[i,3]=1
}
CV <- data.frame(labion,CV, stringsAsFactors = FALSE)
VM <- merge(VM,CV, by.x=1,by.y=1)
return(VM)
}
callDIABLO <- function(Y, data, ncpUser=NA, testkeep, crossValfold = 7, crossValrep=10) {
### use mixOmics DIABLO to run multi-block sparsePLS
### limited to 2 blocks of data (example Pos and Neg mass spec. ionisation)
### Y : qualitative variable to be explained (1 Dimension)
### data : a list with the 2 numeric matrix coresponding to the 2 blocks of variables
### ncpuser: used to force the number of component. If=NA the algo define the best number of components.
### testKeep correspond to the number of variate choosen for each component
### crossValdfold : cross validation Mfold
### crossValrep : cross validation number of iteration
design = matrix(0.1, ncol = length(data), nrow = length(data),
dimnames = list(names(data), names(data)))
diag(design) = 0
## call misomics block splsda
if (!is.na(ncpUser)) {nc <- ncpUser} else {nc <- (length(levels(Y))-1)}
sgccda.res = block.splsda(X = data, Y = Y, ncomp = nc, design = design)
set.seed(123) # for reproducibility, only when the `cpus' argument is not used
# this code takes a couple of min to run
perf.diablo = perf(sgccda.res, validation = 'Mfold', folds = crossValfold, nrepeat = crossValrep)
#perf.diablo # lists the different outputs
# x11()
# plot(perf.diablo)
# dev.off()
## diablo or userdefined number of spls component
if (!is.na(ncpUser)) ncomp <- ncpUser else ncomp = perf.diablo$choice.ncomp$WeightedVote["Overall.BER", "centroids.dist"]
cat("nb comp perf.diablo",ncomp,"\n")
## tune keepX parameter
tune.Zebra = tune.block.splsda(X = data, Y = Y, ncomp = ncomp,
test.keepX = testkeep, design = design,
validation = 'Mfold', folds = crossValfold, nrepeat = crossValrep,
cpus = 2, dist = "centroids.dist")
list.keepX = tune.Zebra$choice.keepX
## retained model with results of tune
sgccda.res = block.splsda(X = data, Y = Y, ncomp = ncomp, keepX = list.keepX, design = design)
print(sgccda.res)
############################### select for each block and each component, variables selected by sparse ##########################
nbvartot <- 0
for (i in 1:length(data)) nbvartot <- nbvartot + ncol(data[[i]])
labvar <- colnames(data[[1]])
for (i in 2:length(data)) labvar <- c(labvar,colnames(data[[i]]))
## init resu selectVar
resSel <- data.frame(labvar,array(0,dim = c(nbvartot,ncomp)))
for (i in 1:length(data))
for (j in 1:ncomp) {
varblock <- selectVar(sgccda.res, block = c(1,2), comp = j)[[i]]$name
resSel[match(varblock,resSel[[1]]),j+1] <- 1
}
################################################ diagnostic graphics #############################################################
pdf(file="graphs_evaluation.pdf", width = 11.5, height = 7.8)