-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.r
105 lines (75 loc) · 2.54 KB
/
utils.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Isolation Forest function
isoForest = function(pseq){
# Isolation forest
library(h2o)
require(phyloseq)
datos = as.data.frame(t(otu_table(pseq)@.Data))
# Se inicializa el cluster H2O
print('Initializing Isolation Forest...')
h2o.init(ip = "localhost",
# Todos los cores disponibles.
nthreads = -1,
# Máxima memoria disponible para el cluster.
max_mem_size = "8g")
h2o.removeAll()
h2o.no_progress()
# Carga de datos en el cluster H2O
datos_h2o <- as.h2o(x = datos)
# Modelo isolation forest
print('Creating Isolation Forest Model...')
isoforest <- h2o.isolationForest(
model_id = "isoforest",
training_frame = datos_h2o,
x = colnames(datos_h2o),
max_depth = 350, # Profundidad máxima de los árboles
ntrees = 100, # Número de los árboles
sample_rate = -1 # Ratio de observaciones empleadas en cada árbol
)
isoforest
# Predicción
predicciones_h2o <- h2o.predict(
object = isoforest,
newdata = datos_h2o
)
predicciones <- as.data.frame(predicciones_h2o)
head(predicciones)
deciles <- quantile(x = predicciones$mean_length, probs = seq(0, 1, 0.1))
deciles
rmPats = which(predicciones$mean_length < deciles[2])
Pats = setdiff(rownames(datos), rownames(datos)[rmPats])
print(paste0('Removing ', length(rmPats), ' outliers patients'))
print(rmPats)
# res = prune_samples(Pats, pseq)
print('Isolation forest analysis done!')
return(Pats)
}
# Targeting and balance data
labeling = function(physeq, target){
require(phyloseq)
print(paste0('Labeling data by ', target))
if (target == 'COUNTRY'){
physeq = subset_samples(physeq, COUNTRY == 'USA' | COUNTRY == 'United Kingdom')
# Balance Data
major = rownames(sample_data(subset_samples(physeq, COUNTRY == 'USA')))
minor = rownames(sample_data(subset_samples(physeq, COUNTRY == 'United Kingdom')))
set.seed(5555)
subSampling = sample(x = major, size = length(minor))
physeq = prune_samples(c(subSampling, minor), physeq)
}
print('Balancing data ...')
print(table(get_variable(physeq, target)))
return(physeq)
}
# FCBF
fast.cor.FS = function(data, thresh){
stopifnot('target' %in% names(data))
# require(FCBF)
y = as.factor(data$target)
x = subset(data, select = - c(target))
dis = discretize_exprs(t(x))
# su_plot(dis, y)
fcbf = fcbf(dis, y, verbose = T, thresh = thresh)
xx = x[,fcbf$index]
xx = as.data.frame(cbind(xx, target = y))
return(xx)
}