-
Notifications
You must be signed in to change notification settings - Fork 8
/
run_random_forest.py
123 lines (112 loc) · 6.46 KB
/
run_random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
import sys
from scipy.stats import pearsonr
import os
import argparse
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from music_utils import *
parser = argparse.ArgumentParser(description='Train and evaluate random forest regressors.')
parser.add_argument('--outprefix', help='Prefix for files generated. E.g. /path/to/output/directory/fileIdentifier')
parser.add_argument('--fold', type=int, help='Specify which fold of k-fold cross validation to train.')
parser.add_argument('--emd_label', nargs='+', help='Label for each embedding file.')
parser.add_argument('--train_set', nargs='+', help='For each embedding data, specify which training set to use. In order with <emd_label>. Default to 1 for each emd_label.')
parser.add_argument('--n_estimators', type=int, default=1000, help='The number of trees in the forest.')
parser.add_argument('--max_depth', type=int, default=30, help='The maximum depth of the tree.')
parser.add_argument('--n_jobs', type=int, default=8, help='The number of jobs to run in parallel.')
args = parser.parse_args()
if args.outprefix is None:
raise ValueError('Please specify --outprefix: Prefix for files generated. E.g., /path/to/output/directory/fileIdentifier')
outprefix = args.outprefix
# Check if output directory exists, make directory if not exist
outdir = '/'.join(x for x in outprefix.split('/')[:-1])
if not os.path.exists(outdir):
os.mkdir(outdir)
fold = args.fold
max_depth = args.max_depth
n_estimators = args.n_estimators
n_jobs = args.n_jobs
emd_label = args.emd_label
if args.train_set is None:
train_set = [1] * len(emd_label)
else:
train_set = args.train_set
if len(train_set) != len(emd_label):
raise ValueError('Entered train_set does not match emd_label.')
# Check if train_set for each emd_label exists
tmp_label = []
for i in range(len(emd_label)):
tmp_label.append('{}_{}'.format(emd_label[i], train_set[i]))
if not os.path.exists('{}_{}/training_set_{}'.format(outprefix, emd_label[i], train_set[i])):
raise ValueError('{} does not have training set {}'.format(emd_label[i], train_set[i]))
model_dir = '{}_trained_models'.format(outprefix)
if not os.path.exists(model_dir):
os.mkdir(model_dir)
outfname = '{}/{}.RF_maxDep_{}_nEst_{}.fold_{}.pkl'.format(model_dir, '_'.join(x for x in tmp_label),
max_depth, n_estimators, fold)
print('Trained model will be saved in {}'.format(outfname))
# Build Random Forest Model
rf = RandomForestRegressor(max_depth=max_depth,
n_estimators=n_estimators,
n_jobs=n_jobs)
print(rf)
sample_dir = '{}_train_test_data'.format(outprefix)
train_idx = np.load('{}/train_idx_{}.balanced.shuffled.npy'.format(sample_dir, fold), allow_pickle=True)
# Load y_train data
y_train = np.load('{}/y_train_genepair_{}.npy'.format(sample_dir, fold), allow_pickle=True)[train_idx]
# Load training data for each data modality
X_train = np.load('{}_{}/training_set_{}/X_train_{}.by_gp.npy'.format(outprefix,
emd_label[0],
train_set[0],
fold), allow_pickle=True)[train_idx]
emd_dim = [X_train.shape[1]]
for i in range(1, len(emd_label)):
emd_X_train = np.load('{}_{}/training_set_{}/X_train_{}.by_gp.npy'.format(outprefix,
emd_label[i],
train_set[i],
fold), allow_pickle=True)[train_idx]
emd_dim.append(emd_X_train.shape[1])
X_train = np.concatenate((X_train, emd_X_train), axis=1)
print('... loaded training data')
if len(set(emd_dim)) > 1:
print('NOTE: we recommend using same number of features for each data modality.')
print('Start training random forest regressor on {} samples with {} total features...'.format(X_train.shape[0],
X_train.shape[1]))
rf.fit(X_train, y_train)
print('... finished training')
save_obj(rf, outfname)
print('... saved trained random forest regressor')
# Predict test data
print('\nStart predicting test data...')
X_test = np.load('{}_{}/training_set_{}/X_test_{}.npy'.format(outprefix,
emd_label[0],
train_set[0], fold), allow_pickle=True)
for i in range(1, len(emd_label)):
emd_X_test = np.load('{}_{}/training_set_{}/X_test_{}.npy'.format(outprefix,
emd_label[i],
train_set[i], fold), allow_pickle=True)
X_test = np.concatenate((X_test, emd_X_test), axis=1)
print('... loaded {} test samples'.format(X_test.shape[0]))
y_pred = rf.predict(X_test)
pred_dir = '{}_yPred_output'.format(outprefix)
if not os.path.exists(pred_dir):
os.mkdir(pred_dir)
np.save('{}/{}'.format(pred_dir, outfname.split('/')[-1].replace('pkl', 'npy')), y_pred, allow_pickle=True)
print('... finished predicting test samples')
y_test = np.load('{}/y_test_genepair_{}.npy'.format(sample_dir, fold), allow_pickle=True)
print('Pearson r: {}'.format(pearsonr(y_test, y_pred)[0]))
# Predict gene pairs without specific GO annotations
print('\nStart predicting gene pairs without specific GO annotations...')
X_rest = np.load('{}_{}/training_set_{}/rest_genepair.npy'.format(outprefix,
emd_label[0],
train_set[0]), allow_pickle=True)
for i in range(1, len(emd_label)):
emd_X_rest = np.load('{}_{}/training_set_{}/rest_genepair.npy'.format(outprefix,
emd_label[i],
train_set[i]), allow_pickle=True)
X_rest = np.concatenate((X_rest, emd_X_rest), axis=1)
y_rest = rf.predict(X_rest)
np.save('{}/{}.restGP.npy'.format(pred_dir, outfname.split('/')[-1].replace('.pkl', '')), y_rest, allow_pickle=True)
print('\n=== finished! ===')