forked from lpernie/Keras_Tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Useful_func.py
191 lines (158 loc) · 6.76 KB
/
Useful_func.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def root2panda(files_path, tree_name, **kwargs):
'''
Args:
-----
files_path: a string like './data/*.root', for example
tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root
file that we want to open
kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc
Returns:
--------
output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored
Note:
-----
if you are working with .root files that contain different branches, you might have to mask your data
in that case, return pd.DataFrame(ss.data)
'''
# -- create list of .root files to process
files = glob.glob(files_path)
# -- process ntuples into rec arrays
ss = stack_arrays([root2array(fpath, tree_name, **kwargs) for fpath in files])
try:
return pd.DataFrame(ss)
except Exception:
return pd.DataFrame(ss.data)
def flatten(column):
'''
Args:
-----
column: a column of a pandas df whose entries are lists (or regular entries -- in which case nothing is done)
e.g.: my_df['some_variable']
Returns:
--------
flattened out version of the column.
For example, it will turn:
[1791, 2719, 1891]
[1717, 1, 0, 171, 9181, 537, 12]
[82, 11]
...
into:
1791, 2719, 1891, 1717, 1, 0, 171, 9181, 537, 12, 82, 11, ...
'''
try:
return np.array([v for e in column for v in e])
except (TypeError, ValueError):
return column
def create_stream(df, num_obj, sort_col):
n_variables = df.shape[1]
var_names = df.keys()
data = np.zeros((df.shape[0], num_obj, n_variables), dtype='float32')
# -- call functions to build X (a.k.a. data)
sort_objects(df, data, sort_col, num_obj)
# -- ix_{train, test} from above or from previously stored ordering
Xobj_train = data[ix_train]
Xobj_test = data[ix_test]
#print 'Scaling features ...'
scale(Xobj_train, var_names, savevars=True) # scale training sample and save scaling
scale(Xobj_test, var_names, savevars=False) # apply scaling to test set
return Xobj_train, Xobj_test
def sort_objects(df, data, SORT_COL, max_nobj):
'''
sort objects using your preferred variable
Args:
-----
df: a dataframe with event-level structure where each event is described by a sequence of jets, muons, etc.
data: an array of shape (nb_events, nb_particles, nb_features)
SORT_COL: a string representing the column to sort the objects by
max_nobj: number of particles to cut off at. if >, truncate, else, -999 pad
Returns:
--------
modifies @a data in place. Pads with -999
'''
#!import tqdm
# i = event number, event = all the variables for that event
#!for i, event in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
for i, event in df.iterrows():
# objs = [[pt's], [eta's], ...] of particles for each event
objs = np.array(
[v.tolist() for v in event.get_values()],
dtype='float32'
)[:, (np.argsort(event[SORT_COL]))[::-1]]
# total number of tracks per jet
nobjs = objs.shape[1]
# take all tracks unless there are more than n_tracks
data[i, :(min(nobjs, max_nobj)), :] = objs.T[:(min(nobjs, max_nobj)), :]
# default value for missing tracks
data[i, (min(nobjs, max_nobj)):, : ] = -999
def scale(data, var_names, savevars, VAR_FILE_NAME='scaling.json'):
'''
Args:
-----
data: a numpy array of shape (nb_events, nb_particles, n_variables)
var_names: list of keys to be used for the model
savevars: bool -- True for training, False for testing
it decides whether we want to fit on data to find mean and std
or if we want to use those stored in the json file
Returns:
--------
modifies data in place, writes out scaling dictionary
'''
import json
scale = {}
if savevars:
for v, name in enumerate(var_names):
#print 'Scaling feature %s of %s (%s).' % (v + 1, len(var_names), name)
f = data[:, :, v]
slc = f[f != -999]
m, s = slc.mean(), slc.std()
slc -= m
slc /= s
data[:, :, v][f != -999] = slc.astype('float32')
scale[name] = {'mean' : float(m), 'sd' : float(s)}
with open(VAR_FILE_NAME, 'wb') as varfile:
json.dump(scale, varfile)
else:
with open(VAR_FILE_NAME, 'rb') as varfile:
varinfo = json.load(varfile)
for v, name in enumerate(var_names):
#print 'Scaling feature %s of %s (%s).' % (v + 1, len(var_names), name)
f = data[:, :, v]
slc = f[f != -999]
m = varinfo[name]['mean']
s = varinfo[name]['sd']
slc -= m
slc /= s
data[:, :, v][f != -999] = slc.astype('float32')
def feature_selection(train_data, features, k):
"""
Definition:
-----------
!! ONLY USED FOR INTUITION, IT'S USING A LINEAR MODEL TO DETERMINE IMPORTANCE !!
Gives an approximate ranking of variable importance and prints out the top k
Args:
-----
train_data = dictionary containing keys "X" and "y" for the training set, where:
X = ndarray of dim (# training examples, # features)
y = array of dim (# training examples) with target values
features = names of features used for training in the order in which they were inserted into X
k = int, the function will print the top k features in order of importance
"""
# -- Select the k top features, as ranked using ANOVA F-score
tf = SelectKBest(score_func=f_classif, k=k)
tf.fit_transform(train_data["X"], train_data["y"])
# -- Return names of top features
logging.getLogger("process_data").info("The {} most important features are {}".format(k, [f for (_, f) in sorted(zip(tf.scores_, features), reverse=True)][:k]))
def my_min(a, b):
if(a<b): return a
else: return b
def my_max(a, b):
if(a>b): return a
else: return b
class PDF(object):
def __init__(self, pdf, size=(200,200)):
self.pdf = pdf
self.size = size
def _repr_html_(self):
return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)
def _repr_latex_(self):
return r'\includegraphics[width=1.0\textwidth]{{{0}}}'.format(self.pdf)