-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathloaders.py
118 lines (91 loc) · 3.1 KB
/
loaders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from functools import partial
import logging
import os
import h5py
import numpy as np
from calodata.features import extract_features, extract_dataframe
concat = partial(np.concatenate, axis=0)
def load_calodata(fpaths):
"""
Returns:
--------
data: a list of 3 numpy arrays, representing the energy deposition in
each layer for a group of showers contained in the file 'fpath'
"""
for fpath in fpaths:
with h5py.File(fpath, "r") as h5:
try:
data = [
concat((data[i], h5["layer_{}".format(i)][:]))
for i in range(3)
]
except NameError:
data = [h5["layer_{}".format(i)][:] for i in range(3)]
return data
def load_all_data(
basedir,
class_one="piplus",
class_two="eplus",
ending="_angle_position_5deg_xy.h5",
):
# CLASS_ONE = 'gamma'
# CLASS_TWO = 'eplus'
# CLASS_ONE = 'piplus'
# CLASS_TWO = 'eplus'
import glob
logger = logging.getLogger(__name__)
c1path = glob.glob(
os.path.join(
os.path.abspath(basedir), "{}{}".format(class_one, ending)
)
)
logger.info("Extracting data for {} from {}".format(class_one, c1path))
c1 = load_calodata(c1path)
c2path = glob.glob(
os.path.join(
os.path.abspath(basedir), "{}{}".format(class_two, ending)
)
)
logger.info("Extracting data for {} from {}".format(class_two, c2path))
c2 = load_calodata(c2path)
data = list(map(concat, zip(c1, c2)))
labels = np.array([1] * c1[0].shape[0] + [0] * c2[0].shape[0])
logger.info("Number of {} events = {}".format(class_one, c1[0].shape[0]))
logger.info("Number of {} events = {}".format(class_two, c2[0].shape[0]))
features = extract_features(data) # shower shapes
# features_df = extract_dataframe(data)
# shower_shapes = features_df.keys()
features = features / np.abs(features).max(axis=0)[np.newaxis, :]
# random shuffle
np.random.seed(0)
ix = np.array(range(len(labels)))
np.random.shuffle(ix)
# number of examples to train on
nb_train = int(0.7 * len(ix))
# train test split
ix_train = ix[:nb_train]
ix_test = ix[nb_train:]
features_train = features[ix_train]
data_train = [np.expand_dims(d[ix_train], -1) / 1000.0 for d in data]
labels_train = labels[ix_train]
features_test = features[ix_test]
data_test = [np.expand_dims(d[ix_test], -1) / 1000.0 for d in data]
labels_test = labels[ix_test]
raveled_train = np.concatenate(
[d.reshape(d.shape[0], -1) for d in data_train], axis=-1
)
raveled_test = np.concatenate(
[d.reshape(d.shape[0], -1) for d in data_test], axis=-1
)
return {
"ix_train": ix_train,
"ix_test": ix_test,
"features_train": features_train,
"data_train": data_train,
"labels_train": labels_train,
"features_test": features_test,
"data_test": data_test,
"labels_test": labels_test,
"raveled_train": raveled_train,
"raveled_test": raveled_test,
}