Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add std variables #23

Merged
merged 3 commits into from
Mar 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Once the models are trained, run the compare_models.py scripts, making sure the
```
conda env create -n event_types -f environment.yml
source activate event_types
pip install -e .
python setup.py install
```

Once the package is set up, next time just run
Expand Down
184 changes: 157 additions & 27 deletions event_types/event_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def branches_to_read():
'''

branches = [
'MCe0',
'MCze',
'MCaz',
'Ze',
Expand Down Expand Up @@ -163,13 +164,13 @@ def branches_to_read():
'meanPedvar_Image',
'fui',
'cross',
'crossO',
'R',
'ES',
'MWR',
'MLR',
'asym',
'tgrad_x',
'width',
'length',
'DispWoff_T',
]

return branches
Expand Down Expand Up @@ -198,25 +199,47 @@ def nominal_labels_train_features():
'MSCW',
'MSCL',
'log_EChi2S',
'log_av_size',
'log_EmissionHeight',
'log_EmissionHeightChi2',
'av_dist',
'log_DispDiff',
'log_dESabs',
'loss_sum',
'NTrig',
'meanPedvar_Image',
'MSWOL',
'log_av_size',
'log_me_size',
'log_std_size',
'av_dist',
'me_dist',
'std_dist',
'av_fui',
'me_fui',
'std_fui',
'av_cross',
'av_crossO',
'me_cross',
'std_cross',
'av_R',
'me_R',
'std_R',
'av_ES',
'MWR',
'MLR',
'MSWOL',
'me_ES',
'std_ES',
'av_asym',
'me_asym',
'std_asym',
'av_tgrad_x',
'me_tgrad_x',
'std_tgrad_x',
'av_width',
'me_width',
'std_width',
'av_length',
'me_length',
'std_length',
'av_dispCombine',
'me_dispCombine',
'std_dispCombine',
]

return labels, train_features
Expand All @@ -241,6 +264,11 @@ def extract_df_from_dl2(root_filename):
A pandas DataFrame with variables to use in the regression/classification, after cuts.
'''

DISP_ERROR_EXPONENTIAL = 50

def disp_error_weight(disp_error):
return np.exp(-1 * DISP_ERROR_EXPONENTIAL * abs(disp_error))

branches = branches_to_read()

particle_file = uproot.open(root_filename)
Expand Down Expand Up @@ -283,8 +311,8 @@ def extract_df_from_dl2(root_filename):
)

# Variables for training:
av_size = [np.average(sizes) for sizes in data_arrays['size'][gamma_like_events]]
reco_energy = data_arrays['ErecS'][gamma_like_events]
true_energy = data_arrays['MCe0'][gamma_like_events]
NTels_reco = data_arrays['NImages'][gamma_like_events]
x_cores = data_arrays['Xcore'][gamma_like_events]
y_cores = data_arrays['Ycore'][gamma_like_events]
Expand All @@ -301,24 +329,67 @@ def extract_df_from_dl2(root_filename):
EmissionHeight = data_arrays['EmissionHeight'][gamma_like_events]
EmissionHeightChi2 = data_arrays['EmissionHeightChi2'][gamma_like_events]
dist = data_arrays['dist'][gamma_like_events]
av_dist = [np.average(dists) for dists in dist]
DispDiff = data_arrays['DispDiff'][gamma_like_events]
dESabs = data_arrays['dESabs'][gamma_like_events]
loss_sum = [np.sum(losses) for losses in data_arrays['loss'][gamma_like_events]]
NTrig = data_arrays['NTrig'][gamma_like_events]
meanPedvar_Image = data_arrays['meanPedvar_Image'][gamma_like_events]

av_size = [np.average(sizes) for sizes in data_arrays['size'][gamma_like_events]]
me_size = [np.median(sizes) for sizes in data_arrays['size'][gamma_like_events]]
std_size = [np.std(sizes) for sizes in data_arrays['size'][gamma_like_events]]

av_dist = [np.average(dists) for dists in dist]
me_dist = [np.median(dists) for dists in dist]
std_dist = [np.std(dists) for dists in dist]

av_fui = [np.average(fui) for fui in data_arrays['fui'][gamma_like_events]]
me_fui = [np.median(fui) for fui in data_arrays['fui'][gamma_like_events]]
std_fui = [np.std(fui) for fui in data_arrays['fui'][gamma_like_events]]

av_cross = [np.average(cross) for cross in data_arrays['cross'][gamma_like_events]]
av_crossO = [np.average(crossO) for crossO in data_arrays['crossO'][gamma_like_events]]
me_cross = [np.median(cross) for cross in data_arrays['cross'][gamma_like_events]]
std_cross = [np.std(cross) for cross in data_arrays['cross'][gamma_like_events]]

av_R = [np.average(R) for R in data_arrays['R'][gamma_like_events]]
me_R = [np.median(R) for R in data_arrays['R'][gamma_like_events]]
std_R = [np.std(R) for R in data_arrays['R'][gamma_like_events]]

av_ES = [np.average(ES) for ES in data_arrays['ES'][gamma_like_events]]
MWR = data_arrays['MWR'][gamma_like_events]
MLR = data_arrays['MLR'][gamma_like_events]
me_ES = [np.median(ES) for ES in data_arrays['ES'][gamma_like_events]]
std_ES = [np.std(ES) for ES in data_arrays['ES'][gamma_like_events]]

av_asym = [np.average(asym) for asym in data_arrays['asym'][gamma_like_events]]
me_asym = [np.median(asym) for asym in data_arrays['asym'][gamma_like_events]]
std_asym = [np.std(asym) for asym in data_arrays['asym'][gamma_like_events]]

av_tgrad_x = [np.average(tgrad_x) for tgrad_x in data_arrays['tgrad_x'][gamma_like_events]]
me_tgrad_x = [np.median(tgrad_x) for tgrad_x in data_arrays['tgrad_x'][gamma_like_events]]
std_tgrad_x = [np.std(tgrad_x) for tgrad_x in data_arrays['tgrad_x'][gamma_like_events]]

av_width = [np.average(width) for width in data_arrays['width'][gamma_like_events]]
me_width = [np.median(width) for width in data_arrays['width'][gamma_like_events]]
std_width = [np.std(width) for width in data_arrays['width'][gamma_like_events]]

av_length = [np.average(length) for length in data_arrays['length'][gamma_like_events]]
me_length = [np.median(length) for length in data_arrays['length'][gamma_like_events]]
std_length = [np.std(length) for length in data_arrays['length'][gamma_like_events]]

av_dispCombine = [
np.average(disp_error_weight(disp_error))
for disp_error in data_arrays['DispWoff_T'][gamma_like_events]
]
me_dispCombine = [
np.median(disp_error_weight(disp_error))
for disp_error in data_arrays['DispWoff_T'][gamma_like_events]
]
std_dispCombine = [
np.std(disp_error_weight(disp_error))
for disp_error in data_arrays['DispWoff_T'][gamma_like_events]
]

data_dict['log_ang_diff'].extend(tuple(np.log10(ang_diff.value)))
data_dict['log_av_size'].extend(tuple(np.log10(av_size)))
data_dict['log_true_energy'].extend(tuple(np.log10(true_energy)))
data_dict['log_reco_energy'].extend(tuple(np.log10(reco_energy)))
data_dict['log_NTels_reco'].extend(tuple(np.log10(NTels_reco)))
data_dict['array_distance'].extend(tuple(array_distance))
Expand All @@ -331,23 +402,56 @@ def extract_df_from_dl2(root_filename):
data_dict['MSCL'].extend(tuple(MSCL))
data_dict['log_EmissionHeight'].extend(tuple(np.log10(EmissionHeight)))
data_dict['log_EmissionHeightChi2'].extend(tuple(np.log10(EmissionHeightChi2)))
data_dict['av_dist'].extend(tuple(av_dist))
data_dict['log_DispDiff'].extend(tuple(np.log10(DispDiff)))
data_dict['log_dESabs'].extend(tuple(np.log10(dESabs)))
data_dict['loss_sum'].extend(tuple(loss_sum))
data_dict['NTrig'].extend(tuple(NTrig))
data_dict['meanPedvar_Image'].extend(tuple(meanPedvar_Image))
data_dict['MSWOL'].extend(tuple(MSCW/MSCL))

data_dict['log_av_size'].extend(tuple(np.log10(av_size)))
data_dict['log_me_size'].extend(tuple(np.log10(me_size)))
data_dict['log_std_size'].extend(tuple(np.log10(std_size)))

data_dict['av_dist'].extend(tuple(av_dist))
data_dict['me_dist'].extend(tuple(me_dist))
data_dict['std_dist'].extend(tuple(std_dist))

data_dict['av_fui'].extend(tuple(av_fui))
data_dict['me_fui'].extend(tuple(me_fui))
data_dict['std_fui'].extend(tuple(std_fui))

data_dict['av_cross'].extend(tuple(av_cross))
data_dict['av_crossO'].extend(tuple(av_crossO))
data_dict['me_cross'].extend(tuple(me_cross))
data_dict['std_cross'].extend(tuple(std_cross))

data_dict['av_R'].extend(tuple(av_R))
data_dict['me_R'].extend(tuple(me_R))
data_dict['std_R'].extend(tuple(std_R))

data_dict['av_ES'].extend(tuple(av_ES))
data_dict['MWR'].extend(tuple(MWR))
data_dict['MLR'].extend(tuple(MLR))
data_dict['MSWOL'].extend(tuple(MSCW/MSCL))
data_dict['MWOL'].extend(tuple(MWR/MLR))
data_dict['me_ES'].extend(tuple(me_ES))
data_dict['std_ES'].extend(tuple(std_ES))

data_dict['av_asym'].extend(tuple(av_asym))
data_dict['me_asym'].extend(tuple(me_asym))
data_dict['std_asym'].extend(tuple(std_asym))

data_dict['av_tgrad_x'].extend(tuple(av_tgrad_x))
data_dict['me_tgrad_x'].extend(tuple(me_tgrad_x))
data_dict['std_tgrad_x'].extend(tuple(std_tgrad_x))

data_dict['av_width'].extend(tuple(av_width))
data_dict['me_width'].extend(tuple(me_width))
data_dict['std_width'].extend(tuple(std_width))

data_dict['av_length'].extend(tuple(av_length))
data_dict['me_length'].extend(tuple(me_length))
data_dict['std_length'].extend(tuple(std_length))

data_dict['av_dispCombine'].extend(tuple(av_dispCombine))
data_dict['me_dispCombine'].extend(tuple(me_dispCombine))
data_dict['std_dispCombine'].extend(tuple(std_dispCombine))

return pd.DataFrame(data=data_dict)

Expand Down Expand Up @@ -895,6 +999,27 @@ def save_test_dtf(dtf_e_test, suffix='default'):
return


def save_scores(scores):
'''
Save the scores of the trained models to disk.
The path for the scores is in scores/'model name'.
Parameters
----------
scores: a dict of scores per energy range per trained sklearn model.
dict:
keys=model names, values=list of scores
'''

this_dir = Path('scores').mkdir(parents=True, exist_ok=True)
for model_name, these_scores in scores.items():

file_name = Path('scores').joinpath('{}.joblib'.format(model_name))
dump(these_scores, file_name, compress=3)

return


def load_test_dtf(suffix='default'):
'''
Load the test data together with load_models().
Expand Down Expand Up @@ -1441,14 +1566,14 @@ def plot_score_comparison(dtf_e_test, trained_models):

fig, ax = plt.subplots(figsize=(8, 6))

scores = defaultdict(list)
rms_scores = defaultdict(list)
scores = defaultdict(dict)
energy_bins = extract_energy_bins(trained_models[next(iter(trained_models))].keys())

for this_model_name, trained_model in trained_models.items():

print('Calculating scores for {}'.format(this_model_name))

scores_this_model = list()
for this_e_range, this_model in trained_model.items():

# To keep lines short
Expand All @@ -1459,18 +1584,23 @@ def plot_score_comparison(dtf_e_test, trained_models):

y_pred = this_model['model'].predict(X_test)

scores[this_model_name].append(this_model['model'].score(X_test, y_test))
# rms_scores[this_model_name].append(metrics.mean_squared_error(y_test, y_pred))
scores_this_model.append(this_model['model'].score(X_test, y_test))

ax.plot(energy_bins, scores[this_model_name], label=this_model_name)
scores[this_model_name]['scores'] = scores_this_model
scores[this_model_name]['energy'] = energy_bins
ax.plot(
scores[this_model_name]['energy'],
scores[this_model_name]['scores'],
label=this_model_name
)

ax.set_xlabel('E [TeV]')
ax.set_ylabel('score')
ax.set_xscale('log')
ax.legend()
plt.tight_layout()

return plt
return plt, scores


def plot_confusion_matrix(event_types, trained_model_name, n_types=2):
Expand Down
48 changes: 26 additions & 22 deletions scripts/compare_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@

labels, train_features = event_types.nominal_labels_train_features()

plot_predict_dist = True
plot_predict_dist = False
plot_scores = True
plot_confusion_matrix = True
plot_confusion_matrix = False
plot_1d_conf_matrix = False
n_types = 3
type_bins = list(np.linspace(0, 1, n_types + 1))
# type_bins = [0, 0.2, 0.8, 1]
Expand Down Expand Up @@ -50,11 +51,11 @@
# models_to_compare = ['All', 'no_asym', 'no_tgrad_x', 'no_asym_tgrad_x']
# models_to_compare = ['All']
models_to_compare = [
'test_size_55p',
'test_size_65p',
'test_size_75p',
'test_size_85p',
'test_size_95p',
'all_features',
'no_width',
'no_length',
'no_dispCombine',
'old_features',
]
if len(models_to_compare) > 1:
group_models_to_compare = np.array_split(
Expand Down Expand Up @@ -85,10 +86,11 @@
plt.clf()

if plot_scores:
plt = event_types.plot_score_comparison(dtf_e_test, trained_models)
plt, scores = event_types.plot_score_comparison(dtf_e_test, trained_models)
plt.savefig('plots/scores_features_{}.pdf'.format(i_group + 1))
plt.savefig('plots/scores_features_{}.png'.format(i_group + 1))
plt.clf()
event_types.save_scores(scores)

if plot_confusion_matrix:

Expand All @@ -114,19 +116,21 @@
n_types
))

plt = event_types.plot_1d_confusion_matrix(
this_event_types,
this_trained_model_name,
n_types
)

plt.savefig('plots/{}_1d_confusion_matrix_n_types_{}.pdf'.format(
this_trained_model_name,
n_types
))
plt.savefig('plots/{}_1d_confusion_matrix_n_types_{}.png'.format(
this_trained_model_name,
n_types
))
if plot_1d_conf_matrix:

plt = event_types.plot_1d_confusion_matrix(
this_event_types,
this_trained_model_name,
n_types
)

plt.savefig('plots/{}_1d_confusion_matrix_n_types_{}.pdf'.format(
this_trained_model_name,
n_types
))
plt.savefig('plots/{}_1d_confusion_matrix_n_types_{}.png'.format(
this_trained_model_name,
n_types
))

plt.clf()
Loading