From 5314a07bf336c4194716527aae6b621de7596ad0 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 10:37:01 +0200 Subject: [PATCH 01/66] up --- scripts/clic/postprocessing.py | 150 ++++++++++++++++++++++++++++----- 1 file changed, 129 insertions(+), 21 deletions(-) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 54d2a857b..4d32f210b 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1,20 +1,21 @@ import os -# to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable +# noqa: to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable os.environ["OMP_NUM_THREADS"] = "1" os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" os.environ["VECLIB_MAXIMUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" -import numpy as np +import bz2 + import awkward +import fastjet +import numpy as np +import pyhepmc +import tqdm import uproot import vector -import tqdm -import pyhepmc -import bz2 -import fastjet from scipy.sparse import coo_matrix track_coll = "SiTracks_Refitted" @@ -61,6 +62,16 @@ "sigma_x", "sigma_y", "sigma_z", + # added by farouk + "energyError", + "sigma_energy", + "sigma_x_weighted", + "sigma_y_weighted", + "sigma_z_weighted", + "energy_weighted_width", + "pos_shower_max", + "width_shower_max", + "energy_shower_max", ] hit_feature_order = [ "elemtype", @@ -137,7 +148,9 @@ def __init__( self.cluster_features = cluster_features # feature matrix of the calo clusters self.track_features = track_features # feature matrix of the tracks self.genparticle_to_hit = genparticle_to_hit # sparse COO matrix of genparticles to hits (idx_gp, idx_hit, weight) - self.genparticle_to_track = genparticle_to_track # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight) + self.genparticle_to_track = ( + genparticle_to_track # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight) + ) self.hit_to_cluster = hit_to_cluster # sparse COO matrix of hits to clusters (idx_hit, idx_cluster, weight) self.gp_merges = gp_merges # sparse COO matrix of any merged genparticles @@ -203,7 +216,10 @@ def get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs): hit_idx_global += 1 hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()} hit_feature_matrix = awkward.Record( - {k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields} + { + k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) + for k in hit_feature_matrix[0].fields + } ) # add all edges from genparticle to calohit @@ -269,7 +285,9 @@ def gen_to_features(prop_data, iev): gen_arr = {k.replace(mc_coll + ".", ""): gen_arr[k] for k in gen_arr.fields} MCParticles_p4 = vector.awk( - awkward.zip({"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]}) + awkward.zip( + {"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]} + ) ) gen_arr["pt"] = MCParticles_p4.pt gen_arr["eta"] = MCParticles_p4.eta @@ -311,7 +329,7 @@ def genparticle_track_adj(sitrack_links, iev): def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cluster_arr = prop_data["PandoraClusters"][iev] - feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy"] + feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy", "energyError"] ret = {feat: cluster_arr["PandoraClusters." + feat] for feat in feats} hit_idx = np.array(hit_to_cluster[0]) @@ -324,8 +342,16 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cl_sigma_y = [] cl_sigma_z = [] + # added by farouk + cl_sigma_energy = [] + cl_sigma_x_weighted, cl_sigma_y_weighted, cl_sigma_z_weighted = [], [], [] + cl_energy_weighted_width = [] + cl_pos_shower_max, cl_energy_shower_max, cl_width_shower_max = [], [], [] + n_cl = len(ret["energy"]) - for cl in range(n_cl): + + # xs, ys, zs, es = [], [], [], [] + for i, cl in enumerate(range(n_cl)): msk_cl = cluster_idx == cl hits = hit_idx[msk_cl] @@ -351,6 +377,57 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cl_sigma_y.append(np.std(hits_posy)) cl_sigma_z.append(np.std(hits_posz)) + # added by farouk + cl_sigma_energy.append(np.std(hits_energy)) + cl_sigma_x_weighted.append(np.std(hits_posx * hits_energy)) + cl_sigma_y_weighted.append(np.std(hits_posy * hits_energy)) + cl_sigma_z_weighted.append(np.std(hits_posz * hits_energy)) + + # z_bar = np.sum(hits_posz * hits_energy) / np.sum(hits_energy) # energy weighted average + x_bar = np.sum(hits_posx * hits_energy) / np.sum(hits_energy) # energy weighted average + y_bar = np.sum(hits_posy * hits_energy) / np.sum(hits_energy) # energy weighted average + + num = (np.sum(hits_energy * (hits_posx - x_bar) ** 2)) + (np.sum(hits_energy * (hits_posy - y_bar) ** 2)) + den = np.sum(hits_energy) + + cl_energy_weighted_width.append(num / den) + + # if i==1: + # xs += [np.array(hits_posx)] + # ys += [np.array(hits_posy)] + # zs += [np.array(hits_posz)] + # es += [np.array(hits_energy)] + + # get position at shower max + # for each unique z integrate the energy of all the hits to find zmax + zmax, emax = 0, -1000 + for z in np.unique(np.array(hits_posz)): + msk = np.array(hits_posz) == z + ez = np.sum(np.array(hits_energy)[msk]) + + if ez > emax: + zmax, emax = z, ez + + cl_pos_shower_max.append(zmax) + cl_energy_shower_max.append(emax) + + # get width at shower max + msk = np.array(hits_posz) == zmax # select the hits at zmax + + x_bar = np.sum(np.array(hits_posx)[msk] * np.array(hits_energy)[msk]) / np.sum( + np.array(hits_energy)[msk] + ) # energy weighted average + y_bar = np.sum(np.array(hits_posy)[msk] * np.array(hits_energy)[msk]) / np.sum( + np.array(hits_energy)[msk] + ) # energy weighted average + + num = (np.sum(np.array(hits_energy)[msk] * (np.array(hits_posx)[msk] - x_bar) ** 2)) + ( + np.sum(np.array(hits_energy)[msk] * (np.array(hits_posy)[msk] - y_bar) ** 2) + ) + den = np.sum(np.array(hits_energy)[msk]) + + cl_width_shower_max.append(num / den) + ret["energy_ecal"] = np.array(cl_energy_ecal) ret["energy_hcal"] = np.array(cl_energy_hcal) ret["energy_other"] = np.array(cl_energy_other) @@ -374,6 +451,17 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): ret["sin_phi"] = np.sin(ret["phi"]) ret["cos_phi"] = np.cos(ret["phi"]) + # added by farouk + ret["sigma_energy"] = np.array(cl_sigma_energy) + ret["sigma_x_weighted"] = np.array(cl_sigma_x_weighted) + ret["sigma_y_weighted"] = np.array(cl_sigma_y_weighted) + ret["sigma_z_weighted"] = np.array(cl_sigma_z_weighted) + ret["energy_weighted_width"] = np.array(cl_energy_weighted_width) + + ret["pos_shower_max"] = np.array(cl_pos_shower_max) + ret["energy_shower_max"] = np.array(cl_energy_shower_max) + ret["width_shower_max"] = np.array(cl_width_shower_max) + return awkward.Record(ret) @@ -425,7 +513,9 @@ def filter_adj(adj, all_to_filtered): def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs): gen_features = gen_to_features(prop_data, iev) - hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs) + hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj( + hit_data, calohit_links, iev, collectionIDs + ) hit_to_cluster = hit_cluster_adj(prop_data, hit_idx_local_to_global, iev) cluster_features = cluster_to_features(prop_data, hit_features, hit_to_cluster, iev) track_features = track_to_features(prop_data, iev) @@ -438,7 +528,9 @@ def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack if len(genparticle_to_track[0]) > 0: gp_to_track = ( - coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track)).max(axis=1).todense() + coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track)) + .max(axis=1) + .todense() ) else: gp_to_track = np.zeros((n_gp, 1)) @@ -491,8 +583,12 @@ def assign_genparticles_to_obj_and_merge(gpdata): ).todense() ) - gp_to_calohit = coo_matrix((gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit)) - calohit_to_cluster = coo_matrix((gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster)) + gp_to_calohit = coo_matrix( + (gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit) + ) + calohit_to_cluster = coo_matrix( + (gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster) + ) gp_to_cluster = np.array((gp_to_calohit * calohit_to_cluster).todense()) @@ -657,7 +753,9 @@ def get_reco_properties(prop_data, iev): reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields} reco_p4 = vector.awk( - awkward.zip({"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]}) + awkward.zip( + {"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]} + ) ) reco_arr["pt"] = reco_p4.pt reco_arr["eta"] = reco_p4.eta @@ -879,19 +977,29 @@ def process_one_file(fn, ofn): assert np.all(used_rps == 1) gps_track = get_particle_feature_matrix(track_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) - gps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])]) + gps_track[:, 0] = np.array( + [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])] + ) gps_cluster = get_particle_feature_matrix(cluster_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) - gps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])]) + gps_cluster[:, 0] = np.array( + [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])] + ) gps_cluster[:, 1] = 0 rps_track = get_particle_feature_matrix(track_to_rp_all, reco_features, particle_feature_order) - rps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])]) + rps_track[:, 0] = np.array( + [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])] + ) rps_cluster = get_particle_feature_matrix(cluster_to_rp_all, reco_features, particle_feature_order) - rps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])]) + rps_cluster[:, 0] = np.array( + [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])] + ) rps_cluster[:, 1] = 0 # all initial gen/reco particle energy must be reconstructable - assert abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2 + assert ( + abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2 + ) assert abs(np.sum(rps_track[:, 6]) + np.sum(rps_cluster[:, 6]) - np.sum(reco_features["energy"])) < 1e-2 From e0ea0791029c5e514ce80bd7fb0ab85e947ed8e0 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 11:48:53 +0200 Subject: [PATCH 02/66] up utils.edm.py with updated feature list --- mlpf/heptfds/clic_pf_edm4hep/utils_edm.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py index b0f152d9c..c63d74994 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -1,6 +1,7 @@ +import random + import awkward as ak import numpy as np -import random # from fcc/postprocessing.py X_FEATURES_TRK = [ @@ -39,6 +40,16 @@ "sigma_x", "sigma_y", "sigma_z", + # added by farouk + "energyError", + "sigma_energy", + "sigma_x_weighted", + "sigma_y_weighted", + "sigma_z_weighted", + "energy_weighted_width", + "pos_shower_max", + "width_shower_max", + "energy_shower_max", ] Y_FEATURES = ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy", "ispu"] From 5096060d301b0a15c323c6c4704d3eefc4ffb4a7 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:15:35 +0200 Subject: [PATCH 03/66] tag 2.2.0 --- mlpf/heptfds/clic_pf_edm4hep/ttbar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py index 9a01aa81f..124d2c9a8 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py @@ -1,6 +1,7 @@ from pathlib import Path import tensorflow as tf +import tensorflow_datasets as tfds from utils_edm import ( X_FEATURES_CL, X_FEATURES_TRK, @@ -9,8 +10,6 @@ split_sample, ) -import tensorflow_datasets as tfds - _DESCRIPTION = """ CLIC EDM4HEP dataset with ee -> ttbar at 380GeV. - X: reconstructed tracks and clusters, variable number N per event @@ -36,6 +35,7 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): "1.5.0": "Regenerate with ARRAY_RECORD", "2.0.0": "Add ispu, genjets, genmet; disable genjet_idx; truth def not based on gp.status==1", "2.1.0": "Bump dataset size", + "2.2.0": "Additional cluster input features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. From dd6bad75a2b7a168e35cc97b6ae0967fe138e5ff Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:16:13 +0200 Subject: [PATCH 04/66] up --- mlpf/heptfds/clic_pf_edm4hep/ttbar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py index 124d2c9a8..74837eb43 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py @@ -25,7 +25,7 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.1.0") + VERSION = tfds.core.Version("2.2.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", From 7e1ac711da50c0db0c911e52e19735c4cc2b34fd Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:24:05 +0200 Subject: [PATCH 05/66] process whole dir --- scripts/clic/postprocessing.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 4d32f210b..000812773 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1044,16 +1044,27 @@ def parse_args(): import argparse parser = argparse.ArgumentParser() - parser.add_argument("--input", type=str, help="Input file ROOT file", required=True) + parser.add_argument( + "--input", type=str, help="Input ROOT file - else if dir then will process all files inside", required=True + ) parser.add_argument("--outpath", type=str, default="raw", help="output path") args = parser.parse_args() return args def process(args): - infile = args.input - outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") - process_one_file(infile, outfile) + + if os.path.isdir(args.input) is True: + import glob + + flist = glob.glob(args.input) + for infile in flist: + outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") + process_one_file(infile, outfile) + else: + infile = args.input + outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") + process_one_file(infile, outfile) if __name__ == "__main__": From c7d8a088fad623dfbd1862cff9de6759b86c487d Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:25:10 +0200 Subject: [PATCH 06/66] debug --- scripts/clic/postprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 000812773..98b09da32 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1035,7 +1035,8 @@ def process_one_file(fn, ofn): } ) ret.append(this_ev) - + if iev == 3: + break ret = awkward.Record({k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields}) awkward.to_parquet(ret, ofn) From d20de3a163236c4fd24730085b0fbb8074c8c3a4 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:27:27 +0200 Subject: [PATCH 07/66] up --- scripts/clic/postprocessing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 98b09da32..0634cc385 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1056,6 +1056,7 @@ def parse_args(): def process(args): if os.path.isdir(args.input) is True: + print("yes") import glob flist = glob.glob(args.input) From a20d8e87030407230cb4009dfcb344c45d1ed3cf Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:27:55 +0200 Subject: [PATCH 08/66] up --- scripts/clic/postprocessing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 0634cc385..45ad73bbe 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1060,6 +1060,7 @@ def process(args): import glob flist = glob.glob(args.input) + print("flist", flist) for infile in flist: outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") process_one_file(infile, outfile) From 78fbf1ab1d7af84228cab6c9e3e0831b66e9ce33 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:29:19 +0200 Subject: [PATCH 09/66] up --- scripts/clic/postprocessing.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 45ad73bbe..49b95c104 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1056,11 +1056,9 @@ def parse_args(): def process(args): if os.path.isdir(args.input) is True: - print("yes") import glob - flist = glob.glob(args.input) - print("flist", flist) + flist = glob.glob(args.input + "/*.root") for infile in flist: outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") process_one_file(infile, outfile) From 1e2f3d1b6a8a943f005fb441b080e2afabd3b5f8 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:29:53 +0200 Subject: [PATCH 10/66] up --- scripts/clic/postprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 49b95c104..e930ed649 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1,3 +1,4 @@ +import glob import os # noqa: to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable @@ -1056,7 +1057,7 @@ def parse_args(): def process(args): if os.path.isdir(args.input) is True: - import glob + print("Will process all files in " + args.input) flist = glob.glob(args.input + "/*.root") for infile in flist: From f2e4181f0b823f64340ce56386da47ed7073af4c Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 12:32:27 +0200 Subject: [PATCH 11/66] remove break --- scripts/clic/postprocessing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index e930ed649..43c7b06b1 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1036,8 +1036,7 @@ def process_one_file(fn, ofn): } ) ret.append(this_ev) - if iev == 3: - break + ret = awkward.Record({k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields}) awkward.to_parquet(ret, ofn) From df08d50eda05d43aba756c26d74716b4ee01f299 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 15:49:47 +0200 Subject: [PATCH 12/66] up --- parameters/pytorch/pyg-clic-ttbar.yaml | 121 +++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 parameters/pytorch/pyg-clic-ttbar.yaml diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml new file mode 100644 index 000000000..2525286ec --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar.yaml @@ -0,0 +1,121 @@ +backend: pytorch + +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: joint #split, joint + pt_mode: linear + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: linear + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 12 + dropout_ff: 0.1 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "relu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 \ No newline at end of file From e13f323a3f34f3f94918c60e02c69ba2e482fd31 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 15:53:01 +0200 Subject: [PATCH 13/66] up logging --- mlpf/pyg/training.py | 140 +++++++++++++++++++++++++++++++++---------- 1 file changed, 109 insertions(+), 31 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 7c2a67e35..93218f098 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -1,19 +1,17 @@ +import csv +import json +import logging import os import os.path as osp import pickle as pkl +import shutil import time +from datetime import datetime from pathlib import Path from tempfile import TemporaryDirectory from typing import Optional -import logging -import shutil -from datetime import datetime -import tqdm -import yaml -import csv -import json -import sklearn -import sklearn.metrics + +import fastjet import numpy as np import pandas import matplotlib @@ -25,6 +23,25 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp +import tqdm +import yaml +from pyg.inference import make_plots, run_predictions +from pyg.logger import _configLogger, _logger +from pyg.mlpf import MLPF +from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders +from pyg.utils import ( + CLASS_LABELS, + ELEM_TYPES_NONZERO, + X_FEATURES, + count_parameters, + get_lr_schedule, + get_model_state_dict, + load_checkpoint, + save_checkpoint, + save_HPs, + unpack_predictions, + unpack_target, +) from torch import Tensor, nn from torch.nn import functional as F from torch.profiler import ProfilerActivity, profile, record_function @@ -54,6 +71,10 @@ from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders from utils import create_comet_experiment +# comet needs to be imported before torch +from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip + + # Ignore divide by 0 errors np.seterr(divide="ignore", invalid="ignore") @@ -146,12 +167,12 @@ def mlpf_loss(y, ypred, batch): pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2) loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() - was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( - axis=-1 - ) - was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( - axis=-1 - ) + was_input_pred = torch.concat( + [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1 + ) * batch.mask.unsqueeze(axis=-1) + was_input_true = torch.concat( + [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1 + ) * batch.mask.unsqueeze(axis=-1) # standardize Wasserstein loss std = was_input_true[batch.mask].std(axis=0) @@ -193,7 +214,9 @@ class FocalLoss(nn.Module): - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0. """ - def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100): + def __init__( + self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100 + ): """Constructor. Args: alpha (Tensor, optional): Weights for each class. Defaults to None. @@ -457,7 +480,9 @@ def train_and_valid( if (world_size > 1) and (rank != 0): iterator = enumerate(data_loader) else: - iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}") + iterator = tqdm.tqdm( + enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}" + ) device_type = "cuda" if isinstance(rank, int) else "cpu" @@ -492,13 +517,19 @@ def train_and_valid( if not is_train: cm_X_gen += sklearn.metrics.confusion_matrix( - batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), ygen["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) + batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), + ygen["cls_id"][batch.mask].detach().cpu().numpy(), + labels=range(13), ) cm_X_pred += sklearn.metrics.confusion_matrix( - batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) + batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), + ypred["cls_id"][batch.mask].detach().cpu().numpy(), + labels=range(13), ) cm_id += sklearn.metrics.confusion_matrix( - ygen["cls_id"][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) + ygen["cls_id"][batch.mask].detach().cpu().numpy(), + ypred["cls_id"][batch.mask].detach().cpu().numpy(), + labels=range(13), ) # save the events of the first validation batch for quick checks if (rank == 0 or rank == "cpu") and itrain == 0: @@ -604,10 +635,20 @@ def train_and_valid( if not is_train and comet_experiment: comet_experiment.log_confusion_matrix( - matrix=cm_X_gen, title="Element to target", row_label="X", column_label="target", epoch=epoch, file_name="cm_X_gen.json" + matrix=cm_X_gen, + title="Element to target", + row_label="X", + column_label="target", + epoch=epoch, + file_name="cm_X_gen.json", ) comet_experiment.log_confusion_matrix( - matrix=cm_X_pred, title="Element to pred", row_label="X", column_label="pred", epoch=epoch, file_name="cm_X_pred.json" + matrix=cm_X_pred, + title="Element to pred", + row_label="X", + column_label="pred", + epoch=epoch, + file_name="cm_X_pred.json", ) comet_experiment.log_confusion_matrix( matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json" @@ -698,7 +739,9 @@ def train_mlpf( # training step, edit here to profile a specific epoch if epoch == -1: - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof: + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True + ) as prof: with record_function("model_train"): losses_t = train_and_valid( rank, @@ -845,10 +888,20 @@ def train_mlpf( time_per_epoch = (t1 - t0_initial) / epoch eta = epochs_remaining * time_per_epoch / 60 + # _logger.info( + # f"Rank {rank}: epoch={epoch} / {num_epochs} " + # + f"train_loss={losses_t['Total']:.4f} " + # + f"valid_loss={losses_v['Total']:.4f} " + # + f"stale={stale_epochs} " + # + f"epoch_train_time={round((t_train-t0)/60, 2)}m " + # + f"epoch_valid_time={round((t_valid-t_train)/60, 2)}m " + # + f"epoch_total_time={round((t1-t0)/60, 2)}m " + # + f"eta={round(eta, 1)}m", + # color="bold", + # ) + _logger.info( f"Rank {rank}: epoch={epoch} / {num_epochs} " - + f"train_loss={losses_t['Total']:.4f} " - + f"valid_loss={losses_v['Total']:.4f} " + f"stale={stale_epochs} " + f"epoch_train_time={round((t_train-t0)/60, 2)}m " + f"epoch_valid_time={round((t_valid-t_train)/60, 2)}m " @@ -857,6 +910,22 @@ def train_mlpf( color="bold", ) + _logger.info( + f"train: loss_total={losses_t['Total']:.4f} " + + f"loss_clf={losses_t['Classification']:.4f} " + + f"loss_clfbinary={losses_t['Classification_binary']:.4f} " + + f"loss_reg={losses_t['Regression']:.4f} ", + color="bold", + ) + + _logger.info( + f"valid: loss_total={losses_v['Total']:.4f} " + + f"loss_clf={losses_v['Classification']:.4f} " + + f"loss_clfbinary={losses_v['Classification_binary']:.4f} " + + f"loss_reg={losses_v['Regression']:.4f} ", + color="bold", + ) + # save separate json files with stats for each epoch, this is robust to crashed-then-resumed trainings history_path = Path(outdir) / "history" history_path.mkdir(parents=True, exist_ok=True) @@ -958,7 +1027,9 @@ def run(rank, world_size, config, args, outdir, logfile): _logger.info(f"Model directory {outdir}", color="bold") if args.comet: - comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) + comet_experiment = create_comet_experiment( + config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir + ) comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1197,7 +1268,9 @@ def train_ray_trial(config, args, outdir=None): loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True) if args.comet: - comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) + comet_experiment = create_comet_experiment( + config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir + ) comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1231,7 +1304,9 @@ def train_ray_trial(config, args, outdir=None): if args.resume_training: model, optimizer = load_checkpoint(checkpoint, model, optimizer) start_epoch = checkpoint["extra_state"]["epoch"] + 1 - lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1) + lr_schedule = get_lr_schedule( + config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1 + ) else: # start a new training with model weights loaded from a pre-trained model model = load_checkpoint(checkpoint, model) @@ -1346,7 +1421,6 @@ def run_hpo(config, args): import ray from ray import tune from ray.train.torch import TorchTrainer - from raytune.pt_search_space import raytune_num_samples, search_space from raytune.utils import get_raytune_schedule, get_raytune_search_alg @@ -1395,7 +1469,9 @@ def run_hpo(config, args): if tune.Tuner.can_restore(str(expdir)): # resume unfinished HPO run - tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True) + tuner = tune.Tuner.restore( + str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True + ) else: # start new HPO run search_space = {"train_loop_config": search_space} # the ray TorchTrainer only takes a single arg: train_loop_config @@ -1436,4 +1512,6 @@ def run_hpo(config, args): print(result_df.columns) logging.info("Total time of Tuner.fit(): {}".format(end - start)) - logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)) + logging.info( + "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config) + ) From e9930ebe92d2088da49d37dc7c29c88034cb9349 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 15:54:03 +0200 Subject: [PATCH 14/66] up --- parameters/pytorch/pyg-clic-ttbar.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml index 2525286ec..21aa978b5 100644 --- a/parameters/pytorch/pyg-clic-ttbar.yaml +++ b/parameters/pytorch/pyg-clic-ttbar.yaml @@ -50,17 +50,18 @@ model: attention: conv_type: attention - num_convs: 12 - dropout_ff: 0.1 + num_convs: 8 + dropout_ff: 0.0 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 activation: "relu" - head_dim: 32 - num_heads: 32 + head_dim: 64 + num_heads: 12 attention_type: math - use_pre_layernorm: True + use_improved_ffn: True + use_improved_attblock: False mamba: conv_type: mamba From 34b89a0dd8b60e9da5e697b80db8526dfecd4e1f Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 15:54:44 +0200 Subject: [PATCH 15/66] up --- parameters/pytorch/pyg-clic-ttbar.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml index 21aa978b5..0e2d2e5bb 100644 --- a/parameters/pytorch/pyg-clic-ttbar.yaml +++ b/parameters/pytorch/pyg-clic-ttbar.yaml @@ -60,8 +60,6 @@ model: head_dim: 64 num_heads: 12 attention_type: math - use_improved_ffn: True - use_improved_attblock: False mamba: conv_type: mamba From 99082ad4ac39f96ee7772429715fdec1d6cbcaf6 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 16:31:57 +0200 Subject: [PATCH 16/66] up --- parameters/pytorch/pyg-clic-ttbar.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml index 0e2d2e5bb..7d404963f 100644 --- a/parameters/pytorch/pyg-clic-ttbar.yaml +++ b/parameters/pytorch/pyg-clic-ttbar.yaml @@ -26,7 +26,7 @@ val_freq: # run an extra validation run every val_freq training steps model: trainable: all learned_representation_mode: last #last, concat - input_encoding: joint #split, joint + input_encoding: split #split, joint pt_mode: linear eta_mode: linear sin_phi_mode: linear From 47c00b210516f95fa3b2eb0c5e4b1992913829dc Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 17:10:15 +0200 Subject: [PATCH 17/66] standardize inputs --- mlpf/pyg/clic_standardization.json | 1 + mlpf/pyg/mlpf.py | 60 +++++++++++++++++++++++--- parameters/pytorch/pyg-clic-ttbar.yaml | 1 + 3 files changed, 55 insertions(+), 7 deletions(-) create mode 100644 mlpf/pyg/clic_standardization.json diff --git a/mlpf/pyg/clic_standardization.json b/mlpf/pyg/clic_standardization.json new file mode 100644 index 000000000..99114010b --- /dev/null +++ b/mlpf/pyg/clic_standardization.json @@ -0,0 +1 @@ +{"2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.0016281852731481194, 0.00044645098387263715, 0.008498543873429298, 4.233071804046631, 31.000581741333008, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.002807975746691227, 0.0019595674239099026, 7.4845520430244505e-06, -0.07514938712120056, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.00073066825279966, -0.0011971204075962305, 0.004265286959707737, 3.290896415710449, 11.28282356262207, -0.4471941590309143, -4.9265971183776855, 1.571304440498352, 1.8862318992614746, 0.9784500598907471, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227722167969, 51.77717971801758], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}} diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 59d7564ab..5a2770973 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -1,13 +1,13 @@ +import math + +import numpy as np import torch import torch.nn as nn - -from .gnn_lsh import CombinedGraphLayer - from pyg.logger import _logger -import math -import numpy as np from torch.nn.attention import SDPBackend, sdpa_kernel +from .gnn_lsh import CombinedGraphLayer + def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): # From https://github.com/rwightman/pytorch-image-models/blob/ @@ -57,6 +57,37 @@ def norm_cdf(x): return tensor +def standardize_inputs(X, elemtypes_nonzero): + import json + + import numpy as np + + with open("clic_standardization.json", "rb") as f: + standard_dict = json.load(f)["2.1.0"] + + for i, ielem in enumerate(elemtypes_nonzero): + + # get mean/std of features of that elem + mean = np.array(standard_dict[f"PFelement{ielem}"]["mean"]) + std = np.array(standard_dict[f"PFelement{ielem}"]["std"]) + + # standardize + Xfeat_normed_msked = X.clone() + Xfeat_normed_msked[..., 1:] = (Xfeat_normed_msked[..., 1:] - mean[..., 1:]) / std[..., 1:] + + # msk other elements + msk = Xfeat_normed_msked[..., 0:1] == ielem + Xfeat_normed_msked = Xfeat_normed_msked * msk + Xfeat_normed_msked = torch.nan_to_num(Xfeat_normed_msked, nan=0.0) + + if i == 0: + Xfeat_normed = Xfeat_normed_msked + else: + Xfeat_normed += Xfeat_normed_msked + + return Xfeat_normed + + def get_activation(activation): if activation == "elu": act = nn.ELU @@ -96,7 +127,9 @@ def __init__( self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True) self.norm0 = torch.nn.LayerNorm(embedding_dim) self.norm1 = torch.nn.LayerNorm(embedding_dim) - self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()) + self.seq = torch.nn.Sequential( + nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act() + ) self.dropout = torch.nn.Dropout(dropout_ff) _logger.info("using attention_type={}".format(attention_type)) # params for torch sdp_kernel @@ -262,6 +295,12 @@ def __init__( dropout_conv_id_mha=0.0, dropout_conv_id_ff=0.0, use_pre_layernorm=False, + # mamba specific parameters + d_state=16, + d_conv=4, + expand=2, + # standardize_inputs + standardize_inputs=False, ): super(MLPF, self).__init__() @@ -281,6 +320,8 @@ def __init__( self.use_pre_layernorm = use_pre_layernorm + self.standardize_inputs = standardize_inputs + if self.conv_type == "attention": embedding_dim = num_heads * head_dim width = num_heads * head_dim @@ -375,6 +416,9 @@ def __init__( def forward(self, X_features, mask): Xfeat_normed = X_features + if self.standardize_inputs: + Xfeat_normed = standardize_inputs(X_features) + embeddings_id, embeddings_reg = [], [] if self.num_convs != 0: if self.input_encoding == "joint": @@ -434,7 +478,9 @@ def forward(self, X_features, mask): e_real[~mask] = 0 e_real[torch.isinf(e_real)] = 0 e_real[torch.isnan(e_real)] = 0 - preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])) + preds_energy = e_real + torch.nn.functional.relu( + self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]) + ) preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1) return preds_binary_particle, preds_pid, preds_momentum diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml index 7d404963f..e949ac554 100644 --- a/parameters/pytorch/pyg-clic-ttbar.yaml +++ b/parameters/pytorch/pyg-clic-ttbar.yaml @@ -60,6 +60,7 @@ model: head_dim: 64 num_heads: 12 attention_type: math + standardize_inputs: True mamba: conv_type: mamba From 97a31944c1596b65ff421f2b0f7114ad12562312 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 17:10:49 +0200 Subject: [PATCH 18/66] up elemtypes_nonzero --- mlpf/pyg/mlpf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 5a2770973..9d3d6afd6 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -417,7 +417,7 @@ def forward(self, X_features, mask): Xfeat_normed = X_features if self.standardize_inputs: - Xfeat_normed = standardize_inputs(X_features) + Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero) embeddings_id, embeddings_reg = [], [] if self.num_convs != 0: From eb51a50dd197bac4ac44ad495c1e6c0096668853 Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 17:12:43 +0200 Subject: [PATCH 19/66] up --- mlpf/pyg/mlpf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 9d3d6afd6..db30d57c5 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -62,7 +62,7 @@ def standardize_inputs(X, elemtypes_nonzero): import numpy as np - with open("clic_standardization.json", "rb") as f: + with open("/pfvolcentral/clic_standardization.json", "rb") as f: standard_dict = json.load(f)["2.1.0"] for i, ielem in enumerate(elemtypes_nonzero): From 9520e2092c7db4ae92c3d911e85ea2dfc76b88df Mon Sep 17 00:00:00 2001 From: Farouk Date: Fri, 20 Sep 2024 17:14:25 +0200 Subject: [PATCH 20/66] up --- mlpf/pyg/mlpf.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index db30d57c5..3a1fd9c2b 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -60,19 +60,18 @@ def norm_cdf(x): def standardize_inputs(X, elemtypes_nonzero): import json - import numpy as np - with open("/pfvolcentral/clic_standardization.json", "rb") as f: standard_dict = json.load(f)["2.1.0"] for i, ielem in enumerate(elemtypes_nonzero): + Xfeat_normed_msked = X.clone() + # get mean/std of features of that elem - mean = np.array(standard_dict[f"PFelement{ielem}"]["mean"]) - std = np.array(standard_dict[f"PFelement{ielem}"]["std"]) + mean = torch.tensor(standard_dict[f"PFelement{ielem}"]["mean"]).to(Xfeat_normed_msked.device) + std = torch.tensor(standard_dict[f"PFelement{ielem}"]["std"]).to(Xfeat_normed_msked.device) # standardize - Xfeat_normed_msked = X.clone() Xfeat_normed_msked[..., 1:] = (Xfeat_normed_msked[..., 1:] - mean[..., 1:]) / std[..., 1:] # msk other elements From 1ec4e9b2ed4ea7dbd15c9cc1217e8a29fea43edf Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 10:57:51 +0200 Subject: [PATCH 21/66] up --- mlpf/pyg/mlpf.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 3a1fd9c2b..f47c7cec0 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -294,10 +294,6 @@ def __init__( dropout_conv_id_mha=0.0, dropout_conv_id_ff=0.0, use_pre_layernorm=False, - # mamba specific parameters - d_state=16, - d_conv=4, - expand=2, # standardize_inputs standardize_inputs=False, ): From 963f81223abac32f4d84bffd7196e14bbeda1f1b Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:01:07 +0200 Subject: [PATCH 22/66] up --- mlpf/pyg/training.py | 85 +++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 93218f098..ac56db023 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -1,4 +1,5 @@ import csv +import glob import json import logging import os @@ -12,14 +13,12 @@ from typing import Optional import fastjet -import numpy as np -import pandas import matplotlib import matplotlib.pyplot as plt -import glob - -# comet needs to be imported before torch -from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip +import numpy as np +import pandas +import sklearn +import sklearn.metrics import torch import torch.distributed as dist import torch.multiprocessing as mp @@ -27,7 +26,7 @@ import yaml from pyg.inference import make_plots, run_predictions from pyg.logger import _configLogger, _logger -from pyg.mlpf import MLPF +from pyg.mlpf import MLPF, set_save_attention from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders from pyg.utils import ( CLASS_LABELS, @@ -46,35 +45,11 @@ from torch.nn import functional as F from torch.profiler import ProfilerActivity, profile, record_function from torch.utils.tensorboard import SummaryWriter - -from pyg.logger import _logger, _configLogger -from pyg.utils import ( - unpack_predictions, - unpack_target, - get_model_state_dict, - load_checkpoint, - save_checkpoint, - CLASS_LABELS, - X_FEATURES, - ELEM_TYPES_NONZERO, - save_HPs, - get_lr_schedule, - count_parameters, -) - - -import fastjet -from pyg.inference import make_plots, run_predictions - -from pyg.mlpf import set_save_attention -from pyg.mlpf import MLPF -from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders from utils import create_comet_experiment # comet needs to be imported before torch from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip - # Ignore divide by 0 errors np.seterr(divide="ignore", invalid="ignore") @@ -119,7 +94,9 @@ def mlpf_loss(y, ypred, batch): # binary loss for particle / no-particle classification # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) - loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none") + loss_binary_classification = 10 * torch.nn.functional.cross_entropy( + ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none" + ) # compare the particle type, only for cases where there was a true particle loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) @@ -406,18 +383,30 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) @@ -910,19 +899,35 @@ def train_mlpf( color="bold", ) + log_t = ( + losses_t["Regression_pt"] + + losses_t["Regression_eta"] + + losses_t["Regression_sin_phi"] + + losses_t["Regression_cos_phi"] + + losses_t["Regression_energy"] + ) + _logger.info( f"train: loss_total={losses_t['Total']:.4f} " + f"loss_clf={losses_t['Classification']:.4f} " + f"loss_clfbinary={losses_t['Classification_binary']:.4f} " - + f"loss_reg={losses_t['Regression']:.4f} ", + + f"loss_reg={log_t:.4f} ", color="bold", ) + log_v = ( + losses_v["Regression_pt"] + + losses_v["Regression_eta"] + + losses_v["Regression_sin_phi"] + + losses_v["Regression_cos_phi"] + + losses_v["Regression_energy"] + ) + _logger.info( f"valid: loss_total={losses_v['Total']:.4f} " + f"loss_clf={losses_v['Classification']:.4f} " + f"loss_clfbinary={losses_v['Classification_binary']:.4f} " - + f"loss_reg={losses_v['Regression']:.4f} ", + + f"loss_reg={log_v:.4f} ", color="bold", ) From c3dea3c298e5d2b5d4336f6d9531ef08dabc9afe Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:18:00 +0200 Subject: [PATCH 23/66] up --- parameters/pytorch/pyg-clic-ttbar.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml index e949ac554..38ac0a553 100644 --- a/parameters/pytorch/pyg-clic-ttbar.yaml +++ b/parameters/pytorch/pyg-clic-ttbar.yaml @@ -1,5 +1,6 @@ backend: pytorch +save_attention: yes dataset: clic sort_data: no data_dir: @@ -27,11 +28,11 @@ model: trainable: all learned_representation_mode: last #last, concat input_encoding: split #split, joint - pt_mode: linear + pt_mode: direct-elemtype-split eta_mode: linear sin_phi_mode: linear cos_phi_mode: linear - energy_mode: linear + energy_mode: direct-elemtype-split gnn_lsh: conv_type: gnn_lsh @@ -60,7 +61,8 @@ model: head_dim: 64 num_heads: 12 attention_type: math - standardize_inputs: True + standardize_inputs: False + use_pre_layernorm: True mamba: conv_type: mamba From 8057f0d13c2ffa341423d5b997b610512a713cee Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:18:57 +0200 Subject: [PATCH 24/66] up --- ...clic-ttbar.yaml => pyg-clic-ttbar-21.yaml} | 0 parameters/pytorch/pyg-clic-ttbar-22.yaml | 123 ++++++++++++++++++ 2 files changed, 123 insertions(+) rename parameters/pytorch/{pyg-clic-ttbar.yaml => pyg-clic-ttbar-21.yaml} (100%) create mode 100644 parameters/pytorch/pyg-clic-ttbar-22.yaml diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml similarity index 100% rename from parameters/pytorch/pyg-clic-ttbar.yaml rename to parameters/pytorch/pyg-clic-ttbar-21.yaml diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml new file mode 100644 index 000000000..90a12c4bc --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 8 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "relu" + head_dim: 64 + num_heads: 12 + attention_type: math + standardize_inputs: False + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.2.0 \ No newline at end of file From 60994cebc9e21b94fd40957ad4c32eaf317e615a Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:24:25 +0200 Subject: [PATCH 25/66] up 26 feats --- mlpf/pyg/training.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index ac56db023..77e32877a 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -993,8 +993,13 @@ def run(rank, world_size, config, args, outdir, logfile): model, optimizer = load_checkpoint(checkpoint, model, optimizer) else: # instantiate a new model in the outdir created + + input_dim = ( + len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 + ) + model_kwargs = { - "input_dim": len(X_FEATURES[config["dataset"]]), + "input_dim": input_dim, "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], @@ -1230,8 +1235,12 @@ def train_ray_trial(config, args, outdir=None): world_rank = ray.train.get_context().get_world_rank() world_size = ray.train.get_context().get_world_size() + input_dim = ( + len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 + ) + model_kwargs = { - "input_dim": len(X_FEATURES[config["dataset"]]), + "input_dim": input_dim, "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], From c9853230247b3869ef15f5da3f635d93587b2aab Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:32:24 +0200 Subject: [PATCH 26/66] up --- mlpf/pyg/training.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 77e32877a..d02ed1c04 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -50,6 +50,7 @@ # comet needs to be imported before torch from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip + # Ignore divide by 0 errors np.seterr(divide="ignore", invalid="ignore") @@ -993,13 +994,8 @@ def run(rank, world_size, config, args, outdir, logfile): model, optimizer = load_checkpoint(checkpoint, model, optimizer) else: # instantiate a new model in the outdir created - - input_dim = ( - len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 - ) - model_kwargs = { - "input_dim": input_dim, + "input_dim": len(X_FEATURES[config["dataset"]]), "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], @@ -1235,12 +1231,8 @@ def train_ray_trial(config, args, outdir=None): world_rank = ray.train.get_context().get_world_rank() world_size = ray.train.get_context().get_world_size() - input_dim = ( - len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 - ) - model_kwargs = { - "input_dim": input_dim, + "input_dim": len(X_FEATURES[config["dataset"]]), "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], From b41ba20edb645cad01104e852ac168b1e11d95eb Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:33:15 +0200 Subject: [PATCH 27/66] add 26 input_dim --- mlpf/pyg/training.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index d02ed1c04..c2edfa89b 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -994,8 +994,13 @@ def run(rank, world_size, config, args, outdir, logfile): model, optimizer = load_checkpoint(checkpoint, model, optimizer) else: # instantiate a new model in the outdir created + + input_dim = ( + len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 + ) + model_kwargs = { - "input_dim": len(X_FEATURES[config["dataset"]]), + "input_dim": input_dim, "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], @@ -1231,8 +1236,12 @@ def train_ray_trial(config, args, outdir=None): world_rank = ray.train.get_context().get_world_rank() world_size = ray.train.get_context().get_world_size() + input_dim = ( + len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 + ) + model_kwargs = { - "input_dim": len(X_FEATURES[config["dataset"]]), + "input_dim": input_dim, "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], From 31bd710833f4e5732aa6ddccd414c6804a8fadd2 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:35:51 +0200 Subject: [PATCH 28/66] up --- mlpf/pyg/training.py | 91 ++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index c2edfa89b..dd8a433fb 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -17,8 +17,9 @@ import matplotlib.pyplot as plt import numpy as np import pandas -import sklearn -import sklearn.metrics + +# import sklearn +# import sklearn.metrics import torch import torch.distributed as dist import torch.multiprocessing as mp @@ -479,10 +480,10 @@ def train_and_valid( loss_accum = 0.0 val_freq_time_0 = time.time() - if not is_train: - cm_X_gen = np.zeros((13, 13)) - cm_X_pred = np.zeros((13, 13)) - cm_id = np.zeros((13, 13)) + # if not is_train: + # cm_X_gen = np.zeros((13, 13)) + # cm_X_pred = np.zeros((13, 13)) + # cm_id = np.zeros((13, 13)) for itrain, batch in iterator: set_save_attention(model, outdir, False) @@ -505,25 +506,25 @@ def train_and_valid( ypred = unpack_predictions(ypred_raw) - if not is_train: - cm_X_gen += sklearn.metrics.confusion_matrix( - batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), - ygen["cls_id"][batch.mask].detach().cpu().numpy(), - labels=range(13), - ) - cm_X_pred += sklearn.metrics.confusion_matrix( - batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), - ypred["cls_id"][batch.mask].detach().cpu().numpy(), - labels=range(13), - ) - cm_id += sklearn.metrics.confusion_matrix( - ygen["cls_id"][batch.mask].detach().cpu().numpy(), - ypred["cls_id"][batch.mask].detach().cpu().numpy(), - labels=range(13), - ) - # save the events of the first validation batch for quick checks - if (rank == 0 or rank == "cpu") and itrain == 0: - validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir) + # if not is_train: + # cm_X_gen += sklearn.metrics.confusion_matrix( + # batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), + # ygen["cls_id"][batch.mask].detach().cpu().numpy(), + # labels=range(13), + # ) + # cm_X_pred += sklearn.metrics.confusion_matrix( + # batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), + # ypred["cls_id"][batch.mask].detach().cpu().numpy(), + # labels=range(13), + # ) + # cm_id += sklearn.metrics.confusion_matrix( + # ygen["cls_id"][batch.mask].detach().cpu().numpy(), + # ypred["cls_id"][batch.mask].detach().cpu().numpy(), + # labels=range(13), + # ) + # # save the events of the first validation batch for quick checks + # if (rank == 0 or rank == "cpu") and itrain == 0: + # validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir) with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: loss = mlpf_loss(ygen, ypred, batch) @@ -623,26 +624,26 @@ def train_and_valid( comet_experiment.log_metrics(intermediate_losses_v, prefix="valid", step=step) val_freq_time_0 = time.time() # reset intermediate validation spacing timer - if not is_train and comet_experiment: - comet_experiment.log_confusion_matrix( - matrix=cm_X_gen, - title="Element to target", - row_label="X", - column_label="target", - epoch=epoch, - file_name="cm_X_gen.json", - ) - comet_experiment.log_confusion_matrix( - matrix=cm_X_pred, - title="Element to pred", - row_label="X", - column_label="pred", - epoch=epoch, - file_name="cm_X_pred.json", - ) - comet_experiment.log_confusion_matrix( - matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json" - ) + # if not is_train and comet_experiment: + # comet_experiment.log_confusion_matrix( + # matrix=cm_X_gen, + # title="Element to target", + # row_label="X", + # column_label="target", + # epoch=epoch, + # file_name="cm_X_gen.json", + # ) + # comet_experiment.log_confusion_matrix( + # matrix=cm_X_pred, + # title="Element to pred", + # row_label="X", + # column_label="pred", + # epoch=epoch, + # file_name="cm_X_pred.json", + # ) + # comet_experiment.log_confusion_matrix( + # matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json" + # ) num_data = torch.tensor(len(data_loader), device=rank) # sum up the number of steps from all workers From 73818768828c362531844cf5cc701eb0ac45ae43 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 11:53:49 +0200 Subject: [PATCH 29/66] up vs 2.2.0 for standardization --- mlpf/pyg/clic_standardization.json | 2 +- mlpf/pyg/mlpf.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mlpf/pyg/clic_standardization.json b/mlpf/pyg/clic_standardization.json index 99114010b..616aa0023 100644 --- a/mlpf/pyg/clic_standardization.json +++ b/mlpf/pyg/clic_standardization.json @@ -1 +1 @@ -{"2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.0016281852731481194, 0.00044645098387263715, 0.008498543873429298, 4.233071804046631, 31.000581741333008, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.002807975746691227, 0.0019595674239099026, 7.4845520430244505e-06, -0.07514938712120056, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.00073066825279966, -0.0011971204075962305, 0.004265286959707737, 3.290896415710449, 11.28282356262207, -0.4471941590309143, -4.9265971183776855, 1.571304440498352, 1.8862318992614746, 0.9784500598907471, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227722167969, 51.77717971801758], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}} +{"2.2.0": {"PFelement1": {"mean": [1.0, 3.313861608505249, 0.0016492522554472089, 0.0001337795110885054, 0.008735032752156258, 4.222240924835205, 52.052459716796875, 15.738365173339844, 0.0, 0.0, 43.952632904052734, 0.003598652081564069, 0.0025099683552980423, 1.8601234614834539e-06, 0.06345824152231216, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], "std": [0.0, 9.41322135925293, 0.8651331663131714, 0.707984983921051, 0.7061750888824463, 11.704730033874512, 9963.4365234375, 6.578129768371582, 0.0, 0.0, 57.3457145690918, 1.3837608098983765, 2.9924261569976807, 0.0038007558323442936, 27.722505569458008, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.5798463821411133, -0.0016945624956861138, 0.0002545498718973249, 0.006834524683654308, 3.3009438514709473, 15.00655460357666, 0.9023095965385437, -5.961066722869873, 1.5725995302200317, 1.894817590713501, 0.9789467453956604, 1.2573817912198137e-05, 80.9168701171875, 50.47904586791992, 50.581722259521484, 51.96408462524414, 0.6345308423042297, 0.041190944612026215, 41.63414001464844, 41.84788513183594, 54.959861755371094, 11619.2177734375, -5.938568115234375, 1578.74169921875, 0.6185510158538818], "std": [0.0, 4.814330577850342, 0.918420672416687, 0.7074259519577026, 0.7067552208900452, 5.785495281219482, 1081.1585693359375, 1084.4302978515625, 1554.4815673828125, 0.6989641189575195, 3.899383783340454, 3.1981284618377686, 0.00021599895262625068, 106.33512878417969, 71.94786834716797, 72.29435729980469, 73.6717758178711, 0.6663865447044373, 0.045639555901288986, 61.58401870727539, 62.009708404541016, 88.85784912109375, 33432.66796875, 1562.9705810546875, 7268.15087890625, 1.1162511110305786]}}, "2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.001628185505978763, 0.00044645112939178944, 0.008498544804751873, 4.233072757720947, 31.00058364868164, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.0028079766780138016, 0.001959568355232477, 7.4845515882771e-06, -0.07514937967061996, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.000730668194591999, -0.0011971204075962305, 0.004265286028385162, 3.2908971309661865, 11.28282356262207, -0.4471946060657501, -4.926597595214844, 1.571304440498352, 1.886231780052185, 0.9784500002861023, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227340698242, 51.77717208862305], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}} \ No newline at end of file diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index f47c7cec0..a3920288f 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -58,10 +58,16 @@ def norm_cdf(x): def standardize_inputs(X, elemtypes_nonzero): + + if X.shape[-1] == 26: + vs = "2.2.0" + else: + vs = "2.1.0" + import json with open("/pfvolcentral/clic_standardization.json", "rb") as f: - standard_dict = json.load(f)["2.1.0"] + standard_dict = json.load(f)[vs] for i, ielem in enumerate(elemtypes_nonzero): From a7f9a46d5d39b2c30fb4ef677e931d4e4e313919 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 13:47:39 +0200 Subject: [PATCH 30/66] more configs --- parameters/pytorch/pyg-clic-ttbar-21-std.yaml | 123 ++++++++++++++++++ parameters/pytorch/pyg-clic-ttbar-22-std.yaml | 123 ++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 parameters/pytorch/pyg-clic-ttbar-21-std.yaml create mode 100644 parameters/pytorch/pyg-clic-ttbar-22-std.yaml diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml new file mode 100644 index 000000000..4b3152b5a --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 8 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "relu" + head_dim: 64 + num_heads: 12 + attention_type: math + standardize_inputs: True + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 \ No newline at end of file diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml new file mode 100644 index 000000000..d061cfc07 --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 8 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "relu" + head_dim: 64 + num_heads: 12 + attention_type: math + standardize_inputs: True + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.2.0 \ No newline at end of file From 74e635f6f81df40bb8025678f4504f7d162942b1 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 14:00:00 +0200 Subject: [PATCH 31/66] better docs --- mlpf/heptfds/clic_pf_edm4hep/utils_edm.py | 2 +- scripts/clic/postprocessing.py | 14 +++----------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py index c63d74994..68e0b610e 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -40,7 +40,7 @@ "sigma_x", "sigma_y", "sigma_z", - # added by farouk + # additional cluster input features "energyError", "sigma_energy", "sigma_x_weighted", diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 43c7b06b1..685d6191d 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -63,7 +63,7 @@ "sigma_x", "sigma_y", "sigma_z", - # added by farouk + # additional cluster input features "energyError", "sigma_energy", "sigma_x_weighted", @@ -343,7 +343,6 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cl_sigma_y = [] cl_sigma_z = [] - # added by farouk cl_sigma_energy = [] cl_sigma_x_weighted, cl_sigma_y_weighted, cl_sigma_z_weighted = [], [], [] cl_energy_weighted_width = [] @@ -378,7 +377,6 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cl_sigma_y.append(np.std(hits_posy)) cl_sigma_z.append(np.std(hits_posz)) - # added by farouk cl_sigma_energy.append(np.std(hits_energy)) cl_sigma_x_weighted.append(np.std(hits_posx * hits_energy)) cl_sigma_y_weighted.append(np.std(hits_posy * hits_energy)) @@ -393,14 +391,8 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cl_energy_weighted_width.append(num / den) - # if i==1: - # xs += [np.array(hits_posx)] - # ys += [np.array(hits_posy)] - # zs += [np.array(hits_posz)] - # es += [np.array(hits_energy)] - # get position at shower max - # for each unique z integrate the energy of all the hits to find zmax + # at each unique "z" integrate the energy of all the hits to find zmax zmax, emax = 0, -1000 for z in np.unique(np.array(hits_posz)): msk = np.array(hits_posz) == z @@ -452,7 +444,7 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): ret["sin_phi"] = np.sin(ret["phi"]) ret["cos_phi"] = np.cos(ret["phi"]) - # added by farouk + # additional cluster input features ret["sigma_energy"] = np.array(cl_sigma_energy) ret["sigma_x_weighted"] = np.array(cl_sigma_x_weighted) ret["sigma_y_weighted"] = np.array(cl_sigma_y_weighted) From dd4c43a2b1cbae3c2f71ca8115b1f282e67e27e8 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 14:43:30 +0200 Subject: [PATCH 32/66] up standardization pipeline --- mlpf/pyg/clic_standardization.json | 1 - mlpf/pyg/mlpf.py | 26 +++------- mlpf/pyg/training.py | 47 ++++++++++++++++++- parameters/pytorch/pyg-clic-ttbar-21-std.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-21.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-22-std.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-22.yaml | 2 +- 7 files changed, 55 insertions(+), 27 deletions(-) delete mode 100644 mlpf/pyg/clic_standardization.json diff --git a/mlpf/pyg/clic_standardization.json b/mlpf/pyg/clic_standardization.json deleted file mode 100644 index 616aa0023..000000000 --- a/mlpf/pyg/clic_standardization.json +++ /dev/null @@ -1 +0,0 @@ -{"2.2.0": {"PFelement1": {"mean": [1.0, 3.313861608505249, 0.0016492522554472089, 0.0001337795110885054, 0.008735032752156258, 4.222240924835205, 52.052459716796875, 15.738365173339844, 0.0, 0.0, 43.952632904052734, 0.003598652081564069, 0.0025099683552980423, 1.8601234614834539e-06, 0.06345824152231216, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], "std": [0.0, 9.41322135925293, 0.8651331663131714, 0.707984983921051, 0.7061750888824463, 11.704730033874512, 9963.4365234375, 6.578129768371582, 0.0, 0.0, 57.3457145690918, 1.3837608098983765, 2.9924261569976807, 0.0038007558323442936, 27.722505569458008, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.5798463821411133, -0.0016945624956861138, 0.0002545498718973249, 0.006834524683654308, 3.3009438514709473, 15.00655460357666, 0.9023095965385437, -5.961066722869873, 1.5725995302200317, 1.894817590713501, 0.9789467453956604, 1.2573817912198137e-05, 80.9168701171875, 50.47904586791992, 50.581722259521484, 51.96408462524414, 0.6345308423042297, 0.041190944612026215, 41.63414001464844, 41.84788513183594, 54.959861755371094, 11619.2177734375, -5.938568115234375, 1578.74169921875, 0.6185510158538818], "std": [0.0, 4.814330577850342, 0.918420672416687, 0.7074259519577026, 0.7067552208900452, 5.785495281219482, 1081.1585693359375, 1084.4302978515625, 1554.4815673828125, 0.6989641189575195, 3.899383783340454, 3.1981284618377686, 0.00021599895262625068, 106.33512878417969, 71.94786834716797, 72.29435729980469, 73.6717758178711, 0.6663865447044373, 0.045639555901288986, 61.58401870727539, 62.009708404541016, 88.85784912109375, 33432.66796875, 1562.9705810546875, 7268.15087890625, 1.1162511110305786]}}, "2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.001628185505978763, 0.00044645112939178944, 0.008498544804751873, 4.233072757720947, 31.00058364868164, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.0028079766780138016, 0.001959568355232477, 7.4845515882771e-06, -0.07514937967061996, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.000730668194591999, -0.0011971204075962305, 0.004265286028385162, 3.2908971309661865, 11.28282356262207, -0.4471946060657501, -4.926597595214844, 1.571304440498352, 1.886231780052185, 0.9784500002861023, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227340698242, 51.77717208862305], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}} \ No newline at end of file diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index a3920288f..7f7ecb922 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -57,25 +57,15 @@ def norm_cdf(x): return tensor -def standardize_inputs(X, elemtypes_nonzero): - - if X.shape[-1] == 26: - vs = "2.2.0" - else: - vs = "2.1.0" - - import json - - with open("/pfvolcentral/clic_standardization.json", "rb") as f: - standard_dict = json.load(f)[vs] +def standardize_inputs(X, elemtypes_nonzero, standardization_dict): for i, ielem in enumerate(elemtypes_nonzero): Xfeat_normed_msked = X.clone() # get mean/std of features of that elem - mean = torch.tensor(standard_dict[f"PFelement{ielem}"]["mean"]).to(Xfeat_normed_msked.device) - std = torch.tensor(standard_dict[f"PFelement{ielem}"]["std"]).to(Xfeat_normed_msked.device) + mean = torch.tensor(standardization_dict[f"PFelement{ielem}"]["mean"]).to(Xfeat_normed_msked.device) + std = torch.tensor(standardization_dict[f"PFelement{ielem}"]["std"]).to(Xfeat_normed_msked.device) # standardize Xfeat_normed_msked[..., 1:] = (Xfeat_normed_msked[..., 1:] - mean[..., 1:]) / std[..., 1:] @@ -300,8 +290,6 @@ def __init__( dropout_conv_id_mha=0.0, dropout_conv_id_ff=0.0, use_pre_layernorm=False, - # standardize_inputs - standardize_inputs=False, ): super(MLPF, self).__init__() @@ -321,8 +309,6 @@ def __init__( self.use_pre_layernorm = use_pre_layernorm - self.standardize_inputs = standardize_inputs - if self.conv_type == "attention": embedding_dim = num_heads * head_dim width = num_heads * head_dim @@ -414,11 +400,11 @@ def __init__( self.final_norm_reg = torch.nn.LayerNorm(embed_dim) # @torch.compile - def forward(self, X_features, mask): + def forward(self, X_features, mask, standardization_dict=None): Xfeat_normed = X_features - if self.standardize_inputs: - Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero) + if standardization_dict is not None: + Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero, standardization_dict) embeddings_id, embeddings_reg = [], [] if self.num_convs != 0: diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index dd8a433fb..e756a7f08 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -450,6 +450,7 @@ def train_and_valid( dtype=torch.float32, tensorboard_writer=None, save_attention=False, + standardization_dict=None, ): """ Performs training over a given epoch. Will run a validation step every N_STEPS and after the last training batch. @@ -496,13 +497,13 @@ def train_and_valid( with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: - ypred_raw = model(batch.X, batch.mask) + ypred_raw = model(batch.X, batch.mask, standardization_dict) else: with torch.no_grad(): # save some attention matrices if save_attention and (rank == 0 or rank == "cpu") and itrain == 0: set_save_attention(model, outdir, True) - ypred_raw = model(batch.X, batch.mask) + ypred_raw = model(batch.X, batch.mask, standardization_dict) ypred = unpack_predictions(ypred_raw) @@ -684,6 +685,7 @@ def train_mlpf( comet_step_freq=None, val_freq=None, save_attention=False, + standardization_dict=None, ): """ Will run a full training by calling train(). @@ -747,6 +749,7 @@ def train_mlpf( lr_schedule=lr_schedule, val_freq=val_freq, dtype=dtype, + standardization_dict=standardization_dict, ) prof.export_chrome_trace("trace.json") else: @@ -767,6 +770,7 @@ def train_mlpf( val_freq=val_freq, dtype=dtype, tensorboard_writer=tensorboard_writer_train, + standardization_dict=standardization_dict, ) t_train = time.time() # epoch time excluding validation @@ -787,6 +791,7 @@ def train_mlpf( dtype=dtype, tensorboard_writer=tensorboard_writer_valid, save_attention=save_attention, + standardization_dict=standardization_dict, ) t_valid = time.time() @@ -1072,6 +1077,42 @@ def run(rank, world_size, config, args, outdir, logfile): last_epoch = -1 if start_epoch == 1 else start_epoch - 1 lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch) + def get_standardization_dict(dataset, train_loader, nsubset=10_000): + + standardization_dict = {} + + for ielem in ELEM_TYPES_NONZERO[dataset]: + standardization_dict["PFelement" + str(ielem)] = {} + + tot_events = 0 + for i, batch in enumerate(train_loader): + + tot_events += batch.X.shape[0] + + # remove the first dimension because we will stack all PFelements anyway to compute the mean/std + batch.X = batch.X.view(-1, batch.X.shape[-1]) + + msk = (batch.X[:, 0] == ielem) & (batch.X[:, 0] != 0) # skip 0 padded elements + + if i == 0: + # initialize + concatenated_pfelements = batch.X[msk] + else: + concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]]) + + standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist() + standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist() + + if tot_events > nsubset: + break + + return standardization_dict + + if config["standardize_inputs"] is True: + standardization_dict = get_standardization_dict(config["dataset"], loaders["train"]) + else: + standardization_dict = None + train_mlpf( rank, world_size, @@ -1092,6 +1133,7 @@ def run(rank, world_size, config, args, outdir, logfile): comet_step_freq=config["comet_step_freq"], val_freq=config["val_freq"], save_attention=config["save_attention"], + standardization_dict=standardization_dict, ) checkpoint = torch.load(f"{outdir}/best_weights.pth", map_location=torch.device(rank)) @@ -1345,6 +1387,7 @@ def train_ray_trial(config, args, outdir=None): comet_step_freq=config["comet_step_freq"], dtype=getattr(torch, config["dtype"]), val_freq=config["val_freq"], + standardization_dict=None, ) diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml index 4b3152b5a..17c3006c9 100644 --- a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: True save_attention: yes dataset: clic sort_data: no @@ -61,7 +62,6 @@ model: head_dim: 64 num_heads: 12 attention_type: math - standardize_inputs: True use_pre_layernorm: True mamba: diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml index 38ac0a553..cd3126713 100644 --- a/parameters/pytorch/pyg-clic-ttbar-21.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False save_attention: yes dataset: clic sort_data: no @@ -61,7 +62,6 @@ model: head_dim: 64 num_heads: 12 attention_type: math - standardize_inputs: False use_pre_layernorm: True mamba: diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml index d061cfc07..fa3d8173c 100644 --- a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: True save_attention: yes dataset: clic sort_data: no @@ -61,7 +62,6 @@ model: head_dim: 64 num_heads: 12 attention_type: math - standardize_inputs: True use_pre_layernorm: True mamba: diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml index 90a12c4bc..438052100 100644 --- a/parameters/pytorch/pyg-clic-ttbar-22.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False save_attention: yes dataset: clic sort_data: no @@ -61,7 +62,6 @@ model: head_dim: 64 num_heads: 12 attention_type: math - standardize_inputs: False use_pre_layernorm: True mamba: From 80bd33058a9d64b296c7127405c6ce2365ad1005 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 14:45:07 +0200 Subject: [PATCH 33/66] better docs --- mlpf/pyg/training.py | 34 ++---------------------------- mlpf/pyg/utils.py | 50 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 47 insertions(+), 37 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index e756a7f08..20d1d2c1d 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -34,6 +34,7 @@ ELEM_TYPES_NONZERO, X_FEATURES, count_parameters, + get_input_standardization, get_lr_schedule, get_model_state_dict, load_checkpoint, @@ -1077,39 +1078,8 @@ def run(rank, world_size, config, args, outdir, logfile): last_epoch = -1 if start_epoch == 1 else start_epoch - 1 lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch) - def get_standardization_dict(dataset, train_loader, nsubset=10_000): - - standardization_dict = {} - - for ielem in ELEM_TYPES_NONZERO[dataset]: - standardization_dict["PFelement" + str(ielem)] = {} - - tot_events = 0 - for i, batch in enumerate(train_loader): - - tot_events += batch.X.shape[0] - - # remove the first dimension because we will stack all PFelements anyway to compute the mean/std - batch.X = batch.X.view(-1, batch.X.shape[-1]) - - msk = (batch.X[:, 0] == ielem) & (batch.X[:, 0] != 0) # skip 0 padded elements - - if i == 0: - # initialize - concatenated_pfelements = batch.X[msk] - else: - concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]]) - - standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist() - standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist() - - if tot_events > nsubset: - break - - return standardization_dict - if config["standardize_inputs"] is True: - standardization_dict = get_standardization_dict(config["dataset"], loaders["train"]) + standardization_dict = get_input_standardization(config["dataset"], loaders["train"]) else: standardization_dict = None diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 6ec64c480..d55e7ec30 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -1,11 +1,11 @@ import json +import logging import pickle as pkl import pandas as pd import torch import torch.utils.data -from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingLR, ConstantLR -import logging +from torch.optim.lr_scheduler import ConstantLR, CosineAnnealingLR, OneCycleLR # https://github.com/ahlinist/cmssw/blob/1df62491f48ef964d198f574cdfcccfd17c70425/DataFormats/ParticleFlowReco/interface/PFBlockElement.h#L33 # https://github.com/cms-sw/cmssw/blob/master/DataFormats/ParticleFlowCandidate/src/PFCandidate.cc#L254 @@ -162,7 +162,9 @@ def unpack_target(y, model): # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"] ret["momentum"] = y[..., 2:7].to(dtype=torch.float32) - ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1) + ret["p4"] = torch.cat( + [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1 + ) ret["ispu"] = y[..., -1] @@ -280,7 +282,11 @@ def load_lr_schedule(lr_schedule, checkpoint): lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"]) return lr_schedule else: - raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys())) + raise KeyError( + "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format( + checkpoint["extra_state"].keys() + ) + ) def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1): @@ -298,7 +304,9 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=- pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3, ) elif config["lr_schedule"] == "cosinedecay": - lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1) + lr_schedule = CosineAnnealingLR( + opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1 + ) else: raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.") return lr_schedule @@ -328,3 +336,35 @@ def count_parameters(model): ) trainable_params += params return trainable_params, nontrainable_params, table + + +def get_input_standardization(dataset, train_loader, nsubset=10_000): + + standardization_dict = {} + + for ielem in ELEM_TYPES_NONZERO[dataset]: + standardization_dict["PFelement" + str(ielem)] = {} + + tot_events = 0 + for i, batch in enumerate(train_loader): + + tot_events += batch.X.shape[0] + + # remove the first dimension because we will stack all PFelements anyway to compute the mean/std + batch.X = batch.X.view(-1, batch.X.shape[-1]) + + msk = (batch.X[:, 0] == ielem) & (batch.X[:, 0] != 0) # skip 0 padded elements + + if i == 0: + # initialize + concatenated_pfelements = batch.X[msk] + else: + concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]]) + + standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist() + standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist() + + if tot_events > nsubset: + break + + return standardization_dict From c0403998fd029df4adaab0ec3ff905f10e68ea73 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 15:14:55 +0200 Subject: [PATCH 34/66] fix input dim for other datasets --- mlpf/pyg/training.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 20d1d2c1d..8ae2fa222 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -1001,10 +1001,14 @@ def run(rank, world_size, config, args, outdir, logfile): model, optimizer = load_checkpoint(checkpoint, model, optimizer) else: # instantiate a new model in the outdir created - - input_dim = ( - len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 - ) + + input_dim = len(X_FEATURES[config["dataset"]]) + if config["dataset"] == "clic": + # extract the version of the dataset + for sample in config["test_dataset"]: + if config["test_dataset"][sample]["version"] == "2.2.0": + input_dim = 26 + break model_kwargs = { "input_dim": input_dim, @@ -1249,9 +1253,13 @@ def train_ray_trial(config, args, outdir=None): world_rank = ray.train.get_context().get_world_rank() world_size = ray.train.get_context().get_world_size() - input_dim = ( - len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26 - ) + input_dim = len(X_FEATURES[config["dataset"]]) + if config["dataset"] == "clic": + # extract the version of the dataset + for sample in config["test_dataset"]: + if config["test_dataset"][sample]["version"] == "2.2.0": + input_dim = 26 + break model_kwargs = { "input_dim": input_dim, From bcfb2771921e86b9dbcc078f6ce3dbdfa0e32a5f Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 15:15:29 +0200 Subject: [PATCH 35/66] pca --- mlpf/pyg/mlpf.py | 8 +- mlpf/pyg/training.py | 74 ++++++------------- mlpf/pyg/utils.py | 14 +--- parameters/pytorch/pyg-clic-ttbar-21-std.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-21.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-22-std.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-22.yaml | 2 +- scripts/clic/postprocessing.py | 65 ++++------------ 8 files changed, 47 insertions(+), 122 deletions(-) diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 7f7ecb922..33d431c09 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -122,9 +122,7 @@ def __init__( self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True) self.norm0 = torch.nn.LayerNorm(embedding_dim) self.norm1 = torch.nn.LayerNorm(embedding_dim) - self.seq = torch.nn.Sequential( - nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act() - ) + self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()) self.dropout = torch.nn.Dropout(dropout_ff) _logger.info("using attention_type={}".format(attention_type)) # params for torch sdp_kernel @@ -465,9 +463,7 @@ def forward(self, X_features, mask, standardization_dict=None): e_real[~mask] = 0 e_real[torch.isinf(e_real)] = 0 e_real[torch.isnan(e_real)] = 0 - preds_energy = e_real + torch.nn.functional.relu( - self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]) - ) + preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])) preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1) return preds_binary_particle, preds_pid, preds_momentum diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 8ae2fa222..91dd4f3c0 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -97,9 +97,7 @@ def mlpf_loss(y, ypred, batch): # binary loss for particle / no-particle classification # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) - loss_binary_classification = 10 * torch.nn.functional.cross_entropy( - ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none" - ) + loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none") # compare the particle type, only for cases where there was a true particle loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) @@ -147,12 +145,12 @@ def mlpf_loss(y, ypred, batch): pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2) loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() - was_input_pred = torch.concat( - [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1 - ) * batch.mask.unsqueeze(axis=-1) - was_input_true = torch.concat( - [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1 - ) * batch.mask.unsqueeze(axis=-1) + was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) + was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) # standardize Wasserstein loss std = was_input_true[batch.mask].std(axis=0) @@ -194,9 +192,7 @@ class FocalLoss(nn.Module): - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0. """ - def __init__( - self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100 - ): + def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100): """Constructor. Args: alpha (Tensor, optional): Weights for each class. Defaults to None. @@ -386,30 +382,18 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) @@ -473,9 +457,7 @@ def train_and_valid( if (world_size > 1) and (rank != 0): iterator = enumerate(data_loader) else: - iterator = tqdm.tqdm( - enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}" - ) + iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}") device_type = "cuda" if isinstance(rank, int) else "cpu" @@ -733,9 +715,7 @@ def train_mlpf( # training step, edit here to profile a specific epoch if epoch == -1: - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True - ) as prof: + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof: with record_function("model_train"): losses_t = train_and_valid( rank, @@ -1001,7 +981,7 @@ def run(rank, world_size, config, args, outdir, logfile): model, optimizer = load_checkpoint(checkpoint, model, optimizer) else: # instantiate a new model in the outdir created - + input_dim = len(X_FEATURES[config["dataset"]]) if config["dataset"] == "clic": # extract the version of the dataset @@ -1049,9 +1029,7 @@ def run(rank, world_size, config, args, outdir, logfile): _logger.info(f"Model directory {outdir}", color="bold") if args.comet: - comet_experiment = create_comet_experiment( - config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir - ) + comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1304,9 +1282,7 @@ def train_ray_trial(config, args, outdir=None): loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True) if args.comet: - comet_experiment = create_comet_experiment( - config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir - ) + comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1340,9 +1316,7 @@ def train_ray_trial(config, args, outdir=None): if args.resume_training: model, optimizer = load_checkpoint(checkpoint, model, optimizer) start_epoch = checkpoint["extra_state"]["epoch"] + 1 - lr_schedule = get_lr_schedule( - config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1 - ) + lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1) else: # start a new training with model weights loaded from a pre-trained model model = load_checkpoint(checkpoint, model) @@ -1506,9 +1480,7 @@ def run_hpo(config, args): if tune.Tuner.can_restore(str(expdir)): # resume unfinished HPO run - tuner = tune.Tuner.restore( - str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True - ) + tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True) else: # start new HPO run search_space = {"train_loop_config": search_space} # the ray TorchTrainer only takes a single arg: train_loop_config @@ -1549,6 +1521,4 @@ def run_hpo(config, args): print(result_df.columns) logging.info("Total time of Tuner.fit(): {}".format(end - start)) - logging.info( - "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config) - ) + logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index d55e7ec30..3ab3509d7 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -162,9 +162,7 @@ def unpack_target(y, model): # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"] ret["momentum"] = y[..., 2:7].to(dtype=torch.float32) - ret["p4"] = torch.cat( - [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1 - ) + ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1) ret["ispu"] = y[..., -1] @@ -282,11 +280,7 @@ def load_lr_schedule(lr_schedule, checkpoint): lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"]) return lr_schedule else: - raise KeyError( - "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format( - checkpoint["extra_state"].keys() - ) - ) + raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys())) def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1): @@ -304,9 +298,7 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=- pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3, ) elif config["lr_schedule"] == "cosinedecay": - lr_schedule = CosineAnnealingLR( - opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1 - ) + lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1) else: raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.") return lr_schedule diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml index 17c3006c9..ae6b5a01e 100644 --- a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml @@ -120,4 +120,4 @@ valid_dataset: test_dataset: clic_edm_ttbar_pf: - version: 2.1.0 \ No newline at end of file + version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml index cd3126713..6aea54096 100644 --- a/parameters/pytorch/pyg-clic-ttbar-21.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml @@ -120,4 +120,4 @@ valid_dataset: test_dataset: clic_edm_ttbar_pf: - version: 2.1.0 \ No newline at end of file + version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml index fa3d8173c..39e3e8247 100644 --- a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml @@ -120,4 +120,4 @@ valid_dataset: test_dataset: clic_edm_ttbar_pf: - version: 2.2.0 \ No newline at end of file + version: 2.2.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml index 438052100..1512a6b3e 100644 --- a/parameters/pytorch/pyg-clic-ttbar-22.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml @@ -120,4 +120,4 @@ valid_dataset: test_dataset: clic_edm_ttbar_pf: - version: 2.2.0 \ No newline at end of file + version: 2.2.0 diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 685d6191d..e05b2ae9c 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -149,9 +149,7 @@ def __init__( self.cluster_features = cluster_features # feature matrix of the calo clusters self.track_features = track_features # feature matrix of the tracks self.genparticle_to_hit = genparticle_to_hit # sparse COO matrix of genparticles to hits (idx_gp, idx_hit, weight) - self.genparticle_to_track = ( - genparticle_to_track # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight) - ) + self.genparticle_to_track = genparticle_to_track # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight) self.hit_to_cluster = hit_to_cluster # sparse COO matrix of hits to clusters (idx_hit, idx_cluster, weight) self.gp_merges = gp_merges # sparse COO matrix of any merged genparticles @@ -217,10 +215,7 @@ def get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs): hit_idx_global += 1 hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()} hit_feature_matrix = awkward.Record( - { - k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) - for k in hit_feature_matrix[0].fields - } + {k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields} ) # add all edges from genparticle to calohit @@ -286,9 +281,7 @@ def gen_to_features(prop_data, iev): gen_arr = {k.replace(mc_coll + ".", ""): gen_arr[k] for k in gen_arr.fields} MCParticles_p4 = vector.awk( - awkward.zip( - {"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]} - ) + awkward.zip({"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]}) ) gen_arr["pt"] = MCParticles_p4.pt gen_arr["eta"] = MCParticles_p4.eta @@ -407,12 +400,8 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): # get width at shower max msk = np.array(hits_posz) == zmax # select the hits at zmax - x_bar = np.sum(np.array(hits_posx)[msk] * np.array(hits_energy)[msk]) / np.sum( - np.array(hits_energy)[msk] - ) # energy weighted average - y_bar = np.sum(np.array(hits_posy)[msk] * np.array(hits_energy)[msk]) / np.sum( - np.array(hits_energy)[msk] - ) # energy weighted average + x_bar = np.sum(np.array(hits_posx)[msk] * np.array(hits_energy)[msk]) / np.sum(np.array(hits_energy)[msk]) # energy weighted average + y_bar = np.sum(np.array(hits_posy)[msk] * np.array(hits_energy)[msk]) / np.sum(np.array(hits_energy)[msk]) # energy weighted average num = (np.sum(np.array(hits_energy)[msk] * (np.array(hits_posx)[msk] - x_bar) ** 2)) + ( np.sum(np.array(hits_energy)[msk] * (np.array(hits_posy)[msk] - y_bar) ** 2) @@ -506,9 +495,7 @@ def filter_adj(adj, all_to_filtered): def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs): gen_features = gen_to_features(prop_data, iev) - hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj( - hit_data, calohit_links, iev, collectionIDs - ) + hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs) hit_to_cluster = hit_cluster_adj(prop_data, hit_idx_local_to_global, iev) cluster_features = cluster_to_features(prop_data, hit_features, hit_to_cluster, iev) track_features = track_to_features(prop_data, iev) @@ -521,9 +508,7 @@ def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack if len(genparticle_to_track[0]) > 0: gp_to_track = ( - coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track)) - .max(axis=1) - .todense() + coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track)).max(axis=1).todense() ) else: gp_to_track = np.zeros((n_gp, 1)) @@ -576,12 +561,8 @@ def assign_genparticles_to_obj_and_merge(gpdata): ).todense() ) - gp_to_calohit = coo_matrix( - (gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit) - ) - calohit_to_cluster = coo_matrix( - (gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster) - ) + gp_to_calohit = coo_matrix((gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit)) + calohit_to_cluster = coo_matrix((gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster)) gp_to_cluster = np.array((gp_to_calohit * calohit_to_cluster).todense()) @@ -746,9 +727,7 @@ def get_reco_properties(prop_data, iev): reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields} reco_p4 = vector.awk( - awkward.zip( - {"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]} - ) + awkward.zip({"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]}) ) reco_arr["pt"] = reco_p4.pt reco_arr["eta"] = reco_p4.eta @@ -970,29 +949,19 @@ def process_one_file(fn, ofn): assert np.all(used_rps == 1) gps_track = get_particle_feature_matrix(track_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) - gps_track[:, 0] = np.array( - [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])] - ) + gps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])]) gps_cluster = get_particle_feature_matrix(cluster_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) - gps_cluster[:, 0] = np.array( - [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])] - ) + gps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])]) gps_cluster[:, 1] = 0 rps_track = get_particle_feature_matrix(track_to_rp_all, reco_features, particle_feature_order) - rps_track[:, 0] = np.array( - [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])] - ) + rps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])]) rps_cluster = get_particle_feature_matrix(cluster_to_rp_all, reco_features, particle_feature_order) - rps_cluster[:, 0] = np.array( - [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])] - ) + rps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])]) rps_cluster[:, 1] = 0 # all initial gen/reco particle energy must be reconstructable - assert ( - abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2 - ) + assert abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2 assert abs(np.sum(rps_track[:, 6]) + np.sum(rps_cluster[:, 6]) - np.sum(reco_features["energy"])) < 1e-2 @@ -1037,9 +1006,7 @@ def parse_args(): import argparse parser = argparse.ArgumentParser() - parser.add_argument( - "--input", type=str, help="Input ROOT file - else if dir then will process all files inside", required=True - ) + parser.add_argument("--input", type=str, help="Input ROOT file - else if dir then will process all files inside", required=True) parser.add_argument("--outpath", type=str, default="raw", help="output path") args = parser.parse_args() return args From b2e7c2e9222efefaafc780b10d7004826fa9609e Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 15:29:26 +0200 Subject: [PATCH 36/66] add standardize_inputs: False to all configs --- parameters/pytorch/pyg-cld.yaml | 1 + parameters/pytorch/pyg-clic-hits.yaml | 1 + parameters/pytorch/pyg-clic.yaml | 1 + parameters/pytorch/pyg-cms-finetune.yaml | 1 + parameters/pytorch/pyg-cms-ttbar-nopu.yaml | 1 + parameters/pytorch/pyg-cms.yaml | 1 + 6 files changed, 6 insertions(+) diff --git a/parameters/pytorch/pyg-cld.yaml b/parameters/pytorch/pyg-cld.yaml index 204689385..f3ae2e957 100644 --- a/parameters/pytorch/pyg-cld.yaml +++ b/parameters/pytorch/pyg-cld.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False dataset: cld sort_data: no data_dir: diff --git a/parameters/pytorch/pyg-clic-hits.yaml b/parameters/pytorch/pyg-clic-hits.yaml index 62b470931..4a8b5e3b1 100644 --- a/parameters/pytorch/pyg-clic-hits.yaml +++ b/parameters/pytorch/pyg-clic-hits.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False dataset: clic data_dir: gpus: 1 diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml index a51540683..e4dcdde1e 100644 --- a/parameters/pytorch/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False save_attention: yes dataset: clic sort_data: no diff --git a/parameters/pytorch/pyg-cms-finetune.yaml b/parameters/pytorch/pyg-cms-finetune.yaml index b70d3df4a..2c362ea39 100644 --- a/parameters/pytorch/pyg-cms-finetune.yaml +++ b/parameters/pytorch/pyg-cms-finetune.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False dataset: cms sort_data: yes data_dir: diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml index cfacab525..2e1ac6e94 100644 --- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml +++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False dataset: cms sort_data: yes data_dir: diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml index 7d5f7e4a1..76770c1cb 100644 --- a/parameters/pytorch/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_inputs: False save_attention: no dataset: cms sort_data: yes From 0b901589dd57813858e201e7fc1e2cdabc6fa1f7 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 15:45:29 +0200 Subject: [PATCH 37/66] up --- parameters/pytorch/pyg-clic-std.yaml | 129 +++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 parameters/pytorch/pyg-clic-std.yaml diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml new file mode 100644 index 000000000..287069d31 --- /dev/null +++ b/parameters/pytorch/pyg-clic-std.yaml @@ -0,0 +1,129 @@ +backend: pytorch + +standardize_inputs: True +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 From a253a4a6129bc91222e31120ffedacb3c75949c1 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 15:49:38 +0200 Subject: [PATCH 38/66] debug --- parameters/pytorch/pyg-clic-std.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml index 287069d31..f59d258f2 100644 --- a/parameters/pytorch/pyg-clic-std.yaml +++ b/parameters/pytorch/pyg-clic-std.yaml @@ -109,8 +109,6 @@ train_dataset: samples: clic_edm_ttbar_pf: version: 2.1.0 - clic_edm_qq_pf: - version: 2.1.0 valid_dataset: clic: From bb92cb52946dc2408b983c0bab700e898a2d9223 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 15:50:54 +0200 Subject: [PATCH 39/66] up --- parameters/pytorch/pyg-clic-std.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml index f59d258f2..2f08386cf 100644 --- a/parameters/pytorch/pyg-clic-std.yaml +++ b/parameters/pytorch/pyg-clic-std.yaml @@ -107,7 +107,7 @@ train_dataset: physical: batch_size: 1 samples: - clic_edm_ttbar_pf: + clic_edm_qq_pf: version: 2.1.0 valid_dataset: From 8e945594d024a919b3f94f7ca9cd0e77c9903e1b Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 15:52:19 +0200 Subject: [PATCH 40/66] revert --- parameters/pytorch/pyg-clic-std.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml index 2f08386cf..287069d31 100644 --- a/parameters/pytorch/pyg-clic-std.yaml +++ b/parameters/pytorch/pyg-clic-std.yaml @@ -107,6 +107,8 @@ train_dataset: physical: batch_size: 1 samples: + clic_edm_ttbar_pf: + version: 2.1.0 clic_edm_qq_pf: version: 2.1.0 From a8522567ee8968d055dfa2bcc26cf91ec4c59190 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:01:19 +0200 Subject: [PATCH 41/66] debug --- mlpf/pyg/utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 3ab3509d7..16d31896e 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -162,7 +162,9 @@ def unpack_target(y, model): # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"] ret["momentum"] = y[..., 2:7].to(dtype=torch.float32) - ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1) + ret["p4"] = torch.cat( + [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1 + ) ret["ispu"] = y[..., -1] @@ -280,7 +282,11 @@ def load_lr_schedule(lr_schedule, checkpoint): lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"]) return lr_schedule else: - raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys())) + raise KeyError( + "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format( + checkpoint["extra_state"].keys() + ) + ) def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1): @@ -298,7 +304,9 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=- pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3, ) elif config["lr_schedule"] == "cosinedecay": - lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1) + lr_schedule = CosineAnnealingLR( + opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1 + ) else: raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.") return lr_schedule @@ -336,6 +344,7 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000): for ielem in ELEM_TYPES_NONZERO[dataset]: standardization_dict["PFelement" + str(ielem)] = {} + print(standardization_dict.keys()) tot_events = 0 for i, batch in enumerate(train_loader): From 46415578ef54f50e83ee025630fc2b5057e11d18 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:02:13 +0200 Subject: [PATCH 42/66] up --- mlpf/pyg/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 16d31896e..648a2d1ca 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -342,6 +342,9 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000): standardization_dict = {} + print("dataset", dataset) + print("ELEM_TYPES_NONZERO[dataset]", ELEM_TYPES_NONZERO[dataset]) + for ielem in ELEM_TYPES_NONZERO[dataset]: standardization_dict["PFelement" + str(ielem)] = {} print(standardization_dict.keys()) From 4d56a63c0f0b0659613c844ef6d27d2281adec9c Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:03:29 +0200 Subject: [PATCH 43/66] oops --- mlpf/pyg/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 648a2d1ca..18e02ae57 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -365,10 +365,10 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000): else: concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]]) + if tot_events > nsubset: + break + standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist() standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist() - if tot_events > nsubset: - break - return standardization_dict From 8f190b77d0ab7533ca82ab0cde4f16eff083a9de Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:04:22 +0200 Subject: [PATCH 44/66] fixed --- mlpf/pyg/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 18e02ae57..2e7f763b6 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -342,12 +342,8 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000): standardization_dict = {} - print("dataset", dataset) - print("ELEM_TYPES_NONZERO[dataset]", ELEM_TYPES_NONZERO[dataset]) - for ielem in ELEM_TYPES_NONZERO[dataset]: standardization_dict["PFelement" + str(ielem)] = {} - print(standardization_dict.keys()) tot_events = 0 for i, batch in enumerate(train_loader): From 57e2924cb429cd4406fa2bd4f6a5958306d9c400 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:12:21 +0200 Subject: [PATCH 45/66] up --- mlpf/pyg/utils.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 2e7f763b6..a58869439 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -162,9 +162,7 @@ def unpack_target(y, model): # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"] ret["momentum"] = y[..., 2:7].to(dtype=torch.float32) - ret["p4"] = torch.cat( - [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1 - ) + ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1) ret["ispu"] = y[..., -1] @@ -282,11 +280,7 @@ def load_lr_schedule(lr_schedule, checkpoint): lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"]) return lr_schedule else: - raise KeyError( - "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format( - checkpoint["extra_state"].keys() - ) - ) + raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys())) def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1): @@ -304,9 +298,7 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=- pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3, ) elif config["lr_schedule"] == "cosinedecay": - lr_schedule = CosineAnnealingLR( - opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1 - ) + lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1) else: raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.") return lr_schedule From cad00fa63f774472e44b5390ab4fe84f1a025bc6 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:25:48 +0200 Subject: [PATCH 46/66] logging --- mlpf/pyg/training.py | 86 +++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 33 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 91dd4f3c0..2d8b43d67 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -97,7 +97,9 @@ def mlpf_loss(y, ypred, batch): # binary loss for particle / no-particle classification # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) - loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none") + loss_binary_classification = 10 * torch.nn.functional.cross_entropy( + ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none" + ) # compare the particle type, only for cases where there was a true particle loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) @@ -145,12 +147,12 @@ def mlpf_loss(y, ypred, batch): pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2) loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() - was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( - axis=-1 - ) - was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( - axis=-1 - ) + was_input_pred = torch.concat( + [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1 + ) * batch.mask.unsqueeze(axis=-1) + was_input_true = torch.concat( + [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1 + ) * batch.mask.unsqueeze(axis=-1) # standardize Wasserstein loss std = was_input_true[batch.mask].std(axis=0) @@ -192,7 +194,9 @@ class FocalLoss(nn.Module): - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0. """ - def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100): + def __init__( + self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100 + ): """Constructor. Args: alpha (Tensor, optional): Weights for each class. Defaults to None. @@ -382,18 +386,30 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) @@ -457,7 +473,9 @@ def train_and_valid( if (world_size > 1) and (rank != 0): iterator = enumerate(data_loader) else: - iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}") + iterator = tqdm.tqdm( + enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}" + ) device_type = "cuda" if isinstance(rank, int) else "cpu" @@ -715,7 +733,9 @@ def train_mlpf( # training step, edit here to profile a specific epoch if epoch == -1: - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof: + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True + ) as prof: with record_function("model_train"): losses_t = train_and_valid( rank, @@ -865,18 +885,6 @@ def train_mlpf( time_per_epoch = (t1 - t0_initial) / epoch eta = epochs_remaining * time_per_epoch / 60 - # _logger.info( - # f"Rank {rank}: epoch={epoch} / {num_epochs} " - # + f"train_loss={losses_t['Total']:.4f} " - # + f"valid_loss={losses_v['Total']:.4f} " - # + f"stale={stale_epochs} " - # + f"epoch_train_time={round((t_train-t0)/60, 2)}m " - # + f"epoch_valid_time={round((t_valid-t_train)/60, 2)}m " - # + f"epoch_total_time={round((t1-t0)/60, 2)}m " - # + f"eta={round(eta, 1)}m", - # color="bold", - # ) - _logger.info( f"Rank {rank}: epoch={epoch} / {num_epochs} " + f"stale={stale_epochs} " @@ -1029,7 +1037,9 @@ def run(rank, world_size, config, args, outdir, logfile): _logger.info(f"Model directory {outdir}", color="bold") if args.comet: - comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) + comet_experiment = create_comet_experiment( + config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir + ) comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1061,6 +1071,8 @@ def run(rank, world_size, config, args, outdir, logfile): lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch) if config["standardize_inputs"] is True: + if (rank == 0) or (rank == "cpu"): + _logger.info("Will standardize the input features before running the training") standardization_dict = get_input_standardization(config["dataset"], loaders["train"]) else: standardization_dict = None @@ -1282,7 +1294,9 @@ def train_ray_trial(config, args, outdir=None): loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True) if args.comet: - comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) + comet_experiment = create_comet_experiment( + config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir + ) comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1316,7 +1330,9 @@ def train_ray_trial(config, args, outdir=None): if args.resume_training: model, optimizer = load_checkpoint(checkpoint, model, optimizer) start_epoch = checkpoint["extra_state"]["epoch"] + 1 - lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1) + lr_schedule = get_lr_schedule( + config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1 + ) else: # start a new training with model weights loaded from a pre-trained model model = load_checkpoint(checkpoint, model) @@ -1480,7 +1496,9 @@ def run_hpo(config, args): if tune.Tuner.can_restore(str(expdir)): # resume unfinished HPO run - tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True) + tuner = tune.Tuner.restore( + str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True + ) else: # start new HPO run search_space = {"train_loop_config": search_space} # the ray TorchTrainer only takes a single arg: train_loop_config @@ -1521,4 +1539,6 @@ def run_hpo(config, args): print(result_df.columns) logging.info("Total time of Tuner.fit(): {}".format(end - start)) - logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)) + logging.info( + "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config) + ) From f91da8196b7d5042ac97da4ec1215d540521edd1 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:27:26 +0200 Subject: [PATCH 47/66] up --- mlpf/pyg_pipeline.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index 4110e2dea..c592c3018 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -27,7 +27,9 @@ parser.add_argument("--prefix", type=str, default=None, help="prefix appended to result dir name") parser.add_argument("--data-dir", type=str, default=None, help="path to `tensorflow_datasets/`") parser.add_argument("--gpus", type=int, default=None, help="to use CPU set to 0; else e.g., 4") -parser.add_argument("--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor") +parser.add_argument( + "--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor" +) parser.add_argument( "--dataset", type=str, @@ -38,7 +40,9 @@ ) parser.add_argument("--num-workers", type=int, default=None, help="number of processes to load the data") parser.add_argument("--prefetch-factor", type=int, default=None, help="number of samples to fetch & prefetch at every call") -parser.add_argument("--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume") +parser.add_argument( + "--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume" +) parser.add_argument("--load", type=str, default=None, help="load checkpoint and start new training from epoch 1") parser.add_argument("--train", action="store_true", default=None, help="initiates a training") @@ -53,7 +57,9 @@ help="which graph layer to use", choices=["attention", "gnn_lsh", "mamba"], ) -parser.add_argument("--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers") +parser.add_argument( + "--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers" +) parser.add_argument("--make-plots", action="store_true", default=None, help="make plots of the test predictions") parser.add_argument("--export-onnx", action="store_true", default=None, help="exports the model to onnx") parser.add_argument("--ntrain", type=int, default=None, help="training samples to use, if None use entire dataset") @@ -88,6 +94,10 @@ ) parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process") +parser.add_argument( + "--standardize_inputs", action="store_true", default=None, help="will standardize the input features before training" +) + def get_outdir(resume_training, load): outdir = None @@ -149,6 +159,9 @@ def main(): } config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]} + if args.standardize_inputs: + config["standardize_inputs"] = True + # override loaded config with values from command line args config = override_config(config, args) From 010eef5dd6db830b32ace63180ab0dfe39dab201 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:29:30 +0200 Subject: [PATCH 48/66] up --- mlpf/pyg/mlpf.py | 12 +- mlpf/pyg/training.py | 2 +- mlpf/pyg_pipeline.py | 6 +- parameters/pytorch/pyg-cld.yaml | 2 +- parameters/pytorch/pyg-clic-hits.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-21-std.yaml | 123 ------------------ parameters/pytorch/pyg-clic-ttbar-22-std.yaml | 123 ------------------ parameters/pytorch/pyg-clic.yaml | 2 +- parameters/pytorch/pyg-cms-finetune.yaml | 2 +- parameters/pytorch/pyg-cms-ttbar-nopu.yaml | 2 +- parameters/pytorch/pyg-cms.yaml | 2 +- 11 files changed, 18 insertions(+), 260 deletions(-) delete mode 100644 parameters/pytorch/pyg-clic-ttbar-21-std.yaml delete mode 100644 parameters/pytorch/pyg-clic-ttbar-22-std.yaml diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 33d431c09..5759c1f50 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -57,7 +57,7 @@ def norm_cdf(x): return tensor -def standardize_inputs(X, elemtypes_nonzero, standardization_dict): +def standardize_input(X, elemtypes_nonzero, standardization_dict): for i, ielem in enumerate(elemtypes_nonzero): @@ -122,7 +122,9 @@ def __init__( self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True) self.norm0 = torch.nn.LayerNorm(embedding_dim) self.norm1 = torch.nn.LayerNorm(embedding_dim) - self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()) + self.seq = torch.nn.Sequential( + nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act() + ) self.dropout = torch.nn.Dropout(dropout_ff) _logger.info("using attention_type={}".format(attention_type)) # params for torch sdp_kernel @@ -402,7 +404,7 @@ def forward(self, X_features, mask, standardization_dict=None): Xfeat_normed = X_features if standardization_dict is not None: - Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero, standardization_dict) + Xfeat_normed = standardize_input(X_features, self.elemtypes_nonzero, standardization_dict) embeddings_id, embeddings_reg = [], [] if self.num_convs != 0: @@ -463,7 +465,9 @@ def forward(self, X_features, mask, standardization_dict=None): e_real[~mask] = 0 e_real[torch.isinf(e_real)] = 0 e_real[torch.isnan(e_real)] = 0 - preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])) + preds_energy = e_real + torch.nn.functional.relu( + self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]) + ) preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1) return preds_binary_particle, preds_pid, preds_momentum diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 2d8b43d67..c561ab5c6 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -1070,7 +1070,7 @@ def run(rank, world_size, config, args, outdir, logfile): last_epoch = -1 if start_epoch == 1 else start_epoch - 1 lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch) - if config["standardize_inputs"] is True: + if config["standardize_input"] is True: if (rank == 0) or (rank == "cpu"): _logger.info("Will standardize the input features before running the training") standardization_dict = get_input_standardization(config["dataset"], loaders["train"]) diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index c592c3018..d07423ed3 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -95,7 +95,7 @@ parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process") parser.add_argument( - "--standardize_inputs", action="store_true", default=None, help="will standardize the input features before training" + "--standardize_input", action="store_true", default=None, help="will standardize the input features before training" ) @@ -159,8 +159,8 @@ def main(): } config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]} - if args.standardize_inputs: - config["standardize_inputs"] = True + if args.standardize_input: + config["standardize_input"] = True # override loaded config with values from command line args config = override_config(config, args) diff --git a/parameters/pytorch/pyg-cld.yaml b/parameters/pytorch/pyg-cld.yaml index f3ae2e957..e2353086a 100644 --- a/parameters/pytorch/pyg-cld.yaml +++ b/parameters/pytorch/pyg-cld.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_input: False dataset: cld sort_data: no data_dir: diff --git a/parameters/pytorch/pyg-clic-hits.yaml b/parameters/pytorch/pyg-clic-hits.yaml index 4a8b5e3b1..7f6aa796f 100644 --- a/parameters/pytorch/pyg-clic-hits.yaml +++ b/parameters/pytorch/pyg-clic-hits.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_input: False dataset: clic data_dir: gpus: 1 diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml deleted file mode 100644 index ae6b5a01e..000000000 --- a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml +++ /dev/null @@ -1,123 +0,0 @@ -backend: pytorch - -standardize_inputs: True -save_attention: yes -dataset: clic -sort_data: no -data_dir: -gpus: 1 -gpu_batch_multiplier: 1 -load: -num_epochs: 100 -patience: 20 -lr: 0.0001 -lr_schedule: cosinedecay # constant, cosinedecay, onecycle -conv_type: attention # gnn_lsh, attention, mamba, flashattention -ntrain: -ntest: -nvalid: -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 100 -dtype: float32 -val_freq: # run an extra validation run every val_freq training steps - -model: - trainable: all - learned_representation_mode: last #last, concat - input_encoding: split #split, joint - pt_mode: direct-elemtype-split - eta_mode: linear - sin_phi_mode: linear - cos_phi_mode: linear - energy_mode: direct-elemtype-split - - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 8 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 32 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - ffn_dist_num_layers: 2 - - attention: - conv_type: attention - num_convs: 8 - dropout_ff: 0.0 - dropout_conv_id_mha: 0.0 - dropout_conv_id_ff: 0.0 - dropout_conv_reg_mha: 0.0 - dropout_conv_reg_ff: 0.0 - activation: "relu" - head_dim: 64 - num_heads: 12 - attention_type: math - use_pre_layernorm: True - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -lr_schedule_config: - onecycle: - pct_start: 0.3 - -raytune: - local_dir: # Note: please specify an absolute path - sched: # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.1.0 - -valid_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.1.0 - -test_dataset: - clic_edm_ttbar_pf: - version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml deleted file mode 100644 index 39e3e8247..000000000 --- a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml +++ /dev/null @@ -1,123 +0,0 @@ -backend: pytorch - -standardize_inputs: True -save_attention: yes -dataset: clic -sort_data: no -data_dir: -gpus: 1 -gpu_batch_multiplier: 1 -load: -num_epochs: 100 -patience: 20 -lr: 0.0001 -lr_schedule: cosinedecay # constant, cosinedecay, onecycle -conv_type: attention # gnn_lsh, attention, mamba, flashattention -ntrain: -ntest: -nvalid: -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 100 -dtype: float32 -val_freq: # run an extra validation run every val_freq training steps - -model: - trainable: all - learned_representation_mode: last #last, concat - input_encoding: split #split, joint - pt_mode: direct-elemtype-split - eta_mode: linear - sin_phi_mode: linear - cos_phi_mode: linear - energy_mode: direct-elemtype-split - - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 8 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 32 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - ffn_dist_num_layers: 2 - - attention: - conv_type: attention - num_convs: 8 - dropout_ff: 0.0 - dropout_conv_id_mha: 0.0 - dropout_conv_id_ff: 0.0 - dropout_conv_reg_mha: 0.0 - dropout_conv_reg_ff: 0.0 - activation: "relu" - head_dim: 64 - num_heads: 12 - attention_type: math - use_pre_layernorm: True - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -lr_schedule_config: - onecycle: - pct_start: 0.3 - -raytune: - local_dir: # Note: please specify an absolute path - sched: # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.2.0 - -valid_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.2.0 - -test_dataset: - clic_edm_ttbar_pf: - version: 2.2.0 diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml index e4dcdde1e..185368c12 100644 --- a/parameters/pytorch/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_input: False save_attention: yes dataset: clic sort_data: no diff --git a/parameters/pytorch/pyg-cms-finetune.yaml b/parameters/pytorch/pyg-cms-finetune.yaml index 2c362ea39..03f5af6c8 100644 --- a/parameters/pytorch/pyg-cms-finetune.yaml +++ b/parameters/pytorch/pyg-cms-finetune.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_input: False dataset: cms sort_data: yes data_dir: diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml index 2e1ac6e94..8485611c4 100644 --- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml +++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_input: False dataset: cms sort_data: yes data_dir: diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml index 76770c1cb..7507d848d 100644 --- a/parameters/pytorch/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_input: False save_attention: no dataset: cms sort_data: yes From 9156459182461c668a3ac02d1aed91a53a91e57e Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:30:48 +0200 Subject: [PATCH 49/66] check --- mlpf/pyg_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index d07423ed3..278c0911a 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -159,8 +159,8 @@ def main(): } config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]} - if args.standardize_input: - config["standardize_input"] = True + # if args.standardize_input: + # config["standardize_input"] = True # override loaded config with values from command line args config = override_config(config, args) From 812f05c092577ece9c19dd8224260c1bac8837de Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:31:51 +0200 Subject: [PATCH 50/66] remove unnecessary config --- parameters/pytorch/pyg-clic-std.yaml | 129 --------------------------- 1 file changed, 129 deletions(-) delete mode 100644 parameters/pytorch/pyg-clic-std.yaml diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml deleted file mode 100644 index 287069d31..000000000 --- a/parameters/pytorch/pyg-clic-std.yaml +++ /dev/null @@ -1,129 +0,0 @@ -backend: pytorch - -standardize_inputs: True -save_attention: yes -dataset: clic -sort_data: no -data_dir: -gpus: 1 -gpu_batch_multiplier: 1 -load: -num_epochs: 100 -patience: 20 -lr: 0.0001 -lr_schedule: cosinedecay # constant, cosinedecay, onecycle -conv_type: attention # gnn_lsh, attention, mamba, flashattention -ntrain: -ntest: -nvalid: -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 100 -dtype: float32 -val_freq: # run an extra validation run every val_freq training steps - -model: - trainable: all - learned_representation_mode: last #last, concat - input_encoding: split #split, joint - pt_mode: direct-elemtype-split - eta_mode: linear - sin_phi_mode: linear - cos_phi_mode: linear - energy_mode: direct-elemtype-split - - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 8 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 32 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - ffn_dist_num_layers: 2 - - attention: - conv_type: attention - num_convs: 4 - dropout_ff: 0.0 - dropout_conv_id_mha: 0.0 - dropout_conv_id_ff: 0.0 - dropout_conv_reg_mha: 0.0 - dropout_conv_reg_ff: 0.0 - activation: "gelu" - head_dim: 32 - num_heads: 32 - attention_type: math - use_pre_layernorm: True - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -lr_schedule_config: - onecycle: - pct_start: 0.3 - -raytune: - local_dir: # Note: please specify an absolute path - sched: # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.1.0 - clic_edm_qq_pf: - version: 2.1.0 - -valid_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.1.0 - clic_edm_qq_pf: - version: 2.1.0 - -test_dataset: - clic_edm_ttbar_pf: - version: 2.1.0 - clic_edm_qq_pf: - version: 2.1.0 From 0968e51046d3bde32ac2800aac3647d63e80d1a5 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:34:25 +0200 Subject: [PATCH 51/66] up --- mlpf/pyg_pipeline.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index 278c0911a..1ed7116e5 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -95,7 +95,7 @@ parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process") parser.add_argument( - "--standardize_input", action="store_true", default=None, help="will standardize the input features before training" + "--standardize-input", action="store_true", default=None, help="will standardize the input features before training" ) @@ -159,9 +159,6 @@ def main(): } config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]} - # if args.standardize_input: - # config["standardize_input"] = True - # override loaded config with values from command line args config = override_config(config, args) From 42ff712e2e662454faec8fd86395e0f3e2916b22 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:48:19 +0200 Subject: [PATCH 52/66] debug --- mlpf/pyg/utils.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index a58869439..f5ab29ad8 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -162,7 +162,9 @@ def unpack_target(y, model): # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"] ret["momentum"] = y[..., 2:7].to(dtype=torch.float32) - ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1) + ret["p4"] = torch.cat( + [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1 + ) ret["ispu"] = y[..., -1] @@ -280,7 +282,11 @@ def load_lr_schedule(lr_schedule, checkpoint): lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"]) return lr_schedule else: - raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys())) + raise KeyError( + "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format( + checkpoint["extra_state"].keys() + ) + ) def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1): @@ -298,7 +304,9 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=- pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3, ) elif config["lr_schedule"] == "cosinedecay": - lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1) + lr_schedule = CosineAnnealingLR( + opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1 + ) else: raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.") return lr_schedule @@ -352,7 +360,7 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000): concatenated_pfelements = batch.X[msk] else: concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]]) - + print("concatenated_pfelements", concatenated_pfelements.device) if tot_events > nsubset: break From fb9e68a78de5fa6b3fb4f2ac9d20d1628de15807 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:49:31 +0200 Subject: [PATCH 53/66] revert --- mlpf/pyg/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index f5ab29ad8..2e7f763b6 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -360,7 +360,7 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000): concatenated_pfelements = batch.X[msk] else: concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]]) - print("concatenated_pfelements", concatenated_pfelements.device) + if tot_events > nsubset: break From 181c5341c4dab56222d6b53f96464752b9163ca7 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:51:35 +0200 Subject: [PATCH 54/66] up new config for all samples --- mlpf/pyg/mlpf.py | 8 +- mlpf/pyg/training.py | 72 +++------- mlpf/pyg/utils.py | 14 +- mlpf/pyg_pipeline.py | 16 +-- parameters/pytorch/pyg-clic-allsamples.yaml | 141 ++++++++++++++++++++ 5 files changed, 171 insertions(+), 80 deletions(-) create mode 100644 parameters/pytorch/pyg-clic-allsamples.yaml diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 5759c1f50..163b880e2 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -122,9 +122,7 @@ def __init__( self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True) self.norm0 = torch.nn.LayerNorm(embedding_dim) self.norm1 = torch.nn.LayerNorm(embedding_dim) - self.seq = torch.nn.Sequential( - nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act() - ) + self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()) self.dropout = torch.nn.Dropout(dropout_ff) _logger.info("using attention_type={}".format(attention_type)) # params for torch sdp_kernel @@ -465,9 +463,7 @@ def forward(self, X_features, mask, standardization_dict=None): e_real[~mask] = 0 e_real[torch.isinf(e_real)] = 0 e_real[torch.isnan(e_real)] = 0 - preds_energy = e_real + torch.nn.functional.relu( - self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]) - ) + preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])) preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1) return preds_binary_particle, preds_pid, preds_momentum diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index c561ab5c6..c526fcb0e 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -97,9 +97,7 @@ def mlpf_loss(y, ypred, batch): # binary loss for particle / no-particle classification # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) - loss_binary_classification = 10 * torch.nn.functional.cross_entropy( - ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none" - ) + loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none") # compare the particle type, only for cases where there was a true particle loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) @@ -147,12 +145,12 @@ def mlpf_loss(y, ypred, batch): pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2) loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() - was_input_pred = torch.concat( - [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1 - ) * batch.mask.unsqueeze(axis=-1) - was_input_true = torch.concat( - [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1 - ) * batch.mask.unsqueeze(axis=-1) + was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) + was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) # standardize Wasserstein loss std = was_input_true[batch.mask].std(axis=0) @@ -194,9 +192,7 @@ class FocalLoss(nn.Module): - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0. """ - def __init__( - self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100 - ): + def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100): """Constructor. Args: alpha (Tensor, optional): Weights for each class. Defaults to None. @@ -386,30 +382,18 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) @@ -473,9 +457,7 @@ def train_and_valid( if (world_size > 1) and (rank != 0): iterator = enumerate(data_loader) else: - iterator = tqdm.tqdm( - enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}" - ) + iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}") device_type = "cuda" if isinstance(rank, int) else "cpu" @@ -733,9 +715,7 @@ def train_mlpf( # training step, edit here to profile a specific epoch if epoch == -1: - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True - ) as prof: + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof: with record_function("model_train"): losses_t = train_and_valid( rank, @@ -1037,9 +1017,7 @@ def run(rank, world_size, config, args, outdir, logfile): _logger.info(f"Model directory {outdir}", color="bold") if args.comet: - comet_experiment = create_comet_experiment( - config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir - ) + comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1294,9 +1272,7 @@ def train_ray_trial(config, args, outdir=None): loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True) if args.comet: - comet_experiment = create_comet_experiment( - config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir - ) + comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1330,9 +1306,7 @@ def train_ray_trial(config, args, outdir=None): if args.resume_training: model, optimizer = load_checkpoint(checkpoint, model, optimizer) start_epoch = checkpoint["extra_state"]["epoch"] + 1 - lr_schedule = get_lr_schedule( - config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1 - ) + lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1) else: # start a new training with model weights loaded from a pre-trained model model = load_checkpoint(checkpoint, model) @@ -1496,9 +1470,7 @@ def run_hpo(config, args): if tune.Tuner.can_restore(str(expdir)): # resume unfinished HPO run - tuner = tune.Tuner.restore( - str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True - ) + tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True) else: # start new HPO run search_space = {"train_loop_config": search_space} # the ray TorchTrainer only takes a single arg: train_loop_config @@ -1539,6 +1511,4 @@ def run_hpo(config, args): print(result_df.columns) logging.info("Total time of Tuner.fit(): {}".format(end - start)) - logging.info( - "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config) - ) + logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)) diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 2e7f763b6..a58869439 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -162,9 +162,7 @@ def unpack_target(y, model): # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"] ret["momentum"] = y[..., 2:7].to(dtype=torch.float32) - ret["p4"] = torch.cat( - [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1 - ) + ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1) ret["ispu"] = y[..., -1] @@ -282,11 +280,7 @@ def load_lr_schedule(lr_schedule, checkpoint): lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"]) return lr_schedule else: - raise KeyError( - "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format( - checkpoint["extra_state"].keys() - ) - ) + raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys())) def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1): @@ -304,9 +298,7 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=- pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3, ) elif config["lr_schedule"] == "cosinedecay": - lr_schedule = CosineAnnealingLR( - opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1 - ) + lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1) else: raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.") return lr_schedule diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index 1ed7116e5..6aa6ab6fd 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -27,9 +27,7 @@ parser.add_argument("--prefix", type=str, default=None, help="prefix appended to result dir name") parser.add_argument("--data-dir", type=str, default=None, help="path to `tensorflow_datasets/`") parser.add_argument("--gpus", type=int, default=None, help="to use CPU set to 0; else e.g., 4") -parser.add_argument( - "--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor" -) +parser.add_argument("--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor") parser.add_argument( "--dataset", type=str, @@ -40,9 +38,7 @@ ) parser.add_argument("--num-workers", type=int, default=None, help="number of processes to load the data") parser.add_argument("--prefetch-factor", type=int, default=None, help="number of samples to fetch & prefetch at every call") -parser.add_argument( - "--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume" -) +parser.add_argument("--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume") parser.add_argument("--load", type=str, default=None, help="load checkpoint and start new training from epoch 1") parser.add_argument("--train", action="store_true", default=None, help="initiates a training") @@ -57,9 +53,7 @@ help="which graph layer to use", choices=["attention", "gnn_lsh", "mamba"], ) -parser.add_argument( - "--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers" -) +parser.add_argument("--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers") parser.add_argument("--make-plots", action="store_true", default=None, help="make plots of the test predictions") parser.add_argument("--export-onnx", action="store_true", default=None, help="exports the model to onnx") parser.add_argument("--ntrain", type=int, default=None, help="training samples to use, if None use entire dataset") @@ -94,9 +88,7 @@ ) parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process") -parser.add_argument( - "--standardize-input", action="store_true", default=None, help="will standardize the input features before training" -) +parser.add_argument("--standardize-input", action="store_true", default=None, help="will standardize the input features before training") def get_outdir(resume_training, load): diff --git a/parameters/pytorch/pyg-clic-allsamples.yaml b/parameters/pytorch/pyg-clic-allsamples.yaml new file mode 100644 index 000000000..290370b73 --- /dev/null +++ b/parameters/pytorch/pyg-clic-allsamples.yaml @@ -0,0 +1,141 @@ +backend: pytorch + +standardize_input: False +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + clic_edm_ww_fullhad_pf/: + version: 2.1.0 + clic_edm_zh_tautau_pf/: + version: 2.1.0 + clic_edm_z_tautau_pf/: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + clic_edm_ww_fullhad_pf/: + version: 2.1.0 + clic_edm_zh_tautau_pf/: + version: 2.1.0 + clic_edm_z_tautau_pf/: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 From 829399013c1e014b7602e5ab6393e8d1b2cb21c3 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:52:32 +0200 Subject: [PATCH 55/66] oopsie --- parameters/pytorch/pyg-clic-allsamples.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parameters/pytorch/pyg-clic-allsamples.yaml b/parameters/pytorch/pyg-clic-allsamples.yaml index 290370b73..bff38ac5b 100644 --- a/parameters/pytorch/pyg-clic-allsamples.yaml +++ b/parameters/pytorch/pyg-clic-allsamples.yaml @@ -111,11 +111,11 @@ train_dataset: version: 2.1.0 clic_edm_qq_pf: version: 2.1.0 - clic_edm_ww_fullhad_pf/: + clic_edm_ww_fullhad_pf: version: 2.1.0 - clic_edm_zh_tautau_pf/: + clic_edm_zh_tautau_pf: version: 2.1.0 - clic_edm_z_tautau_pf/: + clic_edm_z_tautau_pf: version: 2.1.0 valid_dataset: @@ -127,11 +127,11 @@ valid_dataset: version: 2.1.0 clic_edm_qq_pf: version: 2.1.0 - clic_edm_ww_fullhad_pf/: + clic_edm_ww_fullhad_pf: version: 2.1.0 - clic_edm_zh_tautau_pf/: + clic_edm_zh_tautau_pf: version: 2.1.0 - clic_edm_z_tautau_pf/: + clic_edm_z_tautau_pf: version: 2.1.0 test_dataset: From 687b5d71ed396c9b0d36d8c7082d85ce5642c3e1 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 16:59:33 +0200 Subject: [PATCH 56/66] up --- parameters/pytorch/pyg-clic-f.yaml | 129 +++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 parameters/pytorch/pyg-clic-f.yaml diff --git a/parameters/pytorch/pyg-clic-f.yaml b/parameters/pytorch/pyg-clic-f.yaml new file mode 100644 index 000000000..3d061e6ec --- /dev/null +++ b/parameters/pytorch/pyg-clic-f.yaml @@ -0,0 +1,129 @@ +backend: pytorch + +standardize_input: False +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 8 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "relu" + head_dim: 64 + num_heads: 12 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 From efdb489d3d68ae8c7d9fabf580c60c46ce116695 Mon Sep 17 00:00:00 2001 From: Farouk Date: Mon, 23 Sep 2024 17:27:15 +0200 Subject: [PATCH 57/66] pca --- parameters/pytorch/pyg-clic-allsamples.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parameters/pytorch/pyg-clic-allsamples.yaml b/parameters/pytorch/pyg-clic-allsamples.yaml index bff38ac5b..9b240c9fa 100644 --- a/parameters/pytorch/pyg-clic-allsamples.yaml +++ b/parameters/pytorch/pyg-clic-allsamples.yaml @@ -132,7 +132,7 @@ valid_dataset: clic_edm_zh_tautau_pf: version: 2.1.0 clic_edm_z_tautau_pf: - version: 2.1.0 + version: 2.1.0 test_dataset: clic_edm_ttbar_pf: From df1ecbad3da484b0f22cf0fec1730d7c4956e95a Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 11:18:15 +0200 Subject: [PATCH 58/66] up configs --- parameters/pytorch/pyg-clic-f.yaml | 129 ---------------------- parameters/pytorch/pyg-clic-ttbar-21.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-22.yaml | 2 +- 3 files changed, 2 insertions(+), 131 deletions(-) delete mode 100644 parameters/pytorch/pyg-clic-f.yaml diff --git a/parameters/pytorch/pyg-clic-f.yaml b/parameters/pytorch/pyg-clic-f.yaml deleted file mode 100644 index 3d061e6ec..000000000 --- a/parameters/pytorch/pyg-clic-f.yaml +++ /dev/null @@ -1,129 +0,0 @@ -backend: pytorch - -standardize_input: False -save_attention: yes -dataset: clic -sort_data: no -data_dir: -gpus: 1 -gpu_batch_multiplier: 1 -load: -num_epochs: 100 -patience: 20 -lr: 0.0001 -lr_schedule: cosinedecay # constant, cosinedecay, onecycle -conv_type: attention # gnn_lsh, attention, mamba, flashattention -ntrain: -ntest: -nvalid: -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 100 -dtype: float32 -val_freq: # run an extra validation run every val_freq training steps - -model: - trainable: all - learned_representation_mode: last #last, concat - input_encoding: split #split, joint - pt_mode: direct-elemtype-split - eta_mode: linear - sin_phi_mode: linear - cos_phi_mode: linear - energy_mode: direct-elemtype-split - - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 8 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 32 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - ffn_dist_num_layers: 2 - - attention: - conv_type: attention - num_convs: 8 - dropout_ff: 0.0 - dropout_conv_id_mha: 0.0 - dropout_conv_id_ff: 0.0 - dropout_conv_reg_mha: 0.0 - dropout_conv_reg_ff: 0.0 - activation: "relu" - head_dim: 64 - num_heads: 12 - attention_type: math - use_pre_layernorm: True - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -lr_schedule_config: - onecycle: - pct_start: 0.3 - -raytune: - local_dir: # Note: please specify an absolute path - sched: # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.1.0 - clic_edm_qq_pf: - version: 2.1.0 - -valid_dataset: - clic: - physical: - batch_size: 1 - samples: - clic_edm_ttbar_pf: - version: 2.1.0 - clic_edm_qq_pf: - version: 2.1.0 - -test_dataset: - clic_edm_ttbar_pf: - version: 2.1.0 - clic_edm_qq_pf: - version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml index 6aea54096..ae6b5a01e 100644 --- a/parameters/pytorch/pyg-clic-ttbar-21.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_inputs: True save_attention: yes dataset: clic sort_data: no diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml index 1512a6b3e..39e3e8247 100644 --- a/parameters/pytorch/pyg-clic-ttbar-22.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: False +standardize_inputs: True save_attention: yes dataset: clic sort_data: no From d6252c05c0a6d246af849b327566ef4133d45bd5 Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 11:19:36 +0200 Subject: [PATCH 59/66] up --- parameters/pytorch/pyg-clic-ttbar-21.yaml | 8 ++++---- parameters/pytorch/pyg-clic-ttbar-22.yaml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml index ae6b5a01e..3d0f9cc64 100644 --- a/parameters/pytorch/pyg-clic-ttbar-21.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml @@ -52,15 +52,15 @@ model: attention: conv_type: attention - num_convs: 8 + num_convs: 4 dropout_ff: 0.0 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 - activation: "relu" - head_dim: 64 - num_heads: 12 + activation: "gelu" + head_dim: 32 + num_heads: 32 attention_type: math use_pre_layernorm: True diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml index 39e3e8247..3f920fe2b 100644 --- a/parameters/pytorch/pyg-clic-ttbar-22.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml @@ -52,15 +52,15 @@ model: attention: conv_type: attention - num_convs: 8 + num_convs: 4 dropout_ff: 0.0 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 - activation: "relu" - head_dim: 64 - num_heads: 12 + activation: "gelu" + head_dim: 32 + num_heads: 32 attention_type: math use_pre_layernorm: True From 8d3d685f58fd8e11d858dd454384435c5451c2b3 Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 11:20:39 +0200 Subject: [PATCH 60/66] up --- parameters/pytorch/pyg-clic-ttbar-21.yaml | 2 +- parameters/pytorch/pyg-clic-ttbar-22.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml index 3d0f9cc64..376f2b461 100644 --- a/parameters/pytorch/pyg-clic-ttbar-21.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: True +standardize_input: True save_attention: yes dataset: clic sort_data: no diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml index 3f920fe2b..0fc73c684 100644 --- a/parameters/pytorch/pyg-clic-ttbar-22.yaml +++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml @@ -1,6 +1,6 @@ backend: pytorch -standardize_inputs: True +standardize_input: True save_attention: yes dataset: clic sort_data: no From 1aa21a93d9a268f039636f811210f06a38d9b5e3 Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 11:32:54 +0200 Subject: [PATCH 61/66] up configs --- .../pytorch/pyg-clic-ttbar-21-joint.yaml | 123 ++++++++++++++++++ .../pytorch/pyg-clic-ttbar-22-joint.yaml | 123 ++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 parameters/pytorch/pyg-clic-ttbar-21-joint.yaml create mode 100644 parameters/pytorch/pyg-clic-ttbar-22-joint.yaml diff --git a/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml b/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml new file mode 100644 index 000000000..915c6cb91 --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +standardize_input: True +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: joint #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml b/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml new file mode 100644 index 000000000..19c15e8d7 --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +standardize_input: True +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: joint #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.2.0 From 832df1ca159934314a366a7c977191b60948753c Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 11:54:56 +0200 Subject: [PATCH 62/66] try new loss --- mlpf/pyg/training.py | 102 ++++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index c526fcb0e..79b0b71c9 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -73,7 +73,7 @@ def sliced_wasserstein_loss(y_pred, y_true, num_projections=200): return ret -def mlpf_loss(y, ypred, batch): +def mlpf_loss(y, ypred, batch, epoch): """ Args y [dict]: relevant keys are "cls_id, momentum, charge" @@ -97,7 +97,9 @@ def mlpf_loss(y, ypred, batch): # binary loss for particle / no-particle classification # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) - loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none") + loss_binary_classification = 10 * torch.nn.functional.cross_entropy( + ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none" + ) # compare the particle type, only for cases where there was a true particle loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) @@ -145,27 +147,31 @@ def mlpf_loss(y, ypred, batch): pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2) loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() - was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( - axis=-1 - ) - was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( - axis=-1 - ) + was_input_pred = torch.concat( + [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1 + ) * batch.mask.unsqueeze(axis=-1) + was_input_true = torch.concat( + [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1 + ) * batch.mask.unsqueeze(axis=-1) # standardize Wasserstein loss std = was_input_true[batch.mask].std(axis=0) loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean() - # this is the final loss to be optimized - loss["Total"] = ( - loss["Classification_binary"] - + loss["Classification"] - + loss["Regression_pt"] - + loss["Regression_eta"] - + loss["Regression_sin_phi"] - + loss["Regression_cos_phi"] - + loss["Regression_energy"] - ) + loss["Total"] = loss["Classification_binary"] + + if epoch >= 2: + loss["Total"] += loss["Classification"] + + elif epoch >= 4: + # this is the final loss to be optimized + loss["Total"] = ( + +loss["Regression_pt"] + + loss["Regression_eta"] + + loss["Regression_sin_phi"] + + loss["Regression_cos_phi"] + + loss["Regression_energy"] + ) # store these separately but detached loss["Classification_binary"] = loss["Classification_binary"].detach() @@ -192,7 +198,9 @@ class FocalLoss(nn.Module): - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0. """ - def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100): + def __init__( + self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100 + ): """Constructor. Args: alpha (Tensor, optional): Weights for each class. Defaults to None. @@ -382,18 +390,30 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch) - tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram( + "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch + ) + tensorboard_writer.add_histogram( + "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch + ) ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) @@ -457,7 +477,9 @@ def train_and_valid( if (world_size > 1) and (rank != 0): iterator = enumerate(data_loader) else: - iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}") + iterator = tqdm.tqdm( + enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}" + ) device_type = "cuda" if isinstance(rank, int) else "cpu" @@ -511,12 +533,12 @@ def train_and_valid( # validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir) with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: - loss = mlpf_loss(ygen, ypred, batch) + loss = mlpf_loss(ygen, ypred, batch, epoch) for param in model.parameters(): param.grad = None else: with torch.no_grad(): - loss = mlpf_loss(ygen, ypred, batch) + loss = mlpf_loss(ygen, ypred, batch, epoch) if is_train: loss["Total"].backward() @@ -715,7 +737,9 @@ def train_mlpf( # training step, edit here to profile a specific epoch if epoch == -1: - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof: + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True + ) as prof: with record_function("model_train"): losses_t = train_and_valid( rank, @@ -1017,7 +1041,9 @@ def run(rank, world_size, config, args, outdir, logfile): _logger.info(f"Model directory {outdir}", color="bold") if args.comet: - comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) + comet_experiment = create_comet_experiment( + config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir + ) comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1272,7 +1298,9 @@ def train_ray_trial(config, args, outdir=None): loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True) if args.comet: - comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) + comet_experiment = create_comet_experiment( + config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir + ) comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1306,7 +1334,9 @@ def train_ray_trial(config, args, outdir=None): if args.resume_training: model, optimizer = load_checkpoint(checkpoint, model, optimizer) start_epoch = checkpoint["extra_state"]["epoch"] + 1 - lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1) + lr_schedule = get_lr_schedule( + config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1 + ) else: # start a new training with model weights loaded from a pre-trained model model = load_checkpoint(checkpoint, model) @@ -1470,7 +1500,9 @@ def run_hpo(config, args): if tune.Tuner.can_restore(str(expdir)): # resume unfinished HPO run - tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True) + tuner = tune.Tuner.restore( + str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True + ) else: # start new HPO run search_space = {"train_loop_config": search_space} # the ray TorchTrainer only takes a single arg: train_loop_config @@ -1511,4 +1543,6 @@ def run_hpo(config, args): print(result_df.columns) logging.info("Total time of Tuner.fit(): {}".format(end - start)) - logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)) + logging.info( + "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config) + ) From e336230d1b8a1485ec9875a03c7375dddb801239 Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 12:01:13 +0200 Subject: [PATCH 63/66] up --- mlpf/pyg/training.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 79b0b71c9..8fb0863e8 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -160,10 +160,10 @@ def mlpf_loss(y, ypred, batch, epoch): loss["Total"] = loss["Classification_binary"] - if epoch >= 2: + if epoch >= 3: loss["Total"] += loss["Classification"] - elif epoch >= 4: + elif epoch >= 6: # this is the final loss to be optimized loss["Total"] = ( +loss["Regression_pt"] @@ -906,9 +906,10 @@ def train_mlpf( + losses_t["Regression_cos_phi"] + losses_t["Regression_energy"] ) + log_tot = losses_t["Classification"] + losses_t["Classification_binary"] + log_t _logger.info( - f"train: loss_total={losses_t['Total']:.4f} " + f"train: loss_total={log_tot:.4f} " + f"loss_clf={losses_t['Classification']:.4f} " + f"loss_clfbinary={losses_t['Classification_binary']:.4f} " + f"loss_reg={log_t:.4f} ", @@ -922,9 +923,10 @@ def train_mlpf( + losses_v["Regression_cos_phi"] + losses_v["Regression_energy"] ) + log_tot = losses_v["Classification"] + losses_v["Classification_binary"] + log_v _logger.info( - f"valid: loss_total={losses_v['Total']:.4f} " + f"valid: loss_total={log_tot:.4f} " + f"loss_clf={losses_v['Classification']:.4f} " + f"loss_clfbinary={losses_v['Classification_binary']:.4f} " + f"loss_reg={log_v:.4f} ", From 84957d88cd135e582dba567271fd6de7ce838a58 Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 12:24:14 +0200 Subject: [PATCH 64/66] up --- mlpf/pyg/training.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 8fb0863e8..8a4dea1dc 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -158,15 +158,11 @@ def mlpf_loss(y, ypred, batch, epoch): std = was_input_true[batch.mask].std(axis=0) loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean() - loss["Total"] = loss["Classification_binary"] - - if epoch >= 3: - loss["Total"] += loss["Classification"] - - elif epoch >= 6: + loss["Total"] = loss["Classification_binary"] + loss["Classification"] + if epoch >= 5: # this is the final loss to be optimized - loss["Total"] = ( - +loss["Regression_pt"] + loss["Total"] += ( + loss["Regression_pt"] + loss["Regression_eta"] + loss["Regression_sin_phi"] + loss["Regression_cos_phi"] From 91c1fac67ff2528ea35750c2bea49bfdb58f2b72 Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 12:48:46 +0200 Subject: [PATCH 65/66] up --- mlpf/pyg/training.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 8a4dea1dc..15ced511b 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -158,16 +158,16 @@ def mlpf_loss(y, ypred, batch, epoch): std = was_input_true[batch.mask].std(axis=0) loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean() - loss["Total"] = loss["Classification_binary"] + loss["Classification"] - if epoch >= 5: - # this is the final loss to be optimized - loss["Total"] += ( - loss["Regression_pt"] - + loss["Regression_eta"] - + loss["Regression_sin_phi"] - + loss["Regression_cos_phi"] - + loss["Regression_energy"] - ) + # this is the final loss to be optimized + loss["Total"] = ( + loss["Classification_binary"] + + loss["Classification"] + + loss["Regression_pt"] + + loss["Regression_eta"] + + loss["Regression_sin_phi"] + + loss["Regression_cos_phi"] + + loss["Regression_energy"] + ) # store these separately but detached loss["Classification_binary"] = loss["Classification_binary"].detach() From cc523e0fbdd0275665a117d2190407bfca21aff7 Mon Sep 17 00:00:00 2001 From: Farouk Date: Tue, 24 Sep 2024 15:48:14 +0200 Subject: [PATCH 66/66] fix pca --- mlpf/pyg/training.py | 72 +++++++++++++------------------------------- 1 file changed, 21 insertions(+), 51 deletions(-) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 15ced511b..2e1432125 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -97,9 +97,7 @@ def mlpf_loss(y, ypred, batch, epoch): # binary loss for particle / no-particle classification # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape) - loss_binary_classification = 10 * torch.nn.functional.cross_entropy( - ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none" - ) + loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none") # compare the particle type, only for cases where there was a true particle loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape) @@ -147,12 +145,12 @@ def mlpf_loss(y, ypred, batch, epoch): pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2) loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() - was_input_pred = torch.concat( - [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1 - ) * batch.mask.unsqueeze(axis=-1) - was_input_true = torch.concat( - [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1 - ) * batch.mask.unsqueeze(axis=-1) + was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) + was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) # standardize Wasserstein loss std = was_input_true[batch.mask].std(axis=0) @@ -194,9 +192,7 @@ class FocalLoss(nn.Module): - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0. """ - def __init__( - self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100 - ): + def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100): """Constructor. Args: alpha (Tensor, optional): Weights for each class. Defaults to None. @@ -386,30 +382,18 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) - tensorboard_writer.add_histogram( - "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch - ) - tensorboard_writer.add_histogram( - "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch - ) + tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch) + tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch) ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0] tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch) @@ -473,9 +457,7 @@ def train_and_valid( if (world_size > 1) and (rank != 0): iterator = enumerate(data_loader) else: - iterator = tqdm.tqdm( - enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}" - ) + iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}") device_type = "cuda" if isinstance(rank, int) else "cpu" @@ -733,9 +715,7 @@ def train_mlpf( # training step, edit here to profile a specific epoch if epoch == -1: - with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True - ) as prof: + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof: with record_function("model_train"): losses_t = train_and_valid( rank, @@ -1039,9 +1019,7 @@ def run(rank, world_size, config, args, outdir, logfile): _logger.info(f"Model directory {outdir}", color="bold") if args.comet: - comet_experiment = create_comet_experiment( - config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir - ) + comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1296,9 +1274,7 @@ def train_ray_trial(config, args, outdir=None): loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True) if args.comet: - comet_experiment = create_comet_experiment( - config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir - ) + comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir) comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}") comet_experiment.log_parameter("run_id", Path(outdir).name) comet_experiment.log_parameter("world_size", world_size) @@ -1332,9 +1308,7 @@ def train_ray_trial(config, args, outdir=None): if args.resume_training: model, optimizer = load_checkpoint(checkpoint, model, optimizer) start_epoch = checkpoint["extra_state"]["epoch"] + 1 - lr_schedule = get_lr_schedule( - config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1 - ) + lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1) else: # start a new training with model weights loaded from a pre-trained model model = load_checkpoint(checkpoint, model) @@ -1498,9 +1472,7 @@ def run_hpo(config, args): if tune.Tuner.can_restore(str(expdir)): # resume unfinished HPO run - tuner = tune.Tuner.restore( - str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True - ) + tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True) else: # start new HPO run search_space = {"train_loop_config": search_space} # the ray TorchTrainer only takes a single arg: train_loop_config @@ -1541,6 +1513,4 @@ def run_hpo(config, args): print(result_df.columns) logging.info("Total time of Tuner.fit(): {}".format(end - start)) - logging.info( - "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config) - ) + logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))