From 5314a07bf336c4194716527aae6b621de7596ad0 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 10:37:01 +0200
Subject: [PATCH 01/66] up

---
 scripts/clic/postprocessing.py | 150 ++++++++++++++++++++++++++++-----
 1 file changed, 129 insertions(+), 21 deletions(-)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 54d2a857b..4d32f210b 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1,20 +1,21 @@
 import os
 
-# to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable
+# noqa: to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["OPENBLAS_NUM_THREADS"] = "1"
 os.environ["MKL_NUM_THREADS"] = "1"
 os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
 os.environ["NUMEXPR_NUM_THREADS"] = "1"
 
-import numpy as np
+import bz2
+
 import awkward
+import fastjet
+import numpy as np
+import pyhepmc
+import tqdm
 import uproot
 import vector
-import tqdm
-import pyhepmc
-import bz2
-import fastjet
 from scipy.sparse import coo_matrix
 
 track_coll = "SiTracks_Refitted"
@@ -61,6 +62,16 @@
     "sigma_x",
     "sigma_y",
     "sigma_z",
+    # added by farouk
+    "energyError",
+    "sigma_energy",
+    "sigma_x_weighted",
+    "sigma_y_weighted",
+    "sigma_z_weighted",
+    "energy_weighted_width",
+    "pos_shower_max",
+    "width_shower_max",
+    "energy_shower_max",
 ]
 hit_feature_order = [
     "elemtype",
@@ -137,7 +148,9 @@ def __init__(
         self.cluster_features = cluster_features  # feature matrix of the calo clusters
         self.track_features = track_features  # feature matrix of the tracks
         self.genparticle_to_hit = genparticle_to_hit  # sparse COO matrix of genparticles to hits (idx_gp, idx_hit, weight)
-        self.genparticle_to_track = genparticle_to_track  # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight)
+        self.genparticle_to_track = (
+            genparticle_to_track  # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight)
+        )
         self.hit_to_cluster = hit_to_cluster  # sparse COO matrix of hits to clusters (idx_hit, idx_cluster, weight)
         self.gp_merges = gp_merges  # sparse COO matrix of any merged genparticles
 
@@ -203,7 +216,10 @@ def get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs):
             hit_idx_global += 1
     hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()}
     hit_feature_matrix = awkward.Record(
-        {k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields}
+        {
+            k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))])
+            for k in hit_feature_matrix[0].fields
+        }
     )
 
     # add all edges from genparticle to calohit
@@ -269,7 +285,9 @@ def gen_to_features(prop_data, iev):
     gen_arr = {k.replace(mc_coll + ".", ""): gen_arr[k] for k in gen_arr.fields}
 
     MCParticles_p4 = vector.awk(
-        awkward.zip({"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]})
+        awkward.zip(
+            {"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]}
+        )
     )
     gen_arr["pt"] = MCParticles_p4.pt
     gen_arr["eta"] = MCParticles_p4.eta
@@ -311,7 +329,7 @@ def genparticle_track_adj(sitrack_links, iev):
 
 def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
     cluster_arr = prop_data["PandoraClusters"][iev]
-    feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy"]
+    feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy", "energyError"]
     ret = {feat: cluster_arr["PandoraClusters." + feat] for feat in feats}
 
     hit_idx = np.array(hit_to_cluster[0])
@@ -324,8 +342,16 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
     cl_sigma_y = []
     cl_sigma_z = []
 
+    # added by farouk
+    cl_sigma_energy = []
+    cl_sigma_x_weighted, cl_sigma_y_weighted, cl_sigma_z_weighted = [], [], []
+    cl_energy_weighted_width = []
+    cl_pos_shower_max, cl_energy_shower_max, cl_width_shower_max = [], [], []
+
     n_cl = len(ret["energy"])
-    for cl in range(n_cl):
+
+    # xs, ys, zs, es = [], [], [], []
+    for i, cl in enumerate(range(n_cl)):
         msk_cl = cluster_idx == cl
         hits = hit_idx[msk_cl]
 
@@ -351,6 +377,57 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
         cl_sigma_y.append(np.std(hits_posy))
         cl_sigma_z.append(np.std(hits_posz))
 
+        # added by farouk
+        cl_sigma_energy.append(np.std(hits_energy))
+        cl_sigma_x_weighted.append(np.std(hits_posx * hits_energy))
+        cl_sigma_y_weighted.append(np.std(hits_posy * hits_energy))
+        cl_sigma_z_weighted.append(np.std(hits_posz * hits_energy))
+
+        # z_bar = np.sum(hits_posz * hits_energy) / np.sum(hits_energy)  # energy weighted average
+        x_bar = np.sum(hits_posx * hits_energy) / np.sum(hits_energy)  # energy weighted average
+        y_bar = np.sum(hits_posy * hits_energy) / np.sum(hits_energy)  # energy weighted average
+
+        num = (np.sum(hits_energy * (hits_posx - x_bar) ** 2)) + (np.sum(hits_energy * (hits_posy - y_bar) ** 2))
+        den = np.sum(hits_energy)
+
+        cl_energy_weighted_width.append(num / den)
+
+        #         if i==1:
+        # xs += [np.array(hits_posx)]
+        # ys += [np.array(hits_posy)]
+        # zs += [np.array(hits_posz)]
+        # es += [np.array(hits_energy)]
+
+        # get position at shower max
+        # for each unique z integrate the energy of all the hits to find zmax
+        zmax, emax = 0, -1000
+        for z in np.unique(np.array(hits_posz)):
+            msk = np.array(hits_posz) == z
+            ez = np.sum(np.array(hits_energy)[msk])
+
+            if ez > emax:
+                zmax, emax = z, ez
+
+        cl_pos_shower_max.append(zmax)
+        cl_energy_shower_max.append(emax)
+
+        # get width at shower max
+        msk = np.array(hits_posz) == zmax  # select the hits at zmax
+
+        x_bar = np.sum(np.array(hits_posx)[msk] * np.array(hits_energy)[msk]) / np.sum(
+            np.array(hits_energy)[msk]
+        )  # energy weighted average
+        y_bar = np.sum(np.array(hits_posy)[msk] * np.array(hits_energy)[msk]) / np.sum(
+            np.array(hits_energy)[msk]
+        )  # energy weighted average
+
+        num = (np.sum(np.array(hits_energy)[msk] * (np.array(hits_posx)[msk] - x_bar) ** 2)) + (
+            np.sum(np.array(hits_energy)[msk] * (np.array(hits_posy)[msk] - y_bar) ** 2)
+        )
+        den = np.sum(np.array(hits_energy)[msk])
+
+        cl_width_shower_max.append(num / den)
+
     ret["energy_ecal"] = np.array(cl_energy_ecal)
     ret["energy_hcal"] = np.array(cl_energy_hcal)
     ret["energy_other"] = np.array(cl_energy_other)
@@ -374,6 +451,17 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
     ret["sin_phi"] = np.sin(ret["phi"])
     ret["cos_phi"] = np.cos(ret["phi"])
 
+    # added by farouk
+    ret["sigma_energy"] = np.array(cl_sigma_energy)
+    ret["sigma_x_weighted"] = np.array(cl_sigma_x_weighted)
+    ret["sigma_y_weighted"] = np.array(cl_sigma_y_weighted)
+    ret["sigma_z_weighted"] = np.array(cl_sigma_z_weighted)
+    ret["energy_weighted_width"] = np.array(cl_energy_weighted_width)
+
+    ret["pos_shower_max"] = np.array(cl_pos_shower_max)
+    ret["energy_shower_max"] = np.array(cl_energy_shower_max)
+    ret["width_shower_max"] = np.array(cl_width_shower_max)
+
     return awkward.Record(ret)
 
 
@@ -425,7 +513,9 @@ def filter_adj(adj, all_to_filtered):
 
 def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs):
     gen_features = gen_to_features(prop_data, iev)
-    hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs)
+    hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(
+        hit_data, calohit_links, iev, collectionIDs
+    )
     hit_to_cluster = hit_cluster_adj(prop_data, hit_idx_local_to_global, iev)
     cluster_features = cluster_to_features(prop_data, hit_features, hit_to_cluster, iev)
     track_features = track_to_features(prop_data, iev)
@@ -438,7 +528,9 @@ def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack
 
     if len(genparticle_to_track[0]) > 0:
         gp_to_track = (
-            coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track)).max(axis=1).todense()
+            coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track))
+            .max(axis=1)
+            .todense()
         )
     else:
         gp_to_track = np.zeros((n_gp, 1))
@@ -491,8 +583,12 @@ def assign_genparticles_to_obj_and_merge(gpdata):
         ).todense()
     )
 
-    gp_to_calohit = coo_matrix((gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit))
-    calohit_to_cluster = coo_matrix((gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster))
+    gp_to_calohit = coo_matrix(
+        (gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit)
+    )
+    calohit_to_cluster = coo_matrix(
+        (gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster)
+    )
 
     gp_to_cluster = np.array((gp_to_calohit * calohit_to_cluster).todense())
 
@@ -657,7 +753,9 @@ def get_reco_properties(prop_data, iev):
     reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields}
 
     reco_p4 = vector.awk(
-        awkward.zip({"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]})
+        awkward.zip(
+            {"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]}
+        )
     )
     reco_arr["pt"] = reco_p4.pt
     reco_arr["eta"] = reco_p4.eta
@@ -879,19 +977,29 @@ def process_one_file(fn, ofn):
         assert np.all(used_rps == 1)
 
         gps_track = get_particle_feature_matrix(track_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order)
-        gps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])])
+        gps_track[:, 0] = np.array(
+            [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])]
+        )
         gps_cluster = get_particle_feature_matrix(cluster_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order)
-        gps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])])
+        gps_cluster[:, 0] = np.array(
+            [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])]
+        )
         gps_cluster[:, 1] = 0
 
         rps_track = get_particle_feature_matrix(track_to_rp_all, reco_features, particle_feature_order)
-        rps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])])
+        rps_track[:, 0] = np.array(
+            [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])]
+        )
         rps_cluster = get_particle_feature_matrix(cluster_to_rp_all, reco_features, particle_feature_order)
-        rps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])])
+        rps_cluster[:, 0] = np.array(
+            [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])]
+        )
         rps_cluster[:, 1] = 0
 
         # all initial gen/reco particle energy must be reconstructable
-        assert abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2
+        assert (
+            abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2
+        )
 
         assert abs(np.sum(rps_track[:, 6]) + np.sum(rps_cluster[:, 6]) - np.sum(reco_features["energy"])) < 1e-2
 

From e0ea0791029c5e514ce80bd7fb0ab85e947ed8e0 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 11:48:53 +0200
Subject: [PATCH 02/66] up utils.edm.py with updated feature list

---
 mlpf/heptfds/clic_pf_edm4hep/utils_edm.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py
index b0f152d9c..c63d74994 100644
--- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py
+++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py
@@ -1,6 +1,7 @@
+import random
+
 import awkward as ak
 import numpy as np
-import random
 
 # from fcc/postprocessing.py
 X_FEATURES_TRK = [
@@ -39,6 +40,16 @@
     "sigma_x",
     "sigma_y",
     "sigma_z",
+    # added by farouk
+    "energyError",
+    "sigma_energy",
+    "sigma_x_weighted",
+    "sigma_y_weighted",
+    "sigma_z_weighted",
+    "energy_weighted_width",
+    "pos_shower_max",
+    "width_shower_max",
+    "energy_shower_max",
 ]
 
 Y_FEATURES = ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy", "ispu"]

From 5096060d301b0a15c323c6c4704d3eefc4ffb4a7 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:15:35 +0200
Subject: [PATCH 03/66] tag 2.2.0

---
 mlpf/heptfds/clic_pf_edm4hep/ttbar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py
index 9a01aa81f..124d2c9a8 100644
--- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py
+++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 import tensorflow as tf
+import tensorflow_datasets as tfds
 from utils_edm import (
     X_FEATURES_CL,
     X_FEATURES_TRK,
@@ -9,8 +10,6 @@
     split_sample,
 )
 
-import tensorflow_datasets as tfds
-
 _DESCRIPTION = """
 CLIC EDM4HEP dataset with ee -> ttbar at 380GeV.
   - X: reconstructed tracks and clusters, variable number N per event
@@ -36,6 +35,7 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder):
         "1.5.0": "Regenerate with ARRAY_RECORD",
         "2.0.0": "Add ispu, genjets, genmet; disable genjet_idx; truth def not based on gp.status==1",
         "2.1.0": "Bump dataset size",
+        "2.2.0": "Additional cluster input features",
     }
     MANUAL_DOWNLOAD_INSTRUCTIONS = """
     For the raw input files in ROOT EDM4HEP format, please see the citation above.

From dd6bad75a2b7a168e35cc97b6ae0967fe138e5ff Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:16:13 +0200
Subject: [PATCH 04/66] up

---
 mlpf/heptfds/clic_pf_edm4hep/ttbar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py
index 124d2c9a8..74837eb43 100644
--- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py
+++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py
@@ -25,7 +25,7 @@
 
 
 class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder):
-    VERSION = tfds.core.Version("2.1.0")
+    VERSION = tfds.core.Version("2.2.0")
     RELEASE_NOTES = {
         "1.0.0": "Initial release.",
         "1.1.0": "update stats, move to 380 GeV",

From 7e1ac711da50c0db0c911e52e19735c4cc2b34fd Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:24:05 +0200
Subject: [PATCH 05/66] process whole dir

---
 scripts/clic/postprocessing.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 4d32f210b..000812773 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1044,16 +1044,27 @@ def parse_args():
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--input", type=str, help="Input file ROOT file", required=True)
+    parser.add_argument(
+        "--input", type=str, help="Input ROOT file - else if dir then will process all files inside", required=True
+    )
     parser.add_argument("--outpath", type=str, default="raw", help="output path")
     args = parser.parse_args()
     return args
 
 
 def process(args):
-    infile = args.input
-    outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet")
-    process_one_file(infile, outfile)
+
+    if os.path.isdir(args.input) is True:
+        import glob
+
+        flist = glob.glob(args.input)
+        for infile in flist:
+            outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet")
+            process_one_file(infile, outfile)
+    else:
+        infile = args.input
+        outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet")
+        process_one_file(infile, outfile)
 
 
 if __name__ == "__main__":

From c7d8a088fad623dfbd1862cff9de6759b86c487d Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:25:10 +0200
Subject: [PATCH 06/66] debug

---
 scripts/clic/postprocessing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 000812773..98b09da32 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1035,7 +1035,8 @@ def process_one_file(fn, ofn):
             }
         )
         ret.append(this_ev)
-
+        if iev == 3:
+            break
     ret = awkward.Record({k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields})
     awkward.to_parquet(ret, ofn)
 

From d20de3a163236c4fd24730085b0fbb8074c8c3a4 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:27:27 +0200
Subject: [PATCH 07/66] up

---
 scripts/clic/postprocessing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 98b09da32..0634cc385 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1056,6 +1056,7 @@ def parse_args():
 def process(args):
 
     if os.path.isdir(args.input) is True:
+        print("yes")
         import glob
 
         flist = glob.glob(args.input)

From a20d8e87030407230cb4009dfcb344c45d1ed3cf Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:27:55 +0200
Subject: [PATCH 08/66] up

---
 scripts/clic/postprocessing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 0634cc385..45ad73bbe 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1060,6 +1060,7 @@ def process(args):
         import glob
 
         flist = glob.glob(args.input)
+        print("flist", flist)
         for infile in flist:
             outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet")
             process_one_file(infile, outfile)

From 78fbf1ab1d7af84228cab6c9e3e0831b66e9ce33 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:29:19 +0200
Subject: [PATCH 09/66] up

---
 scripts/clic/postprocessing.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 45ad73bbe..49b95c104 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1056,11 +1056,9 @@ def parse_args():
 def process(args):
 
     if os.path.isdir(args.input) is True:
-        print("yes")
         import glob
 
-        flist = glob.glob(args.input)
-        print("flist", flist)
+        flist = glob.glob(args.input + "/*.root")
         for infile in flist:
             outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet")
             process_one_file(infile, outfile)

From 1e2f3d1b6a8a943f005fb441b080e2afabd3b5f8 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:29:53 +0200
Subject: [PATCH 10/66] up

---
 scripts/clic/postprocessing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 49b95c104..e930ed649 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1,3 +1,4 @@
+import glob
 import os
 
 # noqa: to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable
@@ -1056,7 +1057,7 @@ def parse_args():
 def process(args):
 
     if os.path.isdir(args.input) is True:
-        import glob
+        print("Will process all files in " + args.input)
 
         flist = glob.glob(args.input + "/*.root")
         for infile in flist:

From f2e4181f0b823f64340ce56386da47ed7073af4c Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 12:32:27 +0200
Subject: [PATCH 11/66] remove break

---
 scripts/clic/postprocessing.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index e930ed649..43c7b06b1 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -1036,8 +1036,7 @@ def process_one_file(fn, ofn):
             }
         )
         ret.append(this_ev)
-        if iev == 3:
-            break
+
     ret = awkward.Record({k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields})
     awkward.to_parquet(ret, ofn)
 

From df08d50eda05d43aba756c26d74716b4ee01f299 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 15:49:47 +0200
Subject: [PATCH 12/66] up

---
 parameters/pytorch/pyg-clic-ttbar.yaml | 121 +++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 parameters/pytorch/pyg-clic-ttbar.yaml

diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml
new file mode 100644
index 000000000..2525286ec
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-ttbar.yaml
@@ -0,0 +1,121 @@
+backend: pytorch
+
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: joint #split, joint
+  pt_mode: linear
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: linear
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 12
+    dropout_ff: 0.1
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "relu"
+    head_dim: 32
+    num_heads: 32
+    attention_type: math
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.1.0
\ No newline at end of file

From e13f323a3f34f3f94918c60e02c69ba2e482fd31 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 15:53:01 +0200
Subject: [PATCH 13/66] up logging

---
 mlpf/pyg/training.py | 140 +++++++++++++++++++++++++++++++++----------
 1 file changed, 109 insertions(+), 31 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 7c2a67e35..93218f098 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -1,19 +1,17 @@
+import csv
+import json
+import logging
 import os
 import os.path as osp
 import pickle as pkl
+import shutil
 import time
+from datetime import datetime
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Optional
-import logging
-import shutil
-from datetime import datetime
-import tqdm
-import yaml
-import csv
-import json
-import sklearn
-import sklearn.metrics
+
+import fastjet
 import numpy as np
 import pandas
 import matplotlib
@@ -25,6 +23,25 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
+import tqdm
+import yaml
+from pyg.inference import make_plots, run_predictions
+from pyg.logger import _configLogger, _logger
+from pyg.mlpf import MLPF
+from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders
+from pyg.utils import (
+    CLASS_LABELS,
+    ELEM_TYPES_NONZERO,
+    X_FEATURES,
+    count_parameters,
+    get_lr_schedule,
+    get_model_state_dict,
+    load_checkpoint,
+    save_checkpoint,
+    save_HPs,
+    unpack_predictions,
+    unpack_target,
+)
 from torch import Tensor, nn
 from torch.nn import functional as F
 from torch.profiler import ProfilerActivity, profile, record_function
@@ -54,6 +71,10 @@
 from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders
 from utils import create_comet_experiment
 
+# comet needs to be imported before torch
+from comet_ml import OfflineExperiment, Experiment  # noqa: F401, isort:skip
+
+
 # Ignore divide by 0 errors
 np.seterr(divide="ignore", invalid="ignore")
 
@@ -146,12 +167,12 @@ def mlpf_loss(y, ypred, batch):
     pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2)
     loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean()
 
-    was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
-        axis=-1
-    )
-    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
-        axis=-1
-    )
+    was_input_pred = torch.concat(
+        [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1
+    ) * batch.mask.unsqueeze(axis=-1)
+    was_input_true = torch.concat(
+        [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1
+    ) * batch.mask.unsqueeze(axis=-1)
 
     # standardize Wasserstein loss
     std = was_input_true[batch.mask].std(axis=0)
@@ -193,7 +214,9 @@ class FocalLoss(nn.Module):
         - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
     """
 
-    def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100):
+    def __init__(
+        self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100
+    ):
         """Constructor.
         Args:
             alpha (Tensor, optional): Weights for each class. Defaults to None.
@@ -457,7 +480,9 @@ def train_and_valid(
     if (world_size > 1) and (rank != 0):
         iterator = enumerate(data_loader)
     else:
-        iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}")
+        iterator = tqdm.tqdm(
+            enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}"
+        )
 
     device_type = "cuda" if isinstance(rank, int) else "cpu"
 
@@ -492,13 +517,19 @@ def train_and_valid(
 
         if not is_train:
             cm_X_gen += sklearn.metrics.confusion_matrix(
-                batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), ygen["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13)
+                batch.X[:, :, 0][batch.mask].detach().cpu().numpy(),
+                ygen["cls_id"][batch.mask].detach().cpu().numpy(),
+                labels=range(13),
             )
             cm_X_pred += sklearn.metrics.confusion_matrix(
-                batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13)
+                batch.X[:, :, 0][batch.mask].detach().cpu().numpy(),
+                ypred["cls_id"][batch.mask].detach().cpu().numpy(),
+                labels=range(13),
             )
             cm_id += sklearn.metrics.confusion_matrix(
-                ygen["cls_id"][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13)
+                ygen["cls_id"][batch.mask].detach().cpu().numpy(),
+                ypred["cls_id"][batch.mask].detach().cpu().numpy(),
+                labels=range(13),
             )
             # save the events of the first validation batch for quick checks
             if (rank == 0 or rank == "cpu") and itrain == 0:
@@ -604,10 +635,20 @@ def train_and_valid(
 
     if not is_train and comet_experiment:
         comet_experiment.log_confusion_matrix(
-            matrix=cm_X_gen, title="Element to target", row_label="X", column_label="target", epoch=epoch, file_name="cm_X_gen.json"
+            matrix=cm_X_gen,
+            title="Element to target",
+            row_label="X",
+            column_label="target",
+            epoch=epoch,
+            file_name="cm_X_gen.json",
         )
         comet_experiment.log_confusion_matrix(
-            matrix=cm_X_pred, title="Element to pred", row_label="X", column_label="pred", epoch=epoch, file_name="cm_X_pred.json"
+            matrix=cm_X_pred,
+            title="Element to pred",
+            row_label="X",
+            column_label="pred",
+            epoch=epoch,
+            file_name="cm_X_pred.json",
         )
         comet_experiment.log_confusion_matrix(
             matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json"
@@ -698,7 +739,9 @@ def train_mlpf(
 
         # training step, edit here to profile a specific epoch
         if epoch == -1:
-            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
+            with profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True
+            ) as prof:
                 with record_function("model_train"):
                     losses_t = train_and_valid(
                         rank,
@@ -845,10 +888,20 @@ def train_mlpf(
             time_per_epoch = (t1 - t0_initial) / epoch
             eta = epochs_remaining * time_per_epoch / 60
 
+            # _logger.info(
+            #     f"Rank {rank}: epoch={epoch} / {num_epochs} "
+            #     + f"train_loss={losses_t['Total']:.4f} "
+            #     + f"valid_loss={losses_v['Total']:.4f} "
+            #     + f"stale={stale_epochs} "
+            #     + f"epoch_train_time={round((t_train-t0)/60, 2)}m "
+            #     + f"epoch_valid_time={round((t_valid-t_train)/60, 2)}m "
+            #     + f"epoch_total_time={round((t1-t0)/60, 2)}m "
+            #     + f"eta={round(eta, 1)}m",
+            #     color="bold",
+            # )
+
             _logger.info(
                 f"Rank {rank}: epoch={epoch} / {num_epochs} "
-                + f"train_loss={losses_t['Total']:.4f} "
-                + f"valid_loss={losses_v['Total']:.4f} "
                 + f"stale={stale_epochs} "
                 + f"epoch_train_time={round((t_train-t0)/60, 2)}m "
                 + f"epoch_valid_time={round((t_valid-t_train)/60, 2)}m "
@@ -857,6 +910,22 @@ def train_mlpf(
                 color="bold",
             )
 
+            _logger.info(
+                f"train: loss_total={losses_t['Total']:.4f} "
+                + f"loss_clf={losses_t['Classification']:.4f} "
+                + f"loss_clfbinary={losses_t['Classification_binary']:.4f} "
+                + f"loss_reg={losses_t['Regression']:.4f} ",
+                color="bold",
+            )
+
+            _logger.info(
+                f"valid: loss_total={losses_v['Total']:.4f} "
+                + f"loss_clf={losses_v['Classification']:.4f} "
+                + f"loss_clfbinary={losses_v['Classification_binary']:.4f} "
+                + f"loss_reg={losses_v['Regression']:.4f} ",
+                color="bold",
+            )
+
             # save separate json files with stats for each epoch, this is robust to crashed-then-resumed trainings
             history_path = Path(outdir) / "history"
             history_path.mkdir(parents=True, exist_ok=True)
@@ -958,7 +1027,9 @@ def run(rank, world_size, config, args, outdir, logfile):
             _logger.info(f"Model directory {outdir}", color="bold")
 
         if args.comet:
-            comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
+            comet_experiment = create_comet_experiment(
+                config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
+            )
             comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}")
             comet_experiment.log_parameter("run_id", Path(outdir).name)
             comet_experiment.log_parameter("world_size", world_size)
@@ -1197,7 +1268,9 @@ def train_ray_trial(config, args, outdir=None):
     loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True)
 
     if args.comet:
-        comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
+        comet_experiment = create_comet_experiment(
+            config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
+        )
         comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}")
         comet_experiment.log_parameter("run_id", Path(outdir).name)
         comet_experiment.log_parameter("world_size", world_size)
@@ -1231,7 +1304,9 @@ def train_ray_trial(config, args, outdir=None):
                 if args.resume_training:
                     model, optimizer = load_checkpoint(checkpoint, model, optimizer)
                     start_epoch = checkpoint["extra_state"]["epoch"] + 1
-                    lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1)
+                    lr_schedule = get_lr_schedule(
+                        config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1
+                    )
                 else:  # start a new training with model weights loaded from a pre-trained model
                     model = load_checkpoint(checkpoint, model)
 
@@ -1346,7 +1421,6 @@ def run_hpo(config, args):
     import ray
     from ray import tune
     from ray.train.torch import TorchTrainer
-
     from raytune.pt_search_space import raytune_num_samples, search_space
     from raytune.utils import get_raytune_schedule, get_raytune_search_alg
 
@@ -1395,7 +1469,9 @@ def run_hpo(config, args):
 
     if tune.Tuner.can_restore(str(expdir)):
         # resume unfinished HPO run
-        tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True)
+        tuner = tune.Tuner.restore(
+            str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True
+        )
     else:
         # start new HPO run
         search_space = {"train_loop_config": search_space}  # the ray TorchTrainer only takes a single arg: train_loop_config
@@ -1436,4 +1512,6 @@ def run_hpo(config, args):
     print(result_df.columns)
 
     logging.info("Total time of Tuner.fit(): {}".format(end - start))
-    logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))
+    logging.info(
+        "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)
+    )

From e9930ebe92d2088da49d37dc7c29c88034cb9349 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 15:54:03 +0200
Subject: [PATCH 14/66] up

---
 parameters/pytorch/pyg-clic-ttbar.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml
index 2525286ec..21aa978b5 100644
--- a/parameters/pytorch/pyg-clic-ttbar.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar.yaml
@@ -50,17 +50,18 @@ model:
 
   attention:
     conv_type: attention
-    num_convs: 12
-    dropout_ff: 0.1
+    num_convs: 8
+    dropout_ff: 0.0
     dropout_conv_id_mha: 0.0
     dropout_conv_id_ff: 0.0
     dropout_conv_reg_mha: 0.0
     dropout_conv_reg_ff: 0.0
     activation: "relu"
-    head_dim: 32
-    num_heads: 32
+    head_dim: 64
+    num_heads: 12
     attention_type: math
-    use_pre_layernorm: True
+    use_improved_ffn: True
+    use_improved_attblock: False
 
   mamba:
     conv_type: mamba

From 34b89a0dd8b60e9da5e697b80db8526dfecd4e1f Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 15:54:44 +0200
Subject: [PATCH 15/66] up

---
 parameters/pytorch/pyg-clic-ttbar.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml
index 21aa978b5..0e2d2e5bb 100644
--- a/parameters/pytorch/pyg-clic-ttbar.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar.yaml
@@ -60,8 +60,6 @@ model:
     head_dim: 64
     num_heads: 12
     attention_type: math
-    use_improved_ffn: True
-    use_improved_attblock: False
 
   mamba:
     conv_type: mamba

From 99082ad4ac39f96ee7772429715fdec1d6cbcaf6 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 16:31:57 +0200
Subject: [PATCH 16/66] up

---
 parameters/pytorch/pyg-clic-ttbar.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml
index 0e2d2e5bb..7d404963f 100644
--- a/parameters/pytorch/pyg-clic-ttbar.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar.yaml
@@ -26,7 +26,7 @@ val_freq:  # run an extra validation run every val_freq training steps
 model:
   trainable: all
   learned_representation_mode: last #last, concat
-  input_encoding: joint #split, joint
+  input_encoding: split #split, joint
   pt_mode: linear
   eta_mode: linear
   sin_phi_mode: linear

From 47c00b210516f95fa3b2eb0c5e4b1992913829dc Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 17:10:15 +0200
Subject: [PATCH 17/66] standardize inputs

---
 mlpf/pyg/clic_standardization.json     |  1 +
 mlpf/pyg/mlpf.py                       | 60 +++++++++++++++++++++++---
 parameters/pytorch/pyg-clic-ttbar.yaml |  1 +
 3 files changed, 55 insertions(+), 7 deletions(-)
 create mode 100644 mlpf/pyg/clic_standardization.json

diff --git a/mlpf/pyg/clic_standardization.json b/mlpf/pyg/clic_standardization.json
new file mode 100644
index 000000000..99114010b
--- /dev/null
+++ b/mlpf/pyg/clic_standardization.json
@@ -0,0 +1 @@
+{"2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.0016281852731481194, 0.00044645098387263715, 0.008498543873429298, 4.233071804046631, 31.000581741333008, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.002807975746691227, 0.0019595674239099026, 7.4845520430244505e-06, -0.07514938712120056, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.00073066825279966, -0.0011971204075962305, 0.004265286959707737, 3.290896415710449, 11.28282356262207, -0.4471941590309143, -4.9265971183776855, 1.571304440498352, 1.8862318992614746, 0.9784500598907471, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227722167969, 51.77717971801758], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}}
diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index 59d7564ab..5a2770973 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -1,13 +1,13 @@
+import math
+
+import numpy as np
 import torch
 import torch.nn as nn
-
-from .gnn_lsh import CombinedGraphLayer
-
 from pyg.logger import _logger
-import math
-import numpy as np
 from torch.nn.attention import SDPBackend, sdpa_kernel
 
+from .gnn_lsh import CombinedGraphLayer
+
 
 def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
     # From https://github.com/rwightman/pytorch-image-models/blob/
@@ -57,6 +57,37 @@ def norm_cdf(x):
         return tensor
 
 
+def standardize_inputs(X, elemtypes_nonzero):
+    import json
+
+    import numpy as np
+
+    with open("clic_standardization.json", "rb") as f:
+        standard_dict = json.load(f)["2.1.0"]
+
+    for i, ielem in enumerate(elemtypes_nonzero):
+
+        # get mean/std of features of that elem
+        mean = np.array(standard_dict[f"PFelement{ielem}"]["mean"])
+        std = np.array(standard_dict[f"PFelement{ielem}"]["std"])
+
+        # standardize
+        Xfeat_normed_msked = X.clone()
+        Xfeat_normed_msked[..., 1:] = (Xfeat_normed_msked[..., 1:] - mean[..., 1:]) / std[..., 1:]
+
+        # msk other elements
+        msk = Xfeat_normed_msked[..., 0:1] == ielem
+        Xfeat_normed_msked = Xfeat_normed_msked * msk
+        Xfeat_normed_msked = torch.nan_to_num(Xfeat_normed_msked, nan=0.0)
+
+        if i == 0:
+            Xfeat_normed = Xfeat_normed_msked
+        else:
+            Xfeat_normed += Xfeat_normed_msked
+
+    return Xfeat_normed
+
+
 def get_activation(activation):
     if activation == "elu":
         act = nn.ELU
@@ -96,7 +127,9 @@ def __init__(
         self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True)
         self.norm0 = torch.nn.LayerNorm(embedding_dim)
         self.norm1 = torch.nn.LayerNorm(embedding_dim)
-        self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act())
+        self.seq = torch.nn.Sequential(
+            nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()
+        )
         self.dropout = torch.nn.Dropout(dropout_ff)
         _logger.info("using attention_type={}".format(attention_type))
         # params for torch sdp_kernel
@@ -262,6 +295,12 @@ def __init__(
         dropout_conv_id_mha=0.0,
         dropout_conv_id_ff=0.0,
         use_pre_layernorm=False,
+        # mamba specific parameters
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        # standardize_inputs
+        standardize_inputs=False,
     ):
         super(MLPF, self).__init__()
 
@@ -281,6 +320,8 @@ def __init__(
 
         self.use_pre_layernorm = use_pre_layernorm
 
+        self.standardize_inputs = standardize_inputs
+
         if self.conv_type == "attention":
             embedding_dim = num_heads * head_dim
             width = num_heads * head_dim
@@ -375,6 +416,9 @@ def __init__(
     def forward(self, X_features, mask):
         Xfeat_normed = X_features
 
+        if self.standardize_inputs:
+            Xfeat_normed = standardize_inputs(X_features)
+
         embeddings_id, embeddings_reg = [], []
         if self.num_convs != 0:
             if self.input_encoding == "joint":
@@ -434,7 +478,9 @@ def forward(self, X_features, mask):
         e_real[~mask] = 0
         e_real[torch.isinf(e_real)] = 0
         e_real[torch.isnan(e_real)] = 0
-        preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]))
+        preds_energy = e_real + torch.nn.functional.relu(
+            self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])
+        )
         preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1)
         return preds_binary_particle, preds_pid, preds_momentum
 
diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml
index 7d404963f..e949ac554 100644
--- a/parameters/pytorch/pyg-clic-ttbar.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar.yaml
@@ -60,6 +60,7 @@ model:
     head_dim: 64
     num_heads: 12
     attention_type: math
+    standardize_inputs: True
 
   mamba:
     conv_type: mamba

From 97a31944c1596b65ff421f2b0f7114ad12562312 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 17:10:49 +0200
Subject: [PATCH 18/66] up elemtypes_nonzero

---
 mlpf/pyg/mlpf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index 5a2770973..9d3d6afd6 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -417,7 +417,7 @@ def forward(self, X_features, mask):
         Xfeat_normed = X_features
 
         if self.standardize_inputs:
-            Xfeat_normed = standardize_inputs(X_features)
+            Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero)
 
         embeddings_id, embeddings_reg = [], []
         if self.num_convs != 0:

From eb51a50dd197bac4ac44ad495c1e6c0096668853 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 17:12:43 +0200
Subject: [PATCH 19/66] up

---
 mlpf/pyg/mlpf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index 9d3d6afd6..db30d57c5 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -62,7 +62,7 @@ def standardize_inputs(X, elemtypes_nonzero):
 
     import numpy as np
 
-    with open("clic_standardization.json", "rb") as f:
+    with open("/pfvolcentral/clic_standardization.json", "rb") as f:
         standard_dict = json.load(f)["2.1.0"]
 
     for i, ielem in enumerate(elemtypes_nonzero):

From 9520e2092c7db4ae92c3d911e85ea2dfc76b88df Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 20 Sep 2024 17:14:25 +0200
Subject: [PATCH 20/66] up

---
 mlpf/pyg/mlpf.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index db30d57c5..3a1fd9c2b 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -60,19 +60,18 @@ def norm_cdf(x):
 def standardize_inputs(X, elemtypes_nonzero):
     import json
 
-    import numpy as np
-
     with open("/pfvolcentral/clic_standardization.json", "rb") as f:
         standard_dict = json.load(f)["2.1.0"]
 
     for i, ielem in enumerate(elemtypes_nonzero):
 
+        Xfeat_normed_msked = X.clone()
+
         # get mean/std of features of that elem
-        mean = np.array(standard_dict[f"PFelement{ielem}"]["mean"])
-        std = np.array(standard_dict[f"PFelement{ielem}"]["std"])
+        mean = torch.tensor(standard_dict[f"PFelement{ielem}"]["mean"]).to(Xfeat_normed_msked.device)
+        std = torch.tensor(standard_dict[f"PFelement{ielem}"]["std"]).to(Xfeat_normed_msked.device)
 
         # standardize
-        Xfeat_normed_msked = X.clone()
         Xfeat_normed_msked[..., 1:] = (Xfeat_normed_msked[..., 1:] - mean[..., 1:]) / std[..., 1:]
 
         # msk other elements

From 1ec4e9b2ed4ea7dbd15c9cc1217e8a29fea43edf Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 10:57:51 +0200
Subject: [PATCH 21/66] up

---
 mlpf/pyg/mlpf.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index 3a1fd9c2b..f47c7cec0 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -294,10 +294,6 @@ def __init__(
         dropout_conv_id_mha=0.0,
         dropout_conv_id_ff=0.0,
         use_pre_layernorm=False,
-        # mamba specific parameters
-        d_state=16,
-        d_conv=4,
-        expand=2,
         # standardize_inputs
         standardize_inputs=False,
     ):

From 963f81223abac32f4d84bffd7196e14bbeda1f1b Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:01:07 +0200
Subject: [PATCH 22/66] up

---
 mlpf/pyg/training.py | 85 +++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 40 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 93218f098..ac56db023 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -1,4 +1,5 @@
 import csv
+import glob
 import json
 import logging
 import os
@@ -12,14 +13,12 @@
 from typing import Optional
 
 import fastjet
-import numpy as np
-import pandas
 import matplotlib
 import matplotlib.pyplot as plt
-import glob
-
-# comet needs to be imported before torch
-from comet_ml import OfflineExperiment, Experiment  # noqa: F401, isort:skip
+import numpy as np
+import pandas
+import sklearn
+import sklearn.metrics
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -27,7 +26,7 @@
 import yaml
 from pyg.inference import make_plots, run_predictions
 from pyg.logger import _configLogger, _logger
-from pyg.mlpf import MLPF
+from pyg.mlpf import MLPF, set_save_attention
 from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders
 from pyg.utils import (
     CLASS_LABELS,
@@ -46,35 +45,11 @@
 from torch.nn import functional as F
 from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils.tensorboard import SummaryWriter
-
-from pyg.logger import _logger, _configLogger
-from pyg.utils import (
-    unpack_predictions,
-    unpack_target,
-    get_model_state_dict,
-    load_checkpoint,
-    save_checkpoint,
-    CLASS_LABELS,
-    X_FEATURES,
-    ELEM_TYPES_NONZERO,
-    save_HPs,
-    get_lr_schedule,
-    count_parameters,
-)
-
-
-import fastjet
-from pyg.inference import make_plots, run_predictions
-
-from pyg.mlpf import set_save_attention
-from pyg.mlpf import MLPF
-from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders
 from utils import create_comet_experiment
 
 # comet needs to be imported before torch
 from comet_ml import OfflineExperiment, Experiment  # noqa: F401, isort:skip
 
-
 # Ignore divide by 0 errors
 np.seterr(divide="ignore", invalid="ignore")
 
@@ -119,7 +94,9 @@ def mlpf_loss(y, ypred, batch):
 
     # binary loss for particle / no-particle classification
     # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape)
-    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none")
+    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(
+        ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none"
+    )
 
     # compare the particle type, only for cases where there was a true particle
     loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape)
@@ -406,18 +383,30 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o
         ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
@@ -910,19 +899,35 @@ def train_mlpf(
                 color="bold",
             )
 
+            log_t = (
+                losses_t["Regression_pt"]
+                + losses_t["Regression_eta"]
+                + losses_t["Regression_sin_phi"]
+                + losses_t["Regression_cos_phi"]
+                + losses_t["Regression_energy"]
+            )
+
             _logger.info(
                 f"train: loss_total={losses_t['Total']:.4f} "
                 + f"loss_clf={losses_t['Classification']:.4f} "
                 + f"loss_clfbinary={losses_t['Classification_binary']:.4f} "
-                + f"loss_reg={losses_t['Regression']:.4f} ",
+                + f"loss_reg={log_t:.4f} ",
                 color="bold",
             )
 
+            log_v = (
+                losses_v["Regression_pt"]
+                + losses_v["Regression_eta"]
+                + losses_v["Regression_sin_phi"]
+                + losses_v["Regression_cos_phi"]
+                + losses_v["Regression_energy"]
+            )
+
             _logger.info(
                 f"valid: loss_total={losses_v['Total']:.4f} "
                 + f"loss_clf={losses_v['Classification']:.4f} "
                 + f"loss_clfbinary={losses_v['Classification_binary']:.4f} "
-                + f"loss_reg={losses_v['Regression']:.4f} ",
+                + f"loss_reg={log_v:.4f} ",
                 color="bold",
             )
 

From c3dea3c298e5d2b5d4336f6d9531ef08dabc9afe Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:18:00 +0200
Subject: [PATCH 23/66] up

---
 parameters/pytorch/pyg-clic-ttbar.yaml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar.yaml
index e949ac554..38ac0a553 100644
--- a/parameters/pytorch/pyg-clic-ttbar.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+save_attention: yes
 dataset: clic
 sort_data: no
 data_dir:
@@ -27,11 +28,11 @@ model:
   trainable: all
   learned_representation_mode: last #last, concat
   input_encoding: split #split, joint
-  pt_mode: linear
+  pt_mode: direct-elemtype-split
   eta_mode: linear
   sin_phi_mode: linear
   cos_phi_mode: linear
-  energy_mode: linear
+  energy_mode: direct-elemtype-split
 
   gnn_lsh:
     conv_type: gnn_lsh
@@ -60,7 +61,8 @@ model:
     head_dim: 64
     num_heads: 12
     attention_type: math
-    standardize_inputs: True
+    standardize_inputs: False
+    use_pre_layernorm: True
 
   mamba:
     conv_type: mamba

From 8057f0d13c2ffa341423d5b997b610512a713cee Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:18:57 +0200
Subject: [PATCH 24/66] up

---
 ...clic-ttbar.yaml => pyg-clic-ttbar-21.yaml} |   0
 parameters/pytorch/pyg-clic-ttbar-22.yaml     | 123 ++++++++++++++++++
 2 files changed, 123 insertions(+)
 rename parameters/pytorch/{pyg-clic-ttbar.yaml => pyg-clic-ttbar-21.yaml} (100%)
 create mode 100644 parameters/pytorch/pyg-clic-ttbar-22.yaml

diff --git a/parameters/pytorch/pyg-clic-ttbar.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml
similarity index 100%
rename from parameters/pytorch/pyg-clic-ttbar.yaml
rename to parameters/pytorch/pyg-clic-ttbar-21.yaml
diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml
new file mode 100644
index 000000000..90a12c4bc
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml
@@ -0,0 +1,123 @@
+backend: pytorch
+
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: split #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 8
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "relu"
+    head_dim: 64
+    num_heads: 12
+    attention_type: math
+    standardize_inputs: False
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.2.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.2.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.2.0
\ No newline at end of file

From 60994cebc9e21b94fd40957ad4c32eaf317e615a Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:24:25 +0200
Subject: [PATCH 25/66] up 26 feats

---
 mlpf/pyg/training.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index ac56db023..77e32877a 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -993,8 +993,13 @@ def run(rank, world_size, config, args, outdir, logfile):
 
         model, optimizer = load_checkpoint(checkpoint, model, optimizer)
     else:  # instantiate a new model in the outdir created
+
+        input_dim = (
+            len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
+        )
+
         model_kwargs = {
-            "input_dim": len(X_FEATURES[config["dataset"]]),
+            "input_dim": input_dim,
             "num_classes": len(CLASS_LABELS[config["dataset"]]),
             "input_encoding": config["model"]["input_encoding"],
             "pt_mode": config["model"]["pt_mode"],
@@ -1230,8 +1235,12 @@ def train_ray_trial(config, args, outdir=None):
     world_rank = ray.train.get_context().get_world_rank()
     world_size = ray.train.get_context().get_world_size()
 
+    input_dim = (
+        len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
+    )
+
     model_kwargs = {
-        "input_dim": len(X_FEATURES[config["dataset"]]),
+        "input_dim": input_dim,
         "num_classes": len(CLASS_LABELS[config["dataset"]]),
         "input_encoding": config["model"]["input_encoding"],
         "pt_mode": config["model"]["pt_mode"],

From c9853230247b3869ef15f5da3f635d93587b2aab Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:32:24 +0200
Subject: [PATCH 26/66] up

---
 mlpf/pyg/training.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 77e32877a..d02ed1c04 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -50,6 +50,7 @@
 # comet needs to be imported before torch
 from comet_ml import OfflineExperiment, Experiment  # noqa: F401, isort:skip
 
+
 # Ignore divide by 0 errors
 np.seterr(divide="ignore", invalid="ignore")
 
@@ -993,13 +994,8 @@ def run(rank, world_size, config, args, outdir, logfile):
 
         model, optimizer = load_checkpoint(checkpoint, model, optimizer)
     else:  # instantiate a new model in the outdir created
-
-        input_dim = (
-            len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
-        )
-
         model_kwargs = {
-            "input_dim": input_dim,
+            "input_dim": len(X_FEATURES[config["dataset"]]),
             "num_classes": len(CLASS_LABELS[config["dataset"]]),
             "input_encoding": config["model"]["input_encoding"],
             "pt_mode": config["model"]["pt_mode"],
@@ -1235,12 +1231,8 @@ def train_ray_trial(config, args, outdir=None):
     world_rank = ray.train.get_context().get_world_rank()
     world_size = ray.train.get_context().get_world_size()
 
-    input_dim = (
-        len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
-    )
-
     model_kwargs = {
-        "input_dim": input_dim,
+        "input_dim": len(X_FEATURES[config["dataset"]]),
         "num_classes": len(CLASS_LABELS[config["dataset"]]),
         "input_encoding": config["model"]["input_encoding"],
         "pt_mode": config["model"]["pt_mode"],

From b41ba20edb645cad01104e852ac168b1e11d95eb Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:33:15 +0200
Subject: [PATCH 27/66] add 26 input_dim

---
 mlpf/pyg/training.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index d02ed1c04..c2edfa89b 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -994,8 +994,13 @@ def run(rank, world_size, config, args, outdir, logfile):
 
         model, optimizer = load_checkpoint(checkpoint, model, optimizer)
     else:  # instantiate a new model in the outdir created
+
+        input_dim = (
+            len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
+        )
+
         model_kwargs = {
-            "input_dim": len(X_FEATURES[config["dataset"]]),
+            "input_dim": input_dim,
             "num_classes": len(CLASS_LABELS[config["dataset"]]),
             "input_encoding": config["model"]["input_encoding"],
             "pt_mode": config["model"]["pt_mode"],
@@ -1231,8 +1236,12 @@ def train_ray_trial(config, args, outdir=None):
     world_rank = ray.train.get_context().get_world_rank()
     world_size = ray.train.get_context().get_world_size()
 
+    input_dim = (
+        len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
+    )
+
     model_kwargs = {
-        "input_dim": len(X_FEATURES[config["dataset"]]),
+        "input_dim": input_dim,
         "num_classes": len(CLASS_LABELS[config["dataset"]]),
         "input_encoding": config["model"]["input_encoding"],
         "pt_mode": config["model"]["pt_mode"],

From 31bd710833f4e5732aa6ddccd414c6804a8fadd2 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:35:51 +0200
Subject: [PATCH 28/66] up

---
 mlpf/pyg/training.py | 91 ++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index c2edfa89b..dd8a433fb 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -17,8 +17,9 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas
-import sklearn
-import sklearn.metrics
+
+# import sklearn
+# import sklearn.metrics
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -479,10 +480,10 @@ def train_and_valid(
     loss_accum = 0.0
     val_freq_time_0 = time.time()
 
-    if not is_train:
-        cm_X_gen = np.zeros((13, 13))
-        cm_X_pred = np.zeros((13, 13))
-        cm_id = np.zeros((13, 13))
+    # if not is_train:
+    #     cm_X_gen = np.zeros((13, 13))
+    #     cm_X_pred = np.zeros((13, 13))
+    #     cm_id = np.zeros((13, 13))
 
     for itrain, batch in iterator:
         set_save_attention(model, outdir, False)
@@ -505,25 +506,25 @@ def train_and_valid(
 
         ypred = unpack_predictions(ypred_raw)
 
-        if not is_train:
-            cm_X_gen += sklearn.metrics.confusion_matrix(
-                batch.X[:, :, 0][batch.mask].detach().cpu().numpy(),
-                ygen["cls_id"][batch.mask].detach().cpu().numpy(),
-                labels=range(13),
-            )
-            cm_X_pred += sklearn.metrics.confusion_matrix(
-                batch.X[:, :, 0][batch.mask].detach().cpu().numpy(),
-                ypred["cls_id"][batch.mask].detach().cpu().numpy(),
-                labels=range(13),
-            )
-            cm_id += sklearn.metrics.confusion_matrix(
-                ygen["cls_id"][batch.mask].detach().cpu().numpy(),
-                ypred["cls_id"][batch.mask].detach().cpu().numpy(),
-                labels=range(13),
-            )
-            # save the events of the first validation batch for quick checks
-            if (rank == 0 or rank == "cpu") and itrain == 0:
-                validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir)
+        # if not is_train:
+        #     cm_X_gen += sklearn.metrics.confusion_matrix(
+        #         batch.X[:, :, 0][batch.mask].detach().cpu().numpy(),
+        #         ygen["cls_id"][batch.mask].detach().cpu().numpy(),
+        #         labels=range(13),
+        #     )
+        #     cm_X_pred += sklearn.metrics.confusion_matrix(
+        #         batch.X[:, :, 0][batch.mask].detach().cpu().numpy(),
+        #         ypred["cls_id"][batch.mask].detach().cpu().numpy(),
+        #         labels=range(13),
+        #     )
+        #     cm_id += sklearn.metrics.confusion_matrix(
+        #         ygen["cls_id"][batch.mask].detach().cpu().numpy(),
+        #         ypred["cls_id"][batch.mask].detach().cpu().numpy(),
+        #         labels=range(13),
+        #     )
+        #     # save the events of the first validation batch for quick checks
+        #     if (rank == 0 or rank == "cpu") and itrain == 0:
+        #         validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir)
         with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
             if is_train:
                 loss = mlpf_loss(ygen, ypred, batch)
@@ -623,26 +624,26 @@ def train_and_valid(
                     comet_experiment.log_metrics(intermediate_losses_v, prefix="valid", step=step)
                 val_freq_time_0 = time.time()  # reset intermediate validation spacing timer
 
-    if not is_train and comet_experiment:
-        comet_experiment.log_confusion_matrix(
-            matrix=cm_X_gen,
-            title="Element to target",
-            row_label="X",
-            column_label="target",
-            epoch=epoch,
-            file_name="cm_X_gen.json",
-        )
-        comet_experiment.log_confusion_matrix(
-            matrix=cm_X_pred,
-            title="Element to pred",
-            row_label="X",
-            column_label="pred",
-            epoch=epoch,
-            file_name="cm_X_pred.json",
-        )
-        comet_experiment.log_confusion_matrix(
-            matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json"
-        )
+    # if not is_train and comet_experiment:
+    #     comet_experiment.log_confusion_matrix(
+    #         matrix=cm_X_gen,
+    #         title="Element to target",
+    #         row_label="X",
+    #         column_label="target",
+    #         epoch=epoch,
+    #         file_name="cm_X_gen.json",
+    #     )
+    #     comet_experiment.log_confusion_matrix(
+    #         matrix=cm_X_pred,
+    #         title="Element to pred",
+    #         row_label="X",
+    #         column_label="pred",
+    #         epoch=epoch,
+    #         file_name="cm_X_pred.json",
+    #     )
+    #     comet_experiment.log_confusion_matrix(
+    #         matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json"
+    #     )
 
     num_data = torch.tensor(len(data_loader), device=rank)
     # sum up the number of steps from all workers

From 73818768828c362531844cf5cc701eb0ac45ae43 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 11:53:49 +0200
Subject: [PATCH 29/66] up vs 2.2.0 for standardization

---
 mlpf/pyg/clic_standardization.json | 2 +-
 mlpf/pyg/mlpf.py                   | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/mlpf/pyg/clic_standardization.json b/mlpf/pyg/clic_standardization.json
index 99114010b..616aa0023 100644
--- a/mlpf/pyg/clic_standardization.json
+++ b/mlpf/pyg/clic_standardization.json
@@ -1 +1 @@
-{"2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.0016281852731481194, 0.00044645098387263715, 0.008498543873429298, 4.233071804046631, 31.000581741333008, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.002807975746691227, 0.0019595674239099026, 7.4845520430244505e-06, -0.07514938712120056, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.00073066825279966, -0.0011971204075962305, 0.004265286959707737, 3.290896415710449, 11.28282356262207, -0.4471941590309143, -4.9265971183776855, 1.571304440498352, 1.8862318992614746, 0.9784500598907471, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227722167969, 51.77717971801758], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}}
+{"2.2.0": {"PFelement1": {"mean": [1.0, 3.313861608505249, 0.0016492522554472089, 0.0001337795110885054, 0.008735032752156258, 4.222240924835205, 52.052459716796875, 15.738365173339844, 0.0, 0.0, 43.952632904052734, 0.003598652081564069, 0.0025099683552980423, 1.8601234614834539e-06, 0.06345824152231216, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], "std": [0.0, 9.41322135925293, 0.8651331663131714, 0.707984983921051, 0.7061750888824463, 11.704730033874512, 9963.4365234375, 6.578129768371582, 0.0, 0.0, 57.3457145690918, 1.3837608098983765, 2.9924261569976807, 0.0038007558323442936, 27.722505569458008, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.5798463821411133, -0.0016945624956861138, 0.0002545498718973249, 0.006834524683654308, 3.3009438514709473, 15.00655460357666, 0.9023095965385437, -5.961066722869873, 1.5725995302200317, 1.894817590713501, 0.9789467453956604, 1.2573817912198137e-05, 80.9168701171875, 50.47904586791992, 50.581722259521484, 51.96408462524414, 0.6345308423042297, 0.041190944612026215, 41.63414001464844, 41.84788513183594, 54.959861755371094, 11619.2177734375, -5.938568115234375, 1578.74169921875, 0.6185510158538818], "std": [0.0, 4.814330577850342, 0.918420672416687, 0.7074259519577026, 0.7067552208900452, 5.785495281219482, 1081.1585693359375, 1084.4302978515625, 1554.4815673828125, 0.6989641189575195, 3.899383783340454, 3.1981284618377686, 0.00021599895262625068, 106.33512878417969, 71.94786834716797, 72.29435729980469, 73.6717758178711, 0.6663865447044373, 0.045639555901288986, 61.58401870727539, 62.009708404541016, 88.85784912109375, 33432.66796875, 1562.9705810546875, 7268.15087890625, 1.1162511110305786]}}, "2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.001628185505978763, 0.00044645112939178944, 0.008498544804751873, 4.233072757720947, 31.00058364868164, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.0028079766780138016, 0.001959568355232477, 7.4845515882771e-06, -0.07514937967061996, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.000730668194591999, -0.0011971204075962305, 0.004265286028385162, 3.2908971309661865, 11.28282356262207, -0.4471946060657501, -4.926597595214844, 1.571304440498352, 1.886231780052185, 0.9784500002861023, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227340698242, 51.77717208862305], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}}
\ No newline at end of file
diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index f47c7cec0..a3920288f 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -58,10 +58,16 @@ def norm_cdf(x):
 
 
 def standardize_inputs(X, elemtypes_nonzero):
+
+    if X.shape[-1] == 26:
+        vs = "2.2.0"
+    else:
+        vs = "2.1.0"
+
     import json
 
     with open("/pfvolcentral/clic_standardization.json", "rb") as f:
-        standard_dict = json.load(f)["2.1.0"]
+        standard_dict = json.load(f)[vs]
 
     for i, ielem in enumerate(elemtypes_nonzero):
 

From a7f9a46d5d39b2c30fb4ef677e931d4e4e313919 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 13:47:39 +0200
Subject: [PATCH 30/66] more configs

---
 parameters/pytorch/pyg-clic-ttbar-21-std.yaml | 123 ++++++++++++++++++
 parameters/pytorch/pyg-clic-ttbar-22-std.yaml | 123 ++++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 parameters/pytorch/pyg-clic-ttbar-21-std.yaml
 create mode 100644 parameters/pytorch/pyg-clic-ttbar-22-std.yaml

diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
new file mode 100644
index 000000000..4b3152b5a
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
@@ -0,0 +1,123 @@
+backend: pytorch
+
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: split #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 8
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "relu"
+    head_dim: 64
+    num_heads: 12
+    attention_type: math
+    standardize_inputs: True
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.1.0
\ No newline at end of file
diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
new file mode 100644
index 000000000..d061cfc07
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
@@ -0,0 +1,123 @@
+backend: pytorch
+
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: split #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 8
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "relu"
+    head_dim: 64
+    num_heads: 12
+    attention_type: math
+    standardize_inputs: True
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.2.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.2.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.2.0
\ No newline at end of file

From 74e635f6f81df40bb8025678f4504f7d162942b1 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 14:00:00 +0200
Subject: [PATCH 31/66] better docs

---
 mlpf/heptfds/clic_pf_edm4hep/utils_edm.py |  2 +-
 scripts/clic/postprocessing.py            | 14 +++-----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py
index c63d74994..68e0b610e 100644
--- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py
+++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py
@@ -40,7 +40,7 @@
     "sigma_x",
     "sigma_y",
     "sigma_z",
-    # added by farouk
+    # additional cluster input features
     "energyError",
     "sigma_energy",
     "sigma_x_weighted",
diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 43c7b06b1..685d6191d 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -63,7 +63,7 @@
     "sigma_x",
     "sigma_y",
     "sigma_z",
-    # added by farouk
+    # additional cluster input features
     "energyError",
     "sigma_energy",
     "sigma_x_weighted",
@@ -343,7 +343,6 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
     cl_sigma_y = []
     cl_sigma_z = []
 
-    # added by farouk
     cl_sigma_energy = []
     cl_sigma_x_weighted, cl_sigma_y_weighted, cl_sigma_z_weighted = [], [], []
     cl_energy_weighted_width = []
@@ -378,7 +377,6 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
         cl_sigma_y.append(np.std(hits_posy))
         cl_sigma_z.append(np.std(hits_posz))
 
-        # added by farouk
         cl_sigma_energy.append(np.std(hits_energy))
         cl_sigma_x_weighted.append(np.std(hits_posx * hits_energy))
         cl_sigma_y_weighted.append(np.std(hits_posy * hits_energy))
@@ -393,14 +391,8 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
 
         cl_energy_weighted_width.append(num / den)
 
-        #         if i==1:
-        # xs += [np.array(hits_posx)]
-        # ys += [np.array(hits_posy)]
-        # zs += [np.array(hits_posz)]
-        # es += [np.array(hits_energy)]
-
         # get position at shower max
-        # for each unique z integrate the energy of all the hits to find zmax
+        # at each unique "z" integrate the energy of all the hits to find zmax
         zmax, emax = 0, -1000
         for z in np.unique(np.array(hits_posz)):
             msk = np.array(hits_posz) == z
@@ -452,7 +444,7 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
     ret["sin_phi"] = np.sin(ret["phi"])
     ret["cos_phi"] = np.cos(ret["phi"])
 
-    # added by farouk
+    # additional cluster input features
     ret["sigma_energy"] = np.array(cl_sigma_energy)
     ret["sigma_x_weighted"] = np.array(cl_sigma_x_weighted)
     ret["sigma_y_weighted"] = np.array(cl_sigma_y_weighted)

From dd4c43a2b1cbae3c2f71ca8115b1f282e67e27e8 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 14:43:30 +0200
Subject: [PATCH 32/66] up standardization pipeline

---
 mlpf/pyg/clic_standardization.json            |  1 -
 mlpf/pyg/mlpf.py                              | 26 +++-------
 mlpf/pyg/training.py                          | 47 ++++++++++++++++++-
 parameters/pytorch/pyg-clic-ttbar-21-std.yaml |  2 +-
 parameters/pytorch/pyg-clic-ttbar-21.yaml     |  2 +-
 parameters/pytorch/pyg-clic-ttbar-22-std.yaml |  2 +-
 parameters/pytorch/pyg-clic-ttbar-22.yaml     |  2 +-
 7 files changed, 55 insertions(+), 27 deletions(-)
 delete mode 100644 mlpf/pyg/clic_standardization.json

diff --git a/mlpf/pyg/clic_standardization.json b/mlpf/pyg/clic_standardization.json
deleted file mode 100644
index 616aa0023..000000000
--- a/mlpf/pyg/clic_standardization.json
+++ /dev/null
@@ -1 +0,0 @@
-{"2.2.0": {"PFelement1": {"mean": [1.0, 3.313861608505249, 0.0016492522554472089, 0.0001337795110885054, 0.008735032752156258, 4.222240924835205, 52.052459716796875, 15.738365173339844, 0.0, 0.0, 43.952632904052734, 0.003598652081564069, 0.0025099683552980423, 1.8601234614834539e-06, 0.06345824152231216, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], "std": [0.0, 9.41322135925293, 0.8651331663131714, 0.707984983921051, 0.7061750888824463, 11.704730033874512, 9963.4365234375, 6.578129768371582, 0.0, 0.0, 57.3457145690918, 1.3837608098983765, 2.9924261569976807, 0.0038007558323442936, 27.722505569458008, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.5798463821411133, -0.0016945624956861138, 0.0002545498718973249, 0.006834524683654308, 3.3009438514709473, 15.00655460357666, 0.9023095965385437, -5.961066722869873, 1.5725995302200317, 1.894817590713501, 0.9789467453956604, 1.2573817912198137e-05, 80.9168701171875, 50.47904586791992, 50.581722259521484, 51.96408462524414, 0.6345308423042297, 0.041190944612026215, 41.63414001464844, 41.84788513183594, 54.959861755371094, 11619.2177734375, -5.938568115234375, 1578.74169921875, 0.6185510158538818], "std": [0.0, 4.814330577850342, 0.918420672416687, 0.7074259519577026, 0.7067552208900452, 5.785495281219482, 1081.1585693359375, 1084.4302978515625, 1554.4815673828125, 0.6989641189575195, 3.899383783340454, 3.1981284618377686, 0.00021599895262625068, 106.33512878417969, 71.94786834716797, 72.29435729980469, 73.6717758178711, 0.6663865447044373, 0.045639555901288986, 61.58401870727539, 62.009708404541016, 88.85784912109375, 33432.66796875, 1562.9705810546875, 7268.15087890625, 1.1162511110305786]}}, "2.1.0": {"PFelement1": {"mean": [1.0, 3.306542158126831, -0.001628185505978763, 0.00044645112939178944, 0.008498544804751873, 4.233072757720947, 31.00058364868164, 15.700974464416504, 0.0, 0.0, 43.88750457763672, -0.0028079766780138016, 0.001959568355232477, 7.4845515882771e-06, -0.07514937967061996, -1.0, 0.0], "std": [0.0, 14.216273307800293, 0.864812970161438, 0.7057437300682068, 0.708417534828186, 20.747175216674805, 1617.955078125, 6.582387924194336, 0.0, 0.0, 56.95081329345703, 1.3340226411819458, 2.9433951377868652, 0.0037820138968527317, 37.608154296875, 0.0, 0.0]}, "PFelement2": {"mean": [2.0, 2.568070650100708, -0.000730668194591999, -0.0011971204075962305, 0.004265286028385162, 3.2908971309661865, 11.28282356262207, -0.4471946060657501, -4.926597595214844, 1.571304440498352, 1.886231780052185, 0.9784500002861023, 1.2197442629258148e-05, 80.69485473632812, 50.48505783081055, 50.41227340698242, 51.77717208862305], "std": [0.0, 4.781670093536377, 0.9176656603813171, 0.707072377204895, 0.70712810754776, 5.762104511260986, 1084.093505859375, 1080.34375, 1554.9664306640625, 0.6987221240997314, 3.8536908626556396, 3.2011756896972656, 0.00021397981618065387, 105.88664245605469, 72.17912292480469, 71.81172180175781, 72.6884765625]}}}
\ No newline at end of file
diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index a3920288f..7f7ecb922 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -57,25 +57,15 @@ def norm_cdf(x):
         return tensor
 
 
-def standardize_inputs(X, elemtypes_nonzero):
-
-    if X.shape[-1] == 26:
-        vs = "2.2.0"
-    else:
-        vs = "2.1.0"
-
-    import json
-
-    with open("/pfvolcentral/clic_standardization.json", "rb") as f:
-        standard_dict = json.load(f)[vs]
+def standardize_inputs(X, elemtypes_nonzero, standardization_dict):
 
     for i, ielem in enumerate(elemtypes_nonzero):
 
         Xfeat_normed_msked = X.clone()
 
         # get mean/std of features of that elem
-        mean = torch.tensor(standard_dict[f"PFelement{ielem}"]["mean"]).to(Xfeat_normed_msked.device)
-        std = torch.tensor(standard_dict[f"PFelement{ielem}"]["std"]).to(Xfeat_normed_msked.device)
+        mean = torch.tensor(standardization_dict[f"PFelement{ielem}"]["mean"]).to(Xfeat_normed_msked.device)
+        std = torch.tensor(standardization_dict[f"PFelement{ielem}"]["std"]).to(Xfeat_normed_msked.device)
 
         # standardize
         Xfeat_normed_msked[..., 1:] = (Xfeat_normed_msked[..., 1:] - mean[..., 1:]) / std[..., 1:]
@@ -300,8 +290,6 @@ def __init__(
         dropout_conv_id_mha=0.0,
         dropout_conv_id_ff=0.0,
         use_pre_layernorm=False,
-        # standardize_inputs
-        standardize_inputs=False,
     ):
         super(MLPF, self).__init__()
 
@@ -321,8 +309,6 @@ def __init__(
 
         self.use_pre_layernorm = use_pre_layernorm
 
-        self.standardize_inputs = standardize_inputs
-
         if self.conv_type == "attention":
             embedding_dim = num_heads * head_dim
             width = num_heads * head_dim
@@ -414,11 +400,11 @@ def __init__(
             self.final_norm_reg = torch.nn.LayerNorm(embed_dim)
 
     # @torch.compile
-    def forward(self, X_features, mask):
+    def forward(self, X_features, mask, standardization_dict=None):
         Xfeat_normed = X_features
 
-        if self.standardize_inputs:
-            Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero)
+        if standardization_dict is not None:
+            Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero, standardization_dict)
 
         embeddings_id, embeddings_reg = [], []
         if self.num_convs != 0:
diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index dd8a433fb..e756a7f08 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -450,6 +450,7 @@ def train_and_valid(
     dtype=torch.float32,
     tensorboard_writer=None,
     save_attention=False,
+    standardization_dict=None,
 ):
     """
     Performs training over a given epoch. Will run a validation step every N_STEPS and after the last training batch.
@@ -496,13 +497,13 @@ def train_and_valid(
 
         with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
             if is_train:
-                ypred_raw = model(batch.X, batch.mask)
+                ypred_raw = model(batch.X, batch.mask, standardization_dict)
             else:
                 with torch.no_grad():
                     # save some attention matrices
                     if save_attention and (rank == 0 or rank == "cpu") and itrain == 0:
                         set_save_attention(model, outdir, True)
-                    ypred_raw = model(batch.X, batch.mask)
+                    ypred_raw = model(batch.X, batch.mask, standardization_dict)
 
         ypred = unpack_predictions(ypred_raw)
 
@@ -684,6 +685,7 @@ def train_mlpf(
     comet_step_freq=None,
     val_freq=None,
     save_attention=False,
+    standardization_dict=None,
 ):
     """
     Will run a full training by calling train().
@@ -747,6 +749,7 @@ def train_mlpf(
                         lr_schedule=lr_schedule,
                         val_freq=val_freq,
                         dtype=dtype,
+                        standardization_dict=standardization_dict,
                     )
             prof.export_chrome_trace("trace.json")
         else:
@@ -767,6 +770,7 @@ def train_mlpf(
                 val_freq=val_freq,
                 dtype=dtype,
                 tensorboard_writer=tensorboard_writer_train,
+                standardization_dict=standardization_dict,
             )
         t_train = time.time()  # epoch time excluding validation
 
@@ -787,6 +791,7 @@ def train_mlpf(
             dtype=dtype,
             tensorboard_writer=tensorboard_writer_valid,
             save_attention=save_attention,
+            standardization_dict=standardization_dict,
         )
         t_valid = time.time()
 
@@ -1072,6 +1077,42 @@ def run(rank, world_size, config, args, outdir, logfile):
         last_epoch = -1 if start_epoch == 1 else start_epoch - 1
         lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch)
 
+        def get_standardization_dict(dataset, train_loader, nsubset=10_000):
+
+            standardization_dict = {}
+
+            for ielem in ELEM_TYPES_NONZERO[dataset]:
+                standardization_dict["PFelement" + str(ielem)] = {}
+
+                tot_events = 0
+                for i, batch in enumerate(train_loader):
+
+                    tot_events += batch.X.shape[0]
+
+                    # remove the first dimension because we will stack all PFelements anyway to compute the mean/std
+                    batch.X = batch.X.view(-1, batch.X.shape[-1])
+
+                    msk = (batch.X[:, 0] == ielem) & (batch.X[:, 0] != 0)  # skip 0 padded elements
+
+                    if i == 0:
+                        # initialize
+                        concatenated_pfelements = batch.X[msk]
+                    else:
+                        concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]])
+
+                standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist()
+                standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist()
+
+                if tot_events > nsubset:
+                    break
+
+            return standardization_dict
+
+        if config["standardize_inputs"] is True:
+            standardization_dict = get_standardization_dict(config["dataset"], loaders["train"])
+        else:
+            standardization_dict = None
+
         train_mlpf(
             rank,
             world_size,
@@ -1092,6 +1133,7 @@ def run(rank, world_size, config, args, outdir, logfile):
             comet_step_freq=config["comet_step_freq"],
             val_freq=config["val_freq"],
             save_attention=config["save_attention"],
+            standardization_dict=standardization_dict,
         )
 
         checkpoint = torch.load(f"{outdir}/best_weights.pth", map_location=torch.device(rank))
@@ -1345,6 +1387,7 @@ def train_ray_trial(config, args, outdir=None):
         comet_step_freq=config["comet_step_freq"],
         dtype=getattr(torch, config["dtype"]),
         val_freq=config["val_freq"],
+        standardization_dict=None,
     )
 
 
diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
index 4b3152b5a..17c3006c9 100644
--- a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: True
 save_attention: yes
 dataset: clic
 sort_data: no
@@ -61,7 +62,6 @@ model:
     head_dim: 64
     num_heads: 12
     attention_type: math
-    standardize_inputs: True
     use_pre_layernorm: True
 
   mamba:
diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml
index 38ac0a553..cd3126713 100644
--- a/parameters/pytorch/pyg-clic-ttbar-21.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 save_attention: yes
 dataset: clic
 sort_data: no
@@ -61,7 +62,6 @@ model:
     head_dim: 64
     num_heads: 12
     attention_type: math
-    standardize_inputs: False
     use_pre_layernorm: True
 
   mamba:
diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
index d061cfc07..fa3d8173c 100644
--- a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: True
 save_attention: yes
 dataset: clic
 sort_data: no
@@ -61,7 +62,6 @@ model:
     head_dim: 64
     num_heads: 12
     attention_type: math
-    standardize_inputs: True
     use_pre_layernorm: True
 
   mamba:
diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml
index 90a12c4bc..438052100 100644
--- a/parameters/pytorch/pyg-clic-ttbar-22.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 save_attention: yes
 dataset: clic
 sort_data: no
@@ -61,7 +62,6 @@ model:
     head_dim: 64
     num_heads: 12
     attention_type: math
-    standardize_inputs: False
     use_pre_layernorm: True
 
   mamba:

From 80bd33058a9d64b296c7127405c6ce2365ad1005 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 14:45:07 +0200
Subject: [PATCH 33/66] better docs

---
 mlpf/pyg/training.py | 34 ++----------------------------
 mlpf/pyg/utils.py    | 50 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index e756a7f08..20d1d2c1d 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -34,6 +34,7 @@
     ELEM_TYPES_NONZERO,
     X_FEATURES,
     count_parameters,
+    get_input_standardization,
     get_lr_schedule,
     get_model_state_dict,
     load_checkpoint,
@@ -1077,39 +1078,8 @@ def run(rank, world_size, config, args, outdir, logfile):
         last_epoch = -1 if start_epoch == 1 else start_epoch - 1
         lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch)
 
-        def get_standardization_dict(dataset, train_loader, nsubset=10_000):
-
-            standardization_dict = {}
-
-            for ielem in ELEM_TYPES_NONZERO[dataset]:
-                standardization_dict["PFelement" + str(ielem)] = {}
-
-                tot_events = 0
-                for i, batch in enumerate(train_loader):
-
-                    tot_events += batch.X.shape[0]
-
-                    # remove the first dimension because we will stack all PFelements anyway to compute the mean/std
-                    batch.X = batch.X.view(-1, batch.X.shape[-1])
-
-                    msk = (batch.X[:, 0] == ielem) & (batch.X[:, 0] != 0)  # skip 0 padded elements
-
-                    if i == 0:
-                        # initialize
-                        concatenated_pfelements = batch.X[msk]
-                    else:
-                        concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]])
-
-                standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist()
-                standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist()
-
-                if tot_events > nsubset:
-                    break
-
-            return standardization_dict
-
         if config["standardize_inputs"] is True:
-            standardization_dict = get_standardization_dict(config["dataset"], loaders["train"])
+            standardization_dict = get_input_standardization(config["dataset"], loaders["train"])
         else:
             standardization_dict = None
 
diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index 6ec64c480..d55e7ec30 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -1,11 +1,11 @@
 import json
+import logging
 import pickle as pkl
 
 import pandas as pd
 import torch
 import torch.utils.data
-from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingLR, ConstantLR
-import logging
+from torch.optim.lr_scheduler import ConstantLR, CosineAnnealingLR, OneCycleLR
 
 # https://github.com/ahlinist/cmssw/blob/1df62491f48ef964d198f574cdfcccfd17c70425/DataFormats/ParticleFlowReco/interface/PFBlockElement.h#L33
 # https://github.com/cms-sw/cmssw/blob/master/DataFormats/ParticleFlowCandidate/src/PFCandidate.cc#L254
@@ -162,7 +162,9 @@ def unpack_target(y, model):
 
     # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"]
     ret["momentum"] = y[..., 2:7].to(dtype=torch.float32)
-    ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1)
+    ret["p4"] = torch.cat(
+        [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1
+    )
 
     ret["ispu"] = y[..., -1]
 
@@ -280,7 +282,11 @@ def load_lr_schedule(lr_schedule, checkpoint):
         lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"])
         return lr_schedule
     else:
-        raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys()))
+        raise KeyError(
+            "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(
+                checkpoint["extra_state"].keys()
+            )
+        )
 
 
 def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1):
@@ -298,7 +304,9 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-
             pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3,
         )
     elif config["lr_schedule"] == "cosinedecay":
-        lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1)
+        lr_schedule = CosineAnnealingLR(
+            opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1
+        )
     else:
         raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.")
     return lr_schedule
@@ -328,3 +336,35 @@ def count_parameters(model):
             )
             trainable_params += params
     return trainable_params, nontrainable_params, table
+
+
+def get_input_standardization(dataset, train_loader, nsubset=10_000):
+
+    standardization_dict = {}
+
+    for ielem in ELEM_TYPES_NONZERO[dataset]:
+        standardization_dict["PFelement" + str(ielem)] = {}
+
+        tot_events = 0
+        for i, batch in enumerate(train_loader):
+
+            tot_events += batch.X.shape[0]
+
+            # remove the first dimension because we will stack all PFelements anyway to compute the mean/std
+            batch.X = batch.X.view(-1, batch.X.shape[-1])
+
+            msk = (batch.X[:, 0] == ielem) & (batch.X[:, 0] != 0)  # skip 0 padded elements
+
+            if i == 0:
+                # initialize
+                concatenated_pfelements = batch.X[msk]
+            else:
+                concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]])
+
+        standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist()
+        standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist()
+
+        if tot_events > nsubset:
+            break
+
+    return standardization_dict

From c0403998fd029df4adaab0ec3ff905f10e68ea73 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 15:14:55 +0200
Subject: [PATCH 34/66] fix input dim for other datasets

---
 mlpf/pyg/training.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 20d1d2c1d..8ae2fa222 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -1001,10 +1001,14 @@ def run(rank, world_size, config, args, outdir, logfile):
 
         model, optimizer = load_checkpoint(checkpoint, model, optimizer)
     else:  # instantiate a new model in the outdir created
-
-        input_dim = (
-            len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
-        )
+        
+        input_dim = len(X_FEATURES[config["dataset"]])
+        if config["dataset"] == "clic":
+            # extract the version of the dataset
+            for sample in config["test_dataset"]:
+                if config["test_dataset"][sample]["version"] == "2.2.0":
+                    input_dim = 26
+                break
 
         model_kwargs = {
             "input_dim": input_dim,
@@ -1249,9 +1253,13 @@ def train_ray_trial(config, args, outdir=None):
     world_rank = ray.train.get_context().get_world_rank()
     world_size = ray.train.get_context().get_world_size()
 
-    input_dim = (
-        len(X_FEATURES[config["dataset"]]) if config["test_dataset"]["clic_edm_ttbar_pf"]["version"] != "2.2.0" else 26
-    )
+    input_dim = len(X_FEATURES[config["dataset"]])
+    if config["dataset"] == "clic":
+        # extract the version of the dataset
+        for sample in config["test_dataset"]:
+            if config["test_dataset"][sample]["version"] == "2.2.0":
+                input_dim = 26
+            break
 
     model_kwargs = {
         "input_dim": input_dim,

From bcfb2771921e86b9dbcc078f6ce3dbdfa0e32a5f Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 15:15:29 +0200
Subject: [PATCH 35/66] pca

---
 mlpf/pyg/mlpf.py                              |  8 +-
 mlpf/pyg/training.py                          | 74 ++++++-------------
 mlpf/pyg/utils.py                             | 14 +---
 parameters/pytorch/pyg-clic-ttbar-21-std.yaml |  2 +-
 parameters/pytorch/pyg-clic-ttbar-21.yaml     |  2 +-
 parameters/pytorch/pyg-clic-ttbar-22-std.yaml |  2 +-
 parameters/pytorch/pyg-clic-ttbar-22.yaml     |  2 +-
 scripts/clic/postprocessing.py                | 65 ++++------------
 8 files changed, 47 insertions(+), 122 deletions(-)

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index 7f7ecb922..33d431c09 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -122,9 +122,7 @@ def __init__(
         self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True)
         self.norm0 = torch.nn.LayerNorm(embedding_dim)
         self.norm1 = torch.nn.LayerNorm(embedding_dim)
-        self.seq = torch.nn.Sequential(
-            nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()
-        )
+        self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act())
         self.dropout = torch.nn.Dropout(dropout_ff)
         _logger.info("using attention_type={}".format(attention_type))
         # params for torch sdp_kernel
@@ -465,9 +463,7 @@ def forward(self, X_features, mask, standardization_dict=None):
         e_real[~mask] = 0
         e_real[torch.isinf(e_real)] = 0
         e_real[torch.isnan(e_real)] = 0
-        preds_energy = e_real + torch.nn.functional.relu(
-            self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])
-        )
+        preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]))
         preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1)
         return preds_binary_particle, preds_pid, preds_momentum
 
diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 8ae2fa222..91dd4f3c0 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -97,9 +97,7 @@ def mlpf_loss(y, ypred, batch):
 
     # binary loss for particle / no-particle classification
     # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape)
-    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(
-        ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none"
-    )
+    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none")
 
     # compare the particle type, only for cases where there was a true particle
     loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape)
@@ -147,12 +145,12 @@ def mlpf_loss(y, ypred, batch):
     pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2)
     loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean()
 
-    was_input_pred = torch.concat(
-        [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1
-    ) * batch.mask.unsqueeze(axis=-1)
-    was_input_true = torch.concat(
-        [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1
-    ) * batch.mask.unsqueeze(axis=-1)
+    was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
+    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
 
     # standardize Wasserstein loss
     std = was_input_true[batch.mask].std(axis=0)
@@ -194,9 +192,7 @@ class FocalLoss(nn.Module):
         - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
     """
 
-    def __init__(
-        self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100
-    ):
+    def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100):
         """Constructor.
         Args:
             alpha (Tensor, optional): Weights for each class. Defaults to None.
@@ -386,30 +382,18 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o
         ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
@@ -473,9 +457,7 @@ def train_and_valid(
     if (world_size > 1) and (rank != 0):
         iterator = enumerate(data_loader)
     else:
-        iterator = tqdm.tqdm(
-            enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}"
-        )
+        iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}")
 
     device_type = "cuda" if isinstance(rank, int) else "cpu"
 
@@ -733,9 +715,7 @@ def train_mlpf(
 
         # training step, edit here to profile a specific epoch
         if epoch == -1:
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True
-            ) as prof:
+            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
                 with record_function("model_train"):
                     losses_t = train_and_valid(
                         rank,
@@ -1001,7 +981,7 @@ def run(rank, world_size, config, args, outdir, logfile):
 
         model, optimizer = load_checkpoint(checkpoint, model, optimizer)
     else:  # instantiate a new model in the outdir created
-        
+
         input_dim = len(X_FEATURES[config["dataset"]])
         if config["dataset"] == "clic":
             # extract the version of the dataset
@@ -1049,9 +1029,7 @@ def run(rank, world_size, config, args, outdir, logfile):
             _logger.info(f"Model directory {outdir}", color="bold")
 
         if args.comet:
-            comet_experiment = create_comet_experiment(
-                config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
-            )
+            comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
             comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}")
             comet_experiment.log_parameter("run_id", Path(outdir).name)
             comet_experiment.log_parameter("world_size", world_size)
@@ -1304,9 +1282,7 @@ def train_ray_trial(config, args, outdir=None):
     loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True)
 
     if args.comet:
-        comet_experiment = create_comet_experiment(
-            config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
-        )
+        comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
         comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}")
         comet_experiment.log_parameter("run_id", Path(outdir).name)
         comet_experiment.log_parameter("world_size", world_size)
@@ -1340,9 +1316,7 @@ def train_ray_trial(config, args, outdir=None):
                 if args.resume_training:
                     model, optimizer = load_checkpoint(checkpoint, model, optimizer)
                     start_epoch = checkpoint["extra_state"]["epoch"] + 1
-                    lr_schedule = get_lr_schedule(
-                        config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1
-                    )
+                    lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1)
                 else:  # start a new training with model weights loaded from a pre-trained model
                     model = load_checkpoint(checkpoint, model)
 
@@ -1506,9 +1480,7 @@ def run_hpo(config, args):
 
     if tune.Tuner.can_restore(str(expdir)):
         # resume unfinished HPO run
-        tuner = tune.Tuner.restore(
-            str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True
-        )
+        tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True)
     else:
         # start new HPO run
         search_space = {"train_loop_config": search_space}  # the ray TorchTrainer only takes a single arg: train_loop_config
@@ -1549,6 +1521,4 @@ def run_hpo(config, args):
     print(result_df.columns)
 
     logging.info("Total time of Tuner.fit(): {}".format(end - start))
-    logging.info(
-        "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)
-    )
+    logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))
diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index d55e7ec30..3ab3509d7 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -162,9 +162,7 @@ def unpack_target(y, model):
 
     # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"]
     ret["momentum"] = y[..., 2:7].to(dtype=torch.float32)
-    ret["p4"] = torch.cat(
-        [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1
-    )
+    ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1)
 
     ret["ispu"] = y[..., -1]
 
@@ -282,11 +280,7 @@ def load_lr_schedule(lr_schedule, checkpoint):
         lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"])
         return lr_schedule
     else:
-        raise KeyError(
-            "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(
-                checkpoint["extra_state"].keys()
-            )
-        )
+        raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys()))
 
 
 def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1):
@@ -304,9 +298,7 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-
             pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3,
         )
     elif config["lr_schedule"] == "cosinedecay":
-        lr_schedule = CosineAnnealingLR(
-            opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1
-        )
+        lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1)
     else:
         raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.")
     return lr_schedule
diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
index 17c3006c9..ae6b5a01e 100644
--- a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
@@ -120,4 +120,4 @@ valid_dataset:
 
 test_dataset:
   clic_edm_ttbar_pf:
-    version: 2.1.0
\ No newline at end of file
+    version: 2.1.0
diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml
index cd3126713..6aea54096 100644
--- a/parameters/pytorch/pyg-clic-ttbar-21.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml
@@ -120,4 +120,4 @@ valid_dataset:
 
 test_dataset:
   clic_edm_ttbar_pf:
-    version: 2.1.0
\ No newline at end of file
+    version: 2.1.0
diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
index fa3d8173c..39e3e8247 100644
--- a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
@@ -120,4 +120,4 @@ valid_dataset:
 
 test_dataset:
   clic_edm_ttbar_pf:
-    version: 2.2.0
\ No newline at end of file
+    version: 2.2.0
diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml
index 438052100..1512a6b3e 100644
--- a/parameters/pytorch/pyg-clic-ttbar-22.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml
@@ -120,4 +120,4 @@ valid_dataset:
 
 test_dataset:
   clic_edm_ttbar_pf:
-    version: 2.2.0
\ No newline at end of file
+    version: 2.2.0
diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py
index 685d6191d..e05b2ae9c 100644
--- a/scripts/clic/postprocessing.py
+++ b/scripts/clic/postprocessing.py
@@ -149,9 +149,7 @@ def __init__(
         self.cluster_features = cluster_features  # feature matrix of the calo clusters
         self.track_features = track_features  # feature matrix of the tracks
         self.genparticle_to_hit = genparticle_to_hit  # sparse COO matrix of genparticles to hits (idx_gp, idx_hit, weight)
-        self.genparticle_to_track = (
-            genparticle_to_track  # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight)
-        )
+        self.genparticle_to_track = genparticle_to_track  # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight)
         self.hit_to_cluster = hit_to_cluster  # sparse COO matrix of hits to clusters (idx_hit, idx_cluster, weight)
         self.gp_merges = gp_merges  # sparse COO matrix of any merged genparticles
 
@@ -217,10 +215,7 @@ def get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs):
             hit_idx_global += 1
     hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()}
     hit_feature_matrix = awkward.Record(
-        {
-            k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))])
-            for k in hit_feature_matrix[0].fields
-        }
+        {k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields}
     )
 
     # add all edges from genparticle to calohit
@@ -286,9 +281,7 @@ def gen_to_features(prop_data, iev):
     gen_arr = {k.replace(mc_coll + ".", ""): gen_arr[k] for k in gen_arr.fields}
 
     MCParticles_p4 = vector.awk(
-        awkward.zip(
-            {"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]}
-        )
+        awkward.zip({"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]})
     )
     gen_arr["pt"] = MCParticles_p4.pt
     gen_arr["eta"] = MCParticles_p4.eta
@@ -407,12 +400,8 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev):
         # get width at shower max
         msk = np.array(hits_posz) == zmax  # select the hits at zmax
 
-        x_bar = np.sum(np.array(hits_posx)[msk] * np.array(hits_energy)[msk]) / np.sum(
-            np.array(hits_energy)[msk]
-        )  # energy weighted average
-        y_bar = np.sum(np.array(hits_posy)[msk] * np.array(hits_energy)[msk]) / np.sum(
-            np.array(hits_energy)[msk]
-        )  # energy weighted average
+        x_bar = np.sum(np.array(hits_posx)[msk] * np.array(hits_energy)[msk]) / np.sum(np.array(hits_energy)[msk])  # energy weighted average
+        y_bar = np.sum(np.array(hits_posy)[msk] * np.array(hits_energy)[msk]) / np.sum(np.array(hits_energy)[msk])  # energy weighted average
 
         num = (np.sum(np.array(hits_energy)[msk] * (np.array(hits_posx)[msk] - x_bar) ** 2)) + (
             np.sum(np.array(hits_energy)[msk] * (np.array(hits_posy)[msk] - y_bar) ** 2)
@@ -506,9 +495,7 @@ def filter_adj(adj, all_to_filtered):
 
 def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs):
     gen_features = gen_to_features(prop_data, iev)
-    hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(
-        hit_data, calohit_links, iev, collectionIDs
-    )
+    hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs)
     hit_to_cluster = hit_cluster_adj(prop_data, hit_idx_local_to_global, iev)
     cluster_features = cluster_to_features(prop_data, hit_features, hit_to_cluster, iev)
     track_features = track_to_features(prop_data, iev)
@@ -521,9 +508,7 @@ def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack
 
     if len(genparticle_to_track[0]) > 0:
         gp_to_track = (
-            coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track))
-            .max(axis=1)
-            .todense()
+            coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track)).max(axis=1).todense()
         )
     else:
         gp_to_track = np.zeros((n_gp, 1))
@@ -576,12 +561,8 @@ def assign_genparticles_to_obj_and_merge(gpdata):
         ).todense()
     )
 
-    gp_to_calohit = coo_matrix(
-        (gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit)
-    )
-    calohit_to_cluster = coo_matrix(
-        (gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster)
-    )
+    gp_to_calohit = coo_matrix((gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit))
+    calohit_to_cluster = coo_matrix((gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster))
 
     gp_to_cluster = np.array((gp_to_calohit * calohit_to_cluster).todense())
 
@@ -746,9 +727,7 @@ def get_reco_properties(prop_data, iev):
     reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields}
 
     reco_p4 = vector.awk(
-        awkward.zip(
-            {"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]}
-        )
+        awkward.zip({"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]})
     )
     reco_arr["pt"] = reco_p4.pt
     reco_arr["eta"] = reco_p4.eta
@@ -970,29 +949,19 @@ def process_one_file(fn, ofn):
         assert np.all(used_rps == 1)
 
         gps_track = get_particle_feature_matrix(track_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order)
-        gps_track[:, 0] = np.array(
-            [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])]
-        )
+        gps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])])
         gps_cluster = get_particle_feature_matrix(cluster_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order)
-        gps_cluster[:, 0] = np.array(
-            [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])]
-        )
+        gps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])])
         gps_cluster[:, 1] = 0
 
         rps_track = get_particle_feature_matrix(track_to_rp_all, reco_features, particle_feature_order)
-        rps_track[:, 0] = np.array(
-            [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])]
-        )
+        rps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])])
         rps_cluster = get_particle_feature_matrix(cluster_to_rp_all, reco_features, particle_feature_order)
-        rps_cluster[:, 0] = np.array(
-            [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])]
-        )
+        rps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])])
         rps_cluster[:, 1] = 0
 
         # all initial gen/reco particle energy must be reconstructable
-        assert (
-            abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2
-        )
+        assert abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2
 
         assert abs(np.sum(rps_track[:, 6]) + np.sum(rps_cluster[:, 6]) - np.sum(reco_features["energy"])) < 1e-2
 
@@ -1037,9 +1006,7 @@ def parse_args():
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input", type=str, help="Input ROOT file - else if dir then will process all files inside", required=True
-    )
+    parser.add_argument("--input", type=str, help="Input ROOT file - else if dir then will process all files inside", required=True)
     parser.add_argument("--outpath", type=str, default="raw", help="output path")
     args = parser.parse_args()
     return args

From b2e7c2e9222efefaafc780b10d7004826fa9609e Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 15:29:26 +0200
Subject: [PATCH 36/66] add standardize_inputs: False to all configs

---
 parameters/pytorch/pyg-cld.yaml            | 1 +
 parameters/pytorch/pyg-clic-hits.yaml      | 1 +
 parameters/pytorch/pyg-clic.yaml           | 1 +
 parameters/pytorch/pyg-cms-finetune.yaml   | 1 +
 parameters/pytorch/pyg-cms-ttbar-nopu.yaml | 1 +
 parameters/pytorch/pyg-cms.yaml            | 1 +
 6 files changed, 6 insertions(+)

diff --git a/parameters/pytorch/pyg-cld.yaml b/parameters/pytorch/pyg-cld.yaml
index 204689385..f3ae2e957 100644
--- a/parameters/pytorch/pyg-cld.yaml
+++ b/parameters/pytorch/pyg-cld.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 dataset: cld
 sort_data: no
 data_dir:
diff --git a/parameters/pytorch/pyg-clic-hits.yaml b/parameters/pytorch/pyg-clic-hits.yaml
index 62b470931..4a8b5e3b1 100644
--- a/parameters/pytorch/pyg-clic-hits.yaml
+++ b/parameters/pytorch/pyg-clic-hits.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 dataset: clic
 data_dir:
 gpus: 1
diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml
index a51540683..e4dcdde1e 100644
--- a/parameters/pytorch/pyg-clic.yaml
+++ b/parameters/pytorch/pyg-clic.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 save_attention: yes
 dataset: clic
 sort_data: no
diff --git a/parameters/pytorch/pyg-cms-finetune.yaml b/parameters/pytorch/pyg-cms-finetune.yaml
index b70d3df4a..2c362ea39 100644
--- a/parameters/pytorch/pyg-cms-finetune.yaml
+++ b/parameters/pytorch/pyg-cms-finetune.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 dataset: cms
 sort_data: yes
 data_dir:
diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
index cfacab525..2e1ac6e94 100644
--- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
+++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 dataset: cms
 sort_data: yes
 data_dir:
diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml
index 7d5f7e4a1..76770c1cb 100644
--- a/parameters/pytorch/pyg-cms.yaml
+++ b/parameters/pytorch/pyg-cms.yaml
@@ -1,5 +1,6 @@
 backend: pytorch
 
+standardize_inputs: False
 save_attention: no
 dataset: cms
 sort_data: yes

From 0b901589dd57813858e201e7fc1e2cdabc6fa1f7 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 15:45:29 +0200
Subject: [PATCH 37/66] up

---
 parameters/pytorch/pyg-clic-std.yaml | 129 +++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 parameters/pytorch/pyg-clic-std.yaml

diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml
new file mode 100644
index 000000000..287069d31
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-std.yaml
@@ -0,0 +1,129 @@
+backend: pytorch
+
+standardize_inputs: True
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: split #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 4
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "gelu"
+    head_dim: 32
+    num_heads: 32
+    attention_type: math
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+        clic_edm_qq_pf:
+          version: 2.1.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+        clic_edm_qq_pf:
+          version: 2.1.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.1.0
+  clic_edm_qq_pf:
+    version: 2.1.0

From a253a4a6129bc91222e31120ffedacb3c75949c1 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 15:49:38 +0200
Subject: [PATCH 38/66] debug

---
 parameters/pytorch/pyg-clic-std.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml
index 287069d31..f59d258f2 100644
--- a/parameters/pytorch/pyg-clic-std.yaml
+++ b/parameters/pytorch/pyg-clic-std.yaml
@@ -109,8 +109,6 @@ train_dataset:
       samples:
         clic_edm_ttbar_pf:
           version: 2.1.0
-        clic_edm_qq_pf:
-          version: 2.1.0
 
 valid_dataset:
   clic:

From bb92cb52946dc2408b983c0bab700e898a2d9223 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 15:50:54 +0200
Subject: [PATCH 39/66] up

---
 parameters/pytorch/pyg-clic-std.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml
index f59d258f2..2f08386cf 100644
--- a/parameters/pytorch/pyg-clic-std.yaml
+++ b/parameters/pytorch/pyg-clic-std.yaml
@@ -107,7 +107,7 @@ train_dataset:
     physical:
       batch_size: 1
       samples:
-        clic_edm_ttbar_pf:
+        clic_edm_qq_pf:
           version: 2.1.0
 
 valid_dataset:

From 8e945594d024a919b3f94f7ca9cd0e77c9903e1b Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 15:52:19 +0200
Subject: [PATCH 40/66] revert

---
 parameters/pytorch/pyg-clic-std.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml
index 2f08386cf..287069d31 100644
--- a/parameters/pytorch/pyg-clic-std.yaml
+++ b/parameters/pytorch/pyg-clic-std.yaml
@@ -107,6 +107,8 @@ train_dataset:
     physical:
       batch_size: 1
       samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
         clic_edm_qq_pf:
           version: 2.1.0
 

From a8522567ee8968d055dfa2bcc26cf91ec4c59190 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:01:19 +0200
Subject: [PATCH 41/66] debug

---
 mlpf/pyg/utils.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index 3ab3509d7..16d31896e 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -162,7 +162,9 @@ def unpack_target(y, model):
 
     # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"]
     ret["momentum"] = y[..., 2:7].to(dtype=torch.float32)
-    ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1)
+    ret["p4"] = torch.cat(
+        [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1
+    )
 
     ret["ispu"] = y[..., -1]
 
@@ -280,7 +282,11 @@ def load_lr_schedule(lr_schedule, checkpoint):
         lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"])
         return lr_schedule
     else:
-        raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys()))
+        raise KeyError(
+            "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(
+                checkpoint["extra_state"].keys()
+            )
+        )
 
 
 def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1):
@@ -298,7 +304,9 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-
             pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3,
         )
     elif config["lr_schedule"] == "cosinedecay":
-        lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1)
+        lr_schedule = CosineAnnealingLR(
+            opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1
+        )
     else:
         raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.")
     return lr_schedule
@@ -336,6 +344,7 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000):
 
     for ielem in ELEM_TYPES_NONZERO[dataset]:
         standardization_dict["PFelement" + str(ielem)] = {}
+        print(standardization_dict.keys())
 
         tot_events = 0
         for i, batch in enumerate(train_loader):

From 46415578ef54f50e83ee025630fc2b5057e11d18 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:02:13 +0200
Subject: [PATCH 42/66] up

---
 mlpf/pyg/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index 16d31896e..648a2d1ca 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -342,6 +342,9 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000):
 
     standardization_dict = {}
 
+    print("dataset", dataset)
+    print("ELEM_TYPES_NONZERO[dataset]", ELEM_TYPES_NONZERO[dataset])
+
     for ielem in ELEM_TYPES_NONZERO[dataset]:
         standardization_dict["PFelement" + str(ielem)] = {}
         print(standardization_dict.keys())

From 4d56a63c0f0b0659613c844ef6d27d2281adec9c Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:03:29 +0200
Subject: [PATCH 43/66] oops

---
 mlpf/pyg/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index 648a2d1ca..18e02ae57 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -365,10 +365,10 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000):
             else:
                 concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]])
 
+            if tot_events > nsubset:
+                break
+
         standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist()
         standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist()
 
-        if tot_events > nsubset:
-            break
-
     return standardization_dict

From 8f190b77d0ab7533ca82ab0cde4f16eff083a9de Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:04:22 +0200
Subject: [PATCH 44/66] fixed

---
 mlpf/pyg/utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index 18e02ae57..2e7f763b6 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -342,12 +342,8 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000):
 
     standardization_dict = {}
 
-    print("dataset", dataset)
-    print("ELEM_TYPES_NONZERO[dataset]", ELEM_TYPES_NONZERO[dataset])
-
     for ielem in ELEM_TYPES_NONZERO[dataset]:
         standardization_dict["PFelement" + str(ielem)] = {}
-        print(standardization_dict.keys())
 
         tot_events = 0
         for i, batch in enumerate(train_loader):

From 57e2924cb429cd4406fa2bd4f6a5958306d9c400 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:12:21 +0200
Subject: [PATCH 45/66] up

---
 mlpf/pyg/utils.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index 2e7f763b6..a58869439 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -162,9 +162,7 @@ def unpack_target(y, model):
 
     # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"]
     ret["momentum"] = y[..., 2:7].to(dtype=torch.float32)
-    ret["p4"] = torch.cat(
-        [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1
-    )
+    ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1)
 
     ret["ispu"] = y[..., -1]
 
@@ -282,11 +280,7 @@ def load_lr_schedule(lr_schedule, checkpoint):
         lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"])
         return lr_schedule
     else:
-        raise KeyError(
-            "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(
-                checkpoint["extra_state"].keys()
-            )
-        )
+        raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys()))
 
 
 def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1):
@@ -304,9 +298,7 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-
             pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3,
         )
     elif config["lr_schedule"] == "cosinedecay":
-        lr_schedule = CosineAnnealingLR(
-            opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1
-        )
+        lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1)
     else:
         raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.")
     return lr_schedule

From cad00fa63f774472e44b5390ab4fe84f1a025bc6 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:25:48 +0200
Subject: [PATCH 46/66] logging

---
 mlpf/pyg/training.py | 86 +++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 33 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 91dd4f3c0..2d8b43d67 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -97,7 +97,9 @@ def mlpf_loss(y, ypred, batch):
 
     # binary loss for particle / no-particle classification
     # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape)
-    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none")
+    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(
+        ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none"
+    )
 
     # compare the particle type, only for cases where there was a true particle
     loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape)
@@ -145,12 +147,12 @@ def mlpf_loss(y, ypred, batch):
     pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2)
     loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean()
 
-    was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
-        axis=-1
-    )
-    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
-        axis=-1
-    )
+    was_input_pred = torch.concat(
+        [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1
+    ) * batch.mask.unsqueeze(axis=-1)
+    was_input_true = torch.concat(
+        [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1
+    ) * batch.mask.unsqueeze(axis=-1)
 
     # standardize Wasserstein loss
     std = was_input_true[batch.mask].std(axis=0)
@@ -192,7 +194,9 @@ class FocalLoss(nn.Module):
         - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
     """
 
-    def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100):
+    def __init__(
+        self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100
+    ):
         """Constructor.
         Args:
             alpha (Tensor, optional): Weights for each class. Defaults to None.
@@ -382,18 +386,30 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o
         ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
@@ -457,7 +473,9 @@ def train_and_valid(
     if (world_size > 1) and (rank != 0):
         iterator = enumerate(data_loader)
     else:
-        iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}")
+        iterator = tqdm.tqdm(
+            enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}"
+        )
 
     device_type = "cuda" if isinstance(rank, int) else "cpu"
 
@@ -715,7 +733,9 @@ def train_mlpf(
 
         # training step, edit here to profile a specific epoch
         if epoch == -1:
-            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
+            with profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True
+            ) as prof:
                 with record_function("model_train"):
                     losses_t = train_and_valid(
                         rank,
@@ -865,18 +885,6 @@ def train_mlpf(
             time_per_epoch = (t1 - t0_initial) / epoch
             eta = epochs_remaining * time_per_epoch / 60
 
-            # _logger.info(
-            #     f"Rank {rank}: epoch={epoch} / {num_epochs} "
-            #     + f"train_loss={losses_t['Total']:.4f} "
-            #     + f"valid_loss={losses_v['Total']:.4f} "
-            #     + f"stale={stale_epochs} "
-            #     + f"epoch_train_time={round((t_train-t0)/60, 2)}m "
-            #     + f"epoch_valid_time={round((t_valid-t_train)/60, 2)}m "
-            #     + f"epoch_total_time={round((t1-t0)/60, 2)}m "
-            #     + f"eta={round(eta, 1)}m",
-            #     color="bold",
-            # )
-
             _logger.info(
                 f"Rank {rank}: epoch={epoch} / {num_epochs} "
                 + f"stale={stale_epochs} "
@@ -1029,7 +1037,9 @@ def run(rank, world_size, config, args, outdir, logfile):
             _logger.info(f"Model directory {outdir}", color="bold")
 
         if args.comet:
-            comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
+            comet_experiment = create_comet_experiment(
+                config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
+            )
             comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}")
             comet_experiment.log_parameter("run_id", Path(outdir).name)
             comet_experiment.log_parameter("world_size", world_size)
@@ -1061,6 +1071,8 @@ def run(rank, world_size, config, args, outdir, logfile):
         lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch)
 
         if config["standardize_inputs"] is True:
+            if (rank == 0) or (rank == "cpu"):
+                _logger.info("Will standardize the input features before running the training")
             standardization_dict = get_input_standardization(config["dataset"], loaders["train"])
         else:
             standardization_dict = None
@@ -1282,7 +1294,9 @@ def train_ray_trial(config, args, outdir=None):
     loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True)
 
     if args.comet:
-        comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
+        comet_experiment = create_comet_experiment(
+            config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
+        )
         comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}")
         comet_experiment.log_parameter("run_id", Path(outdir).name)
         comet_experiment.log_parameter("world_size", world_size)
@@ -1316,7 +1330,9 @@ def train_ray_trial(config, args, outdir=None):
                 if args.resume_training:
                     model, optimizer = load_checkpoint(checkpoint, model, optimizer)
                     start_epoch = checkpoint["extra_state"]["epoch"] + 1
-                    lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1)
+                    lr_schedule = get_lr_schedule(
+                        config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1
+                    )
                 else:  # start a new training with model weights loaded from a pre-trained model
                     model = load_checkpoint(checkpoint, model)
 
@@ -1480,7 +1496,9 @@ def run_hpo(config, args):
 
     if tune.Tuner.can_restore(str(expdir)):
         # resume unfinished HPO run
-        tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True)
+        tuner = tune.Tuner.restore(
+            str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True
+        )
     else:
         # start new HPO run
         search_space = {"train_loop_config": search_space}  # the ray TorchTrainer only takes a single arg: train_loop_config
@@ -1521,4 +1539,6 @@ def run_hpo(config, args):
     print(result_df.columns)
 
     logging.info("Total time of Tuner.fit(): {}".format(end - start))
-    logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))
+    logging.info(
+        "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)
+    )

From f91da8196b7d5042ac97da4ec1215d540521edd1 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:27:26 +0200
Subject: [PATCH 47/66] up

---
 mlpf/pyg_pipeline.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py
index 4110e2dea..c592c3018 100644
--- a/mlpf/pyg_pipeline.py
+++ b/mlpf/pyg_pipeline.py
@@ -27,7 +27,9 @@
 parser.add_argument("--prefix", type=str, default=None, help="prefix appended to result dir name")
 parser.add_argument("--data-dir", type=str, default=None, help="path to `tensorflow_datasets/`")
 parser.add_argument("--gpus", type=int, default=None, help="to use CPU set to 0; else e.g., 4")
-parser.add_argument("--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor")
+parser.add_argument(
+    "--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor"
+)
 parser.add_argument(
     "--dataset",
     type=str,
@@ -38,7 +40,9 @@
 )
 parser.add_argument("--num-workers", type=int, default=None, help="number of processes to load the data")
 parser.add_argument("--prefetch-factor", type=int, default=None, help="number of samples to fetch & prefetch at every call")
-parser.add_argument("--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume")
+parser.add_argument(
+    "--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume"
+)
 parser.add_argument("--load", type=str, default=None, help="load checkpoint and start new training from epoch 1")
 
 parser.add_argument("--train", action="store_true", default=None, help="initiates a training")
@@ -53,7 +57,9 @@
     help="which graph layer to use",
     choices=["attention", "gnn_lsh", "mamba"],
 )
-parser.add_argument("--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers")
+parser.add_argument(
+    "--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers"
+)
 parser.add_argument("--make-plots", action="store_true", default=None, help="make plots of the test predictions")
 parser.add_argument("--export-onnx", action="store_true", default=None, help="exports the model to onnx")
 parser.add_argument("--ntrain", type=int, default=None, help="training samples to use, if None use entire dataset")
@@ -88,6 +94,10 @@
 )
 parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process")
 
+parser.add_argument(
+    "--standardize_inputs", action="store_true", default=None, help="will standardize the input features before training"
+)
+
 
 def get_outdir(resume_training, load):
     outdir = None
@@ -149,6 +159,9 @@ def main():
                 }
             config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]}
 
+    if args.standardize_inputs:
+        config["standardize_inputs"] = True
+
     # override loaded config with values from command line args
     config = override_config(config, args)
 

From 010eef5dd6db830b32ace63180ab0dfe39dab201 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:29:30 +0200
Subject: [PATCH 48/66] up

---
 mlpf/pyg/mlpf.py                              |  12 +-
 mlpf/pyg/training.py                          |   2 +-
 mlpf/pyg_pipeline.py                          |   6 +-
 parameters/pytorch/pyg-cld.yaml               |   2 +-
 parameters/pytorch/pyg-clic-hits.yaml         |   2 +-
 parameters/pytorch/pyg-clic-ttbar-21-std.yaml | 123 ------------------
 parameters/pytorch/pyg-clic-ttbar-22-std.yaml | 123 ------------------
 parameters/pytorch/pyg-clic.yaml              |   2 +-
 parameters/pytorch/pyg-cms-finetune.yaml      |   2 +-
 parameters/pytorch/pyg-cms-ttbar-nopu.yaml    |   2 +-
 parameters/pytorch/pyg-cms.yaml               |   2 +-
 11 files changed, 18 insertions(+), 260 deletions(-)
 delete mode 100644 parameters/pytorch/pyg-clic-ttbar-21-std.yaml
 delete mode 100644 parameters/pytorch/pyg-clic-ttbar-22-std.yaml

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index 33d431c09..5759c1f50 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -57,7 +57,7 @@ def norm_cdf(x):
         return tensor
 
 
-def standardize_inputs(X, elemtypes_nonzero, standardization_dict):
+def standardize_input(X, elemtypes_nonzero, standardization_dict):
 
     for i, ielem in enumerate(elemtypes_nonzero):
 
@@ -122,7 +122,9 @@ def __init__(
         self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True)
         self.norm0 = torch.nn.LayerNorm(embedding_dim)
         self.norm1 = torch.nn.LayerNorm(embedding_dim)
-        self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act())
+        self.seq = torch.nn.Sequential(
+            nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()
+        )
         self.dropout = torch.nn.Dropout(dropout_ff)
         _logger.info("using attention_type={}".format(attention_type))
         # params for torch sdp_kernel
@@ -402,7 +404,7 @@ def forward(self, X_features, mask, standardization_dict=None):
         Xfeat_normed = X_features
 
         if standardization_dict is not None:
-            Xfeat_normed = standardize_inputs(X_features, self.elemtypes_nonzero, standardization_dict)
+            Xfeat_normed = standardize_input(X_features, self.elemtypes_nonzero, standardization_dict)
 
         embeddings_id, embeddings_reg = [], []
         if self.num_convs != 0:
@@ -463,7 +465,9 @@ def forward(self, X_features, mask, standardization_dict=None):
         e_real[~mask] = 0
         e_real[torch.isinf(e_real)] = 0
         e_real[torch.isnan(e_real)] = 0
-        preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]))
+        preds_energy = e_real + torch.nn.functional.relu(
+            self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])
+        )
         preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1)
         return preds_binary_particle, preds_pid, preds_momentum
 
diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 2d8b43d67..c561ab5c6 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -1070,7 +1070,7 @@ def run(rank, world_size, config, args, outdir, logfile):
         last_epoch = -1 if start_epoch == 1 else start_epoch - 1
         lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch)
 
-        if config["standardize_inputs"] is True:
+        if config["standardize_input"] is True:
             if (rank == 0) or (rank == "cpu"):
                 _logger.info("Will standardize the input features before running the training")
             standardization_dict = get_input_standardization(config["dataset"], loaders["train"])
diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py
index c592c3018..d07423ed3 100644
--- a/mlpf/pyg_pipeline.py
+++ b/mlpf/pyg_pipeline.py
@@ -95,7 +95,7 @@
 parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process")
 
 parser.add_argument(
-    "--standardize_inputs", action="store_true", default=None, help="will standardize the input features before training"
+    "--standardize_input", action="store_true", default=None, help="will standardize the input features before training"
 )
 
 
@@ -159,8 +159,8 @@ def main():
                 }
             config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]}
 
-    if args.standardize_inputs:
-        config["standardize_inputs"] = True
+    if args.standardize_input:
+        config["standardize_input"] = True
 
     # override loaded config with values from command line args
     config = override_config(config, args)
diff --git a/parameters/pytorch/pyg-cld.yaml b/parameters/pytorch/pyg-cld.yaml
index f3ae2e957..e2353086a 100644
--- a/parameters/pytorch/pyg-cld.yaml
+++ b/parameters/pytorch/pyg-cld.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_input: False
 dataset: cld
 sort_data: no
 data_dir:
diff --git a/parameters/pytorch/pyg-clic-hits.yaml b/parameters/pytorch/pyg-clic-hits.yaml
index 4a8b5e3b1..7f6aa796f 100644
--- a/parameters/pytorch/pyg-clic-hits.yaml
+++ b/parameters/pytorch/pyg-clic-hits.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_input: False
 dataset: clic
 data_dir:
 gpus: 1
diff --git a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml b/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
deleted file mode 100644
index ae6b5a01e..000000000
--- a/parameters/pytorch/pyg-clic-ttbar-21-std.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-backend: pytorch
-
-standardize_inputs: True
-save_attention: yes
-dataset: clic
-sort_data: no
-data_dir:
-gpus: 1
-gpu_batch_multiplier: 1
-load:
-num_epochs: 100
-patience: 20
-lr: 0.0001
-lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
-conv_type: attention  # gnn_lsh, attention, mamba, flashattention
-ntrain:
-ntest:
-nvalid:
-num_workers: 0
-prefetch_factor:
-checkpoint_freq:
-comet_name: particleflow-pt
-comet_offline: False
-comet_step_freq: 100
-dtype: float32
-val_freq:  # run an extra validation run every val_freq training steps
-
-model:
-  trainable: all
-  learned_representation_mode: last #last, concat
-  input_encoding: split #split, joint
-  pt_mode: direct-elemtype-split
-  eta_mode: linear
-  sin_phi_mode: linear
-  cos_phi_mode: linear
-  energy_mode: direct-elemtype-split
-
-  gnn_lsh:
-    conv_type: gnn_lsh
-    embedding_dim: 512
-    width: 512
-    num_convs: 8
-    activation: "elu"
-    # gnn-lsh specific parameters
-    bin_size: 32
-    max_num_bins: 200
-    distance_dim: 128
-    layernorm: True
-    num_node_messages: 2
-    ffn_dist_hidden_dim: 128
-    ffn_dist_num_layers: 2
-
-  attention:
-    conv_type: attention
-    num_convs: 8
-    dropout_ff: 0.0
-    dropout_conv_id_mha: 0.0
-    dropout_conv_id_ff: 0.0
-    dropout_conv_reg_mha: 0.0
-    dropout_conv_reg_ff: 0.0
-    activation: "relu"
-    head_dim: 64
-    num_heads: 12
-    attention_type: math
-    use_pre_layernorm: True
-
-  mamba:
-    conv_type: mamba
-    embedding_dim: 128
-    width: 128
-    num_convs: 2
-    dropout: 0.0
-    activation: "elu"
-    # transformer specific paramters
-    num_heads: 2
-    # mamba specific paramters
-    d_state: 16
-    d_conv: 4
-    expand: 2
-
-lr_schedule_config:
-  onecycle:
-    pct_start: 0.3
-
-raytune:
-  local_dir:  # Note: please specify an absolute path
-  sched:  # asha, hyperband
-  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
-  default_metric: "val_loss"
-  default_mode: "min"
-  # Tune schedule specific parameters
-  asha:
-    max_t: 200
-    reduction_factor: 4
-    brackets: 1
-    grace_period: 10
-  hyperband:
-    max_t: 200
-    reduction_factor: 4
-  hyperopt:
-    n_random_steps: 10
-  nevergrad:
-    n_random_steps: 10
-
-train_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.1.0
-
-valid_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.1.0
-
-test_dataset:
-  clic_edm_ttbar_pf:
-    version: 2.1.0
diff --git a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml b/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
deleted file mode 100644
index 39e3e8247..000000000
--- a/parameters/pytorch/pyg-clic-ttbar-22-std.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-backend: pytorch
-
-standardize_inputs: True
-save_attention: yes
-dataset: clic
-sort_data: no
-data_dir:
-gpus: 1
-gpu_batch_multiplier: 1
-load:
-num_epochs: 100
-patience: 20
-lr: 0.0001
-lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
-conv_type: attention  # gnn_lsh, attention, mamba, flashattention
-ntrain:
-ntest:
-nvalid:
-num_workers: 0
-prefetch_factor:
-checkpoint_freq:
-comet_name: particleflow-pt
-comet_offline: False
-comet_step_freq: 100
-dtype: float32
-val_freq:  # run an extra validation run every val_freq training steps
-
-model:
-  trainable: all
-  learned_representation_mode: last #last, concat
-  input_encoding: split #split, joint
-  pt_mode: direct-elemtype-split
-  eta_mode: linear
-  sin_phi_mode: linear
-  cos_phi_mode: linear
-  energy_mode: direct-elemtype-split
-
-  gnn_lsh:
-    conv_type: gnn_lsh
-    embedding_dim: 512
-    width: 512
-    num_convs: 8
-    activation: "elu"
-    # gnn-lsh specific parameters
-    bin_size: 32
-    max_num_bins: 200
-    distance_dim: 128
-    layernorm: True
-    num_node_messages: 2
-    ffn_dist_hidden_dim: 128
-    ffn_dist_num_layers: 2
-
-  attention:
-    conv_type: attention
-    num_convs: 8
-    dropout_ff: 0.0
-    dropout_conv_id_mha: 0.0
-    dropout_conv_id_ff: 0.0
-    dropout_conv_reg_mha: 0.0
-    dropout_conv_reg_ff: 0.0
-    activation: "relu"
-    head_dim: 64
-    num_heads: 12
-    attention_type: math
-    use_pre_layernorm: True
-
-  mamba:
-    conv_type: mamba
-    embedding_dim: 128
-    width: 128
-    num_convs: 2
-    dropout: 0.0
-    activation: "elu"
-    # transformer specific paramters
-    num_heads: 2
-    # mamba specific paramters
-    d_state: 16
-    d_conv: 4
-    expand: 2
-
-lr_schedule_config:
-  onecycle:
-    pct_start: 0.3
-
-raytune:
-  local_dir:  # Note: please specify an absolute path
-  sched:  # asha, hyperband
-  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
-  default_metric: "val_loss"
-  default_mode: "min"
-  # Tune schedule specific parameters
-  asha:
-    max_t: 200
-    reduction_factor: 4
-    brackets: 1
-    grace_period: 10
-  hyperband:
-    max_t: 200
-    reduction_factor: 4
-  hyperopt:
-    n_random_steps: 10
-  nevergrad:
-    n_random_steps: 10
-
-train_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.2.0
-
-valid_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.2.0
-
-test_dataset:
-  clic_edm_ttbar_pf:
-    version: 2.2.0
diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml
index e4dcdde1e..185368c12 100644
--- a/parameters/pytorch/pyg-clic.yaml
+++ b/parameters/pytorch/pyg-clic.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_input: False
 save_attention: yes
 dataset: clic
 sort_data: no
diff --git a/parameters/pytorch/pyg-cms-finetune.yaml b/parameters/pytorch/pyg-cms-finetune.yaml
index 2c362ea39..03f5af6c8 100644
--- a/parameters/pytorch/pyg-cms-finetune.yaml
+++ b/parameters/pytorch/pyg-cms-finetune.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_input: False
 dataset: cms
 sort_data: yes
 data_dir:
diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
index 2e1ac6e94..8485611c4 100644
--- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
+++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_input: False
 dataset: cms
 sort_data: yes
 data_dir:
diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml
index 76770c1cb..7507d848d 100644
--- a/parameters/pytorch/pyg-cms.yaml
+++ b/parameters/pytorch/pyg-cms.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_input: False
 save_attention: no
 dataset: cms
 sort_data: yes

From 9156459182461c668a3ac02d1aed91a53a91e57e Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:30:48 +0200
Subject: [PATCH 49/66] check

---
 mlpf/pyg_pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py
index d07423ed3..278c0911a 100644
--- a/mlpf/pyg_pipeline.py
+++ b/mlpf/pyg_pipeline.py
@@ -159,8 +159,8 @@ def main():
                 }
             config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]}
 
-    if args.standardize_input:
-        config["standardize_input"] = True
+    # if args.standardize_input:
+    #     config["standardize_input"] = True
 
     # override loaded config with values from command line args
     config = override_config(config, args)

From 812f05c092577ece9c19dd8224260c1bac8837de Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:31:51 +0200
Subject: [PATCH 50/66] remove unnecessary config

---
 parameters/pytorch/pyg-clic-std.yaml | 129 ---------------------------
 1 file changed, 129 deletions(-)
 delete mode 100644 parameters/pytorch/pyg-clic-std.yaml

diff --git a/parameters/pytorch/pyg-clic-std.yaml b/parameters/pytorch/pyg-clic-std.yaml
deleted file mode 100644
index 287069d31..000000000
--- a/parameters/pytorch/pyg-clic-std.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-backend: pytorch
-
-standardize_inputs: True
-save_attention: yes
-dataset: clic
-sort_data: no
-data_dir:
-gpus: 1
-gpu_batch_multiplier: 1
-load:
-num_epochs: 100
-patience: 20
-lr: 0.0001
-lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
-conv_type: attention  # gnn_lsh, attention, mamba, flashattention
-ntrain:
-ntest:
-nvalid:
-num_workers: 0
-prefetch_factor:
-checkpoint_freq:
-comet_name: particleflow-pt
-comet_offline: False
-comet_step_freq: 100
-dtype: float32
-val_freq:  # run an extra validation run every val_freq training steps
-
-model:
-  trainable: all
-  learned_representation_mode: last #last, concat
-  input_encoding: split #split, joint
-  pt_mode: direct-elemtype-split
-  eta_mode: linear
-  sin_phi_mode: linear
-  cos_phi_mode: linear
-  energy_mode: direct-elemtype-split
-
-  gnn_lsh:
-    conv_type: gnn_lsh
-    embedding_dim: 512
-    width: 512
-    num_convs: 8
-    activation: "elu"
-    # gnn-lsh specific parameters
-    bin_size: 32
-    max_num_bins: 200
-    distance_dim: 128
-    layernorm: True
-    num_node_messages: 2
-    ffn_dist_hidden_dim: 128
-    ffn_dist_num_layers: 2
-
-  attention:
-    conv_type: attention
-    num_convs: 4
-    dropout_ff: 0.0
-    dropout_conv_id_mha: 0.0
-    dropout_conv_id_ff: 0.0
-    dropout_conv_reg_mha: 0.0
-    dropout_conv_reg_ff: 0.0
-    activation: "gelu"
-    head_dim: 32
-    num_heads: 32
-    attention_type: math
-    use_pre_layernorm: True
-
-  mamba:
-    conv_type: mamba
-    embedding_dim: 128
-    width: 128
-    num_convs: 2
-    dropout: 0.0
-    activation: "elu"
-    # transformer specific paramters
-    num_heads: 2
-    # mamba specific paramters
-    d_state: 16
-    d_conv: 4
-    expand: 2
-
-lr_schedule_config:
-  onecycle:
-    pct_start: 0.3
-
-raytune:
-  local_dir:  # Note: please specify an absolute path
-  sched:  # asha, hyperband
-  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
-  default_metric: "val_loss"
-  default_mode: "min"
-  # Tune schedule specific parameters
-  asha:
-    max_t: 200
-    reduction_factor: 4
-    brackets: 1
-    grace_period: 10
-  hyperband:
-    max_t: 200
-    reduction_factor: 4
-  hyperopt:
-    n_random_steps: 10
-  nevergrad:
-    n_random_steps: 10
-
-train_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.1.0
-        clic_edm_qq_pf:
-          version: 2.1.0
-
-valid_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.1.0
-        clic_edm_qq_pf:
-          version: 2.1.0
-
-test_dataset:
-  clic_edm_ttbar_pf:
-    version: 2.1.0
-  clic_edm_qq_pf:
-    version: 2.1.0

From 0968e51046d3bde32ac2800aac3647d63e80d1a5 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:34:25 +0200
Subject: [PATCH 51/66] up

---
 mlpf/pyg_pipeline.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py
index 278c0911a..1ed7116e5 100644
--- a/mlpf/pyg_pipeline.py
+++ b/mlpf/pyg_pipeline.py
@@ -95,7 +95,7 @@
 parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process")
 
 parser.add_argument(
-    "--standardize_input", action="store_true", default=None, help="will standardize the input features before training"
+    "--standardize-input", action="store_true", default=None, help="will standardize the input features before training"
 )
 
 
@@ -159,9 +159,6 @@ def main():
                 }
             config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]}
 
-    # if args.standardize_input:
-    #     config["standardize_input"] = True
-
     # override loaded config with values from command line args
     config = override_config(config, args)
 

From 42ff712e2e662454faec8fd86395e0f3e2916b22 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:48:19 +0200
Subject: [PATCH 52/66] debug

---
 mlpf/pyg/utils.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index a58869439..f5ab29ad8 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -162,7 +162,9 @@ def unpack_target(y, model):
 
     # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"]
     ret["momentum"] = y[..., 2:7].to(dtype=torch.float32)
-    ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1)
+    ret["p4"] = torch.cat(
+        [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1
+    )
 
     ret["ispu"] = y[..., -1]
 
@@ -280,7 +282,11 @@ def load_lr_schedule(lr_schedule, checkpoint):
         lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"])
         return lr_schedule
     else:
-        raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys()))
+        raise KeyError(
+            "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(
+                checkpoint["extra_state"].keys()
+            )
+        )
 
 
 def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1):
@@ -298,7 +304,9 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-
             pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3,
         )
     elif config["lr_schedule"] == "cosinedecay":
-        lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1)
+        lr_schedule = CosineAnnealingLR(
+            opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1
+        )
     else:
         raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.")
     return lr_schedule
@@ -352,7 +360,7 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000):
                 concatenated_pfelements = batch.X[msk]
             else:
                 concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]])
-
+            print("concatenated_pfelements", concatenated_pfelements.device)
             if tot_events > nsubset:
                 break
 

From fb9e68a78de5fa6b3fb4f2ac9d20d1628de15807 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:49:31 +0200
Subject: [PATCH 53/66] revert

---
 mlpf/pyg/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index f5ab29ad8..2e7f763b6 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -360,7 +360,7 @@ def get_input_standardization(dataset, train_loader, nsubset=10_000):
                 concatenated_pfelements = batch.X[msk]
             else:
                 concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]])
-            print("concatenated_pfelements", concatenated_pfelements.device)
+
             if tot_events > nsubset:
                 break
 

From 181c5341c4dab56222d6b53f96464752b9163ca7 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:51:35 +0200
Subject: [PATCH 54/66] up new config for all samples

---
 mlpf/pyg/mlpf.py                            |   8 +-
 mlpf/pyg/training.py                        |  72 +++-------
 mlpf/pyg/utils.py                           |  14 +-
 mlpf/pyg_pipeline.py                        |  16 +--
 parameters/pytorch/pyg-clic-allsamples.yaml | 141 ++++++++++++++++++++
 5 files changed, 171 insertions(+), 80 deletions(-)
 create mode 100644 parameters/pytorch/pyg-clic-allsamples.yaml

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
index 5759c1f50..163b880e2 100644
--- a/mlpf/pyg/mlpf.py
+++ b/mlpf/pyg/mlpf.py
@@ -122,9 +122,7 @@ def __init__(
         self.mha = torch.nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_mha, batch_first=True)
         self.norm0 = torch.nn.LayerNorm(embedding_dim)
         self.norm1 = torch.nn.LayerNorm(embedding_dim)
-        self.seq = torch.nn.Sequential(
-            nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act()
-        )
+        self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act())
         self.dropout = torch.nn.Dropout(dropout_ff)
         _logger.info("using attention_type={}".format(attention_type))
         # params for torch sdp_kernel
@@ -465,9 +463,7 @@ def forward(self, X_features, mask, standardization_dict=None):
         e_real[~mask] = 0
         e_real[torch.isinf(e_real)] = 0
         e_real[torch.isnan(e_real)] = 0
-        preds_energy = e_real + torch.nn.functional.relu(
-            self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6])
-        )
+        preds_energy = e_real + torch.nn.functional.relu(self.nn_energy(X_features, final_embedding_reg, X_features[..., 5:6]))
         preds_momentum = torch.cat([preds_pt, preds_eta, preds_sin_phi, preds_cos_phi, preds_energy], axis=-1)
         return preds_binary_particle, preds_pid, preds_momentum
 
diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index c561ab5c6..c526fcb0e 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -97,9 +97,7 @@ def mlpf_loss(y, ypred, batch):
 
     # binary loss for particle / no-particle classification
     # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape)
-    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(
-        ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none"
-    )
+    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none")
 
     # compare the particle type, only for cases where there was a true particle
     loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape)
@@ -147,12 +145,12 @@ def mlpf_loss(y, ypred, batch):
     pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2)
     loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean()
 
-    was_input_pred = torch.concat(
-        [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1
-    ) * batch.mask.unsqueeze(axis=-1)
-    was_input_true = torch.concat(
-        [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1
-    ) * batch.mask.unsqueeze(axis=-1)
+    was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
+    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
 
     # standardize Wasserstein loss
     std = was_input_true[batch.mask].std(axis=0)
@@ -194,9 +192,7 @@ class FocalLoss(nn.Module):
         - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
     """
 
-    def __init__(
-        self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100
-    ):
+    def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100):
         """Constructor.
         Args:
             alpha (Tensor, optional): Weights for each class. Defaults to None.
@@ -386,30 +382,18 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o
         ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
@@ -473,9 +457,7 @@ def train_and_valid(
     if (world_size > 1) and (rank != 0):
         iterator = enumerate(data_loader)
     else:
-        iterator = tqdm.tqdm(
-            enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}"
-        )
+        iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}")
 
     device_type = "cuda" if isinstance(rank, int) else "cpu"
 
@@ -733,9 +715,7 @@ def train_mlpf(
 
         # training step, edit here to profile a specific epoch
         if epoch == -1:
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True
-            ) as prof:
+            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
                 with record_function("model_train"):
                     losses_t = train_and_valid(
                         rank,
@@ -1037,9 +1017,7 @@ def run(rank, world_size, config, args, outdir, logfile):
             _logger.info(f"Model directory {outdir}", color="bold")
 
         if args.comet:
-            comet_experiment = create_comet_experiment(
-                config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
-            )
+            comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
             comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}")
             comet_experiment.log_parameter("run_id", Path(outdir).name)
             comet_experiment.log_parameter("world_size", world_size)
@@ -1294,9 +1272,7 @@ def train_ray_trial(config, args, outdir=None):
     loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True)
 
     if args.comet:
-        comet_experiment = create_comet_experiment(
-            config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
-        )
+        comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
         comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}")
         comet_experiment.log_parameter("run_id", Path(outdir).name)
         comet_experiment.log_parameter("world_size", world_size)
@@ -1330,9 +1306,7 @@ def train_ray_trial(config, args, outdir=None):
                 if args.resume_training:
                     model, optimizer = load_checkpoint(checkpoint, model, optimizer)
                     start_epoch = checkpoint["extra_state"]["epoch"] + 1
-                    lr_schedule = get_lr_schedule(
-                        config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1
-                    )
+                    lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1)
                 else:  # start a new training with model weights loaded from a pre-trained model
                     model = load_checkpoint(checkpoint, model)
 
@@ -1496,9 +1470,7 @@ def run_hpo(config, args):
 
     if tune.Tuner.can_restore(str(expdir)):
         # resume unfinished HPO run
-        tuner = tune.Tuner.restore(
-            str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True
-        )
+        tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True)
     else:
         # start new HPO run
         search_space = {"train_loop_config": search_space}  # the ray TorchTrainer only takes a single arg: train_loop_config
@@ -1539,6 +1511,4 @@ def run_hpo(config, args):
     print(result_df.columns)
 
     logging.info("Total time of Tuner.fit(): {}".format(end - start))
-    logging.info(
-        "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)
-    )
+    logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))
diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py
index 2e7f763b6..a58869439 100644
--- a/mlpf/pyg/utils.py
+++ b/mlpf/pyg/utils.py
@@ -162,9 +162,7 @@ def unpack_target(y, model):
 
     # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"]
     ret["momentum"] = y[..., 2:7].to(dtype=torch.float32)
-    ret["p4"] = torch.cat(
-        [ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1
-    )
+    ret["p4"] = torch.cat([ret["pt"].unsqueeze(-1), ret["eta"].unsqueeze(-1), ret["phi"].unsqueeze(-1), ret["energy"].unsqueeze(-1)], axis=-1)
 
     ret["ispu"] = y[..., -1]
 
@@ -282,11 +280,7 @@ def load_lr_schedule(lr_schedule, checkpoint):
         lr_schedule.load_state_dict(checkpoint["extra_state"]["lr_schedule_state_dict"])
         return lr_schedule
     else:
-        raise KeyError(
-            "Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(
-                checkpoint["extra_state"].keys()
-            )
-        )
+        raise KeyError("Couldn't find LR schedule state dict in checkpoint. extra_state contains: {}".format(checkpoint["extra_state"].keys()))
 
 
 def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-1):
@@ -304,9 +298,7 @@ def get_lr_schedule(config, opt, epochs=None, steps_per_epoch=None, last_epoch=-
             pct_start=config["lr_schedule_config"]["onecycle"]["pct_start"] or 0.3,
         )
     elif config["lr_schedule"] == "cosinedecay":
-        lr_schedule = CosineAnnealingLR(
-            opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1
-        )
+        lr_schedule = CosineAnnealingLR(opt, T_max=steps_per_epoch * epochs, last_epoch=last_batch, eta_min=config["lr"] * 0.1)
     else:
         raise ValueError("Supported values for lr_schedule are 'constant', 'onecycle' and 'cosinedecay'.")
     return lr_schedule
diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py
index 1ed7116e5..6aa6ab6fd 100644
--- a/mlpf/pyg_pipeline.py
+++ b/mlpf/pyg_pipeline.py
@@ -27,9 +27,7 @@
 parser.add_argument("--prefix", type=str, default=None, help="prefix appended to result dir name")
 parser.add_argument("--data-dir", type=str, default=None, help="path to `tensorflow_datasets/`")
 parser.add_argument("--gpus", type=int, default=None, help="to use CPU set to 0; else e.g., 4")
-parser.add_argument(
-    "--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor"
-)
+parser.add_argument("--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor")
 parser.add_argument(
     "--dataset",
     type=str,
@@ -40,9 +38,7 @@
 )
 parser.add_argument("--num-workers", type=int, default=None, help="number of processes to load the data")
 parser.add_argument("--prefetch-factor", type=int, default=None, help="number of samples to fetch & prefetch at every call")
-parser.add_argument(
-    "--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume"
-)
+parser.add_argument("--resume-training", type=str, default=None, help="training dir containing the checkpointed training to resume")
 parser.add_argument("--load", type=str, default=None, help="load checkpoint and start new training from epoch 1")
 
 parser.add_argument("--train", action="store_true", default=None, help="initiates a training")
@@ -57,9 +53,7 @@
     help="which graph layer to use",
     choices=["attention", "gnn_lsh", "mamba"],
 )
-parser.add_argument(
-    "--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers"
-)
+parser.add_argument("--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers")
 parser.add_argument("--make-plots", action="store_true", default=None, help="make plots of the test predictions")
 parser.add_argument("--export-onnx", action="store_true", default=None, help="exports the model to onnx")
 parser.add_argument("--ntrain", type=int, default=None, help="training samples to use, if None use entire dataset")
@@ -94,9 +88,7 @@
 )
 parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process")
 
-parser.add_argument(
-    "--standardize-input", action="store_true", default=None, help="will standardize the input features before training"
-)
+parser.add_argument("--standardize-input", action="store_true", default=None, help="will standardize the input features before training")
 
 
 def get_outdir(resume_training, load):
diff --git a/parameters/pytorch/pyg-clic-allsamples.yaml b/parameters/pytorch/pyg-clic-allsamples.yaml
new file mode 100644
index 000000000..290370b73
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-allsamples.yaml
@@ -0,0 +1,141 @@
+backend: pytorch
+
+standardize_input: False
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: split #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 4
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "gelu"
+    head_dim: 32
+    num_heads: 32
+    attention_type: math
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+        clic_edm_qq_pf:
+          version: 2.1.0
+        clic_edm_ww_fullhad_pf/:
+          version: 2.1.0
+        clic_edm_zh_tautau_pf/:
+          version: 2.1.0
+        clic_edm_z_tautau_pf/:
+          version: 2.1.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+        clic_edm_qq_pf:
+          version: 2.1.0
+        clic_edm_ww_fullhad_pf/:
+          version: 2.1.0
+        clic_edm_zh_tautau_pf/:
+          version: 2.1.0
+        clic_edm_z_tautau_pf/:
+          version: 2.1.0          
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.1.0
+  clic_edm_qq_pf:
+    version: 2.1.0

From 829399013c1e014b7602e5ab6393e8d1b2cb21c3 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:52:32 +0200
Subject: [PATCH 55/66] oopsie

---
 parameters/pytorch/pyg-clic-allsamples.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/parameters/pytorch/pyg-clic-allsamples.yaml b/parameters/pytorch/pyg-clic-allsamples.yaml
index 290370b73..bff38ac5b 100644
--- a/parameters/pytorch/pyg-clic-allsamples.yaml
+++ b/parameters/pytorch/pyg-clic-allsamples.yaml
@@ -111,11 +111,11 @@ train_dataset:
           version: 2.1.0
         clic_edm_qq_pf:
           version: 2.1.0
-        clic_edm_ww_fullhad_pf/:
+        clic_edm_ww_fullhad_pf:
           version: 2.1.0
-        clic_edm_zh_tautau_pf/:
+        clic_edm_zh_tautau_pf:
           version: 2.1.0
-        clic_edm_z_tautau_pf/:
+        clic_edm_z_tautau_pf:
           version: 2.1.0
 
 valid_dataset:
@@ -127,11 +127,11 @@ valid_dataset:
           version: 2.1.0
         clic_edm_qq_pf:
           version: 2.1.0
-        clic_edm_ww_fullhad_pf/:
+        clic_edm_ww_fullhad_pf:
           version: 2.1.0
-        clic_edm_zh_tautau_pf/:
+        clic_edm_zh_tautau_pf:
           version: 2.1.0
-        clic_edm_z_tautau_pf/:
+        clic_edm_z_tautau_pf:
           version: 2.1.0          
 
 test_dataset:

From 687b5d71ed396c9b0d36d8c7082d85ce5642c3e1 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 16:59:33 +0200
Subject: [PATCH 56/66] up

---
 parameters/pytorch/pyg-clic-f.yaml | 129 +++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 parameters/pytorch/pyg-clic-f.yaml

diff --git a/parameters/pytorch/pyg-clic-f.yaml b/parameters/pytorch/pyg-clic-f.yaml
new file mode 100644
index 000000000..3d061e6ec
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-f.yaml
@@ -0,0 +1,129 @@
+backend: pytorch
+
+standardize_input: False
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: split #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 8
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "relu"
+    head_dim: 64
+    num_heads: 12
+    attention_type: math
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+        clic_edm_qq_pf:
+          version: 2.1.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+        clic_edm_qq_pf:
+          version: 2.1.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.1.0
+  clic_edm_qq_pf:
+    version: 2.1.0

From efdb489d3d68ae8c7d9fabf580c60c46ce116695 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Sep 2024 17:27:15 +0200
Subject: [PATCH 57/66] pca

---
 parameters/pytorch/pyg-clic-allsamples.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parameters/pytorch/pyg-clic-allsamples.yaml b/parameters/pytorch/pyg-clic-allsamples.yaml
index bff38ac5b..9b240c9fa 100644
--- a/parameters/pytorch/pyg-clic-allsamples.yaml
+++ b/parameters/pytorch/pyg-clic-allsamples.yaml
@@ -132,7 +132,7 @@ valid_dataset:
         clic_edm_zh_tautau_pf:
           version: 2.1.0
         clic_edm_z_tautau_pf:
-          version: 2.1.0          
+          version: 2.1.0
 
 test_dataset:
   clic_edm_ttbar_pf:

From df1ecbad3da484b0f22cf0fec1730d7c4956e95a Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 11:18:15 +0200
Subject: [PATCH 58/66] up configs

---
 parameters/pytorch/pyg-clic-f.yaml        | 129 ----------------------
 parameters/pytorch/pyg-clic-ttbar-21.yaml |   2 +-
 parameters/pytorch/pyg-clic-ttbar-22.yaml |   2 +-
 3 files changed, 2 insertions(+), 131 deletions(-)
 delete mode 100644 parameters/pytorch/pyg-clic-f.yaml

diff --git a/parameters/pytorch/pyg-clic-f.yaml b/parameters/pytorch/pyg-clic-f.yaml
deleted file mode 100644
index 3d061e6ec..000000000
--- a/parameters/pytorch/pyg-clic-f.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-backend: pytorch
-
-standardize_input: False
-save_attention: yes
-dataset: clic
-sort_data: no
-data_dir:
-gpus: 1
-gpu_batch_multiplier: 1
-load:
-num_epochs: 100
-patience: 20
-lr: 0.0001
-lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
-conv_type: attention  # gnn_lsh, attention, mamba, flashattention
-ntrain:
-ntest:
-nvalid:
-num_workers: 0
-prefetch_factor:
-checkpoint_freq:
-comet_name: particleflow-pt
-comet_offline: False
-comet_step_freq: 100
-dtype: float32
-val_freq:  # run an extra validation run every val_freq training steps
-
-model:
-  trainable: all
-  learned_representation_mode: last #last, concat
-  input_encoding: split #split, joint
-  pt_mode: direct-elemtype-split
-  eta_mode: linear
-  sin_phi_mode: linear
-  cos_phi_mode: linear
-  energy_mode: direct-elemtype-split
-
-  gnn_lsh:
-    conv_type: gnn_lsh
-    embedding_dim: 512
-    width: 512
-    num_convs: 8
-    activation: "elu"
-    # gnn-lsh specific parameters
-    bin_size: 32
-    max_num_bins: 200
-    distance_dim: 128
-    layernorm: True
-    num_node_messages: 2
-    ffn_dist_hidden_dim: 128
-    ffn_dist_num_layers: 2
-
-  attention:
-    conv_type: attention
-    num_convs: 8
-    dropout_ff: 0.0
-    dropout_conv_id_mha: 0.0
-    dropout_conv_id_ff: 0.0
-    dropout_conv_reg_mha: 0.0
-    dropout_conv_reg_ff: 0.0
-    activation: "relu"
-    head_dim: 64
-    num_heads: 12
-    attention_type: math
-    use_pre_layernorm: True
-
-  mamba:
-    conv_type: mamba
-    embedding_dim: 128
-    width: 128
-    num_convs: 2
-    dropout: 0.0
-    activation: "elu"
-    # transformer specific paramters
-    num_heads: 2
-    # mamba specific paramters
-    d_state: 16
-    d_conv: 4
-    expand: 2
-
-lr_schedule_config:
-  onecycle:
-    pct_start: 0.3
-
-raytune:
-  local_dir:  # Note: please specify an absolute path
-  sched:  # asha, hyperband
-  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
-  default_metric: "val_loss"
-  default_mode: "min"
-  # Tune schedule specific parameters
-  asha:
-    max_t: 200
-    reduction_factor: 4
-    brackets: 1
-    grace_period: 10
-  hyperband:
-    max_t: 200
-    reduction_factor: 4
-  hyperopt:
-    n_random_steps: 10
-  nevergrad:
-    n_random_steps: 10
-
-train_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.1.0
-        clic_edm_qq_pf:
-          version: 2.1.0
-
-valid_dataset:
-  clic:
-    physical:
-      batch_size: 1
-      samples:
-        clic_edm_ttbar_pf:
-          version: 2.1.0
-        clic_edm_qq_pf:
-          version: 2.1.0
-
-test_dataset:
-  clic_edm_ttbar_pf:
-    version: 2.1.0
-  clic_edm_qq_pf:
-    version: 2.1.0
diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml
index 6aea54096..ae6b5a01e 100644
--- a/parameters/pytorch/pyg-clic-ttbar-21.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_inputs: True
 save_attention: yes
 dataset: clic
 sort_data: no
diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml
index 1512a6b3e..39e3e8247 100644
--- a/parameters/pytorch/pyg-clic-ttbar-22.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: False
+standardize_inputs: True
 save_attention: yes
 dataset: clic
 sort_data: no

From d6252c05c0a6d246af849b327566ef4133d45bd5 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 11:19:36 +0200
Subject: [PATCH 59/66] up

---
 parameters/pytorch/pyg-clic-ttbar-21.yaml | 8 ++++----
 parameters/pytorch/pyg-clic-ttbar-22.yaml | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml
index ae6b5a01e..3d0f9cc64 100644
--- a/parameters/pytorch/pyg-clic-ttbar-21.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml
@@ -52,15 +52,15 @@ model:
 
   attention:
     conv_type: attention
-    num_convs: 8
+    num_convs: 4
     dropout_ff: 0.0
     dropout_conv_id_mha: 0.0
     dropout_conv_id_ff: 0.0
     dropout_conv_reg_mha: 0.0
     dropout_conv_reg_ff: 0.0
-    activation: "relu"
-    head_dim: 64
-    num_heads: 12
+    activation: "gelu"
+    head_dim: 32
+    num_heads: 32
     attention_type: math
     use_pre_layernorm: True
 
diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml
index 39e3e8247..3f920fe2b 100644
--- a/parameters/pytorch/pyg-clic-ttbar-22.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml
@@ -52,15 +52,15 @@ model:
 
   attention:
     conv_type: attention
-    num_convs: 8
+    num_convs: 4
     dropout_ff: 0.0
     dropout_conv_id_mha: 0.0
     dropout_conv_id_ff: 0.0
     dropout_conv_reg_mha: 0.0
     dropout_conv_reg_ff: 0.0
-    activation: "relu"
-    head_dim: 64
-    num_heads: 12
+    activation: "gelu"
+    head_dim: 32
+    num_heads: 32
     attention_type: math
     use_pre_layernorm: True
 

From 8d3d685f58fd8e11d858dd454384435c5451c2b3 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 11:20:39 +0200
Subject: [PATCH 60/66] up

---
 parameters/pytorch/pyg-clic-ttbar-21.yaml | 2 +-
 parameters/pytorch/pyg-clic-ttbar-22.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml
index 3d0f9cc64..376f2b461 100644
--- a/parameters/pytorch/pyg-clic-ttbar-21.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: True
+standardize_input: True
 save_attention: yes
 dataset: clic
 sort_data: no
diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml
index 3f920fe2b..0fc73c684 100644
--- a/parameters/pytorch/pyg-clic-ttbar-22.yaml
+++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml
@@ -1,6 +1,6 @@
 backend: pytorch
 
-standardize_inputs: True
+standardize_input: True
 save_attention: yes
 dataset: clic
 sort_data: no

From 1aa21a93d9a268f039636f811210f06a38d9b5e3 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 11:32:54 +0200
Subject: [PATCH 61/66] up configs

---
 .../pytorch/pyg-clic-ttbar-21-joint.yaml      | 123 ++++++++++++++++++
 .../pytorch/pyg-clic-ttbar-22-joint.yaml      | 123 ++++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 parameters/pytorch/pyg-clic-ttbar-21-joint.yaml
 create mode 100644 parameters/pytorch/pyg-clic-ttbar-22-joint.yaml

diff --git a/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml b/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml
new file mode 100644
index 000000000..915c6cb91
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml
@@ -0,0 +1,123 @@
+backend: pytorch
+
+standardize_input: True
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: joint #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 4
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "gelu"
+    head_dim: 32
+    num_heads: 32
+    attention_type: math
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.1.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.1.0
diff --git a/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml b/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml
new file mode 100644
index 000000000..19c15e8d7
--- /dev/null
+++ b/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml
@@ -0,0 +1,123 @@
+backend: pytorch
+
+standardize_input: True
+save_attention: yes
+dataset: clic
+sort_data: no
+data_dir:
+gpus: 1
+gpu_batch_multiplier: 1
+load:
+num_epochs: 100
+patience: 20
+lr: 0.0001
+lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
+ntrain:
+ntest:
+nvalid:
+num_workers: 0
+prefetch_factor:
+checkpoint_freq:
+comet_name: particleflow-pt
+comet_offline: False
+comet_step_freq: 100
+dtype: float32
+val_freq:  # run an extra validation run every val_freq training steps
+
+model:
+  trainable: all
+  learned_representation_mode: last #last, concat
+  input_encoding: joint #split, joint
+  pt_mode: direct-elemtype-split
+  eta_mode: linear
+  sin_phi_mode: linear
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+
+  gnn_lsh:
+    conv_type: gnn_lsh
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
+    activation: "elu"
+    # gnn-lsh specific parameters
+    bin_size: 32
+    max_num_bins: 200
+    distance_dim: 128
+    layernorm: True
+    num_node_messages: 2
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+
+  attention:
+    conv_type: attention
+    num_convs: 4
+    dropout_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_id_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    activation: "gelu"
+    head_dim: 32
+    num_heads: 32
+    attention_type: math
+    use_pre_layernorm: True
+
+  mamba:
+    conv_type: mamba
+    embedding_dim: 128
+    width: 128
+    num_convs: 2
+    dropout: 0.0
+    activation: "elu"
+    # transformer specific paramters
+    num_heads: 2
+    # mamba specific paramters
+    d_state: 16
+    d_conv: 4
+    expand: 2
+
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched:  # asha, hyperband
+  search_alg:  # bayes, bohb, hyperopt, nevergrad, scikit
+  default_metric: "val_loss"
+  default_mode: "min"
+  # Tune schedule specific parameters
+  asha:
+    max_t: 200
+    reduction_factor: 4
+    brackets: 1
+    grace_period: 10
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  nevergrad:
+    n_random_steps: 10
+
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.2.0
+
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_ttbar_pf:
+          version: 2.2.0
+
+test_dataset:
+  clic_edm_ttbar_pf:
+    version: 2.2.0

From 832df1ca159934314a366a7c977191b60948753c Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 11:54:56 +0200
Subject: [PATCH 62/66] try new loss

---
 mlpf/pyg/training.py | 102 ++++++++++++++++++++++++++++---------------
 1 file changed, 68 insertions(+), 34 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index c526fcb0e..79b0b71c9 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -73,7 +73,7 @@ def sliced_wasserstein_loss(y_pred, y_true, num_projections=200):
     return ret
 
 
-def mlpf_loss(y, ypred, batch):
+def mlpf_loss(y, ypred, batch, epoch):
     """
     Args
         y [dict]: relevant keys are "cls_id, momentum, charge"
@@ -97,7 +97,9 @@ def mlpf_loss(y, ypred, batch):
 
     # binary loss for particle / no-particle classification
     # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape)
-    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none")
+    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(
+        ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none"
+    )
 
     # compare the particle type, only for cases where there was a true particle
     loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape)
@@ -145,27 +147,31 @@ def mlpf_loss(y, ypred, batch):
     pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2)
     loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean()
 
-    was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
-        axis=-1
-    )
-    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
-        axis=-1
-    )
+    was_input_pred = torch.concat(
+        [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1
+    ) * batch.mask.unsqueeze(axis=-1)
+    was_input_true = torch.concat(
+        [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1
+    ) * batch.mask.unsqueeze(axis=-1)
 
     # standardize Wasserstein loss
     std = was_input_true[batch.mask].std(axis=0)
     loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean()
 
-    # this is the final loss to be optimized
-    loss["Total"] = (
-        loss["Classification_binary"]
-        + loss["Classification"]
-        + loss["Regression_pt"]
-        + loss["Regression_eta"]
-        + loss["Regression_sin_phi"]
-        + loss["Regression_cos_phi"]
-        + loss["Regression_energy"]
-    )
+    loss["Total"] = loss["Classification_binary"]
+
+    if epoch >= 2:
+        loss["Total"] += loss["Classification"]
+
+    elif epoch >= 4:
+        # this is the final loss to be optimized
+        loss["Total"] = (
+            +loss["Regression_pt"]
+            + loss["Regression_eta"]
+            + loss["Regression_sin_phi"]
+            + loss["Regression_cos_phi"]
+            + loss["Regression_energy"]
+        )
 
     # store these separately but detached
     loss["Classification_binary"] = loss["Classification_binary"].detach()
@@ -192,7 +198,9 @@ class FocalLoss(nn.Module):
         - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
     """
 
-    def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100):
+    def __init__(
+        self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100
+    ):
         """Constructor.
         Args:
             alpha (Tensor, optional): Weights for each class. Defaults to None.
@@ -382,18 +390,30 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o
         ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch)
-        tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram(
+            "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch
+        )
+        tensorboard_writer.add_histogram(
+            "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch
+        )
         ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
@@ -457,7 +477,9 @@ def train_and_valid(
     if (world_size > 1) and (rank != 0):
         iterator = enumerate(data_loader)
     else:
-        iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}")
+        iterator = tqdm.tqdm(
+            enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}"
+        )
 
     device_type = "cuda" if isinstance(rank, int) else "cpu"
 
@@ -511,12 +533,12 @@ def train_and_valid(
         #         validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir)
         with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
             if is_train:
-                loss = mlpf_loss(ygen, ypred, batch)
+                loss = mlpf_loss(ygen, ypred, batch, epoch)
                 for param in model.parameters():
                     param.grad = None
             else:
                 with torch.no_grad():
-                    loss = mlpf_loss(ygen, ypred, batch)
+                    loss = mlpf_loss(ygen, ypred, batch, epoch)
 
         if is_train:
             loss["Total"].backward()
@@ -715,7 +737,9 @@ def train_mlpf(
 
         # training step, edit here to profile a specific epoch
         if epoch == -1:
-            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
+            with profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True
+            ) as prof:
                 with record_function("model_train"):
                     losses_t = train_and_valid(
                         rank,
@@ -1017,7 +1041,9 @@ def run(rank, world_size, config, args, outdir, logfile):
             _logger.info(f"Model directory {outdir}", color="bold")
 
         if args.comet:
-            comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
+            comet_experiment = create_comet_experiment(
+                config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
+            )
             comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}")
             comet_experiment.log_parameter("run_id", Path(outdir).name)
             comet_experiment.log_parameter("world_size", world_size)
@@ -1272,7 +1298,9 @@ def train_ray_trial(config, args, outdir=None):
     loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True)
 
     if args.comet:
-        comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
+        comet_experiment = create_comet_experiment(
+            config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
+        )
         comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}")
         comet_experiment.log_parameter("run_id", Path(outdir).name)
         comet_experiment.log_parameter("world_size", world_size)
@@ -1306,7 +1334,9 @@ def train_ray_trial(config, args, outdir=None):
                 if args.resume_training:
                     model, optimizer = load_checkpoint(checkpoint, model, optimizer)
                     start_epoch = checkpoint["extra_state"]["epoch"] + 1
-                    lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1)
+                    lr_schedule = get_lr_schedule(
+                        config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1
+                    )
                 else:  # start a new training with model weights loaded from a pre-trained model
                     model = load_checkpoint(checkpoint, model)
 
@@ -1470,7 +1500,9 @@ def run_hpo(config, args):
 
     if tune.Tuner.can_restore(str(expdir)):
         # resume unfinished HPO run
-        tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True)
+        tuner = tune.Tuner.restore(
+            str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True
+        )
     else:
         # start new HPO run
         search_space = {"train_loop_config": search_space}  # the ray TorchTrainer only takes a single arg: train_loop_config
@@ -1511,4 +1543,6 @@ def run_hpo(config, args):
     print(result_df.columns)
 
     logging.info("Total time of Tuner.fit(): {}".format(end - start))
-    logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))
+    logging.info(
+        "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)
+    )

From e336230d1b8a1485ec9875a03c7375dddb801239 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 12:01:13 +0200
Subject: [PATCH 63/66] up

---
 mlpf/pyg/training.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 79b0b71c9..8fb0863e8 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -160,10 +160,10 @@ def mlpf_loss(y, ypred, batch, epoch):
 
     loss["Total"] = loss["Classification_binary"]
 
-    if epoch >= 2:
+    if epoch >= 3:
         loss["Total"] += loss["Classification"]
 
-    elif epoch >= 4:
+    elif epoch >= 6:
         # this is the final loss to be optimized
         loss["Total"] = (
             +loss["Regression_pt"]
@@ -906,9 +906,10 @@ def train_mlpf(
                 + losses_t["Regression_cos_phi"]
                 + losses_t["Regression_energy"]
             )
+            log_tot = losses_t["Classification"] + losses_t["Classification_binary"] + log_t
 
             _logger.info(
-                f"train: loss_total={losses_t['Total']:.4f} "
+                f"train: loss_total={log_tot:.4f} "
                 + f"loss_clf={losses_t['Classification']:.4f} "
                 + f"loss_clfbinary={losses_t['Classification_binary']:.4f} "
                 + f"loss_reg={log_t:.4f} ",
@@ -922,9 +923,10 @@ def train_mlpf(
                 + losses_v["Regression_cos_phi"]
                 + losses_v["Regression_energy"]
             )
+            log_tot = losses_v["Classification"] + losses_v["Classification_binary"] + log_v
 
             _logger.info(
-                f"valid: loss_total={losses_v['Total']:.4f} "
+                f"valid: loss_total={log_tot:.4f} "
                 + f"loss_clf={losses_v['Classification']:.4f} "
                 + f"loss_clfbinary={losses_v['Classification_binary']:.4f} "
                 + f"loss_reg={log_v:.4f} ",

From 84957d88cd135e582dba567271fd6de7ce838a58 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 12:24:14 +0200
Subject: [PATCH 64/66] up

---
 mlpf/pyg/training.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 8fb0863e8..8a4dea1dc 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -158,15 +158,11 @@ def mlpf_loss(y, ypred, batch, epoch):
     std = was_input_true[batch.mask].std(axis=0)
     loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean()
 
-    loss["Total"] = loss["Classification_binary"]
-
-    if epoch >= 3:
-        loss["Total"] += loss["Classification"]
-
-    elif epoch >= 6:
+    loss["Total"] = loss["Classification_binary"] + loss["Classification"]
+    if epoch >= 5:
         # this is the final loss to be optimized
-        loss["Total"] = (
-            +loss["Regression_pt"]
+        loss["Total"] += (
+            loss["Regression_pt"]
             + loss["Regression_eta"]
             + loss["Regression_sin_phi"]
             + loss["Regression_cos_phi"]

From 91c1fac67ff2528ea35750c2bea49bfdb58f2b72 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 12:48:46 +0200
Subject: [PATCH 65/66] up

---
 mlpf/pyg/training.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 8a4dea1dc..15ced511b 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -158,16 +158,16 @@ def mlpf_loss(y, ypred, batch, epoch):
     std = was_input_true[batch.mask].std(axis=0)
     loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean()
 
-    loss["Total"] = loss["Classification_binary"] + loss["Classification"]
-    if epoch >= 5:
-        # this is the final loss to be optimized
-        loss["Total"] += (
-            loss["Regression_pt"]
-            + loss["Regression_eta"]
-            + loss["Regression_sin_phi"]
-            + loss["Regression_cos_phi"]
-            + loss["Regression_energy"]
-        )
+    # this is the final loss to be optimized
+    loss["Total"] = (
+        loss["Classification_binary"]
+        + loss["Classification"]
+        + loss["Regression_pt"]
+        + loss["Regression_eta"]
+        + loss["Regression_sin_phi"]
+        + loss["Regression_cos_phi"]
+        + loss["Regression_energy"]
+    )
 
     # store these separately but detached
     loss["Classification_binary"] = loss["Classification_binary"].detach()

From cc523e0fbdd0275665a117d2190407bfca21aff7 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 24 Sep 2024 15:48:14 +0200
Subject: [PATCH 66/66] fix pca

---
 mlpf/pyg/training.py | 72 +++++++++++++-------------------------------
 1 file changed, 21 insertions(+), 51 deletions(-)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index 15ced511b..2e1432125 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -97,9 +97,7 @@ def mlpf_loss(y, ypred, batch, epoch):
 
     # binary loss for particle / no-particle classification
     # loss_binary_classification = loss_obj_id(ypred["cls_binary"], (y["cls_id"] != 0).long()).reshape(y["cls_id"].shape)
-    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(
-        ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none"
-    )
+    loss_binary_classification = 10 * torch.nn.functional.cross_entropy(ypred["cls_binary"], (y["cls_id"] != 0).long(), reduction="none")
 
     # compare the particle type, only for cases where there was a true particle
     loss_pid_classification = loss_obj_id(ypred["cls_id_onehot"], y["cls_id"]).reshape(y["cls_id"].shape)
@@ -147,12 +145,12 @@ def mlpf_loss(y, ypred, batch, epoch):
     pred_met = torch.sqrt(torch.sum(pred_px, axis=-2) ** 2 + torch.sum(pred_py, axis=-2) ** 2)
     loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean()
 
-    was_input_pred = torch.concat(
-        [torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1
-    ) * batch.mask.unsqueeze(axis=-1)
-    was_input_true = torch.concat(
-        [torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1
-    ) * batch.mask.unsqueeze(axis=-1)
+    was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
+    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
 
     # standardize Wasserstein loss
     std = was_input_true[batch.mask].std(axis=0)
@@ -194,9 +192,7 @@ class FocalLoss(nn.Module):
         - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
     """
 
-    def __init__(
-        self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100
-    ):
+    def __init__(self, alpha: Optional[Tensor] = None, gamma: float = 0.0, reduction: str = "mean", ignore_index: int = -100):
         """Constructor.
         Args:
             alpha (Tensor, optional): Weights for each class. Defaults to None.
@@ -386,30 +382,18 @@ def validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, o
         ratio = (ypred_raw[2][batch.mask][:, 1] / batch.ygen[batch.mask][:, 3])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("eta_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("sphi_target", torch.clamp(batch.ygen[batch.mask][:, 4], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("sphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 2], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 2] / batch.ygen[batch.mask][:, 4])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("sphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("cphi_target", torch.clamp(batch.ygen[batch.mask][:, 5], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("cphi_pred", torch.clamp(ypred_raw[2][batch.mask][:, 3], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 3] / batch.ygen[batch.mask][:, 5])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("cphi_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
-        tensorboard_writer.add_histogram(
-            "energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch
-        )
-        tensorboard_writer.add_histogram(
-            "energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch
-        )
+        tensorboard_writer.add_histogram("energy_target", torch.clamp(batch.ygen[batch.mask][:, 6], -10, 10), global_step=epoch)
+        tensorboard_writer.add_histogram("energy_pred", torch.clamp(ypred_raw[2][batch.mask][:, 4], -10, 10), global_step=epoch)
         ratio = (ypred_raw[2][batch.mask][:, 4] / batch.ygen[batch.mask][:, 6])[batch.ygen[batch.mask][:, 0] != 0]
         tensorboard_writer.add_histogram("energy_ratio", torch.clamp(ratio, -10, 10), global_step=epoch)
 
@@ -473,9 +457,7 @@ def train_and_valid(
     if (world_size > 1) and (rank != 0):
         iterator = enumerate(data_loader)
     else:
-        iterator = tqdm.tqdm(
-            enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}"
-        )
+        iterator = tqdm.tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch} {train_or_valid} loop on rank={rank}")
 
     device_type = "cuda" if isinstance(rank, int) else "cpu"
 
@@ -733,9 +715,7 @@ def train_mlpf(
 
         # training step, edit here to profile a specific epoch
         if epoch == -1:
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True
-            ) as prof:
+            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
                 with record_function("model_train"):
                     losses_t = train_and_valid(
                         rank,
@@ -1039,9 +1019,7 @@ def run(rank, world_size, config, args, outdir, logfile):
             _logger.info(f"Model directory {outdir}", color="bold")
 
         if args.comet:
-            comet_experiment = create_comet_experiment(
-                config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
-            )
+            comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
             comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}")
             comet_experiment.log_parameter("run_id", Path(outdir).name)
             comet_experiment.log_parameter("world_size", world_size)
@@ -1296,9 +1274,7 @@ def train_ray_trial(config, args, outdir=None):
     loaders = get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray=True)
 
     if args.comet:
-        comet_experiment = create_comet_experiment(
-            config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir
-        )
+        comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
         comet_experiment.set_name(f"world_rank_{world_rank}_{Path(outdir).name}")
         comet_experiment.log_parameter("run_id", Path(outdir).name)
         comet_experiment.log_parameter("world_size", world_size)
@@ -1332,9 +1308,7 @@ def train_ray_trial(config, args, outdir=None):
                 if args.resume_training:
                     model, optimizer = load_checkpoint(checkpoint, model, optimizer)
                     start_epoch = checkpoint["extra_state"]["epoch"] + 1
-                    lr_schedule = get_lr_schedule(
-                        config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1
-                    )
+                    lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch=start_epoch - 1)
                 else:  # start a new training with model weights loaded from a pre-trained model
                     model = load_checkpoint(checkpoint, model)
 
@@ -1498,9 +1472,7 @@ def run_hpo(config, args):
 
     if tune.Tuner.can_restore(str(expdir)):
         # resume unfinished HPO run
-        tuner = tune.Tuner.restore(
-            str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True
-        )
+        tuner = tune.Tuner.restore(str(expdir), trainable=trainer, resume_errored=True, restart_errored=False, resume_unfinished=True)
     else:
         # start new HPO run
         search_space = {"train_loop_config": search_space}  # the ray TorchTrainer only takes a single arg: train_loop_config
@@ -1541,6 +1513,4 @@ def run_hpo(config, args):
     print(result_df.columns)
 
     logging.info("Total time of Tuner.fit(): {}".format(end - start))
-    logging.info(
-        "Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)
-    )
+    logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config))