From 88e4875b91f94d468bfbfbee3f0ca942c21923fa Mon Sep 17 00:00:00 2001 From: Joosep Pata Date: Mon, 11 Nov 2024 14:25:49 +0200 Subject: [PATCH] datasets 2.5.0, CMS plotting code (#350) * add back some brems * generate 2.5.0 with fixed truth * split tfds * add ztt * clean multiparticlegun * cms plotting * fix splits * plotting * fix pid map * use correct yaml list * update datasets --- apptainer/python_base.txt | 63 ---- apptainer/python_tf.txt | 16 - apptainer/tf-2.13.0.singularity | 32 -- mlpf/data/clic/postprocessing_jobs.py | 2 +- mlpf/data/cms/README.md | 112 +----- mlpf/data/cms/genjob_nopu.sh | 2 +- mlpf/data/cms/genjob_pu55to75.sh | 4 +- mlpf/data/cms/plot_cms.py | 351 ++++++++++++++++++ mlpf/data/cms/postprocessing2.py | 272 +++++++------- mlpf/data/cms/postprocessing_jobs.py | 21 +- mlpf/data/cms/prepare_args.py | 30 +- mlpf/heptfds/clic_pf_edm4hep/qq.py | 19 +- mlpf/heptfds/clic_pf_edm4hep/single_gamma.py | 78 ---- mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py | 78 ---- mlpf/heptfds/clic_pf_edm4hep/single_pi.py | 78 ---- mlpf/heptfds/clic_pf_edm4hep/ttbar.py | 19 +- mlpf/heptfds/clic_pf_edm4hep/utils_edm.py | 48 ++- mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py | 19 +- mlpf/heptfds/clic_pf_edm4hep/z.py | 81 ---- mlpf/heptfds/clic_pf_edm4hep/zh.py | 84 ----- mlpf/heptfds/cms_pf/cms_utils.py | 54 ++- mlpf/heptfds/cms_pf/qcd.py | 13 +- mlpf/heptfds/cms_pf/qcd_nopu.py | 11 +- .../cms_pf/{vbf_nopu.py => singleele.py} | 31 +- mlpf/heptfds/cms_pf/ttbar.py | 16 +- mlpf/heptfds/cms_pf/ttbar_nopu.py | 11 +- .../cms_pf/{multiparticlegun.py => ztt.py} | 37 +- mlpf/heptfds/cms_pf/{vbf.py => ztt_nopu.py} | 31 +- mlpf/model/PFDataset.py | 39 +- mlpf/model/inference.py | 17 - mlpf/model/training.py | 14 +- mlpf/pipeline.py | 4 +- mlpf/plotting/plots_cms.py | 114 ------ notebooks/cms/cms-validate-onnx.ipynb | 39 +- .../cms/cms-validate-postprocessing.ipynb | 37 +- .../cms-validate-root-vs-postprocessing.ipynb | 278 ++++++++++++++ notebooks/cms/cms-validate-root.ipynb | 126 +++++-- parameters/pytorch/pyg-clic.yaml | 29 +- parameters/pytorch/pyg-cms-nopu.yaml | 155 ++++++++ parameters/pytorch/pyg-cms-ttbar-nopu.yaml | 17 +- parameters/pytorch/pyg-cms.yaml | 58 ++- requirements.txt | 1 - scripts/generate_tfds.sh | 12 +- scripts/local_test_torch.sh | 9 +- scripts/lumi/pytorch-clic-8.sh | 2 +- scripts/lumi/pytorch-cms-8.sh | 8 +- .../a100-mig/pytorch-small-eval-clic.sh | 4 +- .../a100-mig/pytorch-small-eval-cms.sh | 9 +- scripts/tallinn/a100/pytorch.sh | 4 +- scripts/tallinn/copy_dataset_lxplus.sh | 7 + scripts/tallinn/generate_tfds.sh | 11 + scripts/tallinn/prepare_dataset_lxplus.sh | 42 +++ scripts/tallinn/submit_tfds.sh | 22 ++ 53 files changed, 1509 insertions(+), 1162 deletions(-) delete mode 100644 apptainer/python_base.txt delete mode 100644 apptainer/python_tf.txt delete mode 100644 apptainer/tf-2.13.0.singularity create mode 100644 mlpf/data/cms/plot_cms.py delete mode 100644 mlpf/heptfds/clic_pf_edm4hep/single_gamma.py delete mode 100644 mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py delete mode 100644 mlpf/heptfds/clic_pf_edm4hep/single_pi.py delete mode 100644 mlpf/heptfds/clic_pf_edm4hep/z.py delete mode 100644 mlpf/heptfds/clic_pf_edm4hep/zh.py rename mlpf/heptfds/cms_pf/{vbf_nopu.py => singleele.py} (62%) rename mlpf/heptfds/cms_pf/{multiparticlegun.py => ztt.py} (56%) rename mlpf/heptfds/cms_pf/{vbf.py => ztt_nopu.py} (60%) delete mode 100644 mlpf/plotting/plots_cms.py create mode 100644 notebooks/cms/cms-validate-root-vs-postprocessing.ipynb create mode 100644 parameters/pytorch/pyg-cms-nopu.yaml create mode 100755 scripts/tallinn/copy_dataset_lxplus.sh create mode 100644 scripts/tallinn/generate_tfds.sh create mode 100755 scripts/tallinn/prepare_dataset_lxplus.sh create mode 100755 scripts/tallinn/submit_tfds.sh diff --git a/apptainer/python_base.txt b/apptainer/python_base.txt deleted file mode 100644 index 6984a5bb6..000000000 --- a/apptainer/python_base.txt +++ /dev/null @@ -1,63 +0,0 @@ -POT -PyYAML -astropy -awkward -awkward0 -black -bokeh -boost-histogram -corner -cupy-cuda11x -dask -distributed -docopt -emcee -energyflow -fastjet -fastparquet -flake8 -girder-client -hdbscan -healpy -hydra-core -imageio -imageio-ffmpeg -ipyparallel -isort -jupyter -jupyterlab -kaleido -line_profiler -lmfit -lz4 -matplotlib -memory_profiler -mpl_scatter_density -mplhep -networkx -notebook -numba -numpy -pandas -papermill -parsl -particle -plotly -pre-commit -pyarrow -pydot -pygraphviz -pyhf -pymultinest -pynbody -pytest -scikit-learn -scipy -seaborn -tables -tensorboard -tqdm -uproot -vector -xxhash -zenodo_get diff --git a/apptainer/python_tf.txt b/apptainer/python_tf.txt deleted file mode 100644 index 8bbf178af..000000000 --- a/apptainer/python_tf.txt +++ /dev/null @@ -1,16 +0,0 @@ -comet-ml -datasets -keras-flops -keras-tuner -keras_core -onnx -onnxruntime -ray[default] -ray[tune] -tensorboard_plugin_profile -tensorflow-datasets -tensorflow-estimator -tensorflow-model-optimization -tensorflow-text -tf2onnx -transformers diff --git a/apptainer/tf-2.13.0.singularity b/apptainer/tf-2.13.0.singularity deleted file mode 100644 index 07b724439..000000000 --- a/apptainer/tf-2.13.0.singularity +++ /dev/null @@ -1,32 +0,0 @@ -Bootstrap: docker - -From: tensorflow/tensorflow:2.13.0-gpu-jupyter - -%files - specs/python_base.txt /opt/python_base.txt - specs/python_tf.txt /opt/python_tf.txt - -%post - apt update -y --fix-missing - DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y tzdata - apt install -y make cmake parallel gcc g++ gfortran binutils - apt install -y libblas3 libblas-dev liblapack3 liblapack-dev libatlas3-base libatlas-base-dev - apt install -y libtcmalloc-minimal4 - apt install -y graphviz graphviz-dev - apt install -y git - apt install -y wget - - python3 -m pip install --upgrade pip - python3 -m pip install -r /opt/python_base.txt - python3 -m pip install -r /opt/python_tf.txt - python3 -m pip install hls4ml[profiling] - HOROVOD_WITH_TENSORFLOW=1 python3 -m pip install horovod[tensorflow,keras] - python3 -m pip install open3d-cpu - -%environment - export PIP_DEFAULT_TIMEOUT=500 - export LC_ALL="" - export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu - -%runscript - /bin/bash diff --git a/mlpf/data/clic/postprocessing_jobs.py b/mlpf/data/clic/postprocessing_jobs.py index 757c51c35..107ffe6a7 100644 --- a/mlpf/data/clic/postprocessing_jobs.py +++ b/mlpf/data/clic/postprocessing_jobs.py @@ -20,7 +20,7 @@ def write_script(infiles, outpath): for inf in infiles: s += [ "singularity exec -B /local /home/software/singularity/pytorch.simg:2024-08-18 python3 " - + f"scripts/clic/postprocessing.py --input {inf} --outpath {outpath}" + + f"mlpf/data/clic/postprocessing.py --input {inf} --outpath {outpath}" ] ret = "\n".join(s) diff --git a/mlpf/data/cms/README.md b/mlpf/data/cms/README.md index 70122a9d9..09ef35d9b 100644 --- a/mlpf/data/cms/README.md +++ b/mlpf/data/cms/README.md @@ -1,111 +1 @@ -## Validation data - -``` -gfal-copy -r root://xrootd.hep.kbfi.ee:1094//store/user/jpata/mlpf/results/cms/CMSSW_14_1_0_pre3 ./ -``` - -See below for the steps to reproduce these samples. - -The resulting plots can be found at: -``` -https://jpata.web.cern.ch/jpata/mlpf/cms/results/acat2022_20221004_model40M_revalidation20240523/ -https://jpata.web.cern.ch/jpata/mlpf/cms/results/acat2022_20221004_model40M_revalidation_CMSSW14_20240527/ -``` - -## Code setup - -The following should work on lxplus. -``` -#ensure proxy is set -voms-proxy-init -voms cms -valid 192:00 -voms-proxy-info - -#Initialize EL8 -cmssw-el8 - -export SCRAM_ARCH=el8_amd64_gcc12 -cmsrel CMSSW_14_1_0_pre3 -cd CMSSW_14_1_0_pre3/src -cmsenv -git cms-init - -#set the directories we want to check out -echo "/Configuration/Generator/" >> .git/info/sparse-checkout -echo "/IOMC/ParticleGuns/" >> .git/info/sparse-checkout -echo "/RecoParticleFlow/PFProducer/" >> .git/info/sparse-checkout -echo "/Validation/RecoParticleFlow/" >> .git/info/sparse-checkout - -#checkout the CMSSW code -git remote add jpata https://github.com/jpata/cmssw.git -git fetch -a jpata -git checkout pfanalysis_caloparticle_CMSSW_14_1_0_pre3_acat2022 - -#compile -scram b -j4 - -#download the latest MLPF model -mkdir -p RecoParticleFlow/PFProducer/data/mlpf/ -wget https://huggingface.co/jpata/particleflow/resolve/main/cms/2022_10_04_gnnlsh_model40M_acat2022/dev.onnx?download=true -O RecoParticleFlow/PFProducer/data/mlpf/dev.onnx - -# must be b786aa6de49b51f703c87533a66326d6 -md5sum RecoParticleFlow/PFProducer/data/mlpf/dev.onnx -``` - -## Running MLPF in CMSSW - -### PF validation -To test MLPF on higher statistics, it's not practical to redo full reconstruction before the particle flow step. -We can follow a similar logic as the PF validation, where only the relevant PF sequences are rerun. - -We use the following datasets for this: -``` -/RelValQCD_FlatPt_15_3000HS_14/CMSSW_14_1_0_pre3-PU_140X_mcRun3_2024_realistic_v8_STD_2024_PU-v2/GEN-SIM-DIGI-RAW -/RelValTTbar_14TeV/CMSSW_14_1_0_pre3-PU_140X_mcRun3_2024_realistic_v8_STD_2024_PU-v2/GEN-SIM-DIGI-RAW -/RelValQQToHToTauTau_14TeV/CMSSW_14_1_0_pre3-PU_140X_mcRun3_2024_realistic_v8_STD_2024_PU-v2/GEN-SIM-DIGI-RAW -/RelValSingleEFlatPt2To100/CMSSW_14_1_0_pre3-PU_140X_mcRun3_2024_realistic_v8_STD_2024_PU-v2/GEN-SIM-DIGI-RAW -/RelValSingleGammaFlatPt8To150/CMSSW_14_1_0_pre3-PU_140X_mcRun3_2024_realistic_v8_STD_2024_PU-v2/GEN-SIM-DIGI-RAW -/RelValSinglePiFlatPt0p7To10/CMSSW_14_1_0_pre3-PU_140X_mcRun3_2024_realistic_v8_STD_2024_PU-v2/GEN-SIM-DIGI-RAW -``` - -#### MINIAOD with PF and MLPF -The PF validation workflows can be run using the scripts in -``` -cd particleflow - -#the number 1 signifies the row index (filename) in the input file to process -./scripts/cmssw/validation_job.sh mlpf scripts/cmssw/qcd_pu.txt QCD_PU 1 -./scripts/cmssw/validation_job.sh pf scripts/cmssw/qcd_pu.txt QCD_PU 1 -``` - -The MINIAOD output will be in `$CMSSW_BASE/out/QCD_PU_mlpf` and `$CMSSW_BASE/out/QCD_PU_pf`. - -## Generating MLPF training samples - -If you want to regenerate ML training samples from scratch with CMSSW, check the scripts -``` -mlpf/data_cms/genjob_nopu.sh -mlpf/data_cms/genjob_pu55to75.sh -``` - -## pytorch training - -Copy the datasets from EOS: -``` -rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms ./tensorflow_datasets -``` - -Download the pytorch distribution: -``` -wget https://jpata.web.cern.ch/jpata/pytorch.simg -``` - -On a machine with a single GPU, the following is a quick test of the training workflow -``` -singularity exec --env CUDA_VISIBLE_DEVICES=0 -B /scratch/persistent --nv \ - --env PYTHONPATH=`pwd` \ - --env KERAS_BACKEND=torch \ - pytorch.simg python3.10 mlpf/pipeline.py --dataset cms --gpus 1 \ - --data-dir ./tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ - --train --test --make-plots --conv-type attention --num-epochs 10 --gpu-batch-multiplier 1 \ - --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --ntrain 1000 --ntest 1000 --nvalid 1000 -``` +https://github.com/jpata/particleflow/wiki/CMS diff --git a/mlpf/data/cms/genjob_nopu.sh b/mlpf/data/cms/genjob_nopu.sh index 3277eea77..44e91bc84 100755 --- a/mlpf/data/cms/genjob_nopu.sh +++ b/mlpf/data/cms/genjob_nopu.sh @@ -80,7 +80,7 @@ cmsRun step3_phase1_new.py > /dev/null mv pfntuple.root pfntuple_${SEED}.root cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/ -# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ +# python3 ${MLPF_PATH}/mlpf/data/cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ # bzip2 -z pfntuple_${SEED}.pkl # cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ diff --git a/mlpf/data/cms/genjob_pu55to75.sh b/mlpf/data/cms/genjob_pu55to75.sh index ae3634133..5859c67a8 100755 --- a/mlpf/data/cms/genjob_pu55to75.sh +++ b/mlpf/data/cms/genjob_pu55to75.sh @@ -21,7 +21,7 @@ mkdir -p $WORKDIR mkdir -p $OUTDIR PILEUP=Run3_Flat55To75_PoissonOOTPU -PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data_cms/pu_files_local.txt +PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data/cms/pu_files_local.txt N=50 @@ -81,7 +81,7 @@ cmsRun step3_phase1_new.py > /dev/null mv pfntuple.root pfntuple_${SEED}.root cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/ -# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ +# python3 ${MLPF_PATH}/mlpf/data/cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ # bzip2 -z pfntuple_${SEED}.pkl # cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ diff --git a/mlpf/data/cms/plot_cms.py b/mlpf/data/cms/plot_cms.py new file mode 100644 index 000000000..8c90f2d77 --- /dev/null +++ b/mlpf/data/cms/plot_cms.py @@ -0,0 +1,351 @@ +import boost_histogram as bh +import glob +import pickle +import uproot +import awkward as ak +import numpy as np +import vector +import bz2 +import pandas +import os +import fastjet +import random +import time +import tqdm + +from mlpf import jet_utils + + +def chunks(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i : i + n] + + +def load_tree(ttree): + particles_pythia = ttree.arrays(["gen_pt", "gen_eta", "gen_phi", "gen_energy", "gen_pdgid", "gen_status", "gen_daughters"]) + particles_cp = ttree.arrays(["caloparticle_pt", "caloparticle_eta", "caloparticle_phi", "caloparticle_energy", "caloparticle_pid"]) + genjet = ttree.arrays(["genjet_pt", "genjet_eta", "genjet_phi", "genjet_energy"]) + genmet = ttree.arrays(["genmet_pt"]) + return ak.Array({"pythia": particles_pythia, "cp": particles_cp, "genjet": genjet, "genmet": genmet}) + + +def sum_overflow_into_last_bin(all_values): + values = all_values[1:-1] + values[-1] = values[-1] + all_values[-1] + values[0] = values[0] + all_values[0] + return values + + +def to_bh(data, bins, cumulative=False): + h1 = bh.Histogram(bh.axis.Variable(bins)) + h1.fill(data) + if cumulative: + h1[:] = np.sum(h1.values()) - np.cumsum(h1) + h1[:] = sum_overflow_into_last_bin(h1.values(flow=True)[:]) + return h1 + + +def compute_met(pt, phi, mask=None): + if mask is None: + mask = np.ones_like(pt, dtype=bool) + met = np.sqrt(ak.sum(pt[mask] * np.sin(phi[mask]), axis=1) ** 2 + ak.sum(pt[mask] * np.cos(phi[mask]), axis=1) ** 2) + return met + + +def process_files(sample_folder, rootfiles, pklfiles, outfile): + pause = random.randint(100, 1000) / 1000.0 + time.sleep(pause) + + # check that the root and pkl file lists correspond to each other + if len(rootfiles) > 0: + assert len(rootfiles) == len(pklfiles) + for fn1, fn2 in zip(rootfiles, pklfiles): + assert os.path.basename(fn1).split(".")[0] == os.path.basename(fn2).split(".")[0] + + # load root files + tts = [load_tree(uproot.open(fn)["pfana/pftree"]) for fn in rootfiles] + tts = ak.concatenate(tts, axis=0) + particles_pythia = tts["pythia"] + particles_cp = tts["cp"] + + # load pkl files + pickle_data = sum( + [pickle.load(bz2.BZ2File(fn)) for fn in pklfiles], + [], + ) + + for i in range(len(pickle_data)): + for coll in ["ytarget", "ycand"]: + pickle_data[i][coll] = pandas.DataFrame(pickle_data[i][coll]) + pickle_data[i][coll]["phi"] = np.arctan2(pickle_data[i][coll]["sin_phi"], pickle_data[i][coll]["cos_phi"]) + + # get awkward and flat arrays from the data + arrs_awk = {} + arrs_flat = {} + + # tracks and clusters + for coll in ["Xelem"]: + arrs_awk[coll] = {} + arrs_flat[coll] = {} + for feat in ["typ", "pt", "eta", "phi", "energy"]: + arr = [np.array(p[coll][feat][p[coll]["typ"] != 0]) for p in pickle_data] + arrs_awk[coll][feat] = ak.unflatten(ak.concatenate(arr), [len(a) for a in arr]) + arr = [np.array(p[coll][feat]) for p in pickle_data] + arrs_flat[coll][feat] = ak.unflatten(ak.concatenate(arr), [len(a) for a in arr]) + + # MLPF targets and PF reco + for coll in ["ytarget", "ycand"]: + arrs_awk[coll] = {} + arrs_flat[coll] = {} + for feat in ["pid", "pt", "eta", "phi", "energy", "ispu"]: + arr = [np.array(p[coll][feat][p[coll]["pid"] != 0]) for p in pickle_data] + arrs_awk[coll][feat] = ak.unflatten(ak.concatenate(arr), [len(a) for a in arr]) + arr = [np.array(p[coll][feat]) for p in pickle_data] + arrs_flat[coll][feat] = ak.unflatten(ak.concatenate(arr), [len(a) for a in arr]) + + # pythia generator level particles + arrs_awk["pythia"] = {} + arrs_awk["pythia"]["pid"] = ak.from_regular([np.array(p["pythia"][:, 0]) for p in pickle_data]) + arrs_awk["pythia"]["pt"] = ak.from_regular([np.array(p["pythia"][:, 1]) for p in pickle_data]) + arrs_awk["pythia"]["eta"] = ak.from_regular([np.array(p["pythia"][:, 2]) for p in pickle_data]) + arrs_awk["pythia"]["phi"] = ak.from_regular([np.array(p["pythia"][:, 3]) for p in pickle_data]) + arrs_awk["pythia"]["energy"] = ak.from_regular([np.array(p["pythia"][:, 4]) for p in pickle_data]) + + pu_mask = arrs_awk["ytarget"]["ispu"] < 0.5 + if len(rootfiles) > 0: + abs_pid = np.abs(particles_pythia["gen_pdgid"]) + mask_pythia_nonu = ( + (particles_pythia["gen_status"] == 1) + & (abs_pid != 12) + & (abs_pid != 14) + & (abs_pid != 16) # | + # ((particles_pythia["gen_status"]==2) & (ak.num(particles_pythia["gen_daughters"], axis=2) == 0)) + ) + mask_cp = np.abs(particles_cp["caloparticle_eta"]) < 5 + + # MET from MLPF targets and from PF particles + # ypythia_met = compute_met(particles_pythia["gen_pt"], particles_pythia["gen_phi"], mask_pythia_nonu) + # ycaloparticle_met = compute_met(particles_cp["caloparticle_pt"], particles_cp["caloparticle_phi"], mask_cp) + + # dummy mask + ytarget_met = compute_met(arrs_awk["ytarget"]["pt"], arrs_awk["ytarget"]["phi"]) + ytarget_nopu_met = compute_met(arrs_awk["ytarget"]["pt"], arrs_awk["ytarget"]["phi"], pu_mask) + ycand_met = compute_met(arrs_awk["ycand"]["pt"], arrs_awk["ycand"]["phi"]) + + # cluster jets + jets_coll = {} + jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4) + + # genMet, genJets from CMSSW (should be the same as computed from Pythia) + # genmet_cmssw = np.array([pickle_data[i]["genmet"][0, 0] for i in range(len(pickle_data))]) + genjet_cmssw = ak.from_regular([pickle_data[i]["genjet"] for i in range(len(pickle_data))]) + genmet_cmssw = ak.from_regular([pickle_data[i]["genmet"] for i in range(len(pickle_data))]) + genmet_cmssw = genmet_cmssw[:, 0, 0] + genjet_cmssw = vector.awk( + ak.zip( + { + "pt": genjet_cmssw[:, :, 0], + "eta": genjet_cmssw[:, :, 1], + "phi": genjet_cmssw[:, :, 2], + "energy": genjet_cmssw[:, :, 3], + } + ) + ) + jets_coll["genjet"] = genjet_cmssw + + if len(rootfiles) > 0: + vec = vector.awk( + ak.zip( + { + "pt": particles_pythia[mask_pythia_nonu]["gen_pt"], + "eta": particles_pythia[mask_pythia_nonu]["gen_eta"], + "phi": particles_pythia[mask_pythia_nonu]["gen_phi"], + "energy": particles_pythia[mask_pythia_nonu]["gen_energy"], + } + ) + ) + cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef) + jets_coll["pythia_nonu"] = cluster.inclusive_jets(min_pt=3) + + vec = vector.awk( + ak.zip( + { + "pt": particles_cp[mask_cp]["caloparticle_pt"], + "eta": particles_cp[mask_cp]["caloparticle_eta"], + "phi": particles_cp[mask_cp]["caloparticle_phi"], + "energy": particles_cp[mask_cp]["caloparticle_energy"], + } + ) + ) + cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef) + jets_coll["cp"] = cluster.inclusive_jets(min_pt=3) + + for coll in ["ytarget", "ycand"]: + vec = vector.awk( + ak.zip( + { + "pt": arrs_awk[coll]["pt"], + "eta": arrs_awk[coll]["eta"], + "phi": arrs_awk[coll]["phi"], + "energy": arrs_awk[coll]["energy"], + } + ) + ) + cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef) + jets_coll[coll] = cluster.inclusive_jets(min_pt=3) + + vec = vector.awk( + ak.zip( + { + "pt": arrs_awk["ytarget"]["pt"][pu_mask], + "eta": arrs_awk["ytarget"]["eta"][pu_mask], + "phi": arrs_awk["ytarget"]["phi"][pu_mask], + "energy": arrs_awk["ytarget"]["energy"][pu_mask], + } + ) + ) + cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef) + jets_coll["ytarget_nopu"] = cluster.inclusive_jets(min_pt=3) + + genjet_to_ytarget = jet_utils.match_two_jet_collections(jets_coll, "genjet", "ytarget", 0.1) + genjet_to_ytarget_nopu = jet_utils.match_two_jet_collections(jets_coll, "genjet", "ytarget_nopu", 0.1) + genjet_to_ycand = jet_utils.match_two_jet_collections(jets_coll, "genjet", "ycand", 0.1) + + ret = {} + + # particle distributions + b = np.logspace(-4, 4, 100) + if len(rootfiles) > 0: + ret[f"{sample_folder}/particles_pt_pythia"] = to_bh(ak.flatten(particles_pythia[mask_pythia_nonu]["gen_pt"]), bins=b) + ret[f"{sample_folder}/particles_pt_caloparticle"] = to_bh(ak.flatten(particles_cp[mask_cp]["caloparticle_pt"]), bins=b) + ret[f"{sample_folder}/particles_pt_target"] = to_bh(ak.flatten(arrs_awk["ytarget"]["pt"]), bins=b) + ret[f"{sample_folder}/particles_pt_target_pumask"] = to_bh(ak.flatten(arrs_awk["ytarget"]["pt"][pu_mask]), bins=b) + ret[f"{sample_folder}/particles_pt_cand"] = to_bh(ak.flatten(arrs_awk["ycand"]["pt"]), bins=b) + + # per-pid particle distribution + pidset = np.unique(np.abs(ak.flatten(arrs_awk["ytarget"]["pid"]))) + for pid in pidset: + pid = int(pid) + if len(rootfiles) > 0: + pidmask_pythia = np.abs(particles_pythia[mask_pythia_nonu]["gen_pdgid"]) == pid + ret[f"{sample_folder}/particle_{pid}_pt_pythia"] = to_bh(ak.flatten(particles_pythia[mask_pythia_nonu]["gen_pt"][pidmask_pythia]), bins=b) + pidmask_cp = np.abs(particles_cp[mask_cp]["caloparticle_pid"]) == pid + ret[f"{sample_folder}/particle_{pid}_pt_caloparticle"] = to_bh(ak.flatten(particles_cp[mask_cp]["caloparticle_pt"][pidmask_cp]), bins=b) + pidmask_ytarget = np.abs(arrs_awk["ytarget"]["pid"]) == pid + ret[f"{sample_folder}/particle_{pid}_pt_target"] = to_bh(ak.flatten(arrs_awk["ytarget"]["pt"][pidmask_ytarget]), bins=b) + ret[f"{sample_folder}/particle_{pid}_pt_target_pumask"] = to_bh(ak.flatten(arrs_awk["ytarget"]["pt"][pu_mask & pidmask_ytarget]), bins=b) + pidmask_ytarget = np.abs(arrs_awk["ycand"]["pid"]) == pid + ret[f"{sample_folder}/particle_{pid}_pt_cand"] = to_bh(ak.flatten(arrs_awk["ycand"]["pt"]), bins=b) + + b = np.linspace(-5, 5, 100) + if len(rootfiles) > 0: + ret[f"{sample_folder}/particles_eta_pythia"] = to_bh(ak.flatten(particles_pythia[mask_pythia_nonu]["gen_eta"]), bins=b) + ret[f"{sample_folder}/particles_eta_caloparticle"] = to_bh(ak.flatten(particles_cp[mask_cp]["caloparticle_eta"]), bins=b) + ret[f"{sample_folder}/particles_eta_target"] = to_bh(ak.flatten(arrs_awk["ytarget"]["eta"]), bins=b) + ret[f"{sample_folder}/particles_eta_target_pumask"] = to_bh(ak.flatten(arrs_awk["ytarget"]["eta"][pu_mask]), bins=b) + ret[f"{sample_folder}/particles_eta_cand"] = to_bh(ak.flatten(arrs_awk["ycand"]["eta"]), bins=b) + + # jet pt distribution + b = np.logspace(0, 4, 100) + ret[f"{sample_folder}/jets_pt_genjet"] = to_bh(ak.flatten(jets_coll["genjet"].pt), bins=b) + # ret[f"{sample_folder}/jets_pt_caloparticle"] = to_bh(ak.flatten(jets_coll["cp"].pt), bins=b) + ret[f"{sample_folder}/jets_pt_target"] = to_bh(ak.flatten(jets_coll["ytarget"].pt), bins=b) + ret[f"{sample_folder}/jets_pt_target_pumask"] = to_bh(ak.flatten(jets_coll["ytarget_nopu"].pt), bins=b) + ret[f"{sample_folder}/jets_pt_cand"] = to_bh(ak.flatten(jets_coll["ycand"].pt), bins=b) + + b = np.linspace(-5, 5, 100) + ret[f"{sample_folder}/jets_eta_genjet"] = to_bh(ak.flatten(jets_coll["genjet"].eta), bins=b) + # ret[f"{sample_folder}/jets_eta_caloparticle"] = to_bh(ak.flatten(jets_coll["cp"].eta), bins=b) + ret[f"{sample_folder}/jets_eta_target"] = to_bh(ak.flatten(jets_coll["ytarget"].eta), bins=b) + ret[f"{sample_folder}/jets_eta_target_pumask"] = to_bh(ak.flatten(jets_coll["ytarget_nopu"].eta), bins=b) + ret[f"{sample_folder}/jets_eta_cand"] = to_bh(ak.flatten(jets_coll["ycand"].eta), bins=b) + + # jet pt ratio + b = np.linspace(0, 5, 1000) + # ratio = ak.flatten((jets_coll["cp"][genjet_to_cp["cp"]].pt / jets_coll["genjet"][genjet_to_cp["genjet"]].pt)) + # ret[f"{sample_folder}/jets_pt_ratio_caloparticle"] = to_bh(ratio, bins=b) + ratio = ak.flatten((jets_coll["ytarget"][genjet_to_ytarget["ytarget"]].pt / jets_coll["genjet"][genjet_to_ytarget["genjet"]].pt)) + ret[f"{sample_folder}/jets_pt_ratio_target"] = to_bh(ratio, bins=b) + ratio = ak.flatten( + (jets_coll["ytarget_nopu"][genjet_to_ytarget_nopu["ytarget_nopu"]].pt / jets_coll["genjet"][genjet_to_ytarget_nopu["genjet"]].pt) + ) + ret[f"{sample_folder}/jets_pt_ratio_target_pumask"] = to_bh(ratio, bins=b) + ratio = ak.flatten((jets_coll["ycand"][genjet_to_ycand["ycand"]].pt / jets_coll["genjet"][genjet_to_ycand["genjet"]].pt)) + ret[f"{sample_folder}/jets_pt_ratio_cand"] = to_bh(ratio, bins=b) + + b = np.linspace(0.5, 1.5, 1000) + # ratio = ak.flatten((jets_coll["cp"][genjet_to_cp["cp"]].pt / jets_coll["genjet"][genjet_to_cp["genjet"]].pt)) + # ret[f"{sample_folder}/jets_pt_ratio2_caloparticle"] = to_bh(ratio, bins=b) + ratio = ak.flatten((jets_coll["ytarget"][genjet_to_ytarget["ytarget"]].pt / jets_coll["genjet"][genjet_to_ytarget["genjet"]].pt)) + ret[f"{sample_folder}/jets_pt_ratio2_target"] = to_bh(ratio, bins=b) + ratio = ak.flatten( + (jets_coll["ytarget_nopu"][genjet_to_ytarget_nopu["ytarget_nopu"]].pt / jets_coll["genjet"][genjet_to_ytarget_nopu["genjet"]].pt) + ) + ret[f"{sample_folder}/jets_pt_ratio2_target_pumask"] = to_bh(ratio, bins=b) + ratio = ak.flatten((jets_coll["ycand"][genjet_to_ycand["ycand"]].pt / jets_coll["genjet"][genjet_to_ycand["genjet"]].pt)) + ret[f"{sample_folder}/jets_pt_ratio2_cand"] = to_bh(ratio, bins=b) + + b = np.logspace(-1, 3, 100) + ret[f"{sample_folder}/met_pythia"] = to_bh(genmet_cmssw, bins=b) + ret[f"{sample_folder}/met_target"] = to_bh(ytarget_met, bins=b) + ret[f"{sample_folder}/met_target_pumask"] = to_bh(ytarget_nopu_met, bins=b) + ret[f"{sample_folder}/met_cand"] = to_bh(ycand_met, bins=b) + + # print output + # for k in sorted(ret.keys()): + # print(k, ret[k].__class__.__name__) + + # save output + with open(outfile, "wb") as handle: + pickle.dump(ret, handle, protocol=pickle.HIGHEST_PROTOCOL) + + +if __name__ == "__main__": + + perjob = 50 + numjobs = 16 + is_test = False + + args = [] + ijob = 0 + + # process pkl and ROOT files together + maxfiles = 500 + path = "/local/joosep/mlpf/cms/" + for pu_config in ["nopu", "pu55to75"]: + for sample_folder in ["QCDForPF_14TeV_TuneCUETP8M1_cfi", "TTbar_14TeV_TuneCUETP8M1_cfi", "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi"]: + rootfiles = sorted(glob.glob(f"{path}/20240823_simcluster/{pu_config}/{sample_folder}/root/pfntuple_*.root")) + pklfiles = sorted(glob.glob(f"{path}/20240823_simcluster/{pu_config}/{sample_folder}/raw/pfntuple_*.pkl.bz2")) + + rootfiles_d = {fn.split("/")[-1].split(".")[0]: fn for fn in rootfiles} + pklfiles_d = {fn.split("/")[-1].split(".")[0]: fn for fn in pklfiles} + + # find the set of common filenames betweek the root and pkl files + common_keys = sorted(list(set(set(rootfiles_d.keys()).intersection(set(pklfiles_d.keys())))))[:maxfiles] + + # prepare chunked arguments for process_files + for ck in chunks(common_keys, perjob): + args.append( + (f"combined/{pu_config}/{sample_folder}", [rootfiles_d[c] for c in ck], [pklfiles_d[c] for c in ck], "out{}.pkl".format(ijob)) + ) + ijob += 1 + + # process only pkl files + maxfiles = -1 + path = "/scratch/persistent/joosep/" + for pu_config in ["nopu", "pu55to75"]: + for sample_folder in ["QCDForPF_14TeV_TuneCUETP8M1_cfi", "TTbar_14TeV_TuneCUETP8M1_cfi", "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi"]: + pklfiles = sorted(glob.glob(f"{path}/20240823_simcluster/{pu_config}/{sample_folder}/raw/pfntuple_*.pkl.bz2"))[:maxfiles] + for ck in chunks(pklfiles, perjob): + args.append((f"{pu_config}/{sample_folder}", [], ck, "out{}.pkl".format(ijob))) + ijob += 1 + + if is_test: + process_files(*args[0]) + else: + import multiprocessing + + pool = multiprocessing.Pool(numjobs) + pool.starmap(process_files, tqdm.tqdm(args, total=len(args))) + pool.close() diff --git a/mlpf/data/cms/postprocessing2.py b/mlpf/data/cms/postprocessing2.py index 6181a2ce8..6b74c937c 100644 --- a/mlpf/data/cms/postprocessing2.py +++ b/mlpf/data/cms/postprocessing2.py @@ -1,5 +1,13 @@ import math import os + +# to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" +os.environ["VECLIB_MAXIMUM_THREADS"] = "1" +os.environ["NUMEXPR_NUM_THREADS"] = "1" + import pickle import networkx as nx @@ -85,8 +93,8 @@ "ispu", "generatorStatus", "simulatorStatus", - "gp_to_track", - "gp_to_cluster", + "cp_to_track", + "cp_to_cluster", "jet_idx", ] @@ -123,18 +131,6 @@ def print_gen(g, min_pt=1): print(node, g.nodes[node]["pt"], g.nodes[node]["eta"], g.nodes[node]["phi"], g.nodes[node]["pid"], children) -def map_pdgid_to_candid(pdgid, charge): - if pdgid in [22, 11, 13]: - return pdgid - - # charged hadron - if abs(charge) > 0: - return 211 - - # neutral hadron - return 130 - - def deltar_pairs(eta_vec, phi_vec, dr_cut): deta = np.abs(np.subtract.outer(eta_vec, eta_vec)) @@ -163,10 +159,11 @@ def get_charge(pid): raise Exception("Unknown pid: ", pid) -def compute_gen_met(g): - genpart = [elem for elem in g.nodes if elem[0] == "cp"] - sum_px = np.sum([g.nodes[elem]["pt"] * np.cos(g.nodes[elem]["phi"]) for elem in genpart]) - sum_py = np.sum([g.nodes[elem]["pt"] * np.sin(g.nodes[elem]["phi"]) for elem in genpart]) +def compute_gen_met(g, calopart=None): + if calopart is None: + calopart = [elem for elem in g.nodes if elem[0] == "cp"] + sum_px = np.sum([g.nodes[elem]["pt"] * np.cos(g.nodes[elem]["phi"]) for elem in calopart]) + sum_py = np.sum([g.nodes[elem]["pt"] * np.sin(g.nodes[elem]["phi"]) for elem in calopart]) met = np.sqrt(sum_px**2 + sum_py**2) return met @@ -207,8 +204,8 @@ def split_caloparticles(g, elem_type): ispu=g.nodes[cp]["ispu"], generatorStatus=0, simulatorStatus=1, - gp_to_track=g.nodes[cp]["gp_to_track"] * (lv_frac.e / lv.e), - gp_to_cluster=g.nodes[cp]["gp_to_cluster"] * (lv_frac.e / lv.e), + cp_to_track=g.nodes[cp]["cp_to_track"] * (lv_frac.e / lv.e), + cp_to_cluster=g.nodes[cp]["cp_to_cluster"] * (lv_frac.e / lv.e), jet_idx=-1, ) g.add_edge(("cp", new_cp_index), suc, weight=g.edges[cp, suc]["weight"]) @@ -216,28 +213,32 @@ def split_caloparticles(g, elem_type): g.remove_node(cp) -def find_representative_elements(g, elem_to_gp, gp_to_elem, elem_type): +def find_representative_elements(g, elem_to_cp, cp_to_elem, elem_type): unused_elems = [] elems = [(g.nodes[e]["pt"], e) for e in g.nodes if e[0] == "elem" and g.nodes[e]["typ"] == elem_type] elems_sorted = sorted(elems, key=lambda x: x[0], reverse=True) for _, elem in elems_sorted: - gps = list(g.predecessors(elem)) - gps_weight = [(g.edges[(gp, elem)]["weight"], gp) for gp in gps if gp not in gp_to_elem if gp[0] == "cp"] - gps_weight_sorted = sorted(gps_weight, key=lambda x: x[0], reverse=True) - if len(gps_weight_sorted) > 0: - gp = gps_weight_sorted[0][1] - elem_to_gp[elem] = gp - gp_to_elem[gp] = elem + cps = list(g.predecessors(elem)) + cps_weight = [(g.edges[(cp, elem)]["weight"], cp) for cp in cps if cp not in cp_to_elem if cp[0] == "cp"] + cps_weight_sorted = sorted(cps_weight, key=lambda x: x[0], reverse=True) + if len(cps_weight_sorted) > 0: + cp = cps_weight_sorted[0][1] + elem_to_cp[elem] = cp + cp_to_elem[cp] = elem else: unused_elems.append(elem) def prepare_normalized_table(g, iev): + # if there's more than one track per caloparticle, the caloparticle should be distributed among the tracks split_caloparticles(g, 1) + print("split, met={:.2f}".format(compute_gen_met(g))) + if save_debugging_pickle: pickle.dump(g, open("split_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL) - all_genparticles = [] + # get the full list of caloparticles, elements and pfcandidates + all_caloparticles = [] all_elements = [] all_pfcandidates = [] for node in g.nodes: @@ -245,40 +246,42 @@ def prepare_normalized_table(g, iev): all_elements += [node] for parent in g.predecessors(node): if parent[0] == "cp": - all_genparticles += [parent] + all_caloparticles += [parent] elif node[0] == "pfcand": all_pfcandidates += [node] - all_genparticles = list(set(all_genparticles)) - all_elements = sorted(all_elements) - - elem_to_gp = {} # map of element -> genparticles - gp_to_elem = {} # map of genparticle -> element - - # assign genparticles in reverse pt order uniquely to best element - find_representative_elements(g, elem_to_gp, gp_to_elem, 1) # tracks - find_representative_elements(g, elem_to_gp, gp_to_elem, 6) # gsf - find_representative_elements(g, elem_to_gp, gp_to_elem, 4) # ecal - find_representative_elements(g, elem_to_gp, gp_to_elem, 5) # hcal - find_representative_elements(g, elem_to_gp, gp_to_elem, 8) # HF - find_representative_elements(g, elem_to_gp, gp_to_elem, 9) # HF - find_representative_elements(g, elem_to_gp, gp_to_elem, 10) # - find_representative_elements(g, elem_to_gp, gp_to_elem, 11) - - s1 = set(list(gp_to_elem.keys())) - s2 = set(all_genparticles) - unmatched_gp = list(s2 - s1) - - # assign unmatched genparticles to best element, allowing for overlaps - elem_to_gp = {k: [v] for k, v in elem_to_gp.items()} - for gp in sorted(unmatched_gp, key=lambda x: g.nodes[x]["pt"], reverse=True): - elems = [e for e in g.successors(gp)] + + all_caloparticles = list(set(all_caloparticles)) + print("all caloparticles, met={:.2f}".format(compute_gen_met(g, all_caloparticles))) + all_elements = sorted(list(set(all_elements))) + + elem_to_cp = {} # map of element -> caloparticles + cp_to_elem = {} # map of caloparticle -> element + + # assign caloparticles in reverse pt order uniquely to best element + find_representative_elements(g, elem_to_cp, cp_to_elem, 1) # tracks + find_representative_elements(g, elem_to_cp, cp_to_elem, 6) # gsf + find_representative_elements(g, elem_to_cp, cp_to_elem, 4) # ecal + find_representative_elements(g, elem_to_cp, cp_to_elem, 5) # hcal + find_representative_elements(g, elem_to_cp, cp_to_elem, 8) # HF + find_representative_elements(g, elem_to_cp, cp_to_elem, 9) # HF + find_representative_elements(g, elem_to_cp, cp_to_elem, 10) # + find_representative_elements(g, elem_to_cp, cp_to_elem, 11) + + s1 = set(list(cp_to_elem.keys())) + s2 = set(all_caloparticles) + unmatched_cp = list(s2 - s1) + + # assign unmatched caloparticles to best element, allowing for overlaps + elem_to_cp = {k: [v] for k, v in elem_to_cp.items()} + for cp in sorted(unmatched_cp, key=lambda x: g.nodes[x]["pt"], reverse=True): + elems = [e for e in g.successors(cp)] elems_sorted = sorted( - [(g.edges[gp, e]["weight"], e) for e in elems], + [(g.edges[cp, e]["weight"], e) for e in elems], key=lambda x: x[0], reverse=True, ) _, elem = elems_sorted[0] - elem_to_gp[elem] += [gp] + elem_to_cp[elem] += [cp] # Find primary element for each PFCandidate unmatched_cand = [] @@ -303,8 +306,6 @@ def prepare_normalized_table(g, iev): # other particles will be assigned to the highest-energy cluster (ECAL, HCAL, HFEM, HFHAD, SC) else: - # neighbors = [n for n in neighbors if g.nodes[n]["typ"] in [4,5,8,9,10]] - # sorted_neighbors = sorted(neighbors, key=lambda x: g.nodes[x]["energy"], reverse=True) sorted_neighbors = sorted( neighbors, key=lambda x: g.edges[(x, cand)]["weight"], @@ -336,12 +337,14 @@ def prepare_normalized_table(g, iev): ) ycand.fill(0.0) + # find primary element for each CaloParticle and PFCandidate for ielem, elem in enumerate(all_elements): - genparticles = sorted( - elem_to_gp.get(elem, []), + caloparticles = sorted( + elem_to_cp.get(elem, []), key=lambda x: g.edges[(x, elem)]["weight"], reverse=True, ) + candidate = elem_to_cand.get(elem, None) for j in range(len(elem_branches)): @@ -354,27 +357,27 @@ def prepare_normalized_table(g, iev): lv = vector.obj(x=0, y=0, z=0, t=0) # if several CaloParticles are associated to ONLY this element, merge them, as they are not reconstructable separately - if len(genparticles) > 0: - pids_e = sorted([(g.nodes[gp]["pid"], g.nodes[gp]["energy"]) for gp in genparticles], key=lambda x: x[1], reverse=True) + if len(caloparticles) > 0: + pids_e = sorted([(g.nodes[cp]["pid"], g.nodes[cp]["energy"]) for cp in caloparticles], key=lambda x: x[1], reverse=True) # get the pid of the highest-energy particle associated with this element pid = pids_e[0][0] - charge = g.nodes[genparticles[0]]["charge"] - pid = map_pdgid_to_candid(pid, charge) + charge = g.nodes[caloparticles[0]]["charge"] + # pid = map_pdgid_to_candid(pid, charge) sum_pu = 0.0 sum_tot = 0.0 - for gp in genparticles: + for cp in caloparticles: lv += vector.obj( - pt=g.nodes[gp]["pt"], - eta=g.nodes[gp]["eta"], - phi=g.nodes[gp]["phi"], - energy=g.nodes[gp]["energy"], + pt=g.nodes[cp]["pt"], + eta=g.nodes[cp]["eta"], + phi=g.nodes[cp]["phi"], + energy=g.nodes[cp]["energy"], ) - sum_pu += g.nodes[gp]["ispu"] * g.nodes[gp]["energy"] - sum_tot += g.nodes[gp]["energy"] + sum_pu += g.nodes[cp]["ispu"] * g.nodes[cp]["energy"] + sum_tot += g.nodes[cp]["energy"] - gp = { + cp = { "pt": lv.rho, "eta": lv.eta, "sin_phi": np.sin(lv.phi), @@ -386,15 +389,15 @@ def prepare_normalized_table(g, iev): "pz": lv.z, "ispu": sum_pu / sum_tot, "charge": charge, - "gp_to_track": np.sum([g.nodes[gp]["gp_to_track"] for gp in genparticles]), - "gp_to_cluster": np.sum([g.nodes[gp]["gp_to_cluster"] for gp in genparticles]), + "cp_to_track": np.sum([g.nodes[cp]["cp_to_track"] for cp in caloparticles]), + "cp_to_cluster": np.sum([g.nodes[cp]["cp_to_cluster"] for cp in caloparticles]), "generatorStatus": 0, "simulatorStatus": 2, "jet_idx": -1, } for j in range(len(particle_feature_order)): - ytarget[particle_feature_order[j]][ielem] = gp[particle_feature_order[j]] + ytarget[particle_feature_order[j]][ielem] = cp[particle_feature_order[j]] px = np.sum(ytarget["pt"] * ytarget["cos_phi"]) py = np.sum(ytarget["pt"] * ytarget["sin_phi"]) @@ -607,8 +610,8 @@ def make_graph(ev, iev): ispu=float(trackingparticle_ev[iobj] != 0), generatorStatus=0, simulatorStatus=0, - gp_to_track=0, - gp_to_cluster=0, + cp_to_track=0, + cp_to_cluster=0, jet_idx=-1, ) @@ -625,8 +628,8 @@ def make_graph(ev, iev): ispu=float(caloparticle_ev[iobj] != 0), generatorStatus=0, simulatorStatus=0, - gp_to_track=0, - gp_to_cluster=0, + cp_to_track=0, + cp_to_cluster=0, jet_idx=-1, ) itp = caloparticle_idx_trackingparticle[iobj] @@ -664,8 +667,8 @@ def make_graph(ev, iev): ispu=0.0, # for PF candidates, we don't know if it was PU or not generatorStatus=0, simulatorStatus=0, - gp_to_track=0, - gp_to_cluster=0, + cp_to_track=0, + cp_to_cluster=0, jet_idx=-1, ) @@ -691,11 +694,9 @@ def make_graph(ev, iev): caloparticle_to_element_second, caloparticle_to_element_cmp, ): - if g.nodes[("elem", elem)]["typ"] in [2, 3, 7]: - continue - if ("cp", iobj) in g.nodes and ("elem", elem) in g.nodes: - # print(("cp", iobj), ("elem", elem), c) - g.add_edge(("cp", iobj), ("elem", elem), weight=c) + if not (g.nodes[("elem", elem)]["typ"] in [2, 3, 7]): + if ("cp", iobj) in g.nodes and ("elem", elem) in g.nodes: + g.add_edge(("cp", iobj), ("elem", elem), weight=c) simcluster_to_element_first = ev["simcluster_to_element.first"][iev] simcluster_to_element_second = ev["simcluster_to_element.second"][iev] @@ -706,8 +707,8 @@ def make_graph(ev, iev): simcluster_to_element_cmp, ): if not (g.nodes[("elem", elem)]["typ"] in [2, 3, 7]): - # print(("sc", iobj), ("elem", elem), c) - g.add_edge(("sc", iobj), ("elem", elem), weight=c) + if ("sc", iobj) in g.nodes and ("elem", elem) in g.nodes: + g.add_edge(("sc", iobj), ("elem", elem), weight=c) print("make_graph init, met={:.2f}".format(compute_gen_met(g))) @@ -721,21 +722,25 @@ def make_graph(ev, iev): if (pred, suc) not in g.edges: # print(pred, tp, suc) g.add_edge(pred, suc, weight=g.edges[(tp, suc)]["weight"]) - # remove tracking particles + + # remove tracking particles from graph g.remove_nodes_from(tps) if save_debugging_pickle: pickle.dump(g, open("init_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL) - # add any remaining links between SimClusters and Elements using delta-R proximity with dR<0.05 - # note: this may have issues with phi wraparound - elems = [n for n in g.nodes if n[0] == "elem"] + # sometimes, the SimClusters are clearly leaving tracks, but the simulation links between the tracks and SimClusters don't seem to exist. + # add any remaining links between SimClusters and tracks using delta-R proximity with dR<0.05 + # note: this is in general a hack for missing simulation information, currently has issues with phi wraparound + elems = [n for n in g.nodes if n[0] == "elem" and g.nodes[n]["typ"] == 1] scs = [node for node in g.nodes if node[0] == "sc"] sc_coords = np.array([[g.nodes[n]["eta"] for n in scs], [g.nodes[n]["phi"] for n in scs]]) if len(sc_coords.T) > 0: tree = KDTree(sc_coords.T, leaf_size=32) for elem in elems: - if len(list(g.predecessors(elem))) == 0 and g.nodes[elem]["pt"] > 1: + # PFElement must have no links and a high-enough pT + # note: this can have some effect on the target - genjet matching + if len(list(g.predecessors(elem))) == 0 and g.nodes[elem]["pt"] > 1.0: eta = g.nodes[elem]["eta"] phi = g.nodes[elem]["phi"] nearby_scs = tree.query_radius([[eta, phi]], 0.05)[0] @@ -754,11 +759,13 @@ def make_graph(ev, iev): if (pred, suc) not in g.edges: # print(pred, sc, suc) g.add_edge(pred, suc, weight=g.edges[(sc, suc)]["weight"]) - # remove simclusters + + # remove simclusters from graph g.remove_nodes_from(scs) + print("make_graph duplicates removed, met={:.2f}".format(compute_gen_met(g))) - # now remove PS and BREM elements, as they are not informative + # now remove PS and BREM elements, as they are not that informative elems = [n for n in g.nodes if n[0] == "elem"] nodes_to_remove = [] for elem in elems: @@ -766,7 +773,6 @@ def make_graph(ev, iev): nodes_to_remove.append(elem) g.remove_nodes_from(nodes_to_remove) - # merge_closeby_particles(g) print("cleanup done, met={:.2f}".format(compute_gen_met(g))) element_to_candidate_first = ev["element_to_candidate.first"][iev] @@ -775,49 +781,31 @@ def make_graph(ev, iev): if ("elem", elem) in g.nodes: g.add_edge(("elem", elem), ("pfcand", pfcand), weight=1.0) + num_gen = len([n for n in g.nodes if n[0] == "gen" and g.nodes[n]["status"] == 1]) num_cp = len([n for n in g.nodes if n[0] == "cp"]) num_sc = len([n for n in g.nodes if n[0] == "sc"]) num_tp = len([n for n in g.nodes if n[0] == "tp"]) num_pf = len([n for n in g.nodes if n[0] == "pfcand"]) num_elem = len([n for n in g.nodes if n[0] == "elem"]) - print(f"CP={num_cp} SC={num_sc} TP={num_tp} PF={num_pf} EL={num_elem}") + print(f"GEN={num_gen} CP={num_cp} SC={num_sc} TP={num_tp} PF={num_pf} EL={num_elem}") - # for caloparticles, compute the total energy deposited to tracks or clusters for node in g.nodes: if node[0] == "cp": elems_children = list(g.successors(node)) - gp_to_track = 0 - gp_to_cluster = 0 + cp_to_track = 0 + cp_to_cluster = 0 for elem in elems_children: w = g.edges[node, elem]["weight"] elem_type = g.nodes[elem]["typ"] if elem_type in [1, 6]: - gp_to_track += w + cp_to_track += w else: - gp_to_cluster += w - g.nodes[node]["gp_to_track"] = gp_to_track - g.nodes[node]["gp_to_cluster"] = gp_to_cluster + cp_to_cluster += w + g.nodes[node]["cp_to_track"] = cp_to_track + g.nodes[node]["cp_to_cluster"] = cp_to_cluster if save_debugging_pickle: pickle.dump(g, open("cleanup_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL) - # print_gen(g) - return g - - -def cleanup_graph(g): - all_removed_edges = [] - elems = [n for n in g.nodes if n[0] == "elem"] - for elem in elems: - edges_to_remove = [] - for pred in g.predecessors(elem): - edge = (pred, elem) - if g.edges[edge]["weight"] / g.nodes[elem]["energy"] < 0.1: - edges_to_remove.append(edge) - all_removed_edges += edges_to_remove - # print("removed edges:", all_removed_edges) - # for edge in all_removed_edges: - # print(g.nodes[edge[0]]["energy"], g.nodes[edge[1]]["energy"], g.edges[edge]["weight"]) - g.remove_edges_from(all_removed_edges) return g @@ -837,7 +825,6 @@ def process(args): for iev in tqdm.tqdm(events_to_process): print("processing iev={}, genmet_cmssw={:.2f}".format(iev, ev["genmet_pt"][iev][0])) g = make_graph(ev, iev) - # g = cleanup_graph(g) # associate target particles to input elements Xelem, ycand, ytarget = prepare_normalized_table(g, iev) @@ -845,21 +832,40 @@ def process(args): # produce a list of stable pythia particles for downstream validation # stable: status=1 (typical) or status=2 and no daughters (B hadrons) - ptcls_pythia = [ - n - for n in g.nodes - if n[0] == "gen" and ((g.nodes[n]["status"] == 1) or ((g.nodes[n]["status"] == 2) and g.nodes[n]["num_daughters"] == 0)) - ] + ptcls_pythia = [n for n in g.nodes if n[0] == "gen" and ((g.nodes[n]["status"] == 1) and (abs(g.nodes[n]["pid"]) not in [12, 14, 16]))] feats = ["pid", "pt", "eta", "phi", "energy"] arr_ptcls_pythia = np.array([[g.nodes[n][f] for f in feats] for n in ptcls_pythia]) + # note for simulation 20240823_simcluster: genjet from CMSSW contains neutrinos (ak4GenJet), + # so it's somewhat mismatched with respect to CaloParticles, which don't contain neutrinos + # genjet_pt = ev["genjet_pt"][iev] + # genjet_eta = ev["genjet_eta"][iev] + # genjet_phi = ev["genjet_phi"][iev] + # genjet_energy = ev["genjet_energy"][iev] + # genjet = np.stack( + # [awkward.to_numpy(genjet_pt), awkward.to_numpy(genjet_eta), awkward.to_numpy(genjet_phi), awkward.to_numpy(genjet_energy)], axis=-1 + # ) + # produce pythia-level genjets and genmet - genjet_pt = ev["genjet_pt"][iev] - genjet_eta = ev["genjet_eta"][iev] - genjet_phi = ev["genjet_phi"][iev] - genjet_energy = ev["genjet_energy"][iev] + pythia_p4 = vector.awk( + awkward.zip( + { + "pt": arr_ptcls_pythia[:, 1], + "eta": arr_ptcls_pythia[:, 2], + "phi": arr_ptcls_pythia[:, 3], + "energy": arr_ptcls_pythia[:, 4], + } + ) + ) + pythia_jets = compute_jets(pythia_p4) genjet = np.stack( - [awkward.to_numpy(genjet_pt), awkward.to_numpy(genjet_eta), awkward.to_numpy(genjet_phi), awkward.to_numpy(genjet_energy)], axis=-1 + [ + awkward.to_numpy(pythia_jets.pt), + awkward.to_numpy(pythia_jets.eta), + awkward.to_numpy(pythia_jets.phi), + awkward.to_numpy(pythia_jets.energy), + ], + axis=-1, ) genmet_pt = ev["genmet_pt"][iev] diff --git a/mlpf/data/cms/postprocessing_jobs.py b/mlpf/data/cms/postprocessing_jobs.py index f99896ed3..ab5e6cd84 100644 --- a/mlpf/data/cms/postprocessing_jobs.py +++ b/mlpf/data/cms/postprocessing_jobs.py @@ -24,7 +24,7 @@ def write_script(infiles, outfiles): s += [f"if [ ! -f {outf} ]; then"] s += [ " singularity exec -B /local /home/software/singularity/pytorch.simg:2024-08-18" - + f" python3 mlpf/data_cms/postprocessing2.py --input {inf} --outpath {outpath}" + + f" python3 mlpf/data/cms/postprocessing2.py --input {inf} --outpath {outpath}" ] s += [f" bzip2 -z {outf_no_bzip}"] s += ["fi"] @@ -33,26 +33,29 @@ def write_script(infiles, outfiles): samples = [ - # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/TTbar_14TeV_TuneCUETP8M1_cfi", - # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/QCDForPF_14TeV_TuneCUETP8M1_cfi", + # PU # "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi", # "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi", - # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/MultiParticlePFGun50_cfi", + # "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", # NoPU + # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/TTbar_14TeV_TuneCUETP8M1_cfi", + # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/QCDForPF_14TeV_TuneCUETP8M1_cfi", + # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", + # Single particle gun # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SinglePiMinusFlatPt0p7To1000_cfi" - # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SinglePi0Pt1To1000_pythia8_cfi" + # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SinglePi0Pt1To1000_pythia8_cfi", # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleGammaFlatPt1To1000_pythia8_cfi", # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleK0FlatPt1To1000_pythia8_cfi", # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleElectronFlatPt1To1000_pythia8_cfi", # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleMuFlatPt1To1000_pythia8_cfi", - # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleNeutronFlatPt0p7To1000_cfi", - # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleProtonMinusFlatPt0p7To1000_cfi", - # "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleTauFlatPt1To1000_cfi", + "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleNeutronFlatPt0p7To1000_cfi", + "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleProtonMinusFlatPt0p7To1000_cfi", + "/local/joosep/mlpf/cms/20240823_simcluster/nopu/SingleTauFlatPt1To1000_cfi", ] ichunk = 1 for sample in samples: - infiles = list(glob.glob(f"{sample}/root/pfntuple*.root")) + infiles = sorted(list(glob.glob(f"{sample}/root/pfntuple*.root"))) for infiles_chunk in chunks(infiles, 50): outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw/") for inf in infiles_chunk] os.makedirs(os.path.dirname(outfiles_chunk[0]), exist_ok=True) diff --git a/mlpf/data/cms/prepare_args.py b/mlpf/data/cms/prepare_args.py index 7541fdcec..411d24bf7 100644 --- a/mlpf/data/cms/prepare_args.py +++ b/mlpf/data/cms/prepare_args.py @@ -6,27 +6,27 @@ outdir = "/local/joosep/mlpf/cms/20240823_simcluster" samples = [ - # ("TTbar_14TeV_TuneCUETP8M1_cfi", 100000, 120010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - # ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - # ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 320010, "genjob_pu55to75.sh", outdir + "/pu55to75"), + # ("TTbar_14TeV_TuneCUETP8M1_cfi", 100000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"), + # ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 210010, "genjob_pu55to75.sh", outdir + "/pu55to75"), + # ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 310010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("ZpTT_1500_14TeV_TuneCP5_cfi", 600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 705010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("SinglePiMinusFlatPt0p7To1000_cfi", 800000, 801010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - # ("TTbar_14TeV_TuneCUETP8M1_cfi", 702000, 720000, "genjob_nopu.sh", outdir + "/nopu"), + # ("TTbar_14TeV_TuneCUETP8M1_cfi", 700000, 720010, "genjob_nopu.sh", outdir + "/nopu"), # ("VBF_TuneCP5_14TeV_pythia8_cfi", 900000, 920010, "genjob_nopu.sh", outdir + "/nopu"), # ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"), - # ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 1100000,1100010, "genjob_nopu.sh", outdir + "/nopu"), + # ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 1100000,1120010, "genjob_nopu.sh", outdir + "/nopu"), # ("MultiParticlePFGun50_cfi", 800000, 801000, "genjob_nopu.sh", outdir + "/nopu"), - # ("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 905010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1005010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1105010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1205010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1305010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1405010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1505010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SingleTauFlatPt1To1000_cfi", 1600000,1605010, "genjob_nopu.sh", outdir + "/nopu"), - # ("SingleK0FlatPt1To1000_pythia8_cfi", 1700000,1705010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 901010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1001010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1101010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1201010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1301010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1401010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1501010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SingleTauFlatPt1To1000_cfi", 1600000,1601010, "genjob_nopu.sh", outdir + "/nopu"), + # ("SingleK0FlatPt1To1000_pythia8_cfi", 1700000,1701010, "genjob_nopu.sh", outdir + "/nopu"), ] if __name__ == "__main__": @@ -39,5 +39,5 @@ p = this_outdir + "/" + samp + "/root/pfntuple_{}.root".format(seed) if not os.path.isfile(p): print( - f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data_cms/{script} {samp} {seed}" + f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data/cms/{script} {samp} {seed}" ) diff --git a/mlpf/heptfds/clic_pf_edm4hep/qq.py b/mlpf/heptfds/clic_pf_edm4hep/qq.py index 6e1997d8e..88aaff67a 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/qq.py +++ b/mlpf/heptfds/clic_pf_edm4hep/qq.py @@ -1,13 +1,7 @@ from pathlib import Path import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample, -) +from utils_edm import X_FEATURES_CL, X_FEATURES_TRK, Y_FEATURES, generate_examples, split_sample, NUM_SPLITS import tensorflow_datasets as tfds @@ -26,7 +20,7 @@ class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.3.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", @@ -39,6 +33,7 @@ class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder): "2.1.0": "Bump dataset size", "2.2.0": "New target definition, fix truth jets, add targetjets and jet idx", "2.3.0": "Fix target/truth momentum, st=1 more inclusive: PR352", + "2.5.0": "Use 10 splits, skip 2.4.0 to unify with CMS datasets", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. @@ -47,6 +42,9 @@ class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder): rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(ClicEdmQqPf, self).__init__(*args, **kwargs) @@ -72,8 +70,7 @@ def _info(self) -> tfds.core.DatasetInfo: "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=None, - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict( x_features_track=X_FEATURES_TRK, @@ -84,7 +81,7 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): path = dl_manager.manual_dir - return split_sample(Path(path / "p8_ee_qq_ecm380/")) + return split_sample(Path(path / "p8_ee_qq_ecm380/"), self.builder_config, num_splits=NUM_SPLITS) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/single_gamma.py b/mlpf/heptfds/clic_pf_edm4hep/single_gamma.py deleted file mode 100644 index 885d371e4..000000000 --- a/mlpf/heptfds/clic_pf_edm4hep/single_gamma.py +++ /dev/null @@ -1,78 +0,0 @@ -from pathlib import Path - -import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample, -) - -import tensorflow_datasets as tfds -import numpy as np - -_DESCRIPTION = """ -CLIC EDM4HEP dataset with single gamma particle gun - - X: reconstructed tracks and clusters, variable number N per event - - ygen: stable generator particles, zero-padded to N per event - - ycand: baseline particle flow particles, zero-padded to N per event -""" - -_CITATION = """ -Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). -Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. -Zenodo. https://doi.org/10.5281/zenodo.8260741 -""" - - -class ClicEdmSingleGammaPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") - RELEASE_NOTES = { - "1.5.0": "Regenerate with ARRAY_RECORD", - } - MANUAL_DOWNLOAD_INSTRUCTIONS = """ - For the raw input files in ROOT EDM4HEP format, please see the citation above. - - The processed tensorflow_dataset can also be downloaded from: - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ - """ - - def __init__(self, *args, **kwargs): - kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(ClicEdmSingleGammaPf, self).__init__(*args, **kwargs) - - def _info(self) -> tfds.core.DatasetInfo: - """Returns the dataset metadata.""" - return tfds.core.DatasetInfo( - builder=self, - description=_DESCRIPTION, - features=tfds.features.FeaturesDict( - { - "X": tfds.features.Tensor( - shape=( - None, - max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), - ), - dtype=tf.float32, - ), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), - "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), - } - ), - supervised_keys=None, - homepage="", - citation=_CITATION, - metadata=tfds.core.MetadataDict( - x_features_track=X_FEATURES_TRK, - x_features_cluster=X_FEATURES_CL, - y_features=Y_FEATURES, - ), - ) - - def _split_generators(self, dl_manager: tfds.download.DownloadManager): - path = dl_manager.manual_dir - return split_sample(Path(path / "gamma/")) - - def _generate_examples(self, files): - return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py b/mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py deleted file mode 100644 index dcac642bf..000000000 --- a/mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py +++ /dev/null @@ -1,78 +0,0 @@ -from pathlib import Path - -import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample, -) - -import tensorflow_datasets as tfds -import numpy as np - -_DESCRIPTION = """ -CLIC EDM4HEP dataset with single kaon0L particle gun - - X: reconstructed tracks and clusters, variable number N per event - - ygen: stable generator particles, zero-padded to N per event - - ycand: baseline particle flow particles, zero-padded to N per event -""" - -_CITATION = """ -Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). -Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. -Zenodo. https://doi.org/10.5281/zenodo.8260741 -""" - - -class ClicEdmSingleKaon0lPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") - RELEASE_NOTES = { - "1.5.0": "Regenerate with ARRAY_RECORD", - } - MANUAL_DOWNLOAD_INSTRUCTIONS = """ - For the raw input files in ROOT EDM4HEP format, please see the citation above. - - The processed tensorflow_dataset can also be downloaded from: - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ - """ - - def __init__(self, *args, **kwargs): - kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(ClicEdmSingleKaon0lPf, self).__init__(*args, **kwargs) - - def _info(self) -> tfds.core.DatasetInfo: - """Returns the dataset metadata.""" - return tfds.core.DatasetInfo( - builder=self, - description=_DESCRIPTION, - features=tfds.features.FeaturesDict( - { - "X": tfds.features.Tensor( - shape=( - None, - max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), - ), - dtype=tf.float32, - ), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), - "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), - } - ), - supervised_keys=None, - homepage="", - citation=_CITATION, - metadata=tfds.core.MetadataDict( - x_features_track=X_FEATURES_TRK, - x_features_cluster=X_FEATURES_CL, - y_features=Y_FEATURES, - ), - ) - - def _split_generators(self, dl_manager: tfds.download.DownloadManager): - path = dl_manager.manual_dir - return split_sample(Path(path / "kaon0L/")) - - def _generate_examples(self, files): - return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/single_pi.py b/mlpf/heptfds/clic_pf_edm4hep/single_pi.py deleted file mode 100644 index b97c356b3..000000000 --- a/mlpf/heptfds/clic_pf_edm4hep/single_pi.py +++ /dev/null @@ -1,78 +0,0 @@ -from pathlib import Path - -import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample_several, -) - -import tensorflow_datasets as tfds -import numpy as np - -_DESCRIPTION = """ -CLIC EDM4HEP dataset with single-pion particle gun - - X: reconstructed tracks and clusters, variable number N per event - - ygen: stable generator particles, zero-padded to N per event - - ycand: baseline particle flow particles, zero-padded to N per event -""" - -_CITATION = """ -Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). -Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. -Zenodo. https://doi.org/10.5281/zenodo.8260741 -""" - - -class ClicEdmSinglePiPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") - RELEASE_NOTES = { - "1.5.0": "Regenerate with ARRAY_RECORD", - } - MANUAL_DOWNLOAD_INSTRUCTIONS = """ - For the raw input files in ROOT EDM4HEP format, please see the citation above. - - The processed tensorflow_dataset can also be downloaded from: - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ - """ - - def __init__(self, *args, **kwargs): - kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(ClicEdmSinglePiPf, self).__init__(*args, **kwargs) - - def _info(self) -> tfds.core.DatasetInfo: - """Returns the dataset metadata.""" - return tfds.core.DatasetInfo( - builder=self, - description=_DESCRIPTION, - features=tfds.features.FeaturesDict( - { - "X": tfds.features.Tensor( - shape=( - None, - max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), - ), - dtype=tf.float32, - ), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), - "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), - } - ), - supervised_keys=None, - homepage="", - citation=_CITATION, - metadata=tfds.core.MetadataDict( - x_features_track=X_FEATURES_TRK, - x_features_cluster=X_FEATURES_CL, - y_features=Y_FEATURES, - ), - ) - - def _split_generators(self, dl_manager: tfds.download.DownloadManager): - path = dl_manager.manual_dir - return split_sample_several([Path(path / "pi-/"), Path(path / "pi+/")]) - - def _generate_examples(self, files): - return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py index 0c88c4397..fbc83e427 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py @@ -1,13 +1,7 @@ from pathlib import Path import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample, -) +from utils_edm import X_FEATURES_CL, X_FEATURES_TRK, Y_FEATURES, generate_examples, split_sample, NUM_SPLITS import tensorflow_datasets as tfds @@ -26,7 +20,7 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.3.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", @@ -38,6 +32,7 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): "2.1.0": "Bump dataset size", "2.2.0": "New target definition, fix truth jets, add targetjets and jet idx", "2.3.0": "Fix target/truth momentum, st=1 more inclusive: PR352", + "2.5.0": "Use 10 splits, skip 2.4.0 to unify with CMS datasets", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. @@ -46,6 +41,9 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(ClicEdmTtbarPf, self).__init__(*args, **kwargs) @@ -71,8 +69,7 @@ def _info(self) -> tfds.core.DatasetInfo: "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=None, - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict( x_features_track=X_FEATURES_TRK, @@ -83,7 +80,7 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): path = dl_manager.manual_dir - return split_sample(Path(path / "p8_ee_tt_ecm380/")) + return split_sample(Path(path / "p8_ee_tt_ecm380/"), self.builder_config, num_splits=NUM_SPLITS) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py index 5e392091b..9cf0828e1 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -2,6 +2,8 @@ import numpy as np import random +NUM_SPLITS = 10 + # from fcc/postprocessing.py X_FEATURES_TRK = [ "elemtype", @@ -62,7 +64,20 @@ N_Y_FEATURES = len(Y_FEATURES) -def split_sample(path, test_frac=0.9): +def split_list(lst, x): + # Calculate the size of each sublist (except potentially the last) + sublist_size = len(lst) // x + + # Create x-1 sublists of equal size + result = [lst[i * sublist_size : (i + 1) * sublist_size] for i in range(x - 1)] + + # Add the remaining elements to the last sublist + result.append(lst[(x - 1) * sublist_size :]) + + return result + + +def split_sample(path, builder_config, num_splits=NUM_SPLITS, test_frac=0.9): files = sorted(list(path.glob("*.parquet"))) print("Found {} files in {}".format(len(files), path)) assert len(files) > 0 @@ -71,13 +86,19 @@ def split_sample(path, test_frac=0.9): files_test = files[idx_split:] assert len(files_train) > 0 assert len(files_test) > 0 + + split_index = int(builder_config.name) - 1 + files_train_split = split_list(files_train, num_splits) + files_test_split = split_list(files_test, num_splits) + return { - "train": generate_examples(files_train), - "test": generate_examples(files_test), + "train": generate_examples(files_train_split[split_index]), + "test": generate_examples(files_test_split[split_index]), } -def split_sample_several(paths, test_frac=0.9): +# merge and shuffle several samples (e.g. e+, e-), split into test/train +def split_sample_several(paths, builder_config, num_splits=NUM_SPLITS, test_frac=0.9): files = sum([list(path.glob("*.parquet")) for path in paths], []) random.shuffle(files) print("Found {} files".format(len(files))) @@ -87,9 +108,14 @@ def split_sample_several(paths, test_frac=0.9): files_test = files[idx_split:] assert len(files_train) > 0 assert len(files_test) > 0 + + split_index = int(builder_config.name) - 1 + files_train_split = split_list(files_train, num_splits) + files_test_split = split_list(files_test, num_splits) + return { - "train": generate_examples(files_train), - "test": generate_examples(files_test), + "train": generate_examples(files_train_split[split_index]), + "test": generate_examples(files_test_split[split_index]), } @@ -194,13 +220,3 @@ def generate_examples(files): "genjets": gj.astype(np.float32), "targetjets": tj.astype(np.float32), } - - -if __name__ == "__main__": - for ex in generate_examples( - [ - "/local/joosep/mlpf/clic_edm4hep/pi+/reco_pi+_98.parquet", - "/local/joosep/mlpf/clic_edm4hep/pi-/reco_pi-_11.parquet", - ] - ): - print(ex[0], ex[1]["X"].shape, ex[1]["ytarget"].shape, ex[1]["ycand"].shape) diff --git a/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py b/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py index e1be51900..a2e8af850 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py @@ -1,13 +1,7 @@ from pathlib import Path import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample, -) +from utils_edm import X_FEATURES_CL, X_FEATURES_TRK, Y_FEATURES, generate_examples, split_sample, NUM_SPLITS import tensorflow_datasets as tfds @@ -26,7 +20,7 @@ class ClicEdmWwFullhadPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.3.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "1.3.0": "Update stats to ~1M events", "1.4.0": "Fix ycand matching", @@ -34,6 +28,7 @@ class ClicEdmWwFullhadPf(tfds.core.GeneratorBasedBuilder): "2.1.0": "Add ispu, genjets, genmet; disable genjet_idx; truth def not based on gp.status==1", "2.2.0": "New target definition, fix truth jets, add targetjets and jet idx", "2.3.0": "Fix target/truth momentum, st=1 more inclusive: PR352", + "2.5.0": "Use 10 splits, skip 2.4.0 to unify with CMS datasets", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. @@ -42,6 +37,9 @@ class ClicEdmWwFullhadPf(tfds.core.GeneratorBasedBuilder): rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(ClicEdmWwFullhadPf, self).__init__(*args, **kwargs) @@ -67,8 +65,7 @@ def _info(self) -> tfds.core.DatasetInfo: "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=None, - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict( x_features_track=X_FEATURES_TRK, @@ -79,7 +76,7 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): path = dl_manager.manual_dir - return split_sample(Path(path / "p8_ee_WW_fullhad_ecm380/")) + return split_sample(Path(path / "p8_ee_WW_fullhad_ecm380/"), self.builder_config, num_splits=NUM_SPLITS) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/z.py b/mlpf/heptfds/clic_pf_edm4hep/z.py deleted file mode 100644 index f0bc433ba..000000000 --- a/mlpf/heptfds/clic_pf_edm4hep/z.py +++ /dev/null @@ -1,81 +0,0 @@ -from pathlib import Path - -import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample, -) - -import tensorflow_datasets as tfds - -_DESCRIPTION = """ -CLIC EDM4HEP dataset with Z->tautau - - X: reconstructed tracks and clusters, variable number N per event - - ygen: stable generator particles, zero-padded to N per event - - ycand: baseline particle flow particles, zero-padded to N per event -""" - -_CITATION = """ -""" - - -class ClicEdmZTautauPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.3.0") - RELEASE_NOTES = { - "1.3.0": "First version", - "1.4.0": "Fix ycand matching", - "1.5.0": "Regenerate with ARRAY_RECORD", - "2.1.0": "Add ispu, genjets, genmet; disable genjet_idx; truth def not based on gp.status==1", - "2.3.0": "Fix target/truth momentum, st=1 more inclusive: PR352", - } - MANUAL_DOWNLOAD_INSTRUCTIONS = """ - For the raw input files in ROOT EDM4HEP format, please see the citation above. - - The processed tensorflow_dataset can also be downloaded from: - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ - """ - - def __init__(self, *args, **kwargs): - kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(ClicEdmZTautauPf, self).__init__(*args, **kwargs) - - def _info(self) -> tfds.core.DatasetInfo: - """Returns the dataset metadata.""" - return tfds.core.DatasetInfo( - builder=self, - description=_DESCRIPTION, - features=tfds.features.FeaturesDict( - { - "X": tfds.features.Tensor( - shape=( - None, - max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), - ), - dtype=tf.float32, - ), - "ytarget": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), - "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), - "genmet": tfds.features.Scalar(dtype=tf.float32), - "genjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), - "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), - } - ), - supervised_keys=None, - homepage="", - citation=_CITATION, - metadata=tfds.core.MetadataDict( - x_features_track=X_FEATURES_TRK, - x_features_cluster=X_FEATURES_CL, - y_features=Y_FEATURES, - ), - ) - - def _split_generators(self, dl_manager: tfds.download.DownloadManager): - path = dl_manager.manual_dir - return split_sample(Path(path / "p8_ee_Z_Ztautau_ecm380/")) - - def _generate_examples(self, files): - return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/zh.py b/mlpf/heptfds/clic_pf_edm4hep/zh.py deleted file mode 100644 index a9e9651af..000000000 --- a/mlpf/heptfds/clic_pf_edm4hep/zh.py +++ /dev/null @@ -1,84 +0,0 @@ -from pathlib import Path - -import tensorflow as tf -from utils_edm import ( - X_FEATURES_CL, - X_FEATURES_TRK, - Y_FEATURES, - generate_examples, - split_sample, -) - -import tensorflow_datasets as tfds - -_DESCRIPTION = """ -CLIC EDM4HEP dataset with ZH->tautau - - X: reconstructed tracks and clusters, variable number N per event - - ygen: stable generator particles, zero-padded to N per event - - ycand: baseline particle flow particles, zero-padded to N per event -""" - -_CITATION = """ -Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). -Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. -Zenodo. https://doi.org/10.5281/zenodo.8260741 -""" - - -class ClicEdmZhTautauPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.3.0") - RELEASE_NOTES = { - "1.3.0": "First version", - "1.4.0": "Fix ycand matching", - "1.5.0": "Regenerate with ARRAY_RECORD", - "2.1.0": "Add ispu, genjets, genmet; disable genjet_idx; truth def not based on gp.status==1", - "2.3.0": "Fix target/truth momentum, st=1 more inclusive: PR352", - } - MANUAL_DOWNLOAD_INSTRUCTIONS = """ - For the raw input files in ROOT EDM4HEP format, please see the citation above. - - The processed tensorflow_dataset can also be downloaded from: - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ - """ - - def __init__(self, *args, **kwargs): - kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(ClicEdmZhTautauPf, self).__init__(*args, **kwargs) - - def _info(self) -> tfds.core.DatasetInfo: - """Returns the dataset metadata.""" - return tfds.core.DatasetInfo( - builder=self, - description=_DESCRIPTION, - features=tfds.features.FeaturesDict( - { - "X": tfds.features.Tensor( - shape=( - None, - max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), - ), - dtype=tf.float32, - ), - "ytarget": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), - "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), - "genmet": tfds.features.Scalar(dtype=tf.float32), - "genjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), - "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), - } - ), - supervised_keys=None, - homepage="", - citation=_CITATION, - metadata=tfds.core.MetadataDict( - x_features_track=X_FEATURES_TRK, - x_features_cluster=X_FEATURES_CL, - y_features=Y_FEATURES, - ), - ) - - def _split_generators(self, dl_manager: tfds.download.DownloadManager): - path = dl_manager.manual_dir - return split_sample(Path(path / "p8_ee_ZH_Htautau_ecm380/")) - - def _generate_examples(self, files): - return generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/cms_utils.py b/mlpf/heptfds/cms_pf/cms_utils.py index d070f6a88..810034dd2 100644 --- a/mlpf/heptfds/cms_pf/cms_utils.py +++ b/mlpf/heptfds/cms_pf/cms_utils.py @@ -114,11 +114,29 @@ "ispu", "generatorStatus", "simulatorStatus", - "gp_to_track", - "gp_to_cluster", + "cp_to_track", + "cp_to_cluster", "jet_idx", ] +# split each dataset into equal parts for faster building +NUM_SPLITS = 10 + + +def map_pdgid_to_candid(pdgid, charge): + if pdgid == 0: + return 0 + + if pdgid in [22, 11, 13]: + return pdgid + + # charged hadron + if abs(charge) > 0: + return 211 + + # neutral hadron + return 130 + def prepare_data_cms(fn): Xs = [] @@ -151,7 +169,8 @@ def prepare_data_cms(fn): Xelem["sin_phi"] = np.sin(Xelem["phi"]) Xelem["cos_phi"] = np.cos(Xelem["phi"]) Xelem["typ_idx"] = np.array([ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]], dtype=np.float32) - ytarget["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ytarget["pid"]], dtype=np.float32) + pids_remapped = [map_pdgid_to_candid(abs(int(pid)), q) for (pid, q) in zip(ytarget["pid"], ytarget["charge"])] + ytarget["typ_idx"] = np.array([CLASS_LABELS_CMS.index(pid) for pid in pids_remapped], dtype=np.float32) ycand["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["pid"]], dtype=np.float32) Xelem_flat = ak.to_numpy( @@ -187,18 +206,35 @@ def prepare_data_cms(fn): return Xs, ytargets, ycands, genmets, genjets, targetjets -def split_sample(path, test_frac=0.9): +def split_list(lst, x): + # Calculate the size of each sublist (except potentially the last) + sublist_size = len(lst) // x + + # Create x-1 sublists of equal size + result = [lst[i * sublist_size : (i + 1) * sublist_size] for i in range(x - 1)] + + # Add the remaining elements to the last sublist + result.append(lst[(x - 1) * sublist_size :]) + + return result + + +def split_sample(path, builder_config, num_splits=NUM_SPLITS, train_frac=0.9): files = sorted(list(path.glob("*.pkl*"))) print("Found {} files in {}".format(len(files), path)) assert len(files) > 0 - idx_split = int(test_frac * len(files)) - files_train = files[:idx_split] - files_test = files[idx_split:] + idx_test = int(train_frac * len(files)) + files_train = files[:idx_test] + files_test = files[idx_test:] assert len(files_train) > 0 assert len(files_test) > 0 + + split_index = int(builder_config.name) - 1 + files_train_split = split_list(files_train, num_splits) + files_test_split = split_list(files_test, num_splits) return { - "train": generate_examples(files_train), - "test": generate_examples(files_test), + "train": generate_examples(files_train_split[split_index]), + "test": generate_examples(files_test_split[split_index]), } diff --git a/mlpf/heptfds/cms_pf/qcd.py b/mlpf/heptfds/cms_pf/qcd.py index 194ecc96e..be4eff03d 100644 --- a/mlpf/heptfds/cms_pf/qcd.py +++ b/mlpf/heptfds/cms_pf/qcd.py @@ -1,7 +1,6 @@ """CMS PF QCD dataset.""" import cms_utils import tensorflow as tf - import tensorflow_datasets as tfds X_FEATURES = cms_utils.X_FEATURES @@ -21,7 +20,7 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_qcd dataset.""" - VERSION = tfds.core.Version("2.4.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", @@ -35,18 +34,21 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder): "2.1.0": "Additional stats", "2.3.0": "Split CaloParticles along tracks", "2.4.0": "Add gp_to_track, gp_to_cluster, jet_idx", + "2.5.0": "Remove neutrinos from genjets, split to 10", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_qcd ~/tensorflow_datasets/ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, cms_utils.NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(CmsPfQcd, self).__init__(*args, **kwargs) def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" - # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object return tfds.core.DatasetInfo( builder=self, description=_DESCRIPTION, @@ -60,8 +62,7 @@ def _info(self) -> tfds.core.DatasetInfo: "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=("X", "ytarget"), - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), ) @@ -70,7 +71,7 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir sample_dir = "QCDForPF_14TeV_TuneCUETP8M1_cfi" - return cms_utils.split_sample(path / sample_dir / "raw") + return cms_utils.split_sample(path / sample_dir / "raw", self.builder_config, num_splits=cms_utils.NUM_SPLITS) def _generate_examples(self, files): return cms_utils.generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/qcd_nopu.py b/mlpf/heptfds/cms_pf/qcd_nopu.py index 89ff6b61b..517866468 100644 --- a/mlpf/heptfds/cms_pf/qcd_nopu.py +++ b/mlpf/heptfds/cms_pf/qcd_nopu.py @@ -21,15 +21,19 @@ class CmsPfQcdNopu(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_qcd_nopu dataset.""" - VERSION = tfds.core.Version("2.4.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "2.0.0": "New truth def based primarily on CaloParticles", "2.4.0": "Add gp_to_track, gp_to_cluster, jet_idx", + "2.5.0": "Remove neutrinos from truth jets, split to 10", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_qcd_nopu ~/tensorflow_datasets/ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, cms_utils.NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(CmsPfQcdNopu, self).__init__(*args, **kwargs) @@ -49,8 +53,7 @@ def _info(self) -> tfds.core.DatasetInfo: "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=("X", "ytarget"), - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), ) @@ -59,7 +62,7 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir sample_dir = "QCDForPF_14TeV_TuneCUETP8M1_cfi" - return cms_utils.split_sample(path / sample_dir / "raw") + return cms_utils.split_sample(path / sample_dir / "raw", self.builder_config, num_splits=cms_utils.NUM_SPLITS) def _generate_examples(self, files): return cms_utils.generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/vbf_nopu.py b/mlpf/heptfds/cms_pf/singleele.py similarity index 62% rename from mlpf/heptfds/cms_pf/vbf_nopu.py rename to mlpf/heptfds/cms_pf/singleele.py index 98a0d094c..aa18abb81 100644 --- a/mlpf/heptfds/cms_pf/vbf_nopu.py +++ b/mlpf/heptfds/cms_pf/singleele.py @@ -1,7 +1,6 @@ -"""CMS PF TTbar dataset.""" +"""CMS PF SingleEle dataset.""" import cms_utils import tensorflow as tf - import tensorflow_datasets as tfds X_FEATURES = cms_utils.X_FEATURES @@ -10,7 +9,6 @@ _DESCRIPTION = """ Dataset generated with CMSSW and full detector sim. -VBF events without PU in a Run3 setup. """ # TODO(cms_pf): BibTeX citation @@ -18,22 +16,23 @@ """ -class CmsPfVbfNopu(tfds.core.GeneratorBasedBuilder): - """DatasetBuilder for cms_pf_vbf_nopu dataset.""" +class CmsPfSingleEle(tfds.core.GeneratorBasedBuilder, skip_registration=True): + """DatasetBuilder for cms_pf_ttbar dataset.""" - VERSION = tfds.core.Version("2.0.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { - "1.7.1": "First version", - "1.8.0": "Add ispu, genjets, genmet; disable genjet_idx; improved merging", - "2.0.0": "New truth def based primarily on CaloParticles", + "2.5.0": "First version", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_vbf_nopu ~/tensorflow_datasets/ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar ~/tensorflow_datasets/ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(1))] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(CmsPfVbfNopu, self).__init__(*args, **kwargs) + super(CmsPfSingleEle, self).__init__(*args, **kwargs) def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" @@ -43,14 +42,14 @@ def _info(self) -> tfds.core.DatasetInfo: features=tfds.features.FeaturesDict( { "X": tfds.features.Tensor(shape=(None, len(X_FEATURES)), dtype=tf.float32), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ytarget": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "genmet": tfds.features.Scalar(dtype=tf.float32), "genjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), + "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=("X", "ygen"), - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), ) @@ -58,8 +57,8 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir - sample_dir = "VBF_TuneCP5_14TeV_pythia8_cfi" - return cms_utils.split_sample(path / sample_dir / "raw") + sample_dir = "SingleElectronFlatPt1To1000_pythia8_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", self.builder_config, num_splits=1, train_frac=0.1) def _generate_examples(self, files): return cms_utils.generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/ttbar.py b/mlpf/heptfds/cms_pf/ttbar.py index eae5a62a4..babff7729 100644 --- a/mlpf/heptfds/cms_pf/ttbar.py +++ b/mlpf/heptfds/cms_pf/ttbar.py @@ -1,7 +1,6 @@ """CMS PF TTbar dataset.""" import cms_utils import tensorflow as tf - import tensorflow_datasets as tfds X_FEATURES = cms_utils.X_FEATURES @@ -18,10 +17,10 @@ """ -class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): - """DatasetBuilder for cms_pf dataset.""" +class CmsPfTtbar(tfds.core.GeneratorBasedBuilder, skip_registration=True): + """DatasetBuilder for cms_pf_ttbar dataset.""" - VERSION = tfds.core.Version("2.4.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -40,11 +39,15 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): "2.2.0": "Split CaloParticles along tracks", "2.3.0": "Increase stats", "2.4.0": "Add gp_to_track, gp_to_cluster, jet_idx", + "2.5.0": "Remove neutrinos from genjets, split to 10", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar ~/tensorflow_datasets/ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, cms_utils.NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(CmsPfTtbar, self).__init__(*args, **kwargs) @@ -64,8 +67,7 @@ def _info(self) -> tfds.core.DatasetInfo: "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=("X", "ytarget"), - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), ) @@ -74,7 +76,7 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir sample_dir = "TTbar_14TeV_TuneCUETP8M1_cfi" - return cms_utils.split_sample(path / sample_dir / "raw") + return cms_utils.split_sample(path / sample_dir / "raw", self.builder_config, num_splits=cms_utils.NUM_SPLITS) def _generate_examples(self, files): return cms_utils.generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/ttbar_nopu.py b/mlpf/heptfds/cms_pf/ttbar_nopu.py index 002e6342f..112714295 100644 --- a/mlpf/heptfds/cms_pf/ttbar_nopu.py +++ b/mlpf/heptfds/cms_pf/ttbar_nopu.py @@ -21,7 +21,7 @@ class CmsPfTtbarNopu(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_ttbar_nopu dataset.""" - VERSION = tfds.core.Version("2.4.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "1.7.1": "First version", "1.8.0": "Add ispu, genjets, genmet; disable genjet_idx; improved merging", @@ -29,11 +29,15 @@ class CmsPfTtbarNopu(tfds.core.GeneratorBasedBuilder): "2.2.0": "Split CaloParticles along tracks", "2.3.0": "Additional stats", "2.4.0": "Add gp_to_track, gp_to_cluster, jet_idx", + "2.5.0": "Remove neutrinos from genjets, split to 10", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar_nopu ~/tensorflow_datasets/ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, cms_utils.NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(CmsPfTtbarNopu, self).__init__(*args, **kwargs) @@ -53,8 +57,7 @@ def _info(self) -> tfds.core.DatasetInfo: "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=("X", "ytarget"), - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), ) @@ -63,7 +66,7 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir sample_dir = "TTbar_14TeV_TuneCUETP8M1_cfi" - return cms_utils.split_sample(path / sample_dir / "raw") + return cms_utils.split_sample(path / sample_dir / "raw", self.builder_config, num_splits=cms_utils.NUM_SPLITS) def _generate_examples(self, files): return cms_utils.generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/multiparticlegun.py b/mlpf/heptfds/cms_pf/ztt.py similarity index 56% rename from mlpf/heptfds/cms_pf/multiparticlegun.py rename to mlpf/heptfds/cms_pf/ztt.py index 567793d5f..d7a19f5ff 100644 --- a/mlpf/heptfds/cms_pf/multiparticlegun.py +++ b/mlpf/heptfds/cms_pf/ztt.py @@ -1,7 +1,6 @@ -"""CMS PF SinglePi dataset.""" +"""CMS PF TTbar dataset.""" import cms_utils import tensorflow as tf - import tensorflow_datasets as tfds X_FEATURES = cms_utils.X_FEATURES @@ -10,7 +9,7 @@ _DESCRIPTION = """ Dataset generated with CMSSW and full detector sim. -Multi-particle gun events. +Ztautau, all-hadronic events with PU 55-75 in a Run3 setup. """ # TODO(cms_pf): BibTeX citation @@ -18,44 +17,40 @@ """ -class CmsPfMultiParticleGun(tfds.core.GeneratorBasedBuilder): - """DatasetBuilder for cms_pf_multi_particle_gun dataset.""" +class CmsPfZtt(tfds.core.GeneratorBasedBuilder, skip_registration=True): + """DatasetBuilder for cms_pf_ztt dataset.""" - VERSION = tfds.core.Version("2.0.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { - "1.6.0": "Initial release", - "1.6.1": "Additional stats", - "1.7.0": "Add cluster shape vars", - "1.7.1": "Additional stats", - "2.0.0": "New truth def based primarily on CaloParticles", + "2.5.0": "Remove neutrinos from genjets, split to 10", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress \ - lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_multi_particle_gun \ - ~/tensorflow_datasets/ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ztt ~/tensorflow_datasets/ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, cms_utils.NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(CmsPfMultiParticleGun, self).__init__(*args, **kwargs) + super(CmsPfZtt, self).__init__(*args, **kwargs) def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" - # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object return tfds.core.DatasetInfo( builder=self, description=_DESCRIPTION, features=tfds.features.FeaturesDict( { "X": tfds.features.Tensor(shape=(None, len(X_FEATURES)), dtype=tf.float32), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ytarget": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "genmet": tfds.features.Scalar(dtype=tf.float32), "genjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), + "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=("X", "ygen"), - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), ) @@ -63,8 +58,8 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir - sample_dir = "MultiParticlePFGun50_cfi" - return cms_utils.split_sample(path / sample_dir / "raw") + sample_dir = "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", self.builder_config, num_splits=cms_utils.NUM_SPLITS) def _generate_examples(self, files): return cms_utils.generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/vbf.py b/mlpf/heptfds/cms_pf/ztt_nopu.py similarity index 60% rename from mlpf/heptfds/cms_pf/vbf.py rename to mlpf/heptfds/cms_pf/ztt_nopu.py index ee05cbc1c..12dfc2302 100644 --- a/mlpf/heptfds/cms_pf/vbf.py +++ b/mlpf/heptfds/cms_pf/ztt_nopu.py @@ -1,7 +1,6 @@ """CMS PF TTbar dataset.""" import cms_utils import tensorflow as tf - import tensorflow_datasets as tfds X_FEATURES = cms_utils.X_FEATURES @@ -10,7 +9,7 @@ _DESCRIPTION = """ Dataset generated with CMSSW and full detector sim. -VBF events with PU 55-75 in a Run3 setup. +Ztautau, all-hadronic events with PU 55-75 in a Run3 setup. """ # TODO(cms_pf): BibTeX citation @@ -18,23 +17,23 @@ """ -class CmsPfVbf(tfds.core.GeneratorBasedBuilder): - """DatasetBuilder for cms_pf_vbf dataset.""" +class CmsPfZttNopu(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_ztt_nopu dataset.""" - VERSION = tfds.core.Version("2.1.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { - "1.7.1": "First version", - "1.8.0": "Add ispu, genjets, genmet; disable genjet_idx; improved merging", - "2.0.0": "New truth def based primarily on CaloParticles", - "2.1.0": "Additional statistics", + "2.5.0": "Remove neutrinos from genjets, split to 10", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ - rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_vbf ~/tensorflow_datasets/ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ztt ~/tensorflow_datasets/ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, cms_utils.NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD - super(CmsPfVbf, self).__init__(*args, **kwargs) + super(CmsPfZttNopu, self).__init__(*args, **kwargs) def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" @@ -44,14 +43,14 @@ def _info(self) -> tfds.core.DatasetInfo: features=tfds.features.FeaturesDict( { "X": tfds.features.Tensor(shape=(None, len(X_FEATURES)), dtype=tf.float32), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ytarget": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "genmet": tfds.features.Scalar(dtype=tf.float32), "genjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), + "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), - supervised_keys=("X", "ygen"), - homepage="", + homepage="https://github.com/jpata/particleflow", citation=_CITATION, metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), ) @@ -59,8 +58,8 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir - sample_dir = "VBF_TuneCP5_14TeV_pythia8_cfi" - return cms_utils.split_sample(path / sample_dir / "raw") + sample_dir = "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", self.builder_config, num_splits=cms_utils.NUM_SPLITS) def _generate_examples(self, files): return cms_utils.generate_examples(files) diff --git a/mlpf/model/PFDataset.py b/mlpf/model/PFDataset.py index 8a8f4379d..57802023f 100644 --- a/mlpf/model/PFDataset.py +++ b/mlpf/model/PFDataset.py @@ -7,6 +7,7 @@ from mlpf.model.logger import _logger import numpy as np +import sys class TFDSDataSource: @@ -112,8 +113,13 @@ def __init__(self, data_dir, name, split, num_samples=None, sort=False): if split == "valid": split = "test" - builder = tfds.builder(name, data_dir=data_dir) - + try: + builder = tfds.builder(name, data_dir=data_dir) + except Exception: + _logger.error( + "Could not find dataset {} in {}, please check that you have downloaded the correct version of the dataset".format(name, data_dir) + ) + sys.exit(1) self.ds = TFDSDataSource(builder.as_data_source(split=split), sort=sort) if num_samples and num_samples < len(self.ds): @@ -220,19 +226,22 @@ def get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray): dataset = [] for sample in config[f"{split}_dataset"][config["dataset"]][type_]["samples"]: version = config[f"{split}_dataset"][config["dataset"]][type_]["samples"][sample]["version"] - - ds = PFDataset( - config["data_dir"], - f"{sample}:{version}", - split, - num_samples=config[f"n{split}"], - sort=config["sort_data"], - ).ds - - if (rank == 0) or (rank == "cpu"): - _logger.info(f"{split}_dataset: {sample}, {len(ds)}", color="blue") - - dataset.append(ds) + split_configs = config[f"{split}_dataset"][config["dataset"]][type_]["samples"][sample]["splits"] + print("split_configs", split_configs) + + for split_config in split_configs: + ds = PFDataset( + config["data_dir"], + f"{sample}/{split_config}:{version}", + split, + num_samples=config[f"n{split}"], + sort=config["sort_data"], + ).ds + + if (rank == 0) or (rank == "cpu"): + _logger.info(f"{split}_dataset: {sample}, {len(ds)}", color="blue") + + dataset.append(ds) dataset = torch.utils.data.ConcatDataset(dataset) shuffle = split == "train" diff --git a/mlpf/model/inference.py b/mlpf/model/inference.py index 809a276bd..ab9908653 100644 --- a/mlpf/model/inference.py +++ b/mlpf/model/inference.py @@ -32,23 +32,6 @@ from .utils import unpack_predictions, unpack_target -def cluster_particles(data_cls, data_pte, data_etaphi, jet_idx, iev): - pt = data_pte["pt"][iev] - eta = data_etaphi["eta"][iev] - phi = data_etaphi["phi"][iev] - energy = data_pte["energy"][iev] - p4 = np.stack([pt, eta, phi, energy], axis=-1) - - unique_jets = np.unique(jet_idx) - p4 = awkward.Array([p4[jet_idx == i] for i in unique_jets if i != 0]) - p4 = p4[(p4[:, :, 0] != 0) & (p4[:, :, 1] != 0) & (p4[:, :, 2] != 0)] - - p4 = vector.awk(awkward.zip({"pt": p4[:, :, 0], "eta": p4[:, :, 1], "phi": p4[:, :, 2], "energy": p4[:, :, 3]})) - - sum_jets = awkward.sum(p4, axis=1) - return sum_jets - - def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_match_dr, outpath, dir_name, sample): # skip prediction if output exists diff --git a/mlpf/model/training.py b/mlpf/model/training.py index 122de2e2b..102116618 100644 --- a/mlpf/model/training.py +++ b/mlpf/model/training.py @@ -1030,7 +1030,19 @@ def run(rank, world_size, config, args, outdir, logfile): batch_size = config["gpu_batch_multiplier"] version = config["test_dataset"][sample]["version"] - ds = PFDataset(config["data_dir"], f"{sample}:{version}", "test", num_samples=config["ntest"]).ds + split_configs = config["test_dataset"][sample]["splits"] + print("split_configs", split_configs) + + dataset = [] + + ntest = None + if not (config["ntest"] is None): + ntest = config["ntest"] // len(split_configs) + + for split_config in split_configs: + ds = PFDataset(config["data_dir"], f"{sample}/{split_config}:{version}", "test", num_samples=ntest).ds + dataset.append(ds) + ds = torch.utils.data.ConcatDataset(dataset) if (rank == 0) or (rank == "cpu"): _logger.info(f"test_dataset: {sample}, {len(ds)}", color="blue") diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index e384266f3..ae405bc3f 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -49,7 +49,6 @@ ) parser.add_argument("--num-convs", type=int, default=None, help="number of cross-particle convolution (GNN, attention, Mamba) layers") parser.add_argument("--make-plots", action="store_true", default=None, help="make plots of the test predictions") -parser.add_argument("--export-onnx", action="store_true", default=None, help="exports the model to onnx") parser.add_argument("--ntrain", type=int, default=None, help="training samples to use, if None use entire dataset") parser.add_argument("--ntest", type=int, default=None, help="training samples to use, if None use entire dataset") parser.add_argument("--nvalid", type=int, default=None, help="validation samples to use") @@ -141,7 +140,10 @@ def main(): "samples": {"cms_pf_ttbar": config[ds]["cms"]["physical_pu"]["samples"]["cms_pf_ttbar"]}, } } + # load only the last config split + config[ds]["cms"]["physical_pu"]["samples"]["cms_pf_ttbar"]["splits"] = ["10"] config["test_dataset"] = {"cms_pf_ttbar": config["test_dataset"]["cms_pf_ttbar"]} + config["test_dataset"]["cms_pf_ttbar"]["splits"] = ["10"] # override loaded config with values from command line args config = override_config(config, args) diff --git a/mlpf/plotting/plots_cms.py b/mlpf/plotting/plots_cms.py deleted file mode 100644 index a6c229fdd..000000000 --- a/mlpf/plotting/plots_cms.py +++ /dev/null @@ -1,114 +0,0 @@ -import os.path as osp - -import matplotlib.pyplot as plt -import numpy as np -import sklearn.metrics -from plot_utils import ( - plot_confusion_matrix, - plot_E_reso, - plot_eta_reso, - plot_phi_reso, -) - -class_labels = list(range(8)) - - -def deltaphi(phi1, phi2): - return np.fmod(phi1 - phi2 + np.pi, 2 * np.pi) - np.pi - - -def prepare_resolution_plots(big_df, pid, bins, target="cand", outpath="./"): - msk_true = big_df["{}_pid".format(target)] == pid - msk_pred = big_df["pred_pid"] == pid - msk_both = msk_true & msk_pred - v0 = big_df[["{}_e".format(target), "pred_e"]].values - v1 = big_df[["{}_eta".format(target), "pred_eta"]].values - v2 = big_df[["{}_phi".format(target), "pred_phi"]].values - - plot_E_reso( - big_df, - pid, - v0, - msk_true, - msk_pred, - msk_both, - bins, - target=target, - outpath=outpath, - ) - plot_eta_reso( - big_df, - pid, - v1, - msk_true, - msk_pred, - msk_both, - bins, - target=target, - outpath=outpath, - ) - plot_phi_reso( - big_df, - pid, - v2, - msk_true, - msk_pred, - msk_both, - bins, - target=target, - outpath=outpath, - ) - - -def load_np(npfile): - X = np.load(npfile)["X"] - ycand = np.load(npfile)["ycand"] - ypred = np.load(npfile)["ypred"] - return X, ycand, ypred - - -def flatten(arr): - return arr.reshape((arr.shape[0] * arr.shape[1], arr.shape[2])) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--target", - type=str, - choices=["cand", "gen"], - help="Regress to PFCandidates or GenParticles", - default="cand", - ) - parser.add_argument("--input", type=str, required=True) - args = parser.parse_args() - - X, ycand, ypred = load_np(args.input) - - X_flat = flatten(X) - ycand_flat = flatten(ycand) - ypred_flat = flatten(ypred) - msk = X_flat[:, 0] != 0 - - confusion = sklearn.metrics.confusion_matrix(ycand_flat[msk, 0], ypred_flat[msk, 0], labels=range(8)) - - fig, ax = plot_confusion_matrix( - cm=confusion, - target_names=[int(x) for x in class_labels], - normalize=True, - ) - - plt.savefig( - osp.join(osp.dirname(args.input), "confusion_mlpf.pdf"), - bbox_inches="tight", - ) - -# prepare_resolution_plots(big_df, 211, bins[211], target=args.target, outpath=osp.dirname(args.input)) -# prepare_resolution_plots(big_df, 130, bins[130], target=args.target, outpath=osp.dirname(args.input)) -# prepare_resolution_plots(big_df, 11, bins[11], target=args.target, outpath=osp.dirname(args.input)) -# prepare_resolution_plots(big_df, 13, bins[13], target=args.target, outpath=osp.dirname(args.input)) -# prepare_resolution_plots(big_df, 22, bins[22], target=args.target, outpath=osp.dirname(args.input)) -# prepare_resolution_plots(big_df, 1, bins[1], target=args.target, outpath=osp.dirname(args.input)) -# prepare_resolution_plots(big_df, 2, bins[2], target=args.target, outpath=osp.dirname(args.input)) diff --git a/notebooks/cms/cms-validate-onnx.ipynb b/notebooks/cms/cms-validate-onnx.ipynb index b3f0cb377..3223575a3 100644 --- a/notebooks/cms/cms-validate-onnx.ipynb +++ b/notebooks/cms/cms-validate-onnx.ipynb @@ -31,9 +31,10 @@ "from onnxconverter_common import float16\n", "from onnxscript.function_libs.torch_lib.tensor_typing import TFloat\n", "\n", - "sys.path.append(\"../../mlpf\")\n", - "from pyg.mlpf import MLPF\n", - "from pyg.utils import unpack_predictions, unpack_target" + "sys.path.append(\"../../\")\n", + "import mlpf\n", + "from mlpf.model.mlpf import MLPF\n", + "from mlpf.model.utils import unpack_predictions, unpack_target" ] }, { @@ -61,11 +62,11 @@ "dataset = \"cms_pf_ttbar\"\n", "\n", "#model checkpoints are here:\n", - "outdir = \"../../experiments/pyg-cms_20240710_123023_806687/\"\n", + "outdir = \"../../experiments/pyg-cms_20241101_090645_682892/\"\n", "\n", "#Load model arguments from existing training\n", "model_state = torch.load(\n", - " outdir + \"/checkpoints/checkpoint-06-20.165181.pth\", map_location=torch.device(\"cpu\")\n", + " outdir + \"/checkpoints/checkpoint-08-2.986092.pth\", map_location=torch.device(\"cpu\")\n", ")\n", "with open(f\"{outdir}/model_kwargs.pkl\", \"rb\") as f:\n", " model_kwargs = pkl.load(f)\n", @@ -250,9 +251,13 @@ " width = num_heads * head_dim\n", "\n", " # embedding of the inputs\n", - " self.nn0_id = ffn(self.input_dim, embedding_dim, width, self.act, dropout_ff)\n", - " self.nn0_reg = ffn(self.input_dim, embedding_dim, width, self.act, dropout_ff)\n", - "\n", + " self.nn0_id = nn.ModuleList()\n", + " for ielem in range(len(self.elemtypes_nonzero)):\n", + " self.nn0_id.append(ffn(self.input_dim, embedding_dim, width, self.act, dropout_ff))\n", + " self.nn0_reg = nn.ModuleList()\n", + " for ielem in range(len(self.elemtypes_nonzero)):\n", + " self.nn0_reg.append(ffn(self.input_dim, embedding_dim, width, self.act, dropout_ff))\n", + " \n", " self.conv_id = nn.ModuleList()\n", " self.conv_reg = nn.ModuleList()\n", "\n", @@ -281,8 +286,9 @@ " decoding_dim = self.input_dim + embedding_dim\n", "\n", " # DNN that acts on the node level to predict the PID\n", - " self.nn_id = ffn(decoding_dim, num_classes, width, self.act, dropout_ff)\n", - "\n", + " self.nn_binary_particle = ffn(decoding_dim, 2, width, self.act, dropout_ff)\n", + " self.nn_pid = ffn(decoding_dim, num_classes, width, self.act, dropout_ff)\n", + " \n", " # elementwise DNN for node momentum regression\n", " embed_dim = decoding_dim + num_classes\n", " self.nn_pt = RegressionOutput(\"linear\", embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero)\n", @@ -296,8 +302,15 @@ " Xfeat_normed = X_features\n", "\n", " embeddings_id, embeddings_reg = [], []\n", - " embedding_id = self.nn0_id(Xfeat_normed)\n", - " embedding_reg = self.nn0_reg(Xfeat_normed)\n", + " \n", + " embedding_id = torch.stack([nn0(Xfeat_normed) for nn0 in self.nn0_id], axis=-1)\n", + " elemtype_mask = torch.cat([X_features[..., 0:1] == elemtype for elemtype in self.elemtypes_nonzero], axis=-1)\n", + " embedding_id = torch.sum(embedding_id * elemtype_mask.unsqueeze(-2), axis=-1)\n", + "\n", + " embedding_reg = torch.stack([nn0(Xfeat_normed) for nn0 in self.nn0_reg], axis=-1)\n", + " elemtype_mask = torch.cat([X_features[..., 0:1] == elemtype for elemtype in self.elemtypes_nonzero], axis=-1)\n", + " embedding_reg = torch.sum(embedding_reg * elemtype_mask.unsqueeze(-2), axis=-1)\n", + "\n", "\n", " for num, conv in enumerate(self.conv_id):\n", " conv_input = embedding_id if num == 0 else embeddings_id[-1]\n", @@ -724,7 +737,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/notebooks/cms/cms-validate-postprocessing.ipynb b/notebooks/cms/cms-validate-postprocessing.ipynb index d75a90354..a184e88e6 100644 --- a/notebooks/cms/cms-validate-postprocessing.ipynb +++ b/notebooks/cms/cms-validate-postprocessing.ipynb @@ -55,12 +55,12 @@ "source": [ "#https://jpata.web.cern.ch/jpata/mlpf/cms/20240823_simcluster/nopu/TTbar_14TeV_TuneCUETP8M1_cfi/raw\n", "\n", - "sample = \"TTbar_14TeV_TuneCUETP8M1_cfi\"\n", + "sample = \"QCDForPF_14TeV_TuneCUETP8M1_cfi\"\n", "\n", "pickle_data = sum(\n", " [\n", " pickle.load(bz2.BZ2File(f, \"r\"))\n", - " for f in tqdm.tqdm(sorted(list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/nopu/{}/raw/*.pkl.bz2\".format(sample))))[:10])\n", + " for f in tqdm.tqdm(sorted(list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/nopu/{}/raw/*.pkl.bz2\".format(sample))))[:100])\n", " ],\n", " [],\n", ")\n", @@ -455,6 +455,28 @@ " plt.yscale(\"log\")\n", " plt.xlabel(\"jet $p_T$ / genjet $p_T$\")\n", " plt.legend(loc=2, fontsize=12)\n", + " plt.axvline(1.0, color=\"black\", ls=\"--\", lw=0.5)\n", + "\n", + "def plot_jet_ratio_ptcut2(ptcut1, ptcut2):\n", + " plt.figure(figsize=(5,5))\n", + " b = np.linspace(0.5,1.5,100)\n", + "\n", + " pt = jets_coll[\"cmssw\"][cmssw_to_ytarget[\"cmssw\"]].pt\n", + " plt.hist(\n", + " awkward.flatten(\n", + " (jets_coll[\"ytarget\"][cmssw_to_ytarget[\"ytarget\"]].pt / jets_coll[\"cmssw\"][cmssw_to_ytarget[\"cmssw\"]].pt)[(pt>=ptcut1) & (pt=ptcut1) & (pt0) & (np.sum(pid2==pid)>0):\n", + " plt.sca(axs[iax])\n", + " plt.hist(awkward.flatten(particles_pythia[mask_pythia_nonu & (pid1==pid)][\"gen_pt\"]), bins=b, label=\"Pythia\", histtype=\"step\")\n", + " plt.hist(awkward.flatten(particles_cp[mask_cp & (pid2==pid)][\"caloparticle_pt\"]), bins=b, label=\"CaloParticle\", histtype=\"step\")\n", + " \n", + " plt.xscale(\"log\")\n", + " plt.yscale(\"log\")\n", + " plt.xlabel(\"Particle $p_T$ [GeV]\")\n", + " plt.legend(fontsize=8)\n", + " plt.title(pid)\n", + " iax += 1\n", + " plt.axvline(0.3)\n", + "plt.tight_layout()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -171,7 +205,7 @@ "plt.sca(axs[1])\n", "\n", "#it seems like caloparticles and p\n", - "pythia_ptcut = (particles_pythia[\"gen_pt\"]>0.25)\n", + "pythia_ptcut = ((particles_pythia[\"gen_pdgid\"]==22) & (particles_pythia[\"gen_pt\"]>0.25)) | (particles_pythia[\"gen_pdgid\"]!=22)\n", "\n", "plt.title(\"$p_T>0.25$ GeV\")\n", "plt.hist2d(\n", @@ -300,11 +334,11 @@ "plt.figure(figsize=(5,5))\n", "b = np.logspace(-1,1,600)\n", "\n", - "plt.hist(\n", - " awkward.flatten(\n", - " (jets_coll[\"pythia\"][genjet_to_pythia[\"pythia\"]].pt / jets_coll[\"genjet\"][genjet_to_pythia[\"genjet\"]].pt)\n", - " ), bins=b, histtype=\"step\", lw=1, label=\"Pythia\"\n", - ");\n", + "# plt.hist(\n", + "# awkward.flatten(\n", + "# (jets_coll[\"pythia\"][genjet_to_pythia[\"pythia\"]].pt / jets_coll[\"genjet\"][genjet_to_pythia[\"genjet\"]].pt)\n", + "# ), bins=b, histtype=\"step\", lw=1, label=\"Pythia\"\n", + "# );\n", "\n", "plt.hist(\n", " awkward.flatten(\n", @@ -322,30 +356,60 @@ { "cell_type": "code", "execution_count": null, - "id": "260b8da0-9587-4d98-9edc-c07badbf5f4c", + "id": "eb9116c0-7dcf-4338-a7fa-216a8d6b6d33", "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(5,5))\n", - "b = np.logspace(-1,1,600)\n", + "b = np.linspace(0.5,1.5,100)\n", + "\n", + "# plt.hist(\n", + "# awkward.flatten(\n", + "# (jets_coll[\"pythia\"][genjet_to_pythia[\"pythia\"]].pt / jets_coll[\"genjet\"][genjet_to_pythia[\"genjet\"]].pt)\n", + "# ), bins=b, histtype=\"step\", lw=1, label=\"Pythia\"\n", + "# );\n", "\n", "plt.hist(\n", " awkward.flatten(\n", - " (jets_coll[\"cp\"][pythia_to_cp[\"cp\"]].pt / jets_coll[\"pythia\"][pythia_to_cp[\"pythia\"]].pt)\n", - " ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs Pythia\"\n", + " (jets_coll[\"cp\"][genjet_to_cp[\"cp\"]].pt / jets_coll[\"genjet\"][genjet_to_cp[\"genjet\"]].pt)\n", + " ), bins=b, histtype=\"bar\", lw=1, label=\"CaloParticle\"\n", ");\n", "\n", + "#plt.xscale(\"log\")\n", + "#plt.yscale(\"log\")\n", + "plt.xlabel(\"jet $p_T$ / gen-jet $p_T$\")\n", + "plt.legend(loc=1, fontsize=12)\n", + "plt.axvline(1.0, color=\"black\", ls=\"--\", lw=0.5)\n", + "plt.ylim(0,75000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "260b8da0-9587-4d98-9edc-c07badbf5f4c", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(5,5))\n", + "b = np.logspace(-1,1,600)\n", + "\n", + "# plt.hist(\n", + "# awkward.flatten(\n", + "# (jets_coll[\"cp\"][pythia_to_cp[\"cp\"]].pt / jets_coll[\"pythia\"][pythia_to_cp[\"pythia\"]].pt)\n", + "# ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs Pythia\"\n", + "# );\n", + "\n", "plt.hist(\n", " awkward.flatten(\n", " (jets_coll[\"cp\"][pythia_nonu_to_cp[\"cp\"]].pt / jets_coll[\"pythia_nonu\"][pythia_nonu_to_cp[\"pythia_nonu\"]].pt)\n", " ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs. Pythia NoNu\"\n", ");\n", "\n", - "plt.hist(\n", - " awkward.flatten(\n", - " (jets_coll[\"cp\"][pythia_nonu_ptcut_to_cp[\"cp\"]].pt / jets_coll[\"pythia_nonu_ptcut\"][pythia_nonu_ptcut_to_cp[\"pythia_nonu_ptcut\"]].pt)\n", - " ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs. Pythia NoNu, pt>0.25GeV\"\n", - ");\n", + "# plt.hist(\n", + "# awkward.flatten(\n", + "# (jets_coll[\"cp\"][pythia_nonu_ptcut_to_cp[\"cp\"]].pt / jets_coll[\"pythia_nonu_ptcut\"][pythia_nonu_ptcut_to_cp[\"pythia_nonu_ptcut\"]].pt)\n", + "# ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs. Pythia NoNu, pt>0.25GeV\"\n", + "# );\n", "\n", "\n", "plt.xscale(\"log\")\n", @@ -366,13 +430,13 @@ "plt.figure(figsize=(5,5))\n", "b = np.linspace(0.9,1.1,600)\n", "\n", - "pt_pythia = jets_coll[\"pythia\"][pythia_to_cp[\"pythia\"]].pt\n", - "pt_cp = jets_coll[\"cp\"][pythia_to_cp[\"cp\"]].pt\n", - "plt.hist(\n", - " awkward.flatten(\n", - " (pt_cp / pt_pythia)[pt_pythia>50]\n", - " ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs Pythia\"\n", - ");\n", + "# pt_pythia = jets_coll[\"pythia\"][pythia_to_cp[\"pythia\"]].pt\n", + "# pt_cp = jets_coll[\"cp\"][pythia_to_cp[\"cp\"]].pt\n", + "# plt.hist(\n", + "# awkward.flatten(\n", + "# (pt_cp / pt_pythia)[pt_pythia>50]\n", + "# ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs Pythia\"\n", + "# );\n", "\n", "pt_pythia = jets_coll[\"pythia_nonu\"][pythia_nonu_to_cp[\"pythia_nonu\"]].pt\n", "pt_cp = jets_coll[\"cp\"][pythia_nonu_to_cp[\"cp\"]].pt\n", @@ -382,13 +446,13 @@ " ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs. Pythia NoNu\"\n", ");\n", "\n", - "pt_pythia = jets_coll[\"pythia_nonu_ptcut\"][pythia_nonu_ptcut_to_cp[\"pythia_nonu_ptcut\"]].pt\n", - "pt_cp = jets_coll[\"cp\"][pythia_nonu_ptcut_to_cp[\"cp\"]].pt\n", - "plt.hist(\n", - " awkward.flatten(\n", - " (pt_cp / pt_pythia)[pt_pythia>50]\n", - " ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs. Pythia NoNu, pt>0.25GeV\"\n", - ");\n", + "# pt_pythia = jets_coll[\"pythia_nonu_ptcut\"][pythia_nonu_ptcut_to_cp[\"pythia_nonu_ptcut\"]].pt\n", + "# pt_cp = jets_coll[\"cp\"][pythia_nonu_ptcut_to_cp[\"cp\"]].pt\n", + "# plt.hist(\n", + "# awkward.flatten(\n", + "# (pt_cp / pt_pythia)[pt_pythia>50]\n", + "# ), bins=b, histtype=\"step\", lw=1, label=\"CaloParticle vs. Pythia NoNu, pt>0.25GeV\"\n", + "# );\n", "\n", "#plt.xscale(\"log\")\n", "plt.yscale(\"log\")\n", diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml index f50ff91d2..0ebc7ab62 100644 --- a/parameters/pytorch/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -57,7 +57,7 @@ model: dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.1 dropout_conv_reg_ff: 0.1 - activation: "gelu" + activation: "relu" head_dim: 32 num_heads: 32 attention_type: math @@ -107,11 +107,14 @@ train_dataset: batch_size: 1 samples: clic_edm_qq_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] clic_edm_ttbar_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] clic_edm_ww_fullhad_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] valid_dataset: clic: @@ -119,16 +122,22 @@ valid_dataset: batch_size: 1 samples: clic_edm_qq_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] clic_edm_ttbar_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] clic_edm_ww_fullhad_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] test_dataset: clic_edm_qq_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] clic_edm_ttbar_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] clic_edm_ww_fullhad_pf: - version: 2.3.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] diff --git a/parameters/pytorch/pyg-cms-nopu.yaml b/parameters/pytorch/pyg-cms-nopu.yaml new file mode 100644 index 000000000..093852907 --- /dev/null +++ b/parameters/pytorch/pyg-cms-nopu.yaml @@ -0,0 +1,155 @@ +backend: pytorch + +save_attention: no +dataset: cms +sort_data: yes +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: 1 +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 10 +dtype: bfloat16 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + # - nn_energy + # - nn_pt + + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + dropout_ff: 0.0 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 320 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 6 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "relu" + head_dim: 32 + num_heads: 32 + attention_type: flash + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 1024 + width: 1024 + num_convs: 4 + dropout_ff: 0.0 + activation: "elu" + # mamba specific paramters + d_state: 32 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: hyperopt # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + cms: + physical_nopu: + batch_size: 8 + samples: + cms_pf_ttbar_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_qcd_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_ztt_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + +valid_dataset: + cms: + physical_nopu: + batch_size: 8 + samples: + cms_pf_ttbar_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_qcd_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_ztt_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + +test_dataset: + cms_pf_ttbar_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_qcd_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_ztt_nopu: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_ttbar: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_qcd: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + cms_pf_ztt: + version: 2.5.0 + splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml index f5909d02b..030ffad66 100644 --- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml +++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml @@ -56,13 +56,13 @@ model: attention: conv_type: attention - num_convs: 4 + num_convs: 6 dropout_ff: 0.0 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 - activation: "gelu" + activation: "relu" head_dim: 32 num_heads: 32 attention_type: flash @@ -107,19 +107,22 @@ raytune: train_dataset: cms: physical_nopu: - batch_size: 8 + batch_size: 1 samples: cms_pf_ttbar_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1] valid_dataset: cms: physical_nopu: - batch_size: 8 + batch_size: 1 samples: cms_pf_ttbar_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1] test_dataset: cms_pf_ttbar_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1] diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml index 33128774b..30b3931a3 100644 --- a/parameters/pytorch/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -56,13 +56,13 @@ model: attention: conv_type: attention - num_convs: 4 + num_convs: 6 dropout_ff: 0.0 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 - activation: "gelu" + activation: "relu" head_dim: 32 num_heads: 32 attention_type: flash @@ -110,16 +110,26 @@ train_dataset: batch_size: 1 samples: cms_pf_ttbar: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] cms_pf_qcd: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] + cms_pf_ztt: + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] physical_nopu: batch_size: 8 samples: cms_pf_ttbar_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] cms_pf_qcd_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] + cms_pf_ztt_nopu: + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] valid_dataset: cms: @@ -127,23 +137,43 @@ valid_dataset: batch_size: 1 samples: cms_pf_ttbar: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] cms_pf_qcd: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] + cms_pf_ztt: + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] physical_nopu: batch_size: 8 samples: cms_pf_ttbar_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] cms_pf_qcd_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] + cms_pf_ztt_nopu: + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] test_dataset: cms_pf_ttbar: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] cms_pf_qcd: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] + cms_pf_ztt: + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] cms_pf_ttbar_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] cms_pf_qcd_nopu: - version: 2.4.0 + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] + cms_pf_ztt_nopu: + version: 2.5.0 + splits: [1,2,3,4,5,6,7,8,9,10] diff --git a/requirements.txt b/requirements.txt index e1183ef29..6cd2373fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -apache-beam array-record autopep8 awkward diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index 84efe628b..07665bd0c 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -5,7 +5,7 @@ export PYTHONPATH="mlpf:$PYTHONPATH" # T2_EE_Estonia export IMG=/home/software/singularity/pytorch.simg:2024-08-18 -export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build " +export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build" # Desktop # export MANUAL_DIR=/media/joosep/data/cms/v3_1/ @@ -14,9 +14,11 @@ export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build " # export CMD="singularity exec -B /media/joosep/data --env PYTHONPATH=$PYTHONPATH $IMG tfds build " # CMS -# export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets -# export MANUAL_DIR=/local/joosep/mlpf/cms/20240823_simcluster -# $CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log & +export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets2 +export MANUAL_DIR=/local/joosep/mlpf/cms/20240823_simcluster +for i in `seq 1 10`; do + $CMD mlpf/heptfds/cms_pf/ttbar --config $i --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log$i & +done # $CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd.log & # $CMD mlpf/heptfds/cms_pf/ztt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ztt.log & # $CMD mlpf/heptfds/cms_pf/qcd_high_pt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd_high_pt.log & @@ -34,7 +36,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build " # $CMD mlpf/heptfds/cms_pf/ttbar_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_ttbar_nopu.log & # $CMD mlpf/heptfds/cms_pf/qcd_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_qcd_nopu.log & # $CMD mlpf/heptfds/cms_pf/vbf_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_vbf_nopu.log & -# wait +wait # CLIC cluster-based # export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets diff --git a/scripts/local_test_torch.sh b/scripts/local_test_torch.sh index 6feb1e831..88f1c58d5 100755 --- a/scripts/local_test_torch.sh +++ b/scripts/local_test_torch.sh @@ -3,6 +3,7 @@ set -e export TFDS_DATA_DIR=`pwd`/tensorflow_datasets export PWD=`pwd` export PYTHONPATH=`pwd` +export KERAS_BACKEND=torch #create data directories rm -Rf local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi @@ -25,15 +26,15 @@ for file in `\ls -1 local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/root/*.root`; d --num-events 10 done -#create the tensorflow dataset -tfds build mlpf/heptfds/cms_pf/ttbar --manual_dir ./local_test_data +#create the tensorflow dataset for the last split config only +tfds build mlpf/heptfds/cms_pf/ttbar --config 10 --manual_dir ./local_test_data mkdir -p experiments -#test transformer with onnx export +#test transformer python mlpf/pipeline.py --config parameters/pytorch/pyg-cms.yaml --data-dir ./tensorflow_datasets/ \ --prefix MLPF_test_ --num-epochs 2 --nvalid 1 --gpus 0 --train --test --make-plots --conv-type attention \ - --export-onnx --pipeline --dtype float32 --attention-type math --num-convs 1 + --pipeline --dtype float32 --attention-type math --num-convs 1 # test Ray Train training python mlpf/pipeline.py --config parameters/pytorch/pyg-cms.yaml --data-dir ${PWD}/tensorflow_datasets/ \ diff --git a/scripts/lumi/pytorch-clic-8.sh b/scripts/lumi/pytorch-clic-8.sh index 61c122947..c664fe71e 100755 --- a/scripts/lumi/pytorch-clic-8.sh +++ b/scripts/lumi/pytorch-clic-8.sh @@ -40,4 +40,4 @@ singularity exec \ --env CUDA_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES \ $IMG python3 mlpf/pipeline.py --gpus 8 \ --data-dir $TFDS_DATA_DIR --config parameters/pytorch/pyg-clic.yaml \ - --train --gpu-batch-multiplier 128 --num-workers 8 --prefetch-factor 100 --checkpoint-freq 1 --conv-type attention --dtype bfloat16 --lr 0.0001 --num-epochs 30 + --train --gpu-batch-multiplier 128 --num-workers 8 --prefetch-factor 100 --checkpoint-freq 1 --conv-type attention --dtype bfloat16 --lr 0.0001 --num-epochs 50 diff --git a/scripts/lumi/pytorch-cms-8.sh b/scripts/lumi/pytorch-cms-8.sh index 5fe9343e7..f69c52414 100755 --- a/scripts/lumi/pytorch-cms-8.sh +++ b/scripts/lumi/pytorch-cms-8.sh @@ -1,13 +1,13 @@ #!/bin/bash #SBATCH --job-name=mlpf-train #SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 +#SBATCH --time=3-00:00:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=32 #SBATCH --mem=400G #SBATCH --gpus-per-task=8 -#SBATCH --partition=standard-g +#SBATCH --partition=small-g #SBATCH --no-requeue #SBATCH -o logs/slurm-%x-%j-%N.out @@ -16,7 +16,7 @@ cd /scratch/project_465000301/particleflow module load LUMI/24.03 partition/G export IMG=/scratch/project_465000301/pytorch-rocm6.2.simg -export PYTHONPATH=hep_tfds +export PYTHONPATH=`pwd` export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets #export MIOPEN_DISABLE_CACHE=true export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache @@ -40,4 +40,4 @@ singularity exec \ --env CUDA_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES \ $IMG python3 mlpf/pipeline.py --gpus 8 \ --data-dir $TFDS_DATA_DIR --config parameters/pytorch/pyg-cms.yaml \ - --train --gpu-batch-multiplier 6 --num-workers 8 --prefetch-factor 100 --checkpoint-freq 1 --conv-type attention --dtype bfloat16 --lr 0.0001 + --train --gpu-batch-multiplier 5 --num-workers 8 --prefetch-factor 50 --checkpoint-freq 1 --conv-type attention --dtype bfloat16 --lr 0.0001 diff --git a/scripts/tallinn/a100-mig/pytorch-small-eval-clic.sh b/scripts/tallinn/a100-mig/pytorch-small-eval-clic.sh index 5e11df2c0..c369543be 100644 --- a/scripts/tallinn/a100-mig/pytorch-small-eval-clic.sh +++ b/scripts/tallinn/a100-mig/pytorch-small-eval-clic.sh @@ -7,10 +7,10 @@ IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow -WEIGHTS=experiments/pyg-clic_20241001_215132_345408/checkpoints/checkpoint-26-2.004527.pth +WEIGHTS=experiments/pyg-clic_20241106_104416_929167/checkpoints/checkpoint-20-1.914489.pth singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=`pwd` \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pipeline.py --gpus 1 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ - --test --make-plots --gpu-batch-multiplier 100 --load $WEIGHTS --dtype bfloat16 --prefetch-factor 10 --num-workers 8 --load $WEIGHTS --ntest 50000 + --test --make-plots --gpu-batch-multiplier 100 --load $WEIGHTS --dtype bfloat16 --prefetch-factor 10 --num-workers 8 --ntest 50000 diff --git a/scripts/tallinn/a100-mig/pytorch-small-eval-cms.sh b/scripts/tallinn/a100-mig/pytorch-small-eval-cms.sh index 75cc32381..810a9774b 100644 --- a/scripts/tallinn/a100-mig/pytorch-small-eval-cms.sh +++ b/scripts/tallinn/a100-mig/pytorch-small-eval-cms.sh @@ -1,17 +1,18 @@ #!/bin/bash #SBATCH --partition gpu #SBATCH --gres gpu:mig:1 -#SBATCH --mem-per-gpu 200G +#SBATCH --mem-per-gpu 100G #SBATCH -o logs/slurm-%x-%j-%N.out IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow -WEIGHTS=experiments/pyg-cms_20241002_205216_443429/checkpoints/checkpoint-17-3.757689.pth +WEIGHTS=experiments/pyg-cms_20241101_090645_682892/checkpoints/checkpoint-08-2.986092.pth +DATASET=$1 env singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=`pwd` \ --env KERAS_BACKEND=torch \ $IMG python mlpf/pipeline.py --gpus 1 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ - --test --make-plots --gpu-batch-multiplier 2 --load $WEIGHTS --ntest 10000 --dtype bfloat16 --num-workers 8 --prefetch-factor 10 + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-nopu.yaml \ + --test --make-plots --gpu-batch-multiplier 2 --load $WEIGHTS --ntest 50000 --dtype bfloat16 --num-workers 8 --prefetch-factor 10 --test-datasets $DATASET diff --git a/scripts/tallinn/a100/pytorch.sh b/scripts/tallinn/a100/pytorch.sh index f12caf215..150207fb0 100755 --- a/scripts/tallinn/a100/pytorch.sh +++ b/scripts/tallinn/a100/pytorch.sh @@ -12,6 +12,6 @@ singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=`pwd` \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pipeline.py --gpus 1 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-nopu.yaml \ --train --test --make-plots --conv-type attention \ - --gpu-batch-multiplier 8 --checkpoint-freq 1 --num-workers 8 --prefetch-factor 50 --comet + --gpu-batch-multiplier 5 --checkpoint-freq 1 --num-workers 8 --prefetch-factor 50 --comet --num-epochs 30 diff --git a/scripts/tallinn/copy_dataset_lxplus.sh b/scripts/tallinn/copy_dataset_lxplus.sh new file mode 100755 index 000000000..0adb64c71 --- /dev/null +++ b/scripts/tallinn/copy_dataset_lxplus.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [ -n "$LXPLUSUSER" ]; then + rsync --progress --relative --files-from scripts/files_to_copy.txt / $LXPLUSUSER@lxplus.cern.ch:/eos/user/j/jpata/www/mlpf/cms/ +else + echo "Please define LXPLUSUSER" +fi diff --git a/scripts/tallinn/generate_tfds.sh b/scripts/tallinn/generate_tfds.sh new file mode 100644 index 000000000..0b1d132fc --- /dev/null +++ b/scripts/tallinn/generate_tfds.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --partition main +#SBATCH --mem-per-cpu 40G +#SBATCH -o logs/slurm-%x-%j-%N.out + +export KERAS_BACKEND=tensorflow +export PYTHONPATH="mlpf" +export IMG=/home/software/singularity/pytorch.simg:2024-08-18 +export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build" + +$CMD mlpf/heptfds/$1 --config $2 --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/$3 --overwrite diff --git a/scripts/tallinn/prepare_dataset_lxplus.sh b/scripts/tallinn/prepare_dataset_lxplus.sh new file mode 100755 index 000000000..de9fa4fae --- /dev/null +++ b/scripts/tallinn/prepare_dataset_lxplus.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +rm -f scripts/files_to_copy.txt + +maxfiles=100 + +samplestocopy=( + "nopu/SingleElectronFlatPt1To1000_pythia8_cfi" + "nopu/SingleGammaFlatPt1To1000_pythia8_cfi" + "nopu/SingleK0FlatPt1To1000_pythia8_cfi" + "nopu/SingleMuFlatPt1To1000_pythia8_cfi" + "nopu/SingleNeutronFlatPt0p7To1000_cfi" + "nopu/SinglePi0Pt1To1000_pythia8_cfi" + "nopu/SinglePiMinusFlatPt0p7To1000_cfi" + "nopu/SingleProtonMinusFlatPt0p7To1000_cfi" + "nopu/SingleTauFlatPt1To1000_cfi" + "nopu/QCDForPF_14TeV_TuneCUETP8M1_cfi" + "nopu/TTbar_14TeV_TuneCUETP8M1_cfi" + "nopu/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi" + "pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi" + "pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi" + "pu55to75/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi" +) + +#samplestocopy=( +# "p8_ee_qq_ecm380" +# "p8_ee_tt_ecm380" +# "p8_ee_WW_fullhad_ecm380" +# "p8_ee_ZH_Htautau_ecm380" +# "p8_ee_Z_Ztautau_ecm380" +#) + + +#get a few files from each sample, both the root and postprocessed (raw) +for sample in "${samplestocopy[@]}"; do + find "/local/joosep/mlpf/cms/./20240823_simcluster/$sample/root" -type f | sort | head -n$maxfiles >> scripts/files_to_copy.txt + find "/local/joosep/mlpf/cms/./20240823_simcluster/$sample/raw" -type f | sort | head -n$maxfiles >> scripts/files_to_copy.txt + # find /local/joosep/clic_edm4hep/./2024_07/$sample/root/ -type f | sort | head -n$maxfiles >> scripts/files_to_copy.txt +done + +#get the total size +# cat scripts/files_to_copy.txt | xargs du -ch diff --git a/scripts/tallinn/submit_tfds.sh b/scripts/tallinn/submit_tfds.sh new file mode 100755 index 000000000..d1011f266 --- /dev/null +++ b/scripts/tallinn/submit_tfds.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +SUB=scripts/tallinn/generate_tfds.sh + +export DATA_DIR=/local/joosep/mlpf/tensorflow_datasets/cms +export MANUAL_DIR=/local/joosep/mlpf/cms/20240823_simcluster +for i in `seq 1 10`; do + sbatch $SUB cms_pf/qcd_nopu $i nopu + sbatch $SUB cms_pf/ttbar_nopu $i nopu + sbatch $SUB cms_pf/ztt_nopu $i nopu + sbatch $SUB cms_pf/qcd $i pu55to75 + sbatch $SUB cms_pf/ttbar $i pu55to75 + sbatch $SUB cms_pf/ztt $i pu55to75 +done + +export DATA_DIR=/local/joosep/mlpf/tensorflow_datasets/clic +export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep/ +for i in `seq 1 10`; do + sbatch $SUB clic_pf_edm4hep/ttbar $i + sbatch $SUB clic_pf_edm4hep/qq $i + sbatch $SUB clic_pf_edm4hep/ww_fullhad $i +done