From acbb6c562ea4fe870c733a2557cc16f7e58c6765 Mon Sep 17 00:00:00 2001 From: Sakib Rahman Date: Tue, 30 Jan 2024 16:29:08 -0500 Subject: [PATCH] Use a nested directory approach to uniquely identify models instead of hash following https://waterdata.usgs.gov/blog/snakemake-for-ml-experiments/. Establishes a DAG with greater parallelization of processes. --- benchmarks/roman_pots/Snakefile | 312 +++++++++--------- .../roman_pots/train_dense_neural_network.py | 120 +++---- 2 files changed, 207 insertions(+), 225 deletions(-) diff --git a/benchmarks/roman_pots/Snakefile b/benchmarks/roman_pots/Snakefile index 3827c080..d0d66e4f 100644 --- a/benchmarks/roman_pots/Snakefile +++ b/benchmarks/roman_pots/Snakefile @@ -1,197 +1,195 @@ from itertools import product -import hashlib DETECTOR_PATH = os.environ["DETECTOR_PATH"] DETECTOR_VERSION = os.environ["DETECTOR_VERSION"] SUBSYSTEM = "roman_pots" BENCHMARK = "dense_neural_network" -DETECTOR_CONFIG = ["epic_ip6"] -NUM_EPOCHS_PZ = [100] -LEARNING_RATE_PZ = [0.01] -SIZE_INPUT_PZ = [4] -SIZE_OUTPUT_PZ = [1] -N_LAYERS_PZ = [3,6] -SIZE_FIRST_HIDDEN_LAYER_PZ = [128] -MULTIPLIER_PZ = [0.5] -LEAK_RATE_PZ = [0.025] -NUM_EPOCHS_PY = [100] -LEARNING_RATE_PY = [0.01] -SIZE_INPUT_PY = [3] -SIZE_OUTPUT_PY = [1] -N_LAYERS_PY = [3,6] -SIZE_FIRST_HIDDEN_LAYER_PY = [128] -MULTIPLIER_PY = [0.5] -LEAK_RATE_PY = [0.025] -NUM_EPOCHS_PX = [100] -LEARNING_RATE_PX = [0.01] -SIZE_INPUT_PX = [3] -SIZE_OUTPUT_PX = [1] -N_LAYERS_PX = [3,7] -SIZE_FIRST_HIDDEN_LAYER_PX = [128] -MULTIPLIER_PX = [0.5] -LEAK_RATE_PX = [0.025] -MAX_HASH = 6 -NFILES = range(1,11) -NEVENTS_PER_FILE = [100] -NUM_TRAINING_INPUTS = [int(0.5*max(NFILES)),int(0.7*max(NFILES))] -MODEL_VERSION = [ - hashlib.sha512("_".join(map(str,x)).encode()).hexdigest()[:MAX_HASH] - for x in product( - NEVENTS_PER_FILE, NUM_TRAINING_INPUTS, - NUM_EPOCHS_PZ, LEARNING_RATE_PZ, SIZE_INPUT_PZ, SIZE_OUTPUT_PZ, N_LAYERS_PZ, SIZE_FIRST_HIDDEN_LAYER_PZ, MULTIPLIER_PZ, LEAK_RATE_PZ, - NUM_EPOCHS_PY, LEARNING_RATE_PY, SIZE_INPUT_PY, SIZE_OUTPUT_PY, N_LAYERS_PY, SIZE_FIRST_HIDDEN_LAYER_PY, MULTIPLIER_PY, LEAK_RATE_PY, - NUM_EPOCHS_PX, LEARNING_RATE_PX, SIZE_INPUT_PX, SIZE_OUTPUT_PX, N_LAYERS_PX, SIZE_FIRST_HIDDEN_LAYER_PX, MULTIPLIER_PX, LEAK_RATE_PX - ) -] +DETECTOR_CONFIG = "epic_ip6" +NEVENTS_PER_FILE = 5 +NFILES = range(1,6) +MODEL_PZ = { + 'num_epochs' : [100], + 'learning_rate' : [0.01], + 'size_input' : [4], + 'size_output': [1], + 'n_layers' : [3,6], + 'size_first_hidden_layer' : [128], + 'multiplier' : [0.5], + 'leak_rate' : [0.025], +} +MODEL_PY = { + 'num_epochs' : [100], + 'learning_rate' : [0.01], + 'size_input' : [3], + 'size_output': [1], + 'n_layers' : [3,6], + 'size_first_hidden_layer' : [128], + 'multiplier' : [0.5], + 'leak_rate' : [0.025] +} +MODEL_PX = { + 'num_epochs' : [100], + 'learning_rate' : [0.01], + 'size_input' : [3], + 'size_output': [1], + 'n_layers' : [3,7], + 'size_first_hidden_layer' : [128], + 'multiplier' : [0.5], + 'leak_rate' : [0.025] +} -rule target_generate: +rule all: input: - expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/raw_data/"+DETECTOR_VERSION+"_{detector_config}_{index}.edm4hep.root", - detector_config=DETECTOR_CONFIG, + expand("results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/raw_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.edm4hep.root", index=NFILES), - expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/processed_data/"+DETECTOR_VERSION+"_{detector_config}_{index}.txt", - detector_config=DETECTOR_CONFIG, + expand("results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/processed_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.txt", index=NFILES), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/metadata/"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.txt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION) + expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_pz/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/model_pz.pt", + detector_config=DETECTOR_CONFIG, + num_epochs=MODEL_PZ["num_epochs"], + learning_rate=MODEL_PZ["learning_rate"], + size_input=MODEL_PZ["size_input"], + size_output=MODEL_PZ["size_output"], + n_layers=MODEL_PZ["n_layers"], + size_first_hidden_layer=MODEL_PZ["size_first_hidden_layer"], + multiplier=MODEL_PZ["multiplier"], + leak_rate=MODEL_PZ["leak_rate"] + ), + expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_pz/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/LossVsEpoch_model_pz.png", + detector_config=DETECTOR_CONFIG, + num_epochs=MODEL_PZ["num_epochs"], + learning_rate=MODEL_PZ["learning_rate"], + size_input=MODEL_PZ["size_input"], + size_output=MODEL_PZ["size_output"], + n_layers=MODEL_PZ["n_layers"], + size_first_hidden_layer=MODEL_PZ["size_first_hidden_layer"], + multiplier=MODEL_PZ["multiplier"], + leak_rate=MODEL_PZ["leak_rate"] + ), + expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_py/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/model_py.pt", + detector_config=DETECTOR_CONFIG, + num_epochs=MODEL_PY["num_epochs"], + learning_rate=MODEL_PY["learning_rate"], + size_input=MODEL_PY["size_input"], + size_output=MODEL_PY["size_output"], + n_layers=MODEL_PY["n_layers"], + size_first_hidden_layer=MODEL_PY["size_first_hidden_layer"], + multiplier=MODEL_PY["multiplier"], + leak_rate=MODEL_PY["leak_rate"] + ), + expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_py/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/LossVsEpoch_model_py.png", + detector_config=DETECTOR_CONFIG, + num_epochs=MODEL_PY["num_epochs"], + learning_rate=MODEL_PY["learning_rate"], + size_input=MODEL_PY["size_input"], + size_output=MODEL_PY["size_output"], + n_layers=MODEL_PY["n_layers"], + size_first_hidden_layer=MODEL_PY["size_first_hidden_layer"], + multiplier=MODEL_PY["multiplier"], + leak_rate=MODEL_PY["leak_rate"] + ), + expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_px/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/model_px.pt", + detector_config=DETECTOR_CONFIG, + num_epochs=MODEL_PX["num_epochs"], + learning_rate=MODEL_PX["learning_rate"], + size_input=MODEL_PX["size_input"], + size_output=MODEL_PX["size_output"], + n_layers=MODEL_PX["n_layers"], + size_first_hidden_layer=MODEL_PX["size_first_hidden_layer"], + multiplier=MODEL_PX["multiplier"], + leak_rate=MODEL_PX["leak_rate"] + ), + expand("results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_px/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/LossVsEpoch_model_px.png", + detector_config=DETECTOR_CONFIG, + num_epochs=MODEL_PX["num_epochs"], + learning_rate=MODEL_PX["learning_rate"], + size_input=MODEL_PX["size_input"], + size_output=MODEL_PX["size_output"], + n_layers=MODEL_PX["n_layers"], + size_first_hidden_layer=MODEL_PX["size_first_hidden_layer"], + multiplier=MODEL_PX["multiplier"], + leak_rate=MODEL_PX["leak_rate"] + ) + + -rule target_train: - input: - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/model_pz_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.pt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/model_py_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.pt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/model_px_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.pt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/LossVsEpoch_model_pz_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.png", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/LossVsEpoch_model_py_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.png", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/LossVsEpoch_model_px_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.png", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION) rule roman_pots_generate_events: input: script="steering_file.py" params: detector_path=DETECTOR_PATH, - nevents_per_file=NEVENTS_PER_FILE + nevents_per_file=NEVENTS_PER_FILE, + detector_config=DETECTOR_CONFIG output: - "results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/raw_data/"+DETECTOR_VERSION+"_{detector_config}_{index}.edm4hep.root" + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/raw_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.edm4hep.root" shell: """ npsim --steeringFile {input.script} \ - --compactFile {params.detector_path}/{wildcards.detector_config}.xml \ + --compactFile {params.detector_path}/{params.detector_config}.xml \ --outputFile {output} \ -N {params.nevents_per_file} """ rule roman_pots_preprocess_model_training_data: input: - data = "results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/raw_data/"+DETECTOR_VERSION+"_{detector_config}_{index}.edm4hep.root", + data = "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/raw_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.edm4hep.root", script = "preprocess_model_training_data.cxx" output: - "results/"+DETECTOR_VERSION+"/{detector_config}/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/processed_data/"+DETECTOR_VERSION+"_{detector_config}_{index}.txt" + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/processed_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.txt" shell: """ root -q -b {input.script}\"(\\\"{input.data}\\\",\\\"{output}\\\")\" """ -rule roman_pots_generate_neural_network_configs: - input: - output: - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/metadata/"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.txt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION) - run: - for detector_config, nevents_per_file, num_training_inputs, \ - num_epochs_pz, learning_rate_pz, size_input_pz, size_output_pz, n_layers_pz, size_first_hidden_layer_pz, multiplier_pz, leak_rate_pz, \ - num_epochs_py, learning_rate_py, size_input_py, size_output_py, n_layers_py, size_first_hidden_layer_py, multiplier_py, leak_rate_py, \ - num_epochs_px, learning_rate_px, size_input_px, size_output_px, n_layers_px, size_first_hidden_layer_px, multiplier_px, leak_rate_px in \ - product(DETECTOR_CONFIG, NEVENTS_PER_FILE, NUM_TRAINING_INPUTS, - NUM_EPOCHS_PZ, LEARNING_RATE_PZ, SIZE_INPUT_PZ, SIZE_OUTPUT_PZ, N_LAYERS_PZ, SIZE_FIRST_HIDDEN_LAYER_PZ, MULTIPLIER_PZ, LEAK_RATE_PZ, - NUM_EPOCHS_PY, LEARNING_RATE_PY, SIZE_INPUT_PY, SIZE_OUTPUT_PY, N_LAYERS_PY, SIZE_FIRST_HIDDEN_LAYER_PY, MULTIPLIER_PY, LEAK_RATE_PY, - NUM_EPOCHS_PX, LEARNING_RATE_PX, SIZE_INPUT_PX, SIZE_OUTPUT_PX, N_LAYERS_PX, SIZE_FIRST_HIDDEN_LAYER_PX, MULTIPLIER_PX, LEAK_RATE_PX): - output_dir = "results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/metadata" - output_file = str(nevents_per_file)+"_"+str(num_training_inputs)+"_"+\ - str(num_epochs_pz)+"_"+str(learning_rate_pz)+"_"+str(size_input_pz)+"_"+str(size_output_pz)+"_"+str(n_layers_pz)+"_"+str(size_first_hidden_layer_pz)+"_"+str(multiplier_pz)+"_"+str(leak_rate_pz)+"_"+\ - str(num_epochs_py)+"_"+str(learning_rate_py)+"_"+str(size_input_py)+"_"+str(size_output_py)+"_"+str(n_layers_py)+"_"+str(size_first_hidden_layer_py)+"_"+str(multiplier_py)+"_"+str(leak_rate_py)+"_"+\ - str(num_epochs_px)+"_"+str(learning_rate_px)+"_"+str(size_input_px)+"_"+str(size_output_px)+"_"+str(n_layers_px)+"_"+str(size_first_hidden_layer_px)+"_"+str(multiplier_px)+"_"+str(leak_rate_px) - model_hash = hashlib.sha512(output_file.encode()).hexdigest()[:MAX_HASH] - output_file_location = open(str(output_dir)+"/"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_hash)+".txt","w") - output_file_content = "--input_files\nresults/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/processed_data/"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_\n"+\ - "--model_version\n"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_hash)+"\n"+\ - "--nevents_per_file\n"+str(nevents_per_file)+"\n"+\ - "--num_training_inputs\n"+str(num_training_inputs)+"\n"+\ - "--num_epochs_pz\n"+str(num_epochs_pz)+"\n"+\ - "--learning_rate_pz\n"+str(learning_rate_pz)+"\n"+\ - "--size_input_pz\n"+str(size_input_pz)+"\n"+\ - "--size_output_pz\n"+str(size_output_pz)+"\n"+\ - "--n_layers_pz\n"+str(n_layers_pz)+"\n"+\ - "--size_first_hidden_layer_pz\n"+str(size_first_hidden_layer_pz)+"\n"+\ - "--multiplier_pz\n"+str(multiplier_pz)+"\n"+\ - "--leak_rate_pz\n"+str(leak_rate_pz)+"\n"+\ - "--num_epochs_py\n"+str(num_epochs_py)+"\n"+\ - "--learning_rate_py\n"+str(learning_rate_py)+"\n"+\ - "--size_input_py\n"+str(size_input_py)+"\n"+\ - "--size_output_py\n"+str(size_output_py)+"\n"+\ - "--n_layers_py\n"+str(n_layers_py)+"\n"+\ - "--size_first_hidden_layer_py\n"+str(size_first_hidden_layer_py)+"\n"+\ - "--multiplier_py\n"+str(multiplier_py)+"\n"+\ - "--leak_rate_py\n"+str(leak_rate_py)+"\n"+\ - "--num_epochs_px\n"+str(num_epochs_px)+"\n"+\ - "--learning_rate_px\n"+str(learning_rate_px)+"\n"+\ - "--size_input_px\n"+str(size_input_px)+"\n"+\ - "--size_output_px\n"+str(size_output_px)+"\n"+\ - "--n_layers_px\n"+str(n_layers_px)+"\n"+\ - "--size_first_hidden_layer_px\n"+str(size_first_hidden_layer_px)+"\n"+\ - "--multiplier_px\n"+str(multiplier_px)+"\n"+\ - "--leak_rate_px\n"+str(leak_rate_px) - output_file_location.write(output_file_content) - print(output_file_location) - output_file_location.close() - -rule roman_pots_train_neural_networks: +rule roman_pots_train_model_pz: input: + data = ["results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/processed_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.txt".format(index=index) for index in NFILES], script = "train_dense_neural_network.py" + params: + detector_version=DETECTOR_VERSION, + detector_config=DETECTOR_CONFIG, + subsystem=SUBSYSTEM, + benchmark=BENCHMARK output: - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/model_pz_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.pt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/model_py_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.pt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/model_px_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.pt", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/LossVsEpoch_model_pz_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.png", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/LossVsEpoch_model_py_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.png", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION), - expand("results/"+str(DETECTOR_VERSION)+"/{detector_config}/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/LossVsEpoch_model_px_"+str(DETECTOR_VERSION)+"_{detector_config}_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_{model_version}.png", - detector_config=DETECTOR_CONFIG, - model_version=MODEL_VERSION) - - run: - for detector_config, model_version in product(DETECTOR_CONFIG,MODEL_VERSION): - os.system("python "+str(input.script)+" results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/metadata/"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_version)+".txt") - os.system("mv model_pz_"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_version)+".pt results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/") - os.system("mv model_py_"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_version)+".pt results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/") - os.system("mv model_px_"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_version)+".pt results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/trained_models/") - os.system("mv LossVsEpoch_model_pz_"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_version)+".png results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/") - os.system("mv LossVsEpoch_model_py_"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_version)+".png results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/") - os.system("mv LossVsEpoch_model_px_"+str(DETECTOR_VERSION)+"_"+str(detector_config)+"_"+str(SUBSYSTEM)+"_"+str(BENCHMARK)+"_"+str(model_version)+".png results/"+str(DETECTOR_VERSION)+"/"+str(detector_config)+"/detector_benchmarks/"+str(SUBSYSTEM)+"/"+str(BENCHMARK)+"/artifacts/") - - + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_pz/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/model_pz.pt", + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_pz/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/LossVsEpoch_model_pz.png" + shell: + """ + python {input.script} --input_files {input.data} --model_name model_pz --model_dir results/{params.detector_version}/{params.detector_config}/detector_benchmarks/{params.subsystem}/{params.benchmark}/artifacts/model_pz/num_epochs_{wildcards.num_epochs}/learning_rate_{wildcards.learning_rate}/size_input_{wildcards.size_input}/size_output_{wildcards.size_output}/n_layers_{wildcards.n_layers}/size_first_hidden_layer_{wildcards.size_first_hidden_layer}/multiplier_{wildcards.multiplier}/leak_rate_{wildcards.leak_rate} --num_epochs {wildcards.num_epochs} --learning_rate {wildcards.learning_rate} --size_input {wildcards.size_input} --size_output {wildcards.size_output} --n_layers {wildcards.n_layers} --size_first_hidden_layer {wildcards.size_first_hidden_layer} --multiplier {wildcards.multiplier} --leak_rate {wildcards.leak_rate} + """ +rule roman_pots_train_model_py: + input: + data = ["results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/processed_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.txt".format(index=index) for index in NFILES], + script = "train_dense_neural_network.py" + params: + detector_version=DETECTOR_VERSION, + detector_config=DETECTOR_CONFIG, + subsystem=SUBSYSTEM, + benchmark=BENCHMARK + output: + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_py/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/model_py.pt", + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_py/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/LossVsEpoch_model_py.png" + shell: + """ + python {input.script} --input_files {input.data} --model_name model_py --model_dir results/{params.detector_version}/{params.detector_config}/detector_benchmarks/{params.subsystem}/{params.benchmark}/artifacts/model_py/num_epochs_{wildcards.num_epochs}/learning_rate_{wildcards.learning_rate}/size_input_{wildcards.size_input}/size_output_{wildcards.size_output}/n_layers_{wildcards.n_layers}/size_first_hidden_layer_{wildcards.size_first_hidden_layer}/multiplier_{wildcards.multiplier}/leak_rate_{wildcards.leak_rate} --num_epochs {wildcards.num_epochs} --learning_rate {wildcards.learning_rate} --size_input {wildcards.size_input} --size_output {wildcards.size_output} --n_layers {wildcards.n_layers} --size_first_hidden_layer {wildcards.size_first_hidden_layer} --multiplier {wildcards.multiplier} --leak_rate {wildcards.leak_rate} + """ +rule roman_pots_train_model_px: + input: + data = ["results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/processed_data/"+DETECTOR_VERSION+"_"+DETECTOR_CONFIG+"_{index}.txt".format(index=index) for index in NFILES], + script = "train_dense_neural_network.py" + params: + detector_version=DETECTOR_VERSION, + detector_config=DETECTOR_CONFIG, + subsystem=SUBSYSTEM, + benchmark=BENCHMARK + output: + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_px/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/model_px.pt", + "results/"+DETECTOR_VERSION+"/"+DETECTOR_CONFIG+"/detector_benchmarks/"+SUBSYSTEM+"/"+BENCHMARK+"/artifacts/model_px/num_epochs_{num_epochs}/learning_rate_{learning_rate}/size_input_{size_input}/size_output_{size_output}/n_layers_{n_layers}/size_first_hidden_layer_{size_first_hidden_layer}/multiplier_{multiplier}/leak_rate_{leak_rate}/LossVsEpoch_model_px.png" + shell: + """ + python {input.script} --input_files {input.data} --model_name model_px --model_dir results/{params.detector_version}/{params.detector_config}/detector_benchmarks/{params.subsystem}/{params.benchmark}/artifacts/model_px/num_epochs_{wildcards.num_epochs}/learning_rate_{wildcards.learning_rate}/size_input_{wildcards.size_input}/size_output_{wildcards.size_output}/n_layers_{wildcards.n_layers}/size_first_hidden_layer_{wildcards.size_first_hidden_layer}/multiplier_{wildcards.multiplier}/leak_rate_{wildcards.leak_rate} --num_epochs {wildcards.num_epochs} --learning_rate {wildcards.learning_rate} --size_input {wildcards.size_input} --size_output {wildcards.size_output} --n_layers {wildcards.n_layers} --size_first_hidden_layer {wildcards.size_first_hidden_layer} --multiplier {wildcards.multiplier} --leak_rate {wildcards.leak_rate} + """ diff --git a/benchmarks/roman_pots/train_dense_neural_network.py b/benchmarks/roman_pots/train_dense_neural_network.py index 5f9dec91..36bd9bd6 100644 --- a/benchmarks/roman_pots/train_dense_neural_network.py +++ b/benchmarks/roman_pots/train_dense_neural_network.py @@ -8,6 +8,8 @@ import matplotlib.pyplot as plt import argparse import sys +import hashlib + torch.set_default_dtype(torch.float32) if torch.cuda.is_available(): @@ -47,30 +49,13 @@ def standardize(x): standardized_tensor = (x - mean) / std return standardized_tensor, mean, std -def train_model(name, input_tensor, target_tensor, model, hyperparameters): - # Set hyperparameters - match name: - case "model_pz": - num_epochs = int(hyperparameters.num_epochs_pz) - learning_rate = float(hyperparameters.learning_rate_pz) - case "model_py": - num_epochs = int(hyperparameters.num_epochs_py) - learning_rate = float(hyperparameters.learning_rate_py) - case "model_px": - num_epochs = int(hyperparameters.num_epochs_px) - learning_rate = float(hyperparameters.learning_rate_px) - case _: - print("No model name provided. Return without further processing") - return - print("Set number of epochs and learning rate to "+str(num_epochs)+" and "+str(learning_rate)+" for "+str(name)+" training.") - - +def train_model(input_tensor, target_tensor, model, hyperparameters): # Send model to device model=model.to(device) # Define the loss function and optimizer criterion = torch.nn.HuberLoss(reduction='mean', delta=1.0) - optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + optimizer = torch.optim.Adam(model.parameters(), lr=hyperparameters.learning_rate) # Create a learning rate scheduler scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,'min',patience=100,cooldown=100,factor=0.5,threshold=1e-4,verbose=True) @@ -79,7 +64,7 @@ def train_model(name, input_tensor, target_tensor, model, hyperparameters): losses = [] # Train the model - for epoch in range(num_epochs): + for epoch in range(hyperparameters.num_epochs): # Forward pass inputs, targets = input_tensor.to(device), target_tensor.to(device) predictions = model(inputs) @@ -98,18 +83,18 @@ def train_model(name, input_tensor, target_tensor, model, hyperparameters): # Print progress if (epoch + 1) % 10 == 0: - print("Epoch "+str(epoch+1)+"/"+str(num_epochs)+", Loss: "+"{0:0.10f}".format(loss.item())) + print("Epoch "+str(epoch+1)+"/"+str(hyperparameters.num_epochs)+", Loss: "+"{0:0.10f}".format(loss.item())) # Plot the loss values plt.figure() - plt.plot(range(1, num_epochs+1), losses) + plt.plot(range(1, hyperparameters.num_epochs+1), losses) plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Loss as a Function of Epoch') plt.yscale('log') - plt.savefig("LossVsEpoch_"+name+"_"+str(hyperparameters.model_version)+".png") + plt.savefig(hyperparameters.model_dir+"/LossVsEpoch_"+hyperparameters.model_name+".png") - torch.jit.script(model).save(name+"_"+str(hyperparameters.model_version)+".pt") + torch.jit.script(model).save(hyperparameters.model_dir+"/"+hyperparameters.model_name+".pt") return def run_experiment(hyperparameters): @@ -117,63 +102,62 @@ def run_experiment(hyperparameters): # Load training data in tensors training_data = pd.DataFrame() - for i in range(1,int(hyperparameters.num_training_inputs)+1): - temp_training_data = pd.read_csv(hyperparameters.input_files+str(i)+'.txt', delimiter='\t', header=None) + for i in hyperparameters.input_files: + temp_training_data = pd.read_csv(i, delimiter='\t', header=None) training_data = pd.concat([training_data, temp_training_data], ignore_index=True) training_RP_pos_tensor = torch.tensor(training_data.iloc[:,3:7].values, dtype=torch.float32) training_MC_mom_tensor = torch.tensor(training_data.iloc[:,0:3].values, dtype=torch.float32) # Standardize training data - source_pz = training_RP_pos_tensor - scaled_source_pz, mean_source_pz, std_source_pz = standardize(source_pz) - target_pz = training_MC_mom_tensor[:,2].unsqueeze(1) - - source_py = torch.cat((training_RP_pos_tensor[:,2:4], training_MC_mom_tensor[:,2].unsqueeze(1)), 1) - scaled_source_py, mean_source_py, std_source_py = standardize(source_py) - target_py = training_MC_mom_tensor[:,1].unsqueeze(1) + match hyperparameters.model_name: + case "model_pz": + source = training_RP_pos_tensor + scaled_source, mean_source, std_source = standardize(source) + target = training_MC_mom_tensor[:,2].unsqueeze(1) + + case "model_py": + source = torch.cat((training_RP_pos_tensor[:,2:4], training_MC_mom_tensor[:,2].unsqueeze(1)), 1) + scaled_source, mean_source, std_source = standardize(source) + target = training_MC_mom_tensor[:,1].unsqueeze(1) + + case "model_px": + source = torch.cat((training_RP_pos_tensor[:,0:2], training_MC_mom_tensor[:,2].unsqueeze(1)), 1) + scaled_source, mean_source, std_source = standardize(source) + target = training_MC_mom_tensor[:,0].unsqueeze(1) - source_px = torch.cat((training_RP_pos_tensor[:,0:2], training_MC_mom_tensor[:,2].unsqueeze(1)), 1) - scaled_source_px, mean_source_px, std_source_px = standardize(source_px) - target_px = training_MC_mom_tensor[:,0].unsqueeze(1) + case _: + print("No model name provided. Stop further processing") + return # Initialize models - initial_model_pz = NeuralNet(size_input=int(hyperparameters.size_input_pz), - size_output=int(hyperparameters.size_output_pz), - n_layers=int(hyperparameters.n_layers_pz), - size_first_hidden_layer=int(hyperparameters.size_first_hidden_layer_pz), - multiplier=float(hyperparameters.multiplier_pz), - leak_rate=float(hyperparameters.leak_rate_pz)) - initial_model_py = NeuralNet(size_input=int(hyperparameters.size_input_py), - size_output=int(hyperparameters.size_output_py), - n_layers=int(hyperparameters.n_layers_py), - size_first_hidden_layer=int(hyperparameters.size_first_hidden_layer_py), - multiplier=float(hyperparameters.multiplier_py), - leak_rate=float(hyperparameters.leak_rate_py)) - initial_model_px = NeuralNet(size_input=int(hyperparameters.size_input_px), - size_output=int(hyperparameters.size_output_px), - n_layers=int(hyperparameters.n_layers_px), - size_first_hidden_layer=int(hyperparameters.size_first_hidden_layer_px), - multiplier=float(hyperparameters.multiplier_px), - leak_rate=float(hyperparameters.leak_rate_px)) - + initial_model = NeuralNet(size_input=int(hyperparameters.size_input), + size_output=int(hyperparameters.size_output), + n_layers=int(hyperparameters.n_layers), + size_first_hidden_layer=int(hyperparameters.size_first_hidden_layer), + multiplier=float(hyperparameters.multiplier), + leak_rate=float(hyperparameters.leak_rate)) + # Train models - train_model("model_pz", scaled_source_pz, target_pz, initial_model_pz, hyperparameters) - train_model("model_py", scaled_source_py, target_py, initial_model_py, hyperparameters) - train_model("model_px", scaled_source_px, target_px, initial_model_px, hyperparameters) - + train_model(scaled_source, target, initial_model, hyperparameters) + # Print end statement - print("Training completed using "+str(int(hyperparameters.nevents_per_file)*int(hyperparameters.num_training_inputs))+" generated events.") + print("Training completed using "+str(len(hyperparameters.input_files))+" files with "+str(training_RP_pos_tensor.shape[0])+" eligible events") if __name__ == "__main__": - parser = argparse.ArgumentParser(fromfile_prefix_chars='@') - hyperparameters_list = ['--input_files', '--model_version', '--nevents_per_file', '--num_training_inputs', - '--num_epochs_pz', '--learning_rate_pz', '--size_input_pz', '--size_output_pz', '--n_layers_pz', '--size_first_hidden_layer_pz', '--multiplier_pz', '--leak_rate_pz', - '--num_epochs_py', '--learning_rate_py', '--size_input_py', '--size_output_py', '--n_layers_py', '--size_first_hidden_layer_py', '--multiplier_py', '--leak_rate_py', - '--num_epochs_px', '--learning_rate_px', '--size_input_px', '--size_output_px', '--n_layers_px', '--size_first_hidden_layer_px', '--multiplier_px', '--leak_rate_px'] - for hyperparameter in hyperparameters_list: - parser.add_argument(hyperparameter) - hyperparameters = parser.parse_args(['@'+str(sys.argv[1])]) + parser = argparse.ArgumentParser(description="Train neural network model for roman pots") + parser.add_argument('--input_files', type=str, nargs='+', required=True, help='Specify a location of input files.') + parser.add_argument('--model_name', type=str, required=True, help='Specify model name.') + parser.add_argument('--model_dir', type=str, required=True, help='Specify location to save model') + parser.add_argument('--num_epochs', type=int, required=True, help='Specify number of epochs') + parser.add_argument('--learning_rate', type=float, required=True, help='Specify learning rate') + parser.add_argument('--size_input', type=int, required=True, help='Specify input size') + parser.add_argument('--size_output', type=int, required=True, help='Specify output size') + parser.add_argument('--n_layers', type=int, required=True, help='Specify number of layers') + parser.add_argument('--size_first_hidden_layer', type=int, required=True, help='Size of first hidden layer') + parser.add_argument('--multiplier', type=float, required=True, help='Specify mutilplier to calculate size of subsequent hidden layers') + parser.add_argument('--leak_rate', type=float, required=True, help='Specify leak rate') + hyperparameters = parser.parse_args() run_experiment(hyperparameters)