diff --git a/.github/workflows/icenet-install-test.yml b/.github/workflows/icenet-install-test.yml index 144f5fed..28f9b683 100644 --- a/.github/workflows/icenet-install-test.yml +++ b/.github/workflows/icenet-install-test.yml @@ -113,12 +113,12 @@ jobs: run: | source setenv-github-actions.sh && maxevents=10000; source tests/runme_brem_reweight.sh echo "yes" | source superclean.sh - + # - - name: Deep Learning system integration test (zee 1) - run: | - source setenv-github-actions.sh && maxevents=100; source tests/runme_zee_gridtune.sh - echo "yes" | source superclean.sh + #- name: Deep Learning system integration test (zee 1) + # run: | + # source setenv-github-actions.sh && maxevents=100; source tests/runme_zee_gridtune.sh + # echo "yes" | source superclean.sh # - name: Deep Learning system integration test (zee 2) diff --git a/.gitignore b/.gitignore index 29d52d88..00a04844 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,8 @@ *.zip *.text *.log +*.out +*.output #*.json #*.txt diff --git a/icenet/__init__.py b/icenet/__init__.py index 01cd72d7..ff9bd575 100644 --- a/icenet/__init__.py +++ b/icenet/__init__.py @@ -3,7 +3,7 @@ import os import psutil -__version__ = '0.1.3.2' +__version__ = '0.1.3.3' __release__ = 'alpha' __date__ = '21/10/2024' __author__ = 'm.mieskolainen@imperial.ac.uk' diff --git a/icenet/tools/process.py b/icenet/tools/process.py index 1a531122..6b8975b1 100644 --- a/icenet/tools/process.py +++ b/icenet/tools/process.py @@ -224,18 +224,18 @@ def read_config(config_path='configs/xyz/', runmode='all'): hash_args = {} # Critical Python file content - files = {'inputvars': os.path.join(cwd, config_path, f'{args["inputvars"]}.py'), - 'cuts': os.path.join(cwd, config_path, f'cuts.py'), - 'filter': os.path.join(cwd, config_path, f'filter.py'), - 'common': os.path.join(cwd, args['rootname'], f'common.py')} - + files = {'inputvars': os.path.join(cwd, config_path, f'{args["inputvars"]}.py'), + 'cuts': os.path.join(cwd, config_path, f'cuts.py'), + 'filter': os.path.join(cwd, config_path, f'filter.py'), + 'common': os.path.join(cwd, 'ice' + args['rootname'], f'common.py')} + for key in files.keys(): if os.path.exists(files[key]): - print(f"Cache introspection for the file: '{files[key]}'") hash_args[f'__hash__{key}'] = io.make_hash_sha256_file(files[key]) + print(f"Cache introspection for the file: '{files[key]}' [done]", 'green') else: - print(f"Did not find: {files[key]} [may cause crash if your application depends on it]", 'red') - + print(f"Cache introspection did not find: {files[key]} [may cause crash if your application depends on it]", 'red') + # Genesis parameters as the first one hash_args.update(old_args['genesis_runmode']) diff --git a/tests/runme_zee_gridtune.sh b/tests/runme_zee_gridtune.sh index 3781f3f3..2f2421a9 100644 --- a/tests/runme_zee_gridtune.sh +++ b/tests/runme_zee_gridtune.sh @@ -88,6 +88,25 @@ fi echo "DATAPATH is set to $DATAPATH" echo "CONFIG is set to $CONFIG" +# ----------------------------------------------------------------------- +# Initialization + +# Ensure that GRID_ID and GRID_NODES are set to special values + +if [[ $GRID_ID == -1 && $GRID_NODES == 1 ]]; then + + python analysis/zee.py --runmode genesis $MAX --config ${CONFIG}.yml --datapath $DATAPATH + + python analysis/zee.py --runmode train $MAX --config ${CONFIG}.yml --datapath $DATAPATH \ + --modeltag GRIDTUNE --run_id "INIT" --compute 0 + + python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH \ + --modeltag GRIDTUNE --run_id "INIT" --compute 0 + + return 0 # do not use exit +fi + + # ----------------------------------------------------------------------- # Generic functions @@ -182,8 +201,14 @@ echo "" # 4. Run python analysis/zee.py --runmode genesis $MAX --config ${CONFIG}.yml --datapath $DATAPATH -python analysis/zee.py --runmode train $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --supertune "${SUPERTUNE}" # Note " " -python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "minloss" --supertune "models.iceboost_swd.readmode=-1" -python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "last" --supertune "models.iceboost_swd.readmode=-2" + +python analysis/zee.py --runmode train $MAX --config ${CONFIG}.yml --datapath $DATAPATH \ + --modeltag GRIDTUNE --run_id $RUN_ID --supertune "${SUPERTUNE}" # Note " " + +python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH \ + --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "minloss" --supertune "models.iceboost_swd.readmode=-1" + +python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH \ + --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "last" --supertune "models.iceboost_swd.readmode=-2" done diff --git a/tests/zee/gridtune.dag b/tests/zee/gridtune.dag new file mode 100644 index 00000000..c639e17d --- /dev/null +++ b/tests/zee/gridtune.dag @@ -0,0 +1,13 @@ +#!/bin/bash +# +# Grid tuning job with init structure using Condor DAGMan +# +# Submit with: +# condor_submit_dag gridtune.dag + +# Filename: job_dependency.dag +JOB A gridtune_init.job # First init job +JOB B gridtune_array.job # Array job + +# Make B depend on A finishing successfully +PARENT A CHILD B diff --git a/tests/zee/gridtune_array.job b/tests/zee/gridtune_array.job new file mode 100644 index 00000000..f6aa1292 --- /dev/null +++ b/tests/zee/gridtune_array.job @@ -0,0 +1,14 @@ +# Array job + +executable = gridtune_task.sh +arguments = "$(PROCESS) 4 $(ClusterId)" +error = gridtune_array.$(CLUSTER).$(PROCESS).out +output = gridtune_array.$(CLUSTER).$(PROCESS).output +log = gridtune_array.$(CLUSTER).$(PROCESS).log +request_gpus = 1 +request_memory = 80G +#requirements = TARGET.GPUs_DeviceName =?= "Tesla V100-PCIE-32GB" +requirements = TARGET.GPUs_DeviceName =?= "Tesla P100-PCIE-12GB" ++MaxRuntime = 86000 + +queue 4 diff --git a/tests/zee/gridtune_init.job b/tests/zee/gridtune_init.job new file mode 100644 index 00000000..11bca24a --- /dev/null +++ b/tests/zee/gridtune_init.job @@ -0,0 +1,13 @@ +# Initialization job + +executable = gridtune_task.sh +arguments = "-1 1 $(ClusterId)" +error = gridtune_init.$(CLUSTER).out +output = gridtune_init.$(CLUSTER).output +log = gridtune_init.$(CLUSTER).log +request_gpus = 1 +request_memory = 80G +#requirements = TARGET.GPUs_DeviceName =?= "Tesla V100-PCIE-32GB" +requirements = TARGET.GPUs_DeviceName =?= "Tesla P100-PCIE-12GB" ++MaxRuntime = 86000 +queue diff --git a/tests/zee/gridtune_task.sh b/tests/zee/gridtune_task.sh new file mode 100755 index 00000000..817eb412 --- /dev/null +++ b/tests/zee/gridtune_task.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# +# GPU grid tuning task + +echo "Grid tuning job started" +pwd + +ICEPATH="/vols/cms/mmieskol/icenet" + +# ** icenet/setenv.sh uses these ** +export HTC_PROCESS_ID=$1 +export HTC_QUEUE_SIZE=$2 +export HTC_CLUSTER_ID=$3 + +# Init conda +source /home/hep/mmieskol/setconda.sh +conda activate icenet + +# Init icenet +mkdir $ICEPATH/tmp -p +cd $ICEPATH +source $ICEPATH/setenv.sh + +# Execute +DATAPATH="/vols/cms/pfk18/phd/hgg/Jul23/NN21July/N/validations/outputs/Csplit_Jsamp/files" +CONFIG="tune0_EB" +maxevents=150000 +source /vols/cms/mmieskol/icenet/tests/runme_zee_gridtune.sh + +# Create the done file when the job completes +donefile="${ICEPATH}/tmp/icenet_${HTC_CLUSTER_ID}_${HTC_PROCESS_ID}.done" +touch $donefile + +echo "Task done, created file: ${donefile}" diff --git a/tests/zee/submit.sh b/tests/zee/submit.sh new file mode 100644 index 00000000..3b6eac11 --- /dev/null +++ b/tests/zee/submit.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Condor submission with first init job and then +# an array job once that is finished +# +# Emulating DAGMan without using it. +# +# Run with: source submit.sh +# +# m.mieskolainen@imperial.ac.uk, 2024 + +ICEPATH="/vols/cms/mmieskol/icenet" + +TASK_SCRIPT="gridtune_task.sh" +INIT_JOB="gridtune_init.job" +ARRAY_JOB="gridtune_array.job" +PERIOD=15 + +# Submit the first job +echo "Submitting init job" +FIRST_JOB_ID=$(condor_submit $INIT_JOB | awk '/submitted to cluster/ {print int($6)}') +echo " " +cat $INIT_JOB + +# Check if job submission was successful +if [[ -z "$FIRST_JOB_ID" ]]; then + echo "Error: Failed to submit the first job" + exit 1 +fi + +sleep 5 + +echo "First job with ID = ${FIRST_JOB_ID}" +echo "Waiting for the first job to finish" + +# Initialize start time for cumulative waiting +start_time=$(date +%s) + +while true; do + # Check if the job is still in the queue + job_status=$(condor_q $FIRST_JOB_ID -format "%d" JobStatus 2>/dev/null) + + # If condor_q returns nothing, check condor_history + if [ -z "$job_status" ]; then + # Job is no longer in the queue, check the history + job_status=$(condor_history $FIRST_JOB_ID -limit 1 -format "%d" JobStatus 2>/dev/null) + + # Exit the loop if the job has completed + if [ "$job_status" -eq "4" ]; then + echo "Job completed successfully." + break + else + echo "Job is no longer running but didn't finish as expected -- exit" + exit 0 + fi + fi + + # Calculate the cumulative time spent waiting + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + elapsed_minutes=$((elapsed_time / 60)) + elapsed_seconds=$((elapsed_time % 60)) + + # Otherwise, job is still in the queue, and we can wait + echo "Job is still running (status: $job_status). Checking again in ${PERIOD} seconds..." + echo "Cumulative waiting time: ${elapsed_minutes} minute(s) and ${elapsed_seconds} second(s)" + sleep $PERIOD +done + +# Submit the array job +echo "Submitting array job" +condor_submit $ARRAY_JOB +echo " " +cat $ARRAY_JOB