add new grid scripts [no ci]

mieskolainen · Oct 23, 2024 · ae98407 · ae98407
1 parent 4448a21
commit ae98407
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 17 deletions.
diff --git a/.github/workflows/icenet-install-test.yml b/.github/workflows/icenet-install-test.yml
@@ -113,12 +113,12 @@ jobs:
         run: |
           source setenv-github-actions.sh && maxevents=10000; source tests/runme_brem_reweight.sh
           echo "yes" | source superclean.sh
-        
+      
       #
-      - name: Deep Learning system integration test (zee 1)
-        run: |
-          source setenv-github-actions.sh && maxevents=100; source tests/runme_zee_gridtune.sh
-          echo "yes" | source superclean.sh
+      #- name: Deep Learning system integration test (zee 1)
+      #  run: |
+      #    source setenv-github-actions.sh && maxevents=100; source tests/runme_zee_gridtune.sh
+      #    echo "yes" | source superclean.sh
 
       #
       - name: Deep Learning system integration test (zee 2)

diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,8 @@
 *.zip
 *.text
 *.log
+*.out
+*.output
 #*.json
 #*.txt
 

diff --git a/icenet/__init__.py b/icenet/__init__.py
@@ -3,7 +3,7 @@
 import os
 import psutil
 
-__version__    = '0.1.3.2'
+__version__    = '0.1.3.3'
 __release__    = 'alpha'
 __date__       = '21/10/2024'
 __author__     = '[email protected]'

diff --git a/icenet/tools/process.py b/icenet/tools/process.py
@@ -224,18 +224,18 @@ def read_config(config_path='configs/xyz/', runmode='all'):
     hash_args = {}
 
     # Critical Python file content
-    files = {'inputvars': os.path.join(cwd, config_path,      f'{args["inputvars"]}.py'),
-             'cuts':      os.path.join(cwd, config_path,      f'cuts.py'),
-             'filter':    os.path.join(cwd, config_path,      f'filter.py'),
-             'common':    os.path.join(cwd, args['rootname'], f'common.py')}
-    
+    files = {'inputvars': os.path.join(cwd, config_path,              f'{args["inputvars"]}.py'),
+             'cuts':      os.path.join(cwd, config_path,              f'cuts.py'),
+             'filter':    os.path.join(cwd, config_path,              f'filter.py'),
+             'common':    os.path.join(cwd, 'ice' + args['rootname'], f'common.py')}
+
     for key in files.keys():
         if os.path.exists(files[key]):
-            print(f"Cache introspection for the file: '{files[key]}'")
             hash_args[f'__hash__{key}'] = io.make_hash_sha256_file(files[key])
+            print(f"Cache introspection for the file: '{files[key]}' [done]", 'green')
         else:
-            print(f"Did not find: {files[key]} [may cause crash if your application depends on it]", 'red')
-
+            print(f"Cache introspection did not find: {files[key]} [may cause crash if your application depends on it]", 'red')
+    
     # Genesis parameters as the first one
     hash_args.update(old_args['genesis_runmode'])
 

diff --git a/tests/runme_zee_gridtune.sh b/tests/runme_zee_gridtune.sh
@@ -88,6 +88,25 @@ fi
 echo "DATAPATH is set to $DATAPATH"
 echo "CONFIG is set to $CONFIG"
 
+# -----------------------------------------------------------------------
+# Initialization
+
+# Ensure that GRID_ID and GRID_NODES are set to special values
+
+if [[ $GRID_ID == -1 && $GRID_NODES == 1 ]]; then
+
+  python analysis/zee.py --runmode genesis $MAX --config ${CONFIG}.yml --datapath $DATAPATH
+
+  python analysis/zee.py --runmode train $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
+    --modeltag GRIDTUNE --run_id "INIT" --compute 0
+
+  python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
+    --modeltag GRIDTUNE --run_id "INIT" --compute 0
+
+  return 0 # do not use exit
+fi
+
+
 # -----------------------------------------------------------------------
 # Generic functions
 
@@ -182,8 +201,14 @@ echo ""
 
 # 4. Run
 python analysis/zee.py --runmode genesis $MAX --config ${CONFIG}.yml --datapath $DATAPATH 
-python analysis/zee.py --runmode train   $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --supertune "${SUPERTUNE}" # Note " "
-python analysis/zee.py --runmode eval    $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "minloss" --supertune "models.iceboost_swd.readmode=-1" 
-python analysis/zee.py --runmode eval    $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "last"    --supertune "models.iceboost_swd.readmode=-2" 
+
+python analysis/zee.py --runmode train   $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
+  --modeltag GRIDTUNE --run_id $RUN_ID --supertune "${SUPERTUNE}" # Note " "
+
+python analysis/zee.py --runmode eval    $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
+  --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "minloss" --supertune "models.iceboost_swd.readmode=-1" 
+
+python analysis/zee.py --runmode eval    $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
+  --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "last"    --supertune "models.iceboost_swd.readmode=-2" 
 
 done
diff --git a/tests/zee/gridtune.dag b/tests/zee/gridtune.dag
@@ -0,0 +1,13 @@
+#!/bin/bash
+# 
+# Grid tuning job with init structure using Condor DAGMan
+# 
+# Submit with:
+# condor_submit_dag gridtune.dag
+
+# Filename: job_dependency.dag
+JOB A gridtune_init.job   # First init job
+JOB B gridtune_array.job  # Array job
+
+# Make B depend on A finishing successfully
+PARENT A CHILD B
diff --git a/tests/zee/gridtune_array.job b/tests/zee/gridtune_array.job
@@ -0,0 +1,14 @@
+# Array job
+
+executable     = gridtune_task.sh
+arguments      = "$(PROCESS) 4 $(ClusterId)"
+error          = gridtune_array.$(CLUSTER).$(PROCESS).out
+output         = gridtune_array.$(CLUSTER).$(PROCESS).output
+log            = gridtune_array.$(CLUSTER).$(PROCESS).log
+request_gpus   = 1
+request_memory = 80G
+#requirements  = TARGET.GPUs_DeviceName =?= "Tesla V100-PCIE-32GB"
+requirements   = TARGET.GPUs_DeviceName =?= "Tesla P100-PCIE-12GB"
++MaxRuntime    = 86000
+
+queue 4
diff --git a/tests/zee/gridtune_init.job b/tests/zee/gridtune_init.job
@@ -0,0 +1,13 @@
+# Initialization job
+
+executable     = gridtune_task.sh
+arguments      = "-1 1 $(ClusterId)"
+error          = gridtune_init.$(CLUSTER).out
+output         = gridtune_init.$(CLUSTER).output
+log            = gridtune_init.$(CLUSTER).log
+request_gpus   = 1
+request_memory = 80G
+#requirements  = TARGET.GPUs_DeviceName =?= "Tesla V100-PCIE-32GB"
+requirements   = TARGET.GPUs_DeviceName =?= "Tesla P100-PCIE-12GB"
++MaxRuntime    = 86000
+queue
diff --git a/tests/zee/gridtune_task.sh b/tests/zee/gridtune_task.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+#
+# GPU grid tuning task
+
+echo "Grid tuning job started"
+pwd
+
+ICEPATH="/vols/cms/mmieskol/icenet"
+
+# ** icenet/setenv.sh uses these **
+export HTC_PROCESS_ID=$1
+export HTC_QUEUE_SIZE=$2
+export HTC_CLUSTER_ID=$3
+
+# Init conda
+source /home/hep/mmieskol/setconda.sh
+conda activate icenet
+
+# Init icenet
+mkdir $ICEPATH/tmp -p
+cd $ICEPATH
+source $ICEPATH/setenv.sh
+
+# Execute
+DATAPATH="/vols/cms/pfk18/phd/hgg/Jul23/NN21July/N/validations/outputs/Csplit_Jsamp/files"
+CONFIG="tune0_EB"
+maxevents=150000
+source /vols/cms/mmieskol/icenet/tests/runme_zee_gridtune.sh
+
+# Create the done file when the job completes
+donefile="${ICEPATH}/tmp/icenet_${HTC_CLUSTER_ID}_${HTC_PROCESS_ID}.done"
+touch $donefile
+
+echo "Task done, created file: ${donefile}"
diff --git a/tests/zee/submit.sh b/tests/zee/submit.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Condor submission with first init job and then
+# an array job once that is finished
+#
+# Emulating DAGMan without using it.
+#
+# Run with: source submit.sh
+#
+# [email protected], 2024
+
+ICEPATH="/vols/cms/mmieskol/icenet"
+
+TASK_SCRIPT="gridtune_task.sh"
+INIT_JOB="gridtune_init.job"
+ARRAY_JOB="gridtune_array.job"
+PERIOD=15
+
+# Submit the first job
+echo "Submitting init job"
+FIRST_JOB_ID=$(condor_submit $INIT_JOB | awk '/submitted to cluster/ {print int($6)}')
+echo " "
+cat $INIT_JOB
+
+# Check if job submission was successful
+if [[ -z "$FIRST_JOB_ID" ]]; then
+    echo "Error: Failed to submit the first job"
+    exit 1
+fi
+
+sleep 5
+
+echo "First job with ID = ${FIRST_JOB_ID}"
+echo "Waiting for the first job to finish"
+
+# Initialize start time for cumulative waiting
+start_time=$(date +%s)
+
+while true; do
+    # Check if the job is still in the queue
+    job_status=$(condor_q $FIRST_JOB_ID -format "%d" JobStatus 2>/dev/null)
+
+    # If condor_q returns nothing, check condor_history
+    if [ -z "$job_status" ]; then
+        # Job is no longer in the queue, check the history
+        job_status=$(condor_history $FIRST_JOB_ID -limit 1 -format "%d" JobStatus 2>/dev/null)
+
+        # Exit the loop if the job has completed
+        if [ "$job_status" -eq "4" ]; then
+            echo "Job completed successfully."
+            break
+        else
+            echo "Job is no longer running but didn't finish as expected -- exit"
+            exit 0
+        fi
+    fi
+
+    # Calculate the cumulative time spent waiting
+    current_time=$(date +%s)
+    elapsed_time=$((current_time - start_time))
+    elapsed_minutes=$((elapsed_time / 60))
+    elapsed_seconds=$((elapsed_time % 60))
+
+    # Otherwise, job is still in the queue, and we can wait
+    echo "Job is still running (status: $job_status). Checking again in ${PERIOD} seconds..."
+    echo "Cumulative waiting time: ${elapsed_minutes} minute(s) and ${elapsed_seconds} second(s)"
+    sleep $PERIOD
+done
+
+# Submit the array job
+echo "Submitting array job"
+condor_submit $ARRAY_JOB
+echo " "
+cat $ARRAY_JOB
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,8 @@ @@
     *.zip
     *.text
     *.log
+    *.out
+    *.output
     #*.json
     #*.txt
@@ Expand Down @@