diff --git a/helm/bigmon/charts/main/sandbox/local.py.template b/helm/bigmon/charts/main/sandbox/local.py.template index 786a701d..058669f4 100755 --- a/helm/bigmon/charts/main/sandbox/local.py.template +++ b/helm/bigmon/charts/main/sandbox/local.py.template @@ -42,7 +42,7 @@ dbaccess_postgres = { } # Oracle or Postgres -dbaccess = ${BIGMON_DB_ACCESS} +dbaccess = '${BIGMON_DB_ACCESS}' #object store OBJECT_STORE = { diff --git a/helm/bigmon/values.yaml b/helm/bigmon/values.yaml index 4125b049..8124f5c4 100644 --- a/helm/bigmon/values.yaml +++ b/helm/bigmon/values.yaml @@ -9,7 +9,7 @@ main: enabled: true image: - tag: "v0.6.17" + tag: "v0.6.25" autoStart: true diff --git a/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json b/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json index 5c003ba6..0644dbca 100644 --- a/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json +++ b/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json @@ -1014,7 +1014,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "ceQueueName": "EL9", "ceARCGridType": "arc", "ceHostname": ["arc-ce01.gridpp.rl.ac.uk", "arc-ce02.gridpp.rl.ac.uk"], @@ -1042,7 +1042,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1071,7 +1071,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1100,7 +1100,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1129,7 +1129,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", diff --git a/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json b/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json index e279f77e..62900167 100644 --- a/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json +++ b/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json @@ -1012,7 +1012,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "ceQueueName": "EL9", "ceARCGridType": "arc", "ceHostname": ["arc-ce01.gridpp.rl.ac.uk", "arc-ce02.gridpp.rl.ac.uk"], @@ -1040,7 +1040,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1069,7 +1069,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1098,7 +1098,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1127,7 +1127,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", diff --git a/helm/harvester/charts/harvester/sandbox/health_monitor.py b/helm/harvester/charts/harvester/sandbox/health_monitor.py new file mode 100644 index 00000000..600be822 --- /dev/null +++ b/helm/harvester/charts/harvester/sandbox/health_monitor.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +""" +check harvester health +""" + +import os +import re +import subprocess + + +def check_command(command, check_string): + print("Checking command : {0}".format(command)) + print("For string : {0}".format(check_string)) + + tmp_array = command.split() + output = ( + subprocess.Popen(tmp_array, stdout=subprocess.PIPE) + .communicate()[0] + .decode("ascii") + ) + + if re.search(check_string, output): + print("Found the string, return 100") + return 100 + else: + print("String not found, return 0") + return 0 + + +def uwsgi_process_availability(): + # check the uwsgi + process_avail = 0 + output = ( + subprocess.Popen( + "ps -eo pgid,args | grep uwsgi | grep -v grep", + stdout=subprocess.PIPE, + shell=True, + ) + .communicate()[0] + .decode("ascii") + ) + count = 0 + for line in output.split("\n"): + line = line.strip() + if line == "": + continue + count += 1 + if count >= 1: + process_avail = 100 + + print("uwsgi process check availability: %s" % process_avail) + return process_avail + + +def condor_process_availability(): + # check the condor + process_avail = 0 + output = ( + subprocess.Popen( + "ps -eo pgid,args | grep condor_schedd | grep -v grep", + stdout=subprocess.PIPE, + shell=True, + ) + .communicate()[0] + .decode("ascii") + ) + count = 0 + for line in output.split("\n"): + line = line.strip() + if line == "": + continue + count += 1 + if count >= 1: + process_avail = 100 + + print("condor_q process check availability: %s" % process_avail) + return process_avail + + +def condor_q_availability(): + # check the condor_q + process_avail = 0 + try: + result = subprocess.run( + ["condor_q"], + timeout=10, # Timeout in seconds + capture_output=True, + text=True + ) + print(f"command output: {result.stdout}") + process_avail = 100 + except subprocess.TimeoutExpired: + print("The command timed out!") + process_avail = 0 + + print("condor_q process check availability: %s" % process_avail) + return process_avail + + +def main(): + uwsgi_avail, condor_avail, condor_q_avail = 0, 0, 0 + try: + uwsgi_avail = uwsgi_process_availability() + condor_avail = condor_process_availability() + condor_q_avail = condor_q_availability() + except Exception as ex: + print(f"failed to check availability: {ex}") + + print(f"uwsgi_avail: {uwsgi_avail}, condor_avail: {condor_avail}, condor_q_avail: {condor_q_avail}") + + health_monitor_file = "/var/log/panda/harvester_healthy" + if uwsgi_avail and condor_avail and condor_q_avail: + with open(health_monitor_file, 'w') as f: + f.write("OK") + else: + if os.path.exists(health_monitor_file): + os.remove(health_monitor_file) + + +if __name__ == '__main__': + main() diff --git a/helm/harvester/charts/harvester/sandbox/health_monitor.sh b/helm/harvester/charts/harvester/sandbox/health_monitor.sh new file mode 100644 index 00000000..d365f669 --- /dev/null +++ b/helm/harvester/charts/harvester/sandbox/health_monitor.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +source /opt/harvester/bin/activate +source /data/condor/condor/condor.sh + +python /data/harvester/health_monitor.py diff --git a/helm/harvester/charts/harvester/sandbox/lsst.init-harvester b/helm/harvester/charts/harvester/sandbox/lsst.init-harvester index e303e276..491ed12a 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.init-harvester +++ b/helm/harvester/charts/harvester/sandbox/lsst.init-harvester @@ -22,25 +22,25 @@ yes|cp -fr etc/grid-security/vomsdir/lsst/* /etc/grid-security/vomsdir/lsst/ cd /data/harvester # gcloud config -cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester -tar xzf gcloud_config.tar.gz -chmod 777 -R /data/harvester/gcloud_config +# cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester +# tar xzf gcloud_config.tar.gz +# chmod 777 -R /data/harvester/gcloud_config # k8s config -cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester -tar xzf k8s.tar.gz -chmod 777 -R /data/harvester/k8s - -export CLOUDSDK_CONFIG=/data/harvester/gcloud_config -export KUBECONFIG=/data/harvester/gcloud_config/.kube - -mkdir -p /data/harvester/gcloud_config_rubin/ -for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt -do - export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue - gcloud container clusters get-credentials --region=us-central1 $queue - chmod og+rw $KUBECONFIG -done - -export KUBECONFIG=/data/harvester/gcloud_config/.kube +# cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester +# tar xzf k8s.tar.gz +# chmod 777 -R /data/harvester/k8s + +# export CLOUDSDK_CONFIG=/data/harvester/gcloud_config +# export KUBECONFIG=/data/harvester/gcloud_config/.kube + +# mkdir -p /data/harvester/gcloud_config_rubin/ +# for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt +# do +# export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue +# gcloud container clusters get-credentials --region=us-central1 $queue +# chmod og+rw $KUBECONFIG +# done + +# export KUBECONFIG=/data/harvester/gcloud_config/.kube diff --git a/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh b/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh index 51602a36..91b4c77b 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh +++ b/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh @@ -84,8 +84,15 @@ fi # env echo -# cmd="$cmd --export=ALL /cvmfs/sw.lsst.eu/linux-x86_64/panda_env/v1.0.9/pilot/wrapper/rubin-wrapper.sh $@" -cmd="$cmd --export=ALL ${latest}/pilot/wrapper/rubin-wrapper.sh $@" + +# check if there is a local dev pilot +pilot_wrapper_local=/sdf/data/rubin/panda_jobs/panda_env_pilot/pilot_wrapper/rubin-wrapper.sh +if [[ -f ${pilot_wrapper_local} ]]; then + cmd="$cmd --export=ALL ${pilot_wrapper_local} $@" +else + # cmd="$cmd --export=ALL /cvmfs/sw.lsst.eu/linux-x86_64/panda_env/v1.0.9/pilot/wrapper/rubin-wrapper.sh $@" + cmd="$cmd --export=ALL ${latest}/pilot/wrapper/rubin-wrapper.sh $@" +fi echo $cmd ntasks=${ntasks_total} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf index f880a346..ffaf7644 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf @@ -5,9 +5,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -w generic --pilot-user rubin --url https://pandaserver-doma.cern.ch -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy" -# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog" +# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf index af516c21..331b06b5 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf @@ -8,9 +8,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -# arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +# arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf index 0d0aab45..ff783f4b 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf @@ -1,6 +1,8 @@ executable = /opt/harvester/sandbox/lsst.rubin-srun.sh -arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" + +# arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf index b4db94af..7f18d2e7 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf @@ -7,9 +7,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf index abeb6a99..bb3657b3 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf @@ -7,9 +7,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf index 068b67f7..ee66630a 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf @@ -10,7 +10,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper_slac.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-dev.tar.gz" -arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester b/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester index e303e276..491ed12a 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester +++ b/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester @@ -22,25 +22,25 @@ yes|cp -fr etc/grid-security/vomsdir/lsst/* /etc/grid-security/vomsdir/lsst/ cd /data/harvester # gcloud config -cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester -tar xzf gcloud_config.tar.gz -chmod 777 -R /data/harvester/gcloud_config +# cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester +# tar xzf gcloud_config.tar.gz +# chmod 777 -R /data/harvester/gcloud_config # k8s config -cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester -tar xzf k8s.tar.gz -chmod 777 -R /data/harvester/k8s - -export CLOUDSDK_CONFIG=/data/harvester/gcloud_config -export KUBECONFIG=/data/harvester/gcloud_config/.kube - -mkdir -p /data/harvester/gcloud_config_rubin/ -for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt -do - export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue - gcloud container clusters get-credentials --region=us-central1 $queue - chmod og+rw $KUBECONFIG -done - -export KUBECONFIG=/data/harvester/gcloud_config/.kube +# cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester +# tar xzf k8s.tar.gz +# chmod 777 -R /data/harvester/k8s + +# export CLOUDSDK_CONFIG=/data/harvester/gcloud_config +# export KUBECONFIG=/data/harvester/gcloud_config/.kube + +# mkdir -p /data/harvester/gcloud_config_rubin/ +# for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt +# do +# export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue +# gcloud container clusters get-credentials --region=us-central1 $queue +# chmod og+rw $KUBECONFIG +# done + +# export KUBECONFIG=/data/harvester/gcloud_config/.kube diff --git a/helm/harvester/charts/harvester/sandbox/rubin-voms.config.tar b/helm/harvester/charts/harvester/sandbox/rubin-voms.config.tar index 561fc8d0..72053ffb 100644 Binary files a/helm/harvester/charts/harvester/sandbox/rubin-voms.config.tar and b/helm/harvester/charts/harvester/sandbox/rubin-voms.config.tar differ diff --git a/helm/harvester/charts/harvester/sandbox/run-harvester-crons b/helm/harvester/charts/harvester/sandbox/run-harvester-crons index 09121868..89e38722 100644 --- a/helm/harvester/charts/harvester/sandbox/run-harvester-crons +++ b/helm/harvester/charts/harvester/sandbox/run-harvester-crons @@ -6,6 +6,10 @@ while true; do sleep 36000; /opt/harvester/bin/panda_common-install_igtf_ca > /v # log rotate while true; do /usr/sbin/logrotate /data/harvester/logrotate-harvester >> /var/log/panda/logrotate.log 2>&1; sleep 3600; done & +# health monitor + +while true; do bash /data/harvester/health_monitor.sh >> /var/log/panda/health_monitor.log 2>&1; sleep 600; done & + # experiment specific if [[ ! -z "${EXPERIMENT}" ]]; then CurrentDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" diff --git a/helm/harvester/charts/harvester/templates/statefulset.yaml b/helm/harvester/charts/harvester/templates/statefulset.yaml index dea19458..e787741f 100644 --- a/helm/harvester/charts/harvester/templates/statefulset.yaml +++ b/helm/harvester/charts/harvester/templates/statefulset.yaml @@ -180,6 +180,13 @@ spec: {{- end }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + livenessProbe: + exec: + command: + - cat + - /var/log/panda/harvester_healthy + initialDelaySeconds: 1800 + periodSeconds: 600 {{- if .Values.autoStart }} command: ["/bin/sh", "-c"] args: diff --git a/helm/harvester/charts/mariadb/templates/statefulset.yaml b/helm/harvester/charts/mariadb/templates/statefulset.yaml index 96474656..ab30fd05 100644 --- a/helm/harvester/charts/mariadb/templates/statefulset.yaml +++ b/helm/harvester/charts/mariadb/templates/statefulset.yaml @@ -47,6 +47,7 @@ spec: {{- if not .Values.autoscaling.enabled }} replicas: {{ .Values.replicaCount }} {{- end }} + serviceName: {{ include "mariadb.fullname" . }} selector: matchLabels: {{- include "mariadb.selectorLabels" . | nindent 6 }} diff --git a/helm/harvester/values.yaml b/helm/harvester/values.yaml index 8e29a75c..c4b881f8 100644 --- a/helm/harvester/values.yaml +++ b/helm/harvester/values.yaml @@ -10,7 +10,7 @@ harvester: enabled: true # container image and tag image: - tag: "v0.5.6" + tag: "v0.5.14" # tag: "master" # PV with selector support diff --git a/helm/harvester/values/values-lsst-prod.yaml b/helm/harvester/values/values-lsst-prod.yaml index b965de72..775d7161 100644 --- a/helm/harvester/values/values-lsst-prod.yaml +++ b/helm/harvester/values/values-lsst-prod.yaml @@ -22,7 +22,7 @@ harvester: mount: true class: sdf-data-rubin path: "/mnt/harvester-data" - logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_SDF" + logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_S3DF" size: 5Gi experiment: "lsst_prod" diff --git a/helm/harvester/values/values-lsst.yaml b/helm/harvester/values/values-lsst.yaml index a8f22075..90997783 100644 --- a/helm/harvester/values/values-lsst.yaml +++ b/helm/harvester/values/values-lsst.yaml @@ -21,7 +21,7 @@ harvester: mount: true class: sdf-data-rubin path: "/mnt/harvester-data" - logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_SDF" + logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_S3DF" size: 5Gi experiment: "lsst" diff --git a/helm/idds/charts/rest/idds_configmap.json b/helm/idds/charts/rest/idds_configmap.json index 354ff7a5..487c45e5 100644 --- a/helm/idds/charts/rest/idds_configmap.json +++ b/helm/idds/charts/rest/idds_configmap.json @@ -27,7 +27,8 @@ "username": "${IDDS_RECEIVER_USERNAME}", "password": "${IDDS_RECEIVER_PASSWORD}", "broker_timeout": 600}}, - "domapandawork.poll_panda_jobs_chunk_size": 2000 + "domapandawork.poll_panda_jobs_chunk_size": 2000, + "domapandawork.site_to_cloud": "SLAC:US,LANCS:EU,CC-IN2P3:EU,RAL:EU" }, "conductor": {"threshold_to_release_messages": 1000, diff --git a/helm/idds/charts/rest/templates/statefulset.yaml b/helm/idds/charts/rest/templates/statefulset.yaml index 1160f42a..664bbeb6 100644 --- a/helm/idds/charts/rest/templates/statefulset.yaml +++ b/helm/idds/charts/rest/templates/statefulset.yaml @@ -85,6 +85,13 @@ spec: runuser -u atlpan -g zp -- /opt/idds/bin/start-daemon.sh all {{- end}} {{ end -}} + livenessProbe: + exec: + command: + - cat + - /var/log/idds/idds_health + initialDelaySeconds: 600 + periodSeconds: 600 ports: - name: https containerPort: 8443 diff --git a/helm/idds/values.yaml b/helm/idds/values.yaml index fb5c3302..8a384995 100644 --- a/helm/idds/values.yaml +++ b/helm/idds/values.yaml @@ -8,7 +8,7 @@ global: rest: enabled: true image: - tag: "2.1.30" + tag: "2.2.7" resources: limits: diff --git a/helm/panda/charts/jedi/sandbox/run-jedi-crons b/helm/panda/charts/jedi/sandbox/run-jedi-crons index 2dad5de1..3e7157d9 100644 --- a/helm/panda/charts/jedi/sandbox/run-jedi-crons +++ b/helm/panda/charts/jedi/sandbox/run-jedi-crons @@ -2,7 +2,7 @@ tmpExe=/data/panda/run-jedi-crons-tmp-exe cat <> ${tmpExe} while true; do sleep 36000; /opt/panda/bin/panda_common-install_igtf_ca > /var/log/panda/install_igtf_ca.log 2>&1; done & -while true; do /usr/sbin/logrotate /data/panda/logrotate-jedi >> /var/log/panda/logrotate.log 2>&1; sleep 3600; done & +while true; do /usr/sbin/logrotate /data/panda/logrotate-jedi >> /var/log/panda/logrotate.log 2>&1; sleep 86400; done & EOT chmod +x ${tmpExe} diff --git a/helm/panda/charts/server/panda_server_config.json b/helm/panda/charts/server/panda_server_config.json index ce0502be..3ae03414 100644 --- a/helm/panda/charts/server/panda_server_config.json +++ b/helm/panda/charts/server/panda_server_config.json @@ -5,7 +5,8 @@ "CRIC_URL_DDMENDPOINTS": "$PANDA_CRIC_DDMENDPOINTS", "CRIC_URL_SCHEDCONFIG": "$PANDA_CRIC_SCHEDCONFIG", "CRIC_URL_SITES": "$PANDA_CRIC_SITES", - "adder_plugins": "wlcg:dataservice.AdderDummyPlugin:AdderDummyPlugin", + "CRIC_URL_TAGS": "$PANDA_CRIC_URLTAGS", + "RUCIO_RSE_USAGE": "/opt/panda/sandbox/rucio_rse_usage.json", "backend": "postgres", "schemaPANDA": "$PANDA_DB_SCHEMAPANDA", "schemaMETA": "$PANDA_DB_SCHEMAMETA", @@ -22,7 +23,7 @@ "dbpasswd": "$PANDA_DB_PASSWORD", "dbuser": "$PANDA_DB_USER", "dbname": "$PANDA_DB_NAME", - "adder_plugins": "wlcg:dataservice.adder_simple_plugin:AdderDummyPlugin,wlcg:dataservice.adder_simple_plugin:AdderSimplePlugin:sphenix", + "adder_plugins": "wlcg:dataservice.adder_dummy_plugin:AdderDummyPlugin,wlcg:dataservice.adder_simple_plugin:AdderSimplePlugin:sphenix", "setupper_plugins": "wlcg:dataservice.setupper_dummy_plugin:SetupperDummyPlugin", "token_authType": "oidc", "sandboxHostname": "$PANDA_HOSTNAME", diff --git a/helm/panda/charts/server/sandbox/run-panda-crons b/helm/panda/charts/server/sandbox/run-panda-crons index 3f29e076..10086c55 100644 --- a/helm/panda/charts/server/sandbox/run-panda-crons +++ b/helm/panda/charts/server/sandbox/run-panda-crons @@ -3,7 +3,7 @@ tmpExe=/data/panda/run-panda-crons-tmp-exe cat <> ${tmpExe} while true; do sleep 36000; /opt/panda/bin/panda_common-install_igtf_ca > /var/log/panda/install_igtf_ca.log 2>&1; done & while true; do /opt/cacheschedconfig/bin/cacheSC.sh >> /var/log/panda/cacheSC.out 2>&1; sleep 60; done & -while true; do /usr/sbin/logrotate /data/panda/logrotate-panda >> /var/log/panda/logrotate.log 2>&1; sleep 3600; done & +while true; do /usr/sbin/logrotate /data/panda/logrotate-panda >> /var/log/panda/logrotate.log 2>&1; sleep 86400; done & EOT chmod +x ${tmpExe} diff --git a/helm/panda/sandbox/rucio_rse_usage.json b/helm/panda/sandbox/rucio_rse_usage.json new file mode 100644 index 00000000..935b8819 --- /dev/null +++ b/helm/panda/sandbox/rucio_rse_usage.json @@ -0,0 +1 @@ +{"null": "null"} diff --git a/helm/panda/values.yaml b/helm/panda/values.yaml index a9121693..ebc0349f 100644 --- a/helm/panda/values.yaml +++ b/helm/panda/values.yaml @@ -11,7 +11,7 @@ jedi: # container image and tag image: - tag: "0.4.3" + tag: "0.4.5" # tag: "master" # PV with selector support @@ -32,7 +32,7 @@ server: # container image and tag image: - tag: "0.3.20" + tag: "0.4.2" # tag: "master" # PV with selector support