Skip to content

Commit

Permalink
Merge pull request #128 from wguanicedew/main
Browse files Browse the repository at this point in the history
upgrade panda system to new stable version
  • Loading branch information
tmaeno authored Nov 22, 2024
2 parents da8f4b6 + 65a14c2 commit 771ab8b
Show file tree
Hide file tree
Showing 30 changed files with 232 additions and 73 deletions.
2 changes: 1 addition & 1 deletion helm/bigmon/charts/main/sandbox/local.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ dbaccess_postgres = {
}

# Oracle or Postgres
dbaccess = ${BIGMON_DB_ACCESS}
dbaccess = '${BIGMON_DB_ACCESS}'

#object store
OBJECT_STORE = {
Expand Down
2 changes: 1 addition & 1 deletion helm/bigmon/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ main:
enabled: true

image:
tag: "v0.6.17"
tag: "v0.6.25"

autoStart: true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"ceQueueName": "EL9",
"ceARCGridType": "arc",
"ceHostname": ["arc-ce01.gridpp.rl.ac.uk", "arc-ce02.gridpp.rl.ac.uk"],
Expand Down Expand Up @@ -1042,7 +1042,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down Expand Up @@ -1071,7 +1071,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down Expand Up @@ -1100,7 +1100,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down Expand Up @@ -1129,7 +1129,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"ceQueueName": "EL9",
"ceARCGridType": "arc",
"ceHostname": ["arc-ce01.gridpp.rl.ac.uk", "arc-ce02.gridpp.rl.ac.uk"],
Expand Down Expand Up @@ -1040,7 +1040,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down Expand Up @@ -1069,7 +1069,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down Expand Up @@ -1098,7 +1098,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down Expand Up @@ -1127,7 +1127,7 @@
"submitter": {
"nCore": 1,
"nCorePerNode": 1,
"noPilotsWhenNoActiveJobs": false,
"noPilotsWhenNoActiveJobs": true,
"submitMode": "PULL",
"ceQueueName": "EL9",
"ceARCGridType": "arc",
Expand Down
122 changes: 122 additions & 0 deletions helm/harvester/charts/harvester/sandbox/health_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python

"""
check harvester health
"""

import os
import re
import subprocess


def check_command(command, check_string):
print("Checking command : {0}".format(command))
print("For string : {0}".format(check_string))

tmp_array = command.split()
output = (
subprocess.Popen(tmp_array, stdout=subprocess.PIPE)
.communicate()[0]
.decode("ascii")
)

if re.search(check_string, output):
print("Found the string, return 100")
return 100
else:
print("String not found, return 0")
return 0


def uwsgi_process_availability():
# check the uwsgi
process_avail = 0
output = (
subprocess.Popen(
"ps -eo pgid,args | grep uwsgi | grep -v grep",
stdout=subprocess.PIPE,
shell=True,
)
.communicate()[0]
.decode("ascii")
)
count = 0
for line in output.split("\n"):
line = line.strip()
if line == "":
continue
count += 1
if count >= 1:
process_avail = 100

print("uwsgi process check availability: %s" % process_avail)
return process_avail


def condor_process_availability():
# check the condor
process_avail = 0
output = (
subprocess.Popen(
"ps -eo pgid,args | grep condor_schedd | grep -v grep",
stdout=subprocess.PIPE,
shell=True,
)
.communicate()[0]
.decode("ascii")
)
count = 0
for line in output.split("\n"):
line = line.strip()
if line == "":
continue
count += 1
if count >= 1:
process_avail = 100

print("condor_q process check availability: %s" % process_avail)
return process_avail


def condor_q_availability():
# check the condor_q
process_avail = 0
try:
result = subprocess.run(
["condor_q"],
timeout=10, # Timeout in seconds
capture_output=True,
text=True
)
print(f"command output: {result.stdout}")
process_avail = 100
except subprocess.TimeoutExpired:
print("The command timed out!")
process_avail = 0

print("condor_q process check availability: %s" % process_avail)
return process_avail


def main():
uwsgi_avail, condor_avail, condor_q_avail = 0, 0, 0
try:
uwsgi_avail = uwsgi_process_availability()
condor_avail = condor_process_availability()
condor_q_avail = condor_q_availability()
except Exception as ex:
print(f"failed to check availability: {ex}")

print(f"uwsgi_avail: {uwsgi_avail}, condor_avail: {condor_avail}, condor_q_avail: {condor_q_avail}")

health_monitor_file = "/var/log/panda/harvester_healthy"
if uwsgi_avail and condor_avail and condor_q_avail:
with open(health_monitor_file, 'w') as f:
f.write("OK")
else:
if os.path.exists(health_monitor_file):
os.remove(health_monitor_file)


if __name__ == '__main__':
main()
6 changes: 6 additions & 0 deletions helm/harvester/charts/harvester/sandbox/health_monitor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

source /opt/harvester/bin/activate
source /data/condor/condor/condor.sh

python /data/harvester/health_monitor.py
38 changes: 19 additions & 19 deletions helm/harvester/charts/harvester/sandbox/lsst.init-harvester
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,25 @@ yes|cp -fr etc/grid-security/vomsdir/lsst/* /etc/grid-security/vomsdir/lsst/
cd /data/harvester

# gcloud config
cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester
tar xzf gcloud_config.tar.gz
chmod 777 -R /data/harvester/gcloud_config
# cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester
# tar xzf gcloud_config.tar.gz
# chmod 777 -R /data/harvester/gcloud_config

# k8s config
cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester
tar xzf k8s.tar.gz
chmod 777 -R /data/harvester/k8s

export CLOUDSDK_CONFIG=/data/harvester/gcloud_config
export KUBECONFIG=/data/harvester/gcloud_config/.kube

mkdir -p /data/harvester/gcloud_config_rubin/
for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt
do
export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue
gcloud container clusters get-credentials --region=us-central1 $queue
chmod og+rw $KUBECONFIG
done

export KUBECONFIG=/data/harvester/gcloud_config/.kube
# cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester
# tar xzf k8s.tar.gz
# chmod 777 -R /data/harvester/k8s

# export CLOUDSDK_CONFIG=/data/harvester/gcloud_config
# export KUBECONFIG=/data/harvester/gcloud_config/.kube

# mkdir -p /data/harvester/gcloud_config_rubin/
# for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt
# do
# export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue
# gcloud container clusters get-credentials --region=us-central1 $queue
# chmod og+rw $KUBECONFIG
# done

# export KUBECONFIG=/data/harvester/gcloud_config/.kube

11 changes: 9 additions & 2 deletions helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,15 @@ fi
# env

echo
# cmd="$cmd --export=ALL /cvmfs/sw.lsst.eu/linux-x86_64/panda_env/v1.0.9/pilot/wrapper/rubin-wrapper.sh $@"
cmd="$cmd --export=ALL ${latest}/pilot/wrapper/rubin-wrapper.sh $@"

# check if there is a local dev pilot
pilot_wrapper_local=/sdf/data/rubin/panda_jobs/panda_env_pilot/pilot_wrapper/rubin-wrapper.sh
if [[ -f ${pilot_wrapper_local} ]]; then
cmd="$cmd --export=ALL ${pilot_wrapper_local} $@"
else
# cmd="$cmd --export=ALL /cvmfs/sw.lsst.eu/linux-x86_64/panda_env/v1.0.9/pilot/wrapper/rubin-wrapper.sh $@"
cmd="$cmd --export=ALL ${latest}/pilot/wrapper/rubin-wrapper.sh $@"
fi
echo $cmd

ntasks=${ntasks_total}
Expand Down
4 changes: 2 additions & 2 deletions helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh

# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -w generic --pilot-user rubin --url https://pandaserver-doma.cern.ch -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy"

# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog"
arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog"

arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog"
# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog"

initialdir = {accessPoint}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh

# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz"

# arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy"
# arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy"

arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy"
arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy"

initialdir = {accessPoint}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
executable = /opt/harvester/sandbox/lsst.rubin-srun.sh

arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy"
arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy"

# arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy"

initialdir = {accessPoint}

Expand Down
Loading

0 comments on commit 771ab8b

Please sign in to comment.