Skip to content

Commit

Permalink
feat: use user proxy when interacting with JobWrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
aldbr committed Aug 21, 2024
1 parent ff08839 commit 02fe4f2
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 122 deletions.
219 changes: 123 additions & 96 deletions src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getQueues
from DIRAC.Core.Utilities import DErrno
from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
from DIRAC.Core.Utilities.Proxy import executeWithUserProxy
from DIRAC.Core.Utilities.Version import getVersion
from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager
from DIRAC.Resources.Computing import ComputingElement
Expand Down Expand Up @@ -106,9 +107,10 @@ def initialize(self):
if self.submissionPolicy not in ["Application", "JobWrapper"]:
return S_ERROR("SubmissionPolicy must be either Workflow or JobWrapper")

result = self._initializeComputingElement("Pool")
if not result["OK"]:
return result
if self.submissionPolicy == "Application":
result = self._initializeComputingElement("Pool")
if not result["OK"]:
return result

# on-the fly imports
ol = ObjectLoader()
Expand All @@ -133,14 +135,15 @@ def beginExecution(self):
return result
self.pilotDN, _ = result["Value"]

# Maximum number of jobs that can be handled at the same time by the agent
# (only for Application submission) Maximum number of jobs that can be handled at the same time by the agent
self.maxJobsToSubmit = self.am_getOption("MaxJobsToSubmit", self.maxJobsToSubmit)
self.computingElement.setParameters({"NumberOfProcessors": self.maxJobsToSubmit})
if self.submissionPolicy == "Application":
self.computingElement.setParameters({"NumberOfProcessors": self.maxJobsToSubmit})

self.failedQueueCycleFactor = self.am_getOption("FailedQueueCycleFactor", self.failedQueueCycleFactor)
self.cleanTask = self.am_getOption("CleanTask", self.cleanTask)

# Get the location of the CVMFS installation
# (only for JobWrapper submission) Get the location of the CVMFS installation
if self.submissionPolicy == "JobWrapper":
self.cvmfsLocation = self.am_getOption("CVMFSLocation", self.cvmfsLocation)
self.log.info("CVMFS location:", self.cvmfsLocation)
Expand Down Expand Up @@ -207,6 +210,15 @@ def execute(self):
queueDictItems = list(self.queueDict.items())
random.shuffle(queueDictItems)

# Get a pilot proxy
cpuTime = 86400 * 3
self.log.verbose("Getting pilot proxy", "for %s/%s %d long" % (self.pilotDN, self.vo, cpuTime))
pilotGroup = Operations(vo=self.vo).getValue("Pilot/GenericPilotGroup")
result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, pilotGroup, cpuTime)
if not result["OK"]:
return result
pilotProxy = result["Value"]

if self.submissionPolicy == "Application":
# Check that there is enough slots locally
result = self._checkCEAvailability(self.computingElement)
Expand All @@ -223,16 +235,9 @@ def execute(self):
if not self._allowedToSubmit(queueName):
continue

# Get a working proxy
# Get the CE instance
ce = queueDictionary["CE"]
cpuTime = 86400 * 3
self.log.verbose("Getting pilot proxy", "for %s/%s %d long" % (self.pilotDN, self.vo, cpuTime))
pilotGroup = Operations(vo=self.vo).getValue("Pilot/GenericPilotGroup")
result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, pilotGroup, cpuTime)
if not result["OK"]:
return result
proxy = result["Value"]
ce.setProxy(proxy)
ce.setProxy(pilotProxy)

if self.submissionPolicy == "JobWrapper":
# Check errors that could have occurred during job submission and/or execution
Expand Down Expand Up @@ -360,6 +365,8 @@ def execute(self):
jobParams=params,
resourceParams=ceDict,
optimizerParams=optimizerParams,
owner=owner,
jobGroup=jobGroup,
processors=submissionParams["processors"],
)
if not result["OK"]:
Expand Down Expand Up @@ -465,6 +472,48 @@ def _checkMatchingIssues(self, jobRequest):

return S_OK()

#############################################################################

@executeWithUserProxy
def preProcessJob(self, job: JobWrapper):
"""Preprocess the job before submission: should be executed with the user proxy associated with the payload
:param JobWrapper job: job to preprocess
"""
if "InputSandbox" in job.jobArgs:
self.log.verbose("Getting the inputSandbox of job", job.jobID)
if not transferInputSandbox(job, job.jobArgs["InputSandbox"]):
return S_ERROR(f"Cannot get input sandbox of job {job.jobID}")
job.jobReport.commit()

if "InputData" in job.jobArgs and job.jobArgs["InputData"]:
self.log.verbose("Getting the inputData of job", job.jobID)
if not resolveInputData(job):
return S_ERROR(f"Cannot get input data of job {job.jobID}")
job.jobReport.commit()

# Preprocess the payload
try:
self.log.verbose("Pre-processing job", job.jobID)
result = job.preProcess()
if not result["OK"]:
self.log.error("JobWrapper failed the preprocessing phase for", f"{job.jobID}: {result['Message']}")
rescheduleResult = rescheduleFailedJob(
jobID=job.jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=job.jobReport
)
job.sendJobAccounting(status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
return S_ERROR(JobMinorStatus.JOB_WRAPPER_EXECUTION)
except Exception:
self.log.exception("JobWrapper failed the preprocessing phase for", job.jobID)
rescheduleResult = rescheduleFailedJob(
jobID=job.jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=job.jobReport
)
job.sendJobAccounting(status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
return S_ERROR(f"JobWrapper failed the preprocessing phase for {job.jobID}")

job.jobReport.commit()
return S_OK(result["Value"])

def _submitJobWrapper(
self,
jobID: str,
Expand All @@ -473,6 +522,8 @@ def _submitJobWrapper(
jobParams: dict,
resourceParams: dict,
optimizerParams: dict,
owner: str,
jobGroup: str,
processors: int,
):
"""Submit a JobWrapper to the remote site
Expand All @@ -482,7 +533,8 @@ def _submitJobWrapper(
:param jobParams: job parameters
:param resourceParams: resource parameters
:param optimizerParams: optimizer parameters
:param proxyChain: proxy chain
:param owner: owner of the job
:param jobGroup: group of the job
:param processors: number of processors
:return: S_OK
Expand Down Expand Up @@ -513,38 +565,10 @@ def _submitJobWrapper(
if not job:
return S_ERROR(f"Cannot get a JobWrapper instance for job {jobID}")

if "InputSandbox" in jobParams:
self.log.verbose("Getting the inputSandbox of job", jobID)
if not transferInputSandbox(job, jobParams["InputSandbox"], jobReport):
return S_ERROR(f"Cannot get input sandbox of job {jobID}")
jobReport.commit()

if "InputData" in jobParams and jobParams["InputData"]:
self.log.verbose("Getting the inputData of job", jobID)
if not resolveInputData(job, jobReport):
return S_ERROR(f"Cannot get input data of job {jobID}")
jobReport.commit()

# Preprocess the payload
try:
self.log.verbose("Pre-processing job", jobID)
result = job.preProcess()
if not result["OK"]:
self.log.error("JobWrapper failed the preprocessing phase for", f"{jobID}: {result['Message']}")
rescheduleResult = rescheduleFailedJob(
jobID=job.jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=jobReport
)
job.sendJobAccounting(status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
return S_ERROR(JobMinorStatus.JOB_WRAPPER_EXECUTION)
except Exception:
self.log.exception("JobWrapper failed the preprocessing phase for", jobID)
rescheduleResult = rescheduleFailedJob(
jobID=job.jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=jobReport
)
job.sendJobAccounting(status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
return S_ERROR(f"JobWrapper failed the preprocessing phase for {jobID}")
result = self.preProcessJob(job, proxyUserName=job.owner, proxyUserGroup=job.userGroup)
if not result["OK"]:
return result
payloadParams = result["Value"]
jobReport.commit()

# Dump the remote CFG config into the job directory: it is needed for the JobWrapperTemplate
cfgFilename = Path(job.jobIDPath) / "dirac.cfg"
Expand Down Expand Up @@ -622,6 +646,8 @@ def _submitJobWrapper(
time.sleep(self.jobSubmissionDelay)
return S_OK()

#############################################################################

def _checkOutputIntegrity(self, workingDirectory: str):
"""Make sure that output files are not corrupted.
Expand Down Expand Up @@ -650,6 +676,55 @@ def _checkOutputIntegrity(self, workingDirectory: str):

return S_OK()

@executeWithUserProxy
def postProcessJob(self, job: JobWrapper, payloadResults):
"""Perform post-processing for a job: should be executed with the user proxy associated with the payload.
:param job: JobWrapper instance
:param payloadResults: Payload results
"""
try:
result = job.postProcess(**payloadResults)
if not result["OK"]:
self.log.error("Failed to post process the job", f"{job.jobID}: {result['Message']}")
if result["Errno"] == DErrno.EWMSRESC:
self.log.warn("Asked to reschedule job")
rescheduleResult = rescheduleFailedJob(
jobID=job.jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=job.jobReport
)
job.sendJobAccounting(status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
shutil.rmtree(job.jobIDPath)
return

job.jobReport.setJobParameter("Error Message", result["Message"], sendFlag=False)
job.jobReport.setJobStatus(
status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False
)
job.sendFailoverRequest()
job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
shutil.rmtree(job.jobIDPath)
return
except Exception as exc: # pylint: disable=broad-except
self.log.exception("Job raised exception during post processing phase")
job.jobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
job.jobReport.setJobStatus(
status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False
)
job.sendFailoverRequest()
job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
shutil.rmtree(job.jobIDPath)
return

if "OutputSandbox" in job.jobArgs or "OutputData" in job.jobArgs:
self.log.verbose("Uploading the outputSandbox/Data of the job")
if not processJobOutputs(job):
shutil.rmtree(job.jobIDPath)
return
job.jobReport.commit()

# Clean job wrapper locally
job.finalize()

def _checkSubmittedJobWrappers(self, ce: ComputingElement, site: str):
"""Check the status of the submitted tasks.
If the task is finished, get the output and post process the job.
Expand Down Expand Up @@ -771,48 +846,7 @@ def _checkSubmittedJobWrappers(self, ce: ComputingElement, site: str):
continue

payloadResults = result["Value"]
# Post-process the job
try:
result = job.postProcess(**payloadResults)
if not result["OK"]:
self.log.error("Failed to post process the job", f"{jobID}: {result['Message']}")
if result["Errno"] == DErrno.EWMSRESC:
self.log.warn("Asked to reschedule job")
rescheduleResult = rescheduleFailedJob(
jobID=job.jobID, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION, jobReport=jobReport
)
job.sendJobAccounting(status=rescheduleResult, minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
shutil.rmtree(job.jobIDPath)
continue

jobReport.setJobParameter("Error Message", result["Message"], sendFlag=False)
jobReport.setJobStatus(
status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False
)
job.sendFailoverRequest()
job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
shutil.rmtree(job.jobIDPath)
continue
except Exception as exc: # pylint: disable=broad-except
self.log.exception("Job raised exception during post processing phase")
jobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
jobReport.setJobStatus(
status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC, sendFlag=False
)
job.sendFailoverRequest()
job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
shutil.rmtree(job.jobIDPath)
continue

if "OutputSandbox" in job.jobArgs or "OutputData" in job.jobArgs:
self.log.verbose("Uploading the outputSandbox/Data of the job")
if not processJobOutputs(job, jobReport):
shutil.rmtree(job.jobIDPath)
continue
jobReport.commit()

# Clean job wrapper locally
job.finalize()
self.postProcessJob(job, payloadResults, proxyUserName=job.owner, proxyUserGroup=job.userGroup)

# Clean job in the remote resource
if self.cleanTask:
Expand All @@ -837,12 +871,5 @@ def finalize(self):
self.log.error("Problem while trying to get status of the last submitted jobs")
break
time.sleep(int(self.am_getOption("PollingTime")))
else:
for _, queueDictionary in self.queueDict.items():
ce = queueDictionary["CE"]
# Check the submitted JobWrappers a last time
result = self._checkSubmittedJobWrappers(ce, queueDictionary["ParametersDict"]["Site"])
if not result["OK"]:
self.log.error("Problem while trying to get status of the last submitted JobWrappers")

return S_OK()
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def process(self, command: str, env: dict):
#############################################################################
def postProcess(
self,
payloadStatus: int,
payloadStatus: int | None,
payloadOutput: str,
payloadExecutorError: str,
cpuTimeConsumed: list,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def execute(jobID: int, arguments: dict, jobReport: JobReport):

if "InputSandbox" in arguments["Job"]:
jobReport.commit()
if not transferInputSandbox(job, arguments["Job"]["InputSandbox"], jobReport):
if not transferInputSandbox(job, arguments["Job"]["InputSandbox"]):
return 1
else:
gLogger.verbose("Job has no InputSandbox requirement")
Expand All @@ -62,7 +62,7 @@ def execute(jobID: int, arguments: dict, jobReport: JobReport):

if "InputData" in arguments["Job"]:
if arguments["Job"]["InputData"]:
if not resolveInputData(job, jobReport):
if not resolveInputData(job):
return 1
else:
gLogger.verbose("Job has a null InputData requirement:")
Expand All @@ -72,11 +72,11 @@ def execute(jobID: int, arguments: dict, jobReport: JobReport):

jobReport.commit()

if not executePayload(job, jobReport):
if not executePayload(job):
return 1

if "OutputSandbox" in arguments["Job"] or "OutputData" in arguments["Job"]:
if not processJobOutputs(job, jobReport):
if not processJobOutputs(job):
return 2
else:
gLogger.verbose("Job has no OutputData or OutputSandbox requirement")
Expand Down
Loading

0 comments on commit 02fe4f2

Please sign in to comment.