From b6cef9b5c65d423ca0639835249a92397bf5bad1 Mon Sep 17 00:00:00 2001 From: aldbr Date: Mon, 8 Jul 2024 13:51:34 +0200 Subject: [PATCH] feat: introduce the JobExecutionCoordinator --- .../JobWrapper/JobExecutionCoordinator.py | 39 +++++++++++ .../JobWrapper/JobWrapper.py | 49 ++++++++------ .../JobWrapper/test/Test_JobWrapper.py | 67 +++++++++++-------- 3 files changed, 109 insertions(+), 46 deletions(-) create mode 100644 src/DIRAC/WorkloadManagementSystem/JobWrapper/JobExecutionCoordinator.py diff --git a/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobExecutionCoordinator.py b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobExecutionCoordinator.py new file mode 100644 index 00000000000..c82e10a9de3 --- /dev/null +++ b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobExecutionCoordinator.py @@ -0,0 +1,39 @@ +from DIRAC import S_OK + + +class JobExecutionCoordinator: + """ + Abstract class for job execution coordinators. + + This class is responsible for pre-processing and post-processing jobs before and after execution. + It should be implemented by the community job execution coordinator. + It is used by the JobWrapper to handle the execution of jobs. + """ + + def __init__(self, jobArgs: dict, ceArgs: dict) -> None: + """ + Initialize the job execution coordinator. + + :param jobArgs: The job arguments + :param ceArgs: The environment arguments + """ + self.jobArgs = jobArgs + self.ceArgs = ceArgs + + def preProcess(self, command: str, exeEnv: dict): + """ + Pre-process a job before executing it. + This should handle tasks like downloading inputs, preparing commands, etc. + + :param job: The job to be pre-processed + """ + return S_OK({"command": command, "env": exeEnv}) + + def postProcess(self): + """ + Post-process a job after executing it. + This should handle tasks like uploading outputs, checking results, etc. + + :param job: The job to be post-processed + """ + return S_OK() diff --git a/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py index 0a003b10ad6..c17f2e3284d 100755 --- a/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +++ b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py @@ -97,8 +97,8 @@ def __init__(self, jobID=None, jobReport=None): if self.maxPeekLines < 0: self.maxPeekLines = 0 self.defaultCPUTime = gConfig.getValue(self.section + "/DefaultCPUTime", 600) - self.defaultOutputFile = gConfig.getValue(self.section + "/DefaultOutputFile", "std.out") - self.defaultErrorFile = gConfig.getValue(self.section + "/DefaultErrorFile", "std.err") + self.outputFile = gConfig.getValue(self.section + "/DefaultOutputFile", "std.out") + self.errorFile = gConfig.getValue(self.section + "/DefaultErrorFile", "std.err") self.diskSE = gConfig.getValue(self.section + "/DiskSE", ["-disk", "-DST", "-USER"]) self.tapeSE = gConfig.getValue(self.section + "/TapeSE", ["-tape", "-RDST", "-RAW"]) self.failoverRequestDelay = gConfig.getValue(self.section + "/FailoverRequestDelay", 45) @@ -175,6 +175,8 @@ def __init__(self, jobID=None, jobReport=None): self.optArgs = {} self.ceArgs = {} + self.jobExecutionCoordinator = None + # Store the result of the payload execution self.executionResults = {} @@ -194,6 +196,10 @@ def initialize(self, arguments): self.jobGroup = self.jobArgs.get("JobGroup", self.jobGroup) self.jobName = self.jobArgs.get("JobName", self.jobName) self.jobType = self.jobArgs.get("JobType", self.jobType) + # Prepare outputs + self.errorFile = self.jobArgs.get("StdError", self.errorFile) + self.outputFile = self.jobArgs.get("StdOutput", self.outputFile) + dataParam = self.jobArgs.get("InputData", []) if dataParam and not isinstance(dataParam, list): dataParam = [dataParam] @@ -232,6 +238,14 @@ def initialize(self, arguments): self.log.debug("================") self.log.debug(json.dumps(dict(os.environ), indent=4)) + # Load the Job Execution Coordinator: can be overriden by a specific implementation + result = ObjectLoader().loadObject( + "WorkloadManagementSystem.JobWrapper.JobExecutionCoordinator", "JobExecutionCoordinator" + ) + if not result["OK"]: + return result + self.jobExecutionCoordinator = result["Value"](jobArgs=self.jobArgs, ceArgs=self.ceArgs) + ############################################################################# def __setInitialJobParameters(self): """Sets some initial job parameters""" @@ -358,26 +372,19 @@ def preProcess(self): command = result["Value"] self.log.verbose(f"Execution command: {command}") - # Prepare outputs - errorFile = self.jobArgs.get("StdError", self.defaultErrorFile) - outputFile = self.jobArgs.get("StdOutput", self.defaultOutputFile) - result = self.__prepareEnvironment() if not result["OK"]: return result exeEnv = result["Value"] - return S_OK( - { - "command": command, - "error": errorFile, - "output": outputFile, - "env": exeEnv, - } - ) + if not (result := self.jobExecutionCoordinator.preProcess(command, exeEnv))["OK"]: + self.log.error("Failed to pre-process the job", result["Message"]) + return result + + return result ############################################################################# - def process(self, command: str, output: str, error: str, env: dict): + def process(self, command: str, env: dict): """This method calls the payload.""" self.log.info(f"Job Wrapper is starting the processing phase for job {self.jobID}") @@ -398,7 +405,9 @@ def process(self, command: str, output: str, error: str, env: dict): spObject = Subprocess(timeout=False, bufferLimit=int(self.bufferLimit)) with contextlib.chdir(self.jobIDPath): - exeThread = ExecutionThread(spObject, command, self.maxPeekLines, output, error, env, self.executionResults) + exeThread = ExecutionThread( + spObject, command, self.maxPeekLines, self.outputFile, self.errorFile, env, self.executionResults + ) exeThread.start() payloadPID = None for seconds in range(5, 40, 5): @@ -565,7 +574,11 @@ def postProcess( self.failedFlag = False self.__report(status=JobStatus.COMPLETING, minorStatus=JobMinorStatus.APP_SUCCESS, sendFlag=True) - return S_OK() + if not (result := self.jobExecutionCoordinator.postProcess())["OK"]: + self.log.error("Failed to post-process the job", result["Message"]) + return result + + return result ############################################################################# def execute(self): @@ -580,8 +593,6 @@ def execute(self): result = self.process( command=payloadParams["command"], - output=payloadParams["output"], - error=payloadParams["error"], env=payloadParams["env"], ) if not result["OK"]: diff --git a/src/DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py b/src/DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py index f999baf6826..0dc4d899e4b 100644 --- a/src/DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +++ b/src/DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py @@ -15,6 +15,7 @@ from DIRAC.DataManagementSystem.Client.test.mock_DM import dm_mock from DIRAC.Resources.Catalog.test.mock_FC import fc_mock from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus +from DIRAC.WorkloadManagementSystem.JobWrapper.JobExecutionCoordinator import JobExecutionCoordinator from DIRAC.WorkloadManagementSystem.JobWrapper.JobWrapper import JobWrapper getSystemSectionMock = MagicMock() @@ -35,48 +36,44 @@ def test_preProcess(mocker): # Test a simple command without argument jw = JobWrapper() jw.jobArgs = {"Executable": echoLocation} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) result = jw.preProcess() assert result["OK"] assert result["Value"]["command"] == echoLocation - assert result["Value"]["error"] == "std.err" - assert result["Value"]["output"] == "std.out" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "1" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" # Test a command with arguments jw = JobWrapper() jw.jobArgs = {"Executable": echoLocation, "Arguments": "hello"} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) result = jw.preProcess() assert result["OK"] assert result["Value"]["command"] == f"{echoLocation} hello" - assert result["Value"]["error"] == "std.err" - assert result["Value"]["output"] == "std.out" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "1" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" # Test a command that is included in the PATH jw = JobWrapper() jw.jobArgs = {"Executable": "echo", "Arguments": "hello"} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) result = jw.preProcess() assert result["OK"] assert result["Value"]["command"] == f"{echoLocation} hello" - assert result["Value"]["error"] == "std.err" - assert result["Value"]["output"] == "std.out" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "1" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" # Test a command and specify outputs jw = JobWrapper() jw.jobArgs = {"Executable": "echo", "Arguments": "hello", "StdError": "error.log", "StdOutput": "output.log"} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) result = jw.preProcess() assert result["OK"] assert result["Value"]["command"] == f"{echoLocation} hello" - assert result["Value"]["error"] == "error.log" - assert result["Value"]["output"] == "output.log" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "1" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" @@ -84,32 +81,32 @@ def test_preProcess(mocker): jw = JobWrapper() jw.jobArgs = {"Executable": "echo", "Arguments": "hello"} jw.ceArgs = {"Processors": 2} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) result = jw.preProcess() assert result["OK"] assert result["Value"]["command"] == f"{echoLocation} hello" - assert result["Value"]["error"] == "std.err" - assert result["Value"]["output"] == "std.out" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "2" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" # Test a command with environment variable in the executable jw = JobWrapper() jw.jobArgs = {"Executable": "${CMD}", "Arguments": "hello"} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) os.environ["CMD"] = echoLocation result = jw.preProcess() assert result["OK"] assert result["Value"]["command"] == f"{echoLocation} hello" - assert result["Value"]["error"] == "std.err" - assert result["Value"]["output"] == "std.out" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "1" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" # Test a command with an empty executable jw = JobWrapper() jw.jobArgs = {} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) + result = jw.preProcess() assert not result["OK"] assert result["Message"] == "Job 0 has no specified executable" @@ -117,6 +114,8 @@ def test_preProcess(mocker): # Test a command with an executable that does not exist jw = JobWrapper() jw.jobArgs = {"Executable": "pippo"} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) + result = jw.preProcess() assert not result["OK"] assert result["Message"] == f"Path to executable {os.getcwd()}/pippo not found" @@ -124,6 +123,7 @@ def test_preProcess(mocker): # Test dirac-jobexec jw = JobWrapper() jw.jobArgs = {"Executable": "dirac-jobexec", "Arguments": "jobDescription.xml"} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) result = jw.preProcess() assert result["OK"] @@ -136,8 +136,6 @@ def test_preProcess(mocker): assert ( result["Value"]["command"].strip() == f"{diracJobExecLocation} jobDescription.xml {' '.join(expectedOptions)}" ) - assert result["Value"]["error"] == "std.err" - assert result["Value"]["output"] == "std.out" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "1" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" @@ -145,8 +143,9 @@ def test_preProcess(mocker): jw = JobWrapper() jw.jobArgs = {"Executable": "dirac-jobexec", "Arguments": "jobDescription.xml"} jw.ceArgs = {"GridCE": "CE", "Queue": "Queue", "SubmissionPolicy": "Application"} - result = jw.preProcess() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) + result = jw.preProcess() assert result["OK"] expectedOptions = [ "-o /LocalSite/CPUNormalizationFactor=0.0", @@ -158,8 +157,6 @@ def test_preProcess(mocker): assert ( result["Value"]["command"].strip() == f"{diracJobExecLocation} jobDescription.xml {' '.join(expectedOptions)}" ) - assert result["Value"]["error"] == "std.err" - assert result["Value"]["output"] == "std.out" assert result["Value"]["env"]["DIRAC_PROCESSORS"] == "1" assert result["Value"]["env"]["DIRAC_WHOLENODE"] == "False" @@ -179,10 +176,10 @@ def test_processSuccessfulCommand(mocker): mocker.patch.object(jw, "_JobWrapper__setJobParam") with tempfile.NamedTemporaryFile(delete=True) as std_out, tempfile.NamedTemporaryFile(delete=True) as std_err: + jw.outputFile = std_out.name + jw.errorFile = std_err.name result = jw.process( command=f"{os.path.dirname(os.path.abspath(__file__))}/script-long.sh", - output=std_out.name, - error=std_err.name, env={}, ) @@ -209,10 +206,10 @@ def test_processSuccessfulDiracJobExec(mocker): with tempfile.NamedTemporaryFile(delete=True) as std_out, tempfile.NamedTemporaryFile(delete=True) as std_err: executable = shutil.which("dirac-jobexec") + jw.outputFile = std_out.name + jw.errorFile = std_err.name result = jw.process( command=f"{executable} {os.path.dirname(os.path.abspath(__file__))}/jobDescription.xml --o /DIRAC/Setup=Test", - output=std_out.name, - error=std_err.name, env={}, ) @@ -234,10 +231,10 @@ def test_processFailedCommand(mocker): mocker.patch.object(jw, "_JobWrapper__setJobParam") with tempfile.NamedTemporaryFile(delete=True) as std_out, tempfile.NamedTemporaryFile(delete=True) as std_err: + jw.outputFile = std_out.name + jw.errorFile = std_err.name result = jw.process( command=f"{os.path.dirname(os.path.abspath(__file__))}/script-fail.sh", - output=std_out.name, - error=std_err.name, env={}, ) @@ -271,7 +268,9 @@ def test_processFailedSubprocess(mocker): mocker.patch.object(jw, "_JobWrapper__setJobParam") with tempfile.NamedTemporaryFile(delete=True) as std_out, tempfile.NamedTemporaryFile(delete=True) as std_err: - result = jw.process("mock_command", std_out.name, std_err.name, {}) + jw.outputFile = std_out.name + jw.errorFile = std_err.name + result = jw.process("mock_command", {}) assert result["OK"] assert not result["Value"]["payloadStatus"] @@ -294,7 +293,9 @@ def test_processQuickExecutionNoWatchdog(mocker): mocker.patch.object(jw, "_JobWrapper__setJobParam") with tempfile.NamedTemporaryFile(delete=True) as std_out, tempfile.NamedTemporaryFile(delete=True) as std_err: - result = jw.process(command=f"echo hello", output=std_out.name, error=std_err.name, env={}) + jw.outputFile = std_out.name + jw.errorFile = std_err.name + result = jw.process(command=f"echo hello", env={}) assert result["OK"] assert result["Value"]["payloadStatus"] == 0 @@ -321,7 +322,9 @@ def test_processSubprocessFailureNoPid(mocker): mocker.patch("DIRAC.WorkloadManagementSystem.JobWrapper.JobWrapper.ExecutionThread", return_value=mock_exeThread) with tempfile.NamedTemporaryFile(delete=True) as std_out, tempfile.NamedTemporaryFile(delete=True) as std_err: - result = jw.process(command=f"mock_command", output=std_out.name, error=std_err.name, env={}) + jw.outputFile = std_out.name + jw.errorFile = std_err.name + result = jw.process(command=f"mock_command", env={}) assert not result["OK"] assert "Payload process could not start after 140 seconds" in result["Message"] @@ -344,6 +347,7 @@ def set_param_side_effect(*args, **kwargs): # Test when the payload finished successfully jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -365,6 +369,7 @@ def set_param_side_effect(*args, **kwargs): # Test when the payload failed jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -386,6 +391,7 @@ def set_param_side_effect(*args, **kwargs): # Test when the payload failed: should be rescheduled jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -409,6 +415,7 @@ def set_param_side_effect(*args, **kwargs): # Test when there is no output jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -430,6 +437,7 @@ def set_param_side_effect(*args, **kwargs): # Test when there is a watchdog error jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -451,6 +459,7 @@ def set_param_side_effect(*args, **kwargs): # Test when the executor failed: no status defined jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -472,6 +481,7 @@ def set_param_side_effect(*args, **kwargs): # Test when the executor failed: status defined jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -494,6 +504,7 @@ def set_param_side_effect(*args, **kwargs): # Test when the subprocess did not complete jw = JobWrapper() + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) mocker.patch.object(jw, "_JobWrapper__report", side_effect=report_side_effect) mocker.patch.object(jw, "_JobWrapper__setJobParam", side_effect=set_param_side_effect) @@ -552,9 +563,11 @@ def test_execute(mocker, executable, args, src, expectedResult): jw = JobWrapper() jw.jobArgs = {"Executable": executable} + jw.jobExecutionCoordinator = JobExecutionCoordinator(None, None) + if args: jw.jobArgs["Arguments"] = args - res = jw.execute() + jw.execute() assert expectedResult in jw.jobReport.jobStatusInfo[-1] if src: