From 492a5ba28f51a1a2d55fecdbbcf78373ac605392 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 27 Sep 2023 10:12:49 +0200 Subject: [PATCH] Refactoring. Storing trace curl error in file, discovered by job metrics --- PILOTVERSION | 2 +- pilot/user/atlas/jobmetrics.py | 29 +++++++++++++++++++++++++++-- pilot/util/constants.py | 2 +- pilot/util/default.cfg | 3 +++ pilot/util/monitoring.py | 5 ----- pilot/util/tracereport.py | 12 +++++++++++- 6 files changed, 43 insertions(+), 10 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 5bdb2840..e20a74ab 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.8.27 \ No newline at end of file +3.6.8.29 \ No newline at end of file diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index 7fd93160..02b59dcb 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -12,9 +12,11 @@ import logging from pilot.api import analytics +from pilot.common.exception import FileHandlingFailure +from pilot.util.config import config from pilot.util.jobmetrics import get_job_metrics_entry from pilot.util.features import MachineFeatures, JobFeatures -from pilot.util.filehandling import find_last_line +from pilot.util.filehandling import find_last_line, read_file from pilot.util.math import float_to_rounded_string from .cpu import get_core_count @@ -74,7 +76,7 @@ def get_job_metrics_string(job, extra={}): logger.info("will not add max space = %d B to job metrics", max_space) # is there a detected rucio trace service error? - trace_exit_code = os.environ.get('RUCIO_TRACE_ERROR', '0') + trace_exit_code = get_trace_exit_code(job.workdir) if trace_exit_code != '0': job_metrics += get_job_metrics_entry("rucioTraceError", trace_exit_code) @@ -100,6 +102,29 @@ def get_job_metrics_string(job, extra={}): return job_metrics +def get_trace_exit_code(workdir): + """ + Look for any rucio trace curl problems using an env var and a file. + + :param workdir: payload work directory (str) + :return: curl exit code (str). + """ + + trace_exit_code = os.environ.get('RUCIO_TRACE_ERROR', '0') + if trace_exit_code == '0': + # look for rucio_trace_error_file in case middleware container is used + path = os.path.join(workdir, config.Rucio.rucio_trace_error_file) + if os.path.exists(path): + try: + trace_exit_code = read_file(path) + except FileHandlingFailure as exc: + logger.warning(f'failed to read {path}: {exc}') + else: + logger.debug(f'read {trace_exit_code} from file {path}') + + return trace_exit_code + + def add_features(job_metrics, corecount, add=[]): """ Add job and machine feature data to the job metrics if available diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a3b05f08..8d1433f9 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '8' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '27' # build number should be reset to '1' for every new development cycle +BUILD = '29' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index bda415fc..eb8307e3 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -303,6 +303,9 @@ checksum_type: adler32 # Rucio server URL for traces url: https://rucio-lb-prod.cern.ch/traces/ +# Error info file in case of curl error +rucio_trace_error_file: rucio_trace_error.txt + # Rucio host host: https://voatlasrucio-server-prod.cern.ch:443 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 4136cddc..ae2a367c 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -14,7 +14,6 @@ import subprocess from glob import glob from typing import Any -from re import findall from signal import SIGKILL from pilot.common.errorcodes import ErrorCodes @@ -94,10 +93,6 @@ def job_monitor_tasks(job, mt, args): # noqa: C901 # confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU) check_hz() - # verify that the process is still alive (again) - if not still_running(job.pid): - return 0, "" - try: cpuconsumptiontime = get_current_cpu_consumption_time(job.pid) except Exception as error: diff --git a/pilot/util/tracereport.py b/pilot/util/tracereport.py index 98060448..007835cd 100644 --- a/pilot/util/tracereport.py +++ b/pilot/util/tracereport.py @@ -19,7 +19,8 @@ from pilot.util.config import config from pilot.util.constants import get_pilot_version, get_rucio_client_version from pilot.util.container import execute, execute2 -from pilot.util.filehandling import append_to_file +from pilot.common.exception import FileHandlingFailure +from pilot.util.filehandling import append_to_file, write_file import logging logger = logging.getLogger(__name__) @@ -204,6 +205,15 @@ def send(self): if not exit_code: logger.info('no errors were detected from curl operation') else: + # better to store exit code in file since env var will not be seen outside container in case middleware + # container is used + path = os.path.join(self.workdir, config.Rucio.rucio_trace_error_file) + try: + write_file(path, str(exit_code)) + except FileHandlingFailure as exc: + logger.warning(f'failed to store curl exit code to file: {exc}') + else: + logger.info(f'wrote rucio trace exit code {exit_code} to file {path}') logger.debug(f"setting env var RUCIO_TRACE_ERROR to \'{exit_code}\' to be sent with job metrics") os.environ['RUCIO_TRACE_ERROR'] = str(exit_code)