Skip to content

Commit

Permalink
Refactoring. Storing trace curl error in file, discovered by job metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
PalNilsson committed Sep 27, 2023
1 parent 1287a69 commit 492a5ba
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 10 deletions.
2 changes: 1 addition & 1 deletion PILOTVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.6.8.27
3.6.8.29
29 changes: 27 additions & 2 deletions pilot/user/atlas/jobmetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
import logging

from pilot.api import analytics
from pilot.common.exception import FileHandlingFailure
from pilot.util.config import config
from pilot.util.jobmetrics import get_job_metrics_entry
from pilot.util.features import MachineFeatures, JobFeatures
from pilot.util.filehandling import find_last_line
from pilot.util.filehandling import find_last_line, read_file
from pilot.util.math import float_to_rounded_string

from .cpu import get_core_count
Expand Down Expand Up @@ -74,7 +76,7 @@ def get_job_metrics_string(job, extra={}):
logger.info("will not add max space = %d B to job metrics", max_space)

# is there a detected rucio trace service error?
trace_exit_code = os.environ.get('RUCIO_TRACE_ERROR', '0')
trace_exit_code = get_trace_exit_code(job.workdir)
if trace_exit_code != '0':
job_metrics += get_job_metrics_entry("rucioTraceError", trace_exit_code)

Expand All @@ -100,6 +102,29 @@ def get_job_metrics_string(job, extra={}):
return job_metrics


def get_trace_exit_code(workdir):
"""
Look for any rucio trace curl problems using an env var and a file.
:param workdir: payload work directory (str)
:return: curl exit code (str).
"""

trace_exit_code = os.environ.get('RUCIO_TRACE_ERROR', '0')
if trace_exit_code == '0':
# look for rucio_trace_error_file in case middleware container is used
path = os.path.join(workdir, config.Rucio.rucio_trace_error_file)
if os.path.exists(path):
try:
trace_exit_code = read_file(path)
except FileHandlingFailure as exc:
logger.warning(f'failed to read {path}: {exc}')
else:
logger.debug(f'read {trace_exit_code} from file {path}')

return trace_exit_code


def add_features(job_metrics, corecount, add=[]):
"""
Add job and machine feature data to the job metrics if available
Expand Down
2 changes: 1 addition & 1 deletion pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '8' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '27' # build number should be reset to '1' for every new development cycle
BUILD = '29' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down
3 changes: 3 additions & 0 deletions pilot/util/default.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,9 @@ checksum_type: adler32
# Rucio server URL for traces
url: https://rucio-lb-prod.cern.ch/traces/

# Error info file in case of curl error
rucio_trace_error_file: rucio_trace_error.txt

# Rucio host
host: https://voatlasrucio-server-prod.cern.ch:443

Expand Down
5 changes: 0 additions & 5 deletions pilot/util/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import subprocess
from glob import glob
from typing import Any
from re import findall
from signal import SIGKILL

from pilot.common.errorcodes import ErrorCodes
Expand Down Expand Up @@ -94,10 +93,6 @@ def job_monitor_tasks(job, mt, args): # noqa: C901
# confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU)
check_hz()

# verify that the process is still alive (again)
if not still_running(job.pid):
return 0, ""

try:
cpuconsumptiontime = get_current_cpu_consumption_time(job.pid)
except Exception as error:
Expand Down
12 changes: 11 additions & 1 deletion pilot/util/tracereport.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from pilot.util.config import config
from pilot.util.constants import get_pilot_version, get_rucio_client_version
from pilot.util.container import execute, execute2
from pilot.util.filehandling import append_to_file
from pilot.common.exception import FileHandlingFailure
from pilot.util.filehandling import append_to_file, write_file

import logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -204,6 +205,15 @@ def send(self):
if not exit_code:
logger.info('no errors were detected from curl operation')
else:
# better to store exit code in file since env var will not be seen outside container in case middleware
# container is used
path = os.path.join(self.workdir, config.Rucio.rucio_trace_error_file)
try:
write_file(path, str(exit_code))
except FileHandlingFailure as exc:
logger.warning(f'failed to store curl exit code to file: {exc}')
else:
logger.info(f'wrote rucio trace exit code {exit_code} to file {path}')
logger.debug(f"setting env var RUCIO_TRACE_ERROR to \'{exit_code}\' to be sent with job metrics")
os.environ['RUCIO_TRACE_ERROR'] = str(exit_code)

Expand Down

0 comments on commit 492a5ba

Please sign in to comment.