From dce080469de69738f3ceb1924c53de1b7e8df340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tibor=20=C5=A0imko?= Date: Thu, 3 Feb 2022 14:08:47 +0100 Subject: [PATCH] htcondorcern: fix Singularity and Kerberos tokens Fixes CERN HTCondor compute backend for Singularity unpacked images execution mode where jobs couldn't access restricted EOS directories due to inaccessible Kerberos credentials. --- Dockerfile | 2 +- .../htcondorcern_job_manager.py | 29 +++++++++++++------ reana_job_controller/job_monitor.py | 2 +- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 58e37561..b193f6fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # This file is part of REANA. -# Copyright (C) 2017, 2018, 2019, 2020, 2021 CERN. +# Copyright (C) 2017, 2018, 2019, 2020, 2021, 2022 CERN. # # REANA is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/reana_job_controller/htcondorcern_job_manager.py b/reana_job_controller/htcondorcern_job_manager.py index f0483a99..7d7c30bc 100644 --- a/reana_job_controller/htcondorcern_job_manager.py +++ b/reana_job_controller/htcondorcern_job_manager.py @@ -146,6 +146,7 @@ def execute(self): job_ad["MaxRunTime"] = 3600 if self.htcondor_accounting_group: job_ad["AccountingGroup"] = self.htcondor_accounting_group + job_ad["MY.SendCredential"] = True future = current_app.htcondor_executor.submit(self._submit, job_ad) clusterid = future.result() return clusterid @@ -229,12 +230,15 @@ def _copy_wrapper_file(self): ) else: template = ( - "#!/bin/bash \n" + "#!/bin/bash\n" + 'SINGULARITY_KRB5CCNAME="FILE:/srv/$(basename $KRB5CCNAME)"\n' "singularity exec " + "--contain " + "--ipc " + "--pid " "--home $PWD:/srv " - "--bind $PWD:/srv " "--bind /cvmfs " - "--bind /eos " + "--env KRB5CCNAME=$SINGULARITY_KRB5CCNAME " "{DOCKER_IMG} {CMD}".format( DOCKER_IMG=self.docker_img, CMD=self._format_arguments() + " | bash", @@ -253,7 +257,7 @@ def _copy_wrapper_file(self): def _submit(self, job_ad): """Execute submission transaction.""" ads = [] - schedd = HTCondorJobManagerCERN._get_schedd() + schedd, credd = HTCondorJobManagerCERN._get_schedd() logging.info("Submiting job - {}".format(job_ad)) clusterid = schedd.submit(job_ad, 1, True, ads) HTCondorJobManagerCERN._spool_input(ads) @@ -261,7 +265,7 @@ def _submit(self, job_ad): @retry(stop_max_attempt_number=MAX_NUM_RETRIES, wait_fixed=RETRY_WAIT_TIME) def _spool_input(ads): - schedd = HTCondorJobManagerCERN._get_schedd() + schedd, credd = HTCondorJobManagerCERN._get_schedd() logging.info("Spooling job inputs - {}".format(ads)) schedd.spool(ads) @@ -274,12 +278,19 @@ def _get_schedd(): thread_local, "MONITOR_THREAD_SCHEDD", htcondor.Schedd() # noqa: F821 ) logging.info("Getting schedd: {}".format(thread_local.MONITOR_THREAD_SCHEDD)) - return thread_local.MONITOR_THREAD_SCHEDD + credd = getattr(thread_local, "MONITOR_THREAD_CREDD", None) + if credd is None: + setattr( + thread_local, "MONITOR_THREAD_CREDD", htcondor.Credd() # noqa: F821 + ) + thread_local.MONITOR_THREAD_CREDD.add_user_cred(htcondor.CredTypes.Kerberos, None) + logging.info("Getting credd: {}".format(thread_local.MONITOR_THREAD_CREDD)) + return thread_local.MONITOR_THREAD_SCHEDD, thread_local.MONITOR_THREAD_CREDD def stop(backend_job_id): """Stop HTCondor job execution.""" try: - schedd = HTCondorJobManagerCERN._get_schedd() + schedd, credd = HTCondorJobManagerCERN._get_schedd() schedd.act( htcondor.JobAction.Remove, # noqa: F821 "ClusterId=={}".format(backend_job_id), @@ -290,7 +301,7 @@ def stop(backend_job_id): @retry(stop_max_attempt_number=MAX_NUM_RETRIES, wait_fixed=RETRY_WAIT_TIME) def spool_output(backend_job_id): """Transfer job output.""" - schedd = HTCondorJobManagerCERN._get_schedd() + schedd, credd = HTCondorJobManagerCERN._get_schedd() logging.info("Spooling jobs {} output.".format(backend_job_id)) schedd.retrieve("ClusterId == {}".format(backend_job_id)) @@ -316,7 +327,7 @@ def get_logs(backend_job_id, workspace): def find_job_in_history(backend_job_id): """Return job if present in condor history.""" - schedd = HTCondorJobManagerCERN._get_schedd() + schedd, credd = HTCondorJobManagerCERN._get_schedd() ads = ["ClusterId", "JobStatus", "ExitCode", "RemoveReason"] condor_it = schedd.history( "ClusterId == {0}".format(backend_job_id), ads, match=1 diff --git a/reana_job_controller/job_monitor.py b/reana_job_controller/job_monitor.py index 2441c62b..e5c15ad3 100644 --- a/reana_job_controller/job_monitor.py +++ b/reana_job_controller/job_monitor.py @@ -528,7 +528,7 @@ def query_condor_jobs(app, backend_job_ids): ads = ["ClusterId", "JobStatus", "ExitCode", "ExitStatus", "HoldReasonCode"] query = format_condor_job_que_query(backend_job_ids) htcondorcern_job_manager_cls = COMPUTE_BACKENDS["htcondorcern"]() - schedd = htcondorcern_job_manager_cls._get_schedd() + schedd, credd = htcondorcern_job_manager_cls._get_schedd() logging.info("Querying jobs {}".format(backend_job_ids)) condor_jobs = schedd.xquery(requirements=query, projection=ads) return condor_jobs