diff --git a/resources/playbook/roles/bibigrid/files/slurm/delete_server.py b/resources/playbook/roles/bibigrid/files/slurm/delete_server.py index b74754c3..c9677393 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/delete_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/delete_server.py @@ -12,9 +12,9 @@ import time import os_client_config +import requests import yaml - -from pyzabbix import ZabbixAPI +from pyzabbix import ZabbixAPI, ZabbixAPIException LOGGER_FORMAT = "%(asctime)s [%(levelname)s] %(message)s" logging.basicConfig(format=LOGGER_FORMAT, filename="/var/log/slurm/delete_server.log", level=logging.INFO) @@ -29,11 +29,11 @@ logging.info("Your input %s with length %s", sys.argv, len(sys.argv)) sys.exit(1) -separator = ',' +SEPERATOR = ',' if '\n' in sys.argv[1]: - separator = '\n' + SEPERATOR = '\n' -terminate_workers = sys.argv[1].split(separator) +terminate_workers = sys.argv[1].split(SEPERATOR) logging.info("Deleting instances %s", terminate_workers) GROUP_VARS_PATH = "/opt/playbook/group_vars" @@ -61,8 +61,8 @@ for worker_group in worker_groups: for terminate_worker in terminate_workers: # terminate all servers that are part of the current worker group - result = subprocess.run(["scontrol", "show", "hostname", worker_group["name"]], - stdout=subprocess.PIPE, check=True) # get all workers in worker_type + result = subprocess.run(["scontrol", "show", "hostname", worker_group["name"]], stdout=subprocess.PIPE, + check=True) # get all workers in worker_type possible_workers = result.stdout.decode("utf-8").strip().split("\n") if terminate_worker in possible_workers: result = connections[worker_group["cloud_identifier"]].delete_server(terminate_worker) @@ -76,19 +76,29 @@ # ------------------------------- # connect to Zabbix API -zapi = ZabbixAPI(server='http://localhost/zabbix') -# authenticate -zapi.login("Admin",common_config["zabbix_conf"]["admin_password"]) -# iterate over terminate_workers list -for terminate_worker in terminate_workers: - # get list of hosts that matches the hostname - hosts = zapi.host.get(output=["hostid","name"],filter={"name": terminate_worker}) - if not hosts: - logging.warning(f"Can't remove host '{terminate_worker}' from Zabbix: Host doesn't exist.") - else: - # remove host from Zabbix - zapi.host.delete(hosts[0]["hostid"]) - logging.info(f"Remove host '{terminate_worker}' from Zabbix.") +if common_config["enable_zabbix"]: + try: + # Connect to Zabbix API + zapi = ZabbixAPI(server='http://localhost/zabbix') + + # Authenticate + zapi.login("Admin", common_config["zabbix_conf"]["admin_password"]) + + # Iterate over terminate_workers list + for terminate_worker in terminate_workers: + try: + # Get list of hosts that matches the hostname + hosts = zapi.host.get(output=["hostid", "name"], filter={"name": terminate_worker}) + if not hosts: + logging.warning(f"Can't remove host '{terminate_worker}' from Zabbix: Host doesn't exist.") + else: + # Remove host from Zabbix + zapi.host.delete(hosts[0]["hostid"]) + logging.info(f"Removed host '{terminate_worker}' from Zabbix.") + except ZabbixAPIException as e: + logging.error(f"Error while handling host '{terminate_worker}': {e}") + except requests.exceptions.RequestException as e: + logging.error(f"Cannot connect to Zabbix server: {e}") logging.info(f"Successful delete_server.py execution ({sys.argv[1]})!") time_in_s = time.time() - start_time diff --git a/resources/playbook/roles/bibigrid/tasks/011-zabbix-agent.yaml b/resources/playbook/roles/bibigrid/tasks/011-zabbix-agent.yaml index 890aaf33..a7e2e1af 100644 --- a/resources/playbook/roles/bibigrid/tasks/011-zabbix-agent.yaml +++ b/resources/playbook/roles/bibigrid/tasks/011-zabbix-agent.yaml @@ -1,18 +1,11 @@ -- name: Install zabbix python-api - pip: - name: zabbix-api - -- name: Install zabbix agent - apt: - name: zabbix-agent - state: present - when: "ansible_distribution_file_variety == 'Debian'" - -- name: Install zabbix agent - dnf: - name: zabbix-agent - state: present - when: "ansible_distribution_file_variety == 'RedHat'" +- name: Ensure zabbix user exists + when: "'master' not in group_names" + user: + name: zabbix + comment: "Zabbix Monitoring User" + home: /var/lib/zabbix + shell: /usr/sbin/nologin + createhome: no - name: Create zabbix_agent dropin directory file: @@ -35,22 +28,34 @@ mode: 0644 notify: zabbix-agent -- name: Start and Enable zabbix-agent - systemd: - name: zabbix-agent - state: started - enabled: true - -- name: Install zabbix python-api - pip: - name: zabbix-api - - name: Copy Zabbix Host delete script copy: src: zabbix/zabbix_host_delete.py dest: /usr/local/bin/zabbix_host_delete.py mode: 0755 +- name: Install zabbix python-api + pip: + name: zabbix-api + +- name: Install zabbix agent + apt: + name: zabbix-agent + state: present + when: "ansible_distribution_file_variety == 'Debian'" + +- name: Install zabbix agent + dnf: + name: zabbix-agent + state: present + when: "ansible_distribution_file_variety == 'RedHat'" + +- name: Start and Enable zabbix-agent + systemd: + name: zabbix-agent + state: started + enabled: true + # -------------------------------------- # -- Add worker node as zabbix hosts -- # --------------------------------------