Skip to content

Commit

Permalink
Hotfix: Fixed key use master as compute not working properly (#460)
Browse files Browse the repository at this point in the history
* fixed rule setting for security groups

* fixed multiple network is now list causing error bugs.

* trying to figure out why route applying only works once.

* Added more echo's for better debugging.

* fixed remaining "subnet list gets handled as a single subnet" bug and finalized multiple routes handling.

* fixed None bug where [] is expected when no sshPublicKeyFile is given.

* removed master from compute if use master as compute is false

* reconstructured role additional in order to make it easier to include. Added quotes for consistency.

* Updated all tests (#448)

* updated most tests

* fixed validate_configuration.py tests.

* Updated tests for startup.py

* fixed bug in terminate that caused assume_yes to work as assume_no

* updated terminate_cluster tests.

* fixed formatting improved pylint

* adapted tests

* updated return threading test

* updated provider_handler

* tests not finished yet

* Fixed server regex issue

* test list clusters updated

* fixed too open cluster_id regex

* added missing "to"

* fixed id_generation tests

* renamed configuration handler to please linter

* removed unnecessary tests and updated remaining

* updated tests not finished yet

* improved code style

* fixed tests further. One to fix left.

* fixed additional tests

* fixed all tests for ansible configurator

* fixed comment

* fixed multiple tests

* fixed a few tests

* Fixed create

* fixed some issues regarding

* fixing test_provider.py

* removed infrastructure_cloud.yml

* minor fixes

* fixed all tests

* removed print

* changed prints to log

* removed log
  • Loading branch information
XaverStiensmeier authored Nov 28, 2023
1 parent 39a881f commit 5ce64a2
Show file tree
Hide file tree
Showing 33 changed files with 1,310 additions and 1,272 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ resources/playbook/ansible_hosts
resources/playbook/vars/
resources/playbook/host_vars/
resources/playbook/group_vars/
tests/resources/*
!test/resources/test_configuration.yml

# any log files
*.log
Expand Down
3 changes: 1 addition & 2 deletions bibigrid/core/actions/ide.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
LOCALHOST = "127.0.0.1"



def sigint_handler(caught_signal, frame): # pylint: disable=unused-argument
"""
Is called when SIGINT is thrown and terminates the program
Expand Down Expand Up @@ -82,7 +81,7 @@ def ide(cluster_id, master_provider, master_configuration, log):
ssh_pkey=used_private_key,
local_bind_address=(LOCALHOST, used_local_bind_address),
remote_bind_address=(LOCALHOST, REMOTE_BIND_ADDRESS)) as server:
print("CTRL+C to close port forwarding when you are done.")
log.log(42, "CTRL+C to close port forwarding when you are done.")
with server:
# opens in existing window if any default program exists
webbrowser.open(f"http://localhost:{used_local_bind_address}", new=2)
Expand Down
4 changes: 2 additions & 2 deletions bibigrid/core/actions/list_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from bibigrid.core.actions import create

SERVER_REGEX = re.compile(r"^bibigrid-((master)-([a-zA-Z0-9]+)|(worker|vpngtw)\d+-([a-zA-Z0-9]+)-\d+)$")
SERVER_REGEX = re.compile(r"^bibigrid-((master)-([a-zA-Z0-9]+)|(worker|vpngtw)-([a-zA-Z0-9]+)-\d+)$")


def dict_clusters(providers, log):
Expand Down Expand Up @@ -69,7 +69,7 @@ def log_list(cluster_id, providers, log):
master_count, worker_count, vpn_count = get_size_overview(cluster_dict[cluster_id], log)
log.log(42, f"\tCluster has {master_count} master, {vpn_count} vpngtw and {worker_count} regular workers. "
f"The cluster is spread over {vpn_count + master_count} reachable provider(s).")
pprint.pprint(cluster_dict[cluster_id])
log.log(42, pprint.pformat(cluster_dict[cluster_id]))
else:
log.info("Cluster with cluster-id {cluster_id} not found.")
log.log(42, f"Cluster with cluster-id {cluster_id} not found.")
Expand Down
12 changes: 6 additions & 6 deletions bibigrid/core/actions/terminate.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def terminate(cluster_id, providers, log, debug=False, assume_yes=False):
cluster_security_group_state = []
tmp_keyname = create.KEY_NAME.format(cluster_id=cluster_id)
local_keypairs_deleted = delete_local_keypairs(tmp_keyname, log)
if not assume_yes and (
local_keypairs_deleted or input(f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. "
f"This might not be your cluster. Are you sure you want to terminate it?\n"
f"Any non-empty input to shutdown cluster {cluster_id}. "
f"Empty input to exit with cluster still alive:")):
if assume_yes or local_keypairs_deleted or input(
f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. "
f"This might not be your cluster. Are you sure you want to terminate it?\n"
f"Any non-empty input to shutdown cluster {cluster_id}. "
f"Empty input to exit with cluster still alive:"):
for provider in providers:
log.info("Terminating cluster %s on cloud %s", cluster_id, provider.cloud_specification['identifier'])
server_list = provider.list_servers()
Expand All @@ -61,7 +61,7 @@ def terminate_servers(server_list, cluster_id, provider, log):
"""
log.info("Deleting servers on provider %s...", provider.cloud_specification['identifier'])
cluster_server_state = []
server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}+|(worker|vpngtw)-{cluster_id}+-\d+)$")
server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}|(worker|vpngtw)-{cluster_id}-\d+)$")
for server in server_list:
if server_regex.match(server["name"]):
log.info("Trying to terminate Server %s on cloud %s.", server['name'],
Expand Down
16 changes: 8 additions & 8 deletions bibigrid/core/utility/ansible_configurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ def write_host_and_group_vars(configurations, providers, cluster_id, log): # py
master_dict = {"name": name, "image": master["image"], "network": configuration["network"],
"network_cidrs": configuration["subnet_cidrs"], "floating_ip": configuration["floating_ip"],
"flavor": flavor_dict, "private_v4": configuration["private_v4"],
"cloud_identifier": configuration["cloud_identifier"],
"volumes": configuration["volumes"],
"cloud_identifier": configuration["cloud_identifier"], "volumes": configuration["volumes"],
"fallback_on_other_image": configuration.get("fallbackOnOtherImage", False)}
if configuration.get("wireguard_peer"):
master_dict["wireguard"] = {"ip": "10.0.0.1", "peer": configuration.get("wireguard_peer")}
Expand Down Expand Up @@ -165,8 +164,8 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us
master_configuration = configurations[0]
log.info("Generating common configuration file...")
# print(configuration.get("slurmConf", {}))
common_configuration_yaml = {"auto_mount": master_configuration.get("autoMount", False),
"cluster_id": cluster_id, "cluster_cidrs": cidrs, "default_user": default_user,
common_configuration_yaml = {"auto_mount": master_configuration.get("autoMount", False), "cluster_id": cluster_id,
"cluster_cidrs": cidrs, "default_user": default_user,
"local_fs": master_configuration.get("localFS", False),
"local_dns_lookup": master_configuration.get("localDNSlookup", False),
"use_master_as_compute": master_configuration.get("useMasterAsCompute", True),
Expand Down Expand Up @@ -266,17 +265,18 @@ def get_cidrs(configurations):
"""
all_cidrs = []
for configuration in configurations:
subnet = configuration["subnet_cidrs"]
provider_cidrs = {"cloud_identifier": configuration["cloud_identifier"], "provider_cidrs": subnet}
provider_cidrs = {"cloud_identifier": configuration["cloud_identifier"],
"provider_cidrs": configuration["subnet_cidrs"]}
all_cidrs.append(provider_cidrs)
return all_cidrs


def get_ansible_roles(ansible_roles, log):
"""
Checks if ansible_roles have all necessary values and returns True if so.
:param ansible_roles: ansible_roles from master configuration (first configuration)
:return: list of valid ansible_roles
@param ansible_roles: ansible_roles from master configuration (first configuration)
@param log:
@return: list of valid ansible_roles
"""
ansible_roles_yaml = []
for ansible_role in (ansible_roles or []):
Expand Down
10 changes: 5 additions & 5 deletions bibigrid/core/utility/handler/ssh_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import time

import paramiko
import yaml
import sympy
import yaml

from bibigrid.core.utility import ansible_commands as aC
from bibigrid.models.exceptions import ConnectionException, ExecutionException
Expand Down Expand Up @@ -107,10 +107,10 @@ def is_active(client, floating_ip_address, private_key, username, log, gateway,
port = 22
if gateway:
log.info(f"Using SSH Gateway {gateway.get('ip')}")
octets = {f'oct{enum+1}': int(elem) for enum, elem in enumerate(floating_ip_address.split("."))}
octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(floating_ip_address.split("."))}
port = int(sympy.sympify(gateway["portFunction"]).subs(dict(octets)))
client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username,
pkey=private_key, timeout=7, auth_timeout=5, port=port)
client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username, pkey=private_key,
timeout=7, auth_timeout=5, port=port)
establishing_connection = False
log.info(f"Successfully connected to {floating_ip_address}")
except paramiko.ssh_exception.NoValidConnectionsError as exc:
Expand Down Expand Up @@ -158,7 +158,7 @@ def execute_ssh_cml_commands(client, commands, log):
:param log:
"""
for command in commands:
ssh_stdin, ssh_stdout, ssh_stderr = client.exec_command(command[0]) # pylint: disable=unused-variable
_, ssh_stdout, _ = client.exec_command(command[0])
ssh_stdout.channel.set_combine_stderr(True)
log.info(f"REMOTE: {command[1]}")

Expand Down
4 changes: 2 additions & 2 deletions bibigrid/core/utility/validate_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,8 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p
flavor = provider.get_flavor(instance_type)
if not flavor:
self.log.warning("Flavor %s does not exist.", instance_type)
print("Available flavors:")
print("\n".join(provider.get_active_flavors()))
self.log.log(42, "Available flavors:")
self.log.log(42, "\n".join(provider.get_active_flavors()))
return False
type_max_disk_space = flavor["disk"]
type_max_ram = flavor["ram"]
Expand Down
2 changes: 2 additions & 0 deletions bibigrid/openstack/openstack_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ def create_server(self, name, flavor, image, network, key_name=None, wait=True,
except openstack.exceptions.BadRequestException as exc:
if "is not active" in str(exc):
raise ImageDeactivatedException() from exc
if "Invalid key_name provided" in str(exc):
raise ExecutionException() from exc
raise ConnectionError() from exc
except openstack.exceptions.SDKException as exc:
raise ExecutionException() from exc
Expand Down
28 changes: 0 additions & 28 deletions resources/playbook/roles/additional/example/meta/main.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
- debug:
msg:
- "Hello {{ ansible_user }}!"

2 changes: 1 addition & 1 deletion resources/playbook/roles/bibigrid/tasks/010-bin-server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
- name: Does folder exist
delegate_to: localhost
stat:
path: ~{{ ansible_facts.env.SUDO_USER }}/bin
path: "~{{ ansible_facts.env.SUDO_USER }}/bin"
register: folder

- when: folder.stat.exists
Expand Down
3 changes: 2 additions & 1 deletion resources/playbook/roles/bibigrid/templates/slurm/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ SlurmdLogFile=/var/log/slurm/slurmd.log
# COMPUTE NODES
{% set sl = {} %}
{% set all = {"nodes":[]} %}
{% for node_name in groups.master+groups.workers %}
{% set master_or_empty = groups.master if use_master_as_compute else [] %}
{% for node_name in master_or_empty +groups.workers %}
{% set node = hostvars[node_name] %}
{% set mem = node.flavor.ram // 1024 * 1000 %}
{% if node.cloud_identifier not in sl %}
Expand Down
Loading

0 comments on commit 5ce64a2

Please sign in to comment.