Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hotfix: Fixed key use master as compute not working properly #460

Merged
merged 14 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ resources/playbook/ansible_hosts
resources/playbook/vars/
resources/playbook/host_vars/
resources/playbook/group_vars/
tests/resources/*
!test/resources/test_configuration.yml

# any log files
*.log
Expand Down
3 changes: 1 addition & 2 deletions bibigrid/core/actions/ide.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
LOCALHOST = "127.0.0.1"



def sigint_handler(caught_signal, frame): # pylint: disable=unused-argument
"""
Is called when SIGINT is thrown and terminates the program
Expand Down Expand Up @@ -82,7 +81,7 @@ def ide(cluster_id, master_provider, master_configuration, log):
ssh_pkey=used_private_key,
local_bind_address=(LOCALHOST, used_local_bind_address),
remote_bind_address=(LOCALHOST, REMOTE_BIND_ADDRESS)) as server:
print("CTRL+C to close port forwarding when you are done.")
log.log(42, "CTRL+C to close port forwarding when you are done.")
with server:
# opens in existing window if any default program exists
webbrowser.open(f"http://localhost:{used_local_bind_address}", new=2)
Expand Down
4 changes: 2 additions & 2 deletions bibigrid/core/actions/list_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from bibigrid.core.actions import create

SERVER_REGEX = re.compile(r"^bibigrid-((master)-([a-zA-Z0-9]+)|(worker|vpngtw)\d+-([a-zA-Z0-9]+)-\d+)$")
SERVER_REGEX = re.compile(r"^bibigrid-((master)-([a-zA-Z0-9]+)|(worker|vpngtw)-([a-zA-Z0-9]+)-\d+)$")


def dict_clusters(providers, log):
Expand Down Expand Up @@ -69,7 +69,7 @@ def log_list(cluster_id, providers, log):
master_count, worker_count, vpn_count = get_size_overview(cluster_dict[cluster_id], log)
log.log(42, f"\tCluster has {master_count} master, {vpn_count} vpngtw and {worker_count} regular workers. "
f"The cluster is spread over {vpn_count + master_count} reachable provider(s).")
pprint.pprint(cluster_dict[cluster_id])
log.log(42, pprint.pformat(cluster_dict[cluster_id]))
else:
log.info("Cluster with cluster-id {cluster_id} not found.")
log.log(42, f"Cluster with cluster-id {cluster_id} not found.")
Expand Down
12 changes: 6 additions & 6 deletions bibigrid/core/actions/terminate.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def terminate(cluster_id, providers, log, debug=False, assume_yes=False):
cluster_security_group_state = []
tmp_keyname = create.KEY_NAME.format(cluster_id=cluster_id)
local_keypairs_deleted = delete_local_keypairs(tmp_keyname, log)
if not assume_yes and (
local_keypairs_deleted or input(f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. "
f"This might not be your cluster. Are you sure you want to terminate it?\n"
f"Any non-empty input to shutdown cluster {cluster_id}. "
f"Empty input to exit with cluster still alive:")):
if assume_yes or local_keypairs_deleted or input(
f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. "
f"This might not be your cluster. Are you sure you want to terminate it?\n"
f"Any non-empty input to shutdown cluster {cluster_id}. "
f"Empty input to exit with cluster still alive:"):
for provider in providers:
log.info("Terminating cluster %s on cloud %s", cluster_id, provider.cloud_specification['identifier'])
server_list = provider.list_servers()
Expand All @@ -61,7 +61,7 @@ def terminate_servers(server_list, cluster_id, provider, log):
"""
log.info("Deleting servers on provider %s...", provider.cloud_specification['identifier'])
cluster_server_state = []
server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}+|(worker|vpngtw)-{cluster_id}+-\d+)$")
server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}|(worker|vpngtw)-{cluster_id}-\d+)$")
for server in server_list:
if server_regex.match(server["name"]):
log.info("Trying to terminate Server %s on cloud %s.", server['name'],
Expand Down
16 changes: 8 additions & 8 deletions bibigrid/core/utility/ansible_configurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ def write_host_and_group_vars(configurations, providers, cluster_id, log): # py
master_dict = {"name": name, "image": master["image"], "network": configuration["network"],
"network_cidrs": configuration["subnet_cidrs"], "floating_ip": configuration["floating_ip"],
"flavor": flavor_dict, "private_v4": configuration["private_v4"],
"cloud_identifier": configuration["cloud_identifier"],
"volumes": configuration["volumes"],
"cloud_identifier": configuration["cloud_identifier"], "volumes": configuration["volumes"],
"fallback_on_other_image": configuration.get("fallbackOnOtherImage", False)}
if configuration.get("wireguard_peer"):
master_dict["wireguard"] = {"ip": "10.0.0.1", "peer": configuration.get("wireguard_peer")}
Expand Down Expand Up @@ -165,8 +164,8 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us
master_configuration = configurations[0]
log.info("Generating common configuration file...")
# print(configuration.get("slurmConf", {}))
common_configuration_yaml = {"auto_mount": master_configuration.get("autoMount", False),
"cluster_id": cluster_id, "cluster_cidrs": cidrs, "default_user": default_user,
common_configuration_yaml = {"auto_mount": master_configuration.get("autoMount", False), "cluster_id": cluster_id,
"cluster_cidrs": cidrs, "default_user": default_user,
"local_fs": master_configuration.get("localFS", False),
"local_dns_lookup": master_configuration.get("localDNSlookup", False),
"use_master_as_compute": master_configuration.get("useMasterAsCompute", True),
Expand Down Expand Up @@ -266,17 +265,18 @@ def get_cidrs(configurations):
"""
all_cidrs = []
for configuration in configurations:
subnet = configuration["subnet_cidrs"]
provider_cidrs = {"cloud_identifier": configuration["cloud_identifier"], "provider_cidrs": subnet}
provider_cidrs = {"cloud_identifier": configuration["cloud_identifier"],
"provider_cidrs": configuration["subnet_cidrs"]}
all_cidrs.append(provider_cidrs)
return all_cidrs


def get_ansible_roles(ansible_roles, log):
"""
Checks if ansible_roles have all necessary values and returns True if so.
:param ansible_roles: ansible_roles from master configuration (first configuration)
:return: list of valid ansible_roles
@param ansible_roles: ansible_roles from master configuration (first configuration)
@param log:
@return: list of valid ansible_roles
"""
ansible_roles_yaml = []
for ansible_role in (ansible_roles or []):
Expand Down
10 changes: 5 additions & 5 deletions bibigrid/core/utility/handler/ssh_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import time

import paramiko
import yaml
import sympy
import yaml

from bibigrid.core.utility import ansible_commands as aC
from bibigrid.models.exceptions import ConnectionException, ExecutionException
Expand Down Expand Up @@ -107,10 +107,10 @@ def is_active(client, floating_ip_address, private_key, username, log, gateway,
port = 22
if gateway:
log.info(f"Using SSH Gateway {gateway.get('ip')}")
octets = {f'oct{enum+1}': int(elem) for enum, elem in enumerate(floating_ip_address.split("."))}
octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(floating_ip_address.split("."))}
port = int(sympy.sympify(gateway["portFunction"]).subs(dict(octets)))
client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username,
pkey=private_key, timeout=7, auth_timeout=5, port=port)
client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username, pkey=private_key,
timeout=7, auth_timeout=5, port=port)
establishing_connection = False
log.info(f"Successfully connected to {floating_ip_address}")
except paramiko.ssh_exception.NoValidConnectionsError as exc:
Expand Down Expand Up @@ -158,7 +158,7 @@ def execute_ssh_cml_commands(client, commands, log):
:param log:
"""
for command in commands:
ssh_stdin, ssh_stdout, ssh_stderr = client.exec_command(command[0]) # pylint: disable=unused-variable
_, ssh_stdout, _ = client.exec_command(command[0])
ssh_stdout.channel.set_combine_stderr(True)
log.info(f"REMOTE: {command[1]}")

Expand Down
4 changes: 2 additions & 2 deletions bibigrid/core/utility/validate_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,8 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p
flavor = provider.get_flavor(instance_type)
if not flavor:
self.log.warning("Flavor %s does not exist.", instance_type)
print("Available flavors:")
print("\n".join(provider.get_active_flavors()))
self.log.log(42, "Available flavors:")
self.log.log(42, "\n".join(provider.get_active_flavors()))
return False
type_max_disk_space = flavor["disk"]
type_max_ram = flavor["ram"]
Expand Down
2 changes: 2 additions & 0 deletions bibigrid/openstack/openstack_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ def create_server(self, name, flavor, image, network, key_name=None, wait=True,
except openstack.exceptions.BadRequestException as exc:
if "is not active" in str(exc):
raise ImageDeactivatedException() from exc
if "Invalid key_name provided" in str(exc):
raise ExecutionException() from exc
raise ConnectionError() from exc
except openstack.exceptions.SDKException as exc:
raise ExecutionException() from exc
Expand Down
28 changes: 0 additions & 28 deletions resources/playbook/roles/additional/example/meta/main.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
- debug:
msg:
- "Hello {{ ansible_user }}!"

2 changes: 1 addition & 1 deletion resources/playbook/roles/bibigrid/tasks/010-bin-server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
- name: Does folder exist
delegate_to: localhost
stat:
path: ~{{ ansible_facts.env.SUDO_USER }}/bin
path: "~{{ ansible_facts.env.SUDO_USER }}/bin"
register: folder

- when: folder.stat.exists
Expand Down
3 changes: 2 additions & 1 deletion resources/playbook/roles/bibigrid/templates/slurm/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ SlurmdLogFile=/var/log/slurm/slurmd.log
# COMPUTE NODES
{% set sl = {} %}
{% set all = {"nodes":[]} %}
{% for node_name in groups.master+groups.workers %}
{% set master_or_empty = groups.master if use_master_as_compute else [] %}
{% for node_name in master_or_empty +groups.workers %}
{% set node = hostvars[node_name] %}
{% set mem = node.flavor.ram // 1024 * 1000 %}
{% if node.cloud_identifier not in sl %}
Expand Down
Loading
Loading