From 811e287861530559742cf58ab3c026c56ea1b265 Mon Sep 17 00:00:00 2001 From: dweinholz Date: Wed, 3 Apr 2024 08:05:22 +0200 Subject: [PATCH 1/2] fix(Scaling):fixed ep --- .../openstack_connector/openstack_connector.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/simple_vm_client/openstack_connector/openstack_connector.py b/simple_vm_client/openstack_connector/openstack_connector.py index 9959e4d..c9218ff 100644 --- a/simple_vm_client/openstack_connector/openstack_connector.py +++ b/simple_vm_client/openstack_connector/openstack_connector.py @@ -527,6 +527,7 @@ def get_image( replace_inactive: bool = False, ignore_not_active: bool = False, ignore_not_found: bool = False, + replace_not_found: bool = False, ) -> Image: logger.info(f"Get Image {name_or_id}") @@ -535,6 +536,14 @@ def get_image( raise ImageNotFoundException( message=f"Image {name_or_id} not found!", name_or_id=name_or_id ) + elif not image and replace_not_found: + for version in ["18.04", "20.04", "22.04", "1804", "2004", "2204"]: + if version in name_or_id: + image = self.get_active_image_by_os_version( + os_version=version, os_distro="ubuntu" + ) + break + elif image and image.status != "active" and replace_inactive: metadata = image.get("metadata", None) image_os_version = metadata.get("os_version", None) @@ -1214,7 +1223,10 @@ def start_server( key_name: str = None # type: ignore try: image: Image = self.get_image( - name_or_id=image_name, replace_inactive=True, ignore_not_found=True + name_or_id=image_name, + replace_not_found=True, + replace_inactive=True, + ignore_not_found=True, ) flavor: Flavor = self.get_flavor(name_or_id=flavor_name) network: Network = self.get_network() From 649c3d407dc473757950939bc657f5632f398ea7 Mon Sep 17 00:00:00 2001 From: dweinholz Date: Wed, 3 Apr 2024 08:46:14 +0200 Subject: [PATCH 2/2] feat(Cluster):added slurm version --- portal_client.thrift | 1 + simple_vm_client/VirtualMachineService-remote | 7 +- simple_vm_client/VirtualMachineService.py | 35 +++++++++ .../openstack_connector.py | 75 ++++++++++++++----- 4 files changed, 95 insertions(+), 23 deletions(-) diff --git a/portal_client.thrift b/portal_client.thrift index 830cec9..f650dfc 100644 --- a/portal_client.thrift +++ b/portal_client.thrift @@ -443,6 +443,7 @@ service VirtualMachineService { 9:list additional_keys, 10:optional string research_environment 11:optional list additional_security_group_ids, + 12:optional string slurm_version, ) diff --git a/simple_vm_client/VirtualMachineService-remote b/simple_vm_client/VirtualMachineService-remote index 7189003..e7c2706 100755 --- a/simple_vm_client/VirtualMachineService-remote +++ b/simple_vm_client/VirtualMachineService-remote @@ -53,7 +53,7 @@ if len(sys.argv) <= 1 or sys.argv[1] == "--help": print(" void delete_security_group_rule(string openstack_id)") print(" void delete_server(string openstack_id)") print( - " string start_server(string flavor_name, string image_name, string public_key, string servername, metadata, volume_ids_path_new, volume_ids_path_attach, additional_keys, string research_environment, additional_security_group_ids)" + " string start_server(string flavor_name, string image_name, string public_key, string servername, metadata, volume_ids_path_new, volume_ids_path_attach, additional_keys, string research_environment, additional_security_group_ids, string slurm_version)" ) print(" bool is_bibigrid_available()") print(" void detach_ip_from_server(string server_id, string floating_ip)") @@ -389,8 +389,8 @@ elif cmd == "delete_server": ) elif cmd == "start_server": - if len(args) != 10: - print("start_server requires 10 args") + if len(args) != 11: + print("start_server requires 11 args") sys.exit(1) pp.pprint( client.start_server( @@ -404,6 +404,7 @@ elif cmd == "start_server": eval(args[7]), args[8], eval(args[9]), + args[10], ) ) diff --git a/simple_vm_client/VirtualMachineService.py b/simple_vm_client/VirtualMachineService.py index e21b6b9..8920e40 100644 --- a/simple_vm_client/VirtualMachineService.py +++ b/simple_vm_client/VirtualMachineService.py @@ -202,6 +202,7 @@ def start_server( additional_keys, research_environment, additional_security_group_ids, + slurm_version, ): """ Parameters: @@ -215,6 +216,7 @@ def start_server( - additional_keys - research_environment - additional_security_group_ids + - slurm_version """ @@ -1458,6 +1460,7 @@ def start_server( additional_keys, research_environment, additional_security_group_ids, + slurm_version, ): """ Parameters: @@ -1471,6 +1474,7 @@ def start_server( - additional_keys - research_environment - additional_security_group_ids + - slurm_version """ self.send_start_server( @@ -1484,6 +1488,7 @@ def start_server( additional_keys, research_environment, additional_security_group_ids, + slurm_version, ) return self.recv_start_server() @@ -1499,6 +1504,7 @@ def send_start_server( additional_keys, research_environment, additional_security_group_ids, + slurm_version, ): self._oprot.writeMessageBegin("start_server", TMessageType.CALL, self._seqid) args = start_server_args() @@ -1512,6 +1518,7 @@ def send_start_server( args.additional_keys = additional_keys args.research_environment = research_environment args.additional_security_group_ids = additional_security_group_ids + args.slurm_version = slurm_version args.write(self._oprot) self._oprot.writeMessageEnd() self._oprot.trans.flush() @@ -4216,6 +4223,7 @@ def process_start_server(self, seqid, iprot, oprot): args.additional_keys, args.research_environment, args.additional_security_group_ids, + args.slurm_version, ) msg_type = TMessageType.REPLY except TTransport.TTransportException: @@ -9168,6 +9176,7 @@ class start_server_args(object): - additional_keys - research_environment - additional_security_group_ids + - slurm_version """ @@ -9183,6 +9192,7 @@ def __init__( additional_keys=None, research_environment=None, additional_security_group_ids=None, + slurm_version=None, ): self.flavor_name = flavor_name self.image_name = image_name @@ -9194,6 +9204,7 @@ def __init__( self.additional_keys = additional_keys self.research_environment = research_environment self.additional_security_group_ids = additional_security_group_ids + self.slurm_version = slurm_version def read(self, iprot): if ( @@ -9348,6 +9359,15 @@ def read(self, iprot): iprot.readListEnd() else: iprot.skip(ftype) + elif fid == 12: + if ftype == TType.STRING: + self.slurm_version = ( + iprot.readString().decode("utf-8", errors="replace") + if sys.version_info[0] == 2 + else iprot.readString() + ) + else: + iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() @@ -9468,6 +9488,14 @@ def write(self, oprot): ) oprot.writeListEnd() oprot.writeFieldEnd() + if self.slurm_version is not None: + oprot.writeFieldBegin("slurm_version", TType.STRING, 12) + oprot.writeString( + self.slurm_version.encode("utf-8") + if sys.version_info[0] == 2 + else self.slurm_version + ) + oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() @@ -9559,6 +9587,13 @@ def __ne__(self, other): (TType.STRING, "UTF8", False), None, ), # 11 + ( + 12, + TType.STRING, + "slurm_version", + "UTF8", + None, + ), # 12 ) diff --git a/simple_vm_client/openstack_connector/openstack_connector.py b/simple_vm_client/openstack_connector/openstack_connector.py index c9218ff..034fd84 100644 --- a/simple_vm_client/openstack_connector/openstack_connector.py +++ b/simple_vm_client/openstack_connector/openstack_connector.py @@ -501,12 +501,10 @@ def get_servers_by_bibigrid_id(self, bibigrid_id: str) -> list[Server]: def get_active_image_by_os_version(self, os_version: str, os_distro: str) -> Image: logger.info(f"Get active Image by os-version: {os_version}") images = self.openstack_connection.list_images() - for img in images: - image: Image = img - metadata = image["metadata"] - image_os_version = metadata.get("os_version", None) - image_os_distro = metadata.get("os_distro", None) - base_image_ref = metadata.get("base_image_ref", None) + for image in images: + image_os_version = image.get("os_version", None) + image_os_distro = image.get("os_distro", None) + base_image_ref = image.get("properties", {}).get("base_image_ref", None) if ( os_version == image_os_version and image.status == "active" @@ -521,13 +519,35 @@ def get_active_image_by_os_version(self, os_version: str, os_distro: str) -> Ima name_or_id="", ) + def get_active_image_by_os_version_and_slurm_version( + self, os_version, os_distro, slurm_version + ) -> Image: + logger.info( + f"Get active Image by os-version: {os_version} and slurm_version {slurm_version}" + ) + images = self.openstack_connection.list_images() + backup_image = None + for image in images: + if image and image.status == "active": + image_os_version = image.get("os_version", None) + image_os_distro = image.get("os_distro", None) + properties = image.get("properties", None) + if os_version == image_os_version and "worker" in image.get("tags", []): + if os_distro and os_distro == image_os_distro: + backup_image = image + if properties.get("slurm_version" == slurm_version): + return image + + return backup_image + def get_image( self, name_or_id: str, replace_inactive: bool = False, ignore_not_active: bool = False, - ignore_not_found: bool = False, replace_not_found: bool = False, + ignore_not_found: bool = False, + slurm_version: str = None, ) -> Image: logger.info(f"Get Image {name_or_id}") @@ -536,21 +556,33 @@ def get_image( raise ImageNotFoundException( message=f"Image {name_or_id} not found!", name_or_id=name_or_id ) - elif not image and replace_not_found: - for version in ["18.04", "20.04", "22.04", "1804", "2004", "2204"]: + elif image is None and replace_not_found: + for version in ["20.04", "22.04", "2004", "2204"]: if version in name_or_id: - image = self.get_active_image_by_os_version( - os_version=version, os_distro="ubuntu" - ) - break + if slurm_version: + image = self.get_active_image_by_os_version_and_slurm_version( + os_version=version, + os_distro="ubuntu", + slurm_version=slurm_version, + ) + else: + image = self.get_active_image_by_os_version( + os_version=version, os_distro="ubuntu" + ) elif image and image.status != "active" and replace_inactive: - metadata = image.get("metadata", None) - image_os_version = metadata.get("os_version", None) - image_os_distro = metadata.get("os_distro", None) - image = self.get_active_image_by_os_version( - os_version=image_os_version, os_distro=image_os_distro - ) + image_os_version = image["os_version"] + image_os_distro = image["os_distro"] + if slurm_version: + image = self.get_active_image_by_os_version_and_slurm_version( + os_version=image_os_version, + os_distro=image_os_distro, + slurm_version=slurm_version, + ) + else: + image = self.get_active_image_by_os_version( + os_version=image_os_version, os_distro=image_os_distro + ) elif image and image.status != "active" and not ignore_not_active: raise ImageNotFoundException( message=f"Image {name_or_id} found but not active!", @@ -1217,16 +1249,19 @@ def start_server( volume_ids_path_attach: Union[list[dict[str, str]], None] = None, additional_keys: Union[list[str], None] = None, additional_security_group_ids: Union[list[str], None] = None, + slurm_version: str = None, ) -> str: logger.info(f"Start Server {servername}") key_name: str = None # type: ignore try: + image: Image = self.get_image( name_or_id=image_name, - replace_not_found=True, replace_inactive=True, ignore_not_found=True, + replace_not_found=True, + slurm_version=slurm_version, ) flavor: Flavor = self.get_flavor(name_or_id=flavor_name) network: Network = self.get_network()