From 6ab1d42c0cfb1b426378d67196697a1dfbdcd8fd Mon Sep 17 00:00:00 2001 From: Predrag Rogic Date: Thu, 12 Dec 2024 19:14:28 +0000 Subject: [PATCH] fix timeout when stopping KVM machine with CRI-O container runtime (#19758) * fix timeout when stopping KVM machine with CRI-O container runtime * Update pkg/drivers/kvm/kvm.go Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com> * Update pkg/drivers/kvm/kvm.go Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com> * improve error message * Update pkg/drivers/kvm/kvm.go Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com> --------- Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com> Co-authored-by: Medya Ghazizadeh --- pkg/drivers/kvm/domain.go | 10 +- pkg/drivers/kvm/kvm.go | 337 ++++++++++++++++++++++++------------- pkg/drivers/kvm/network.go | 18 +- 3 files changed, 238 insertions(+), 127 deletions(-) diff --git a/pkg/drivers/kvm/domain.go b/pkg/drivers/kvm/domain.go index 6a1589e74a92..15af5a2ec130 100644 --- a/pkg/drivers/kvm/domain.go +++ b/pkg/drivers/kvm/domain.go @@ -31,7 +31,7 @@ import ( func (d *Driver) getDomain() (*libvirt.Domain, *libvirt.Connect, error) { conn, err := getConnection(d.ConnectionURI) if err != nil { - return nil, nil, errors.Wrap(err, "getting domain") + return nil, nil, errors.Wrap(err, "getting libvirt connection") } dom, err := conn.LookupDomainByName(d.MachineName) @@ -71,9 +71,13 @@ func (d *Driver) createDomain() (*libvirt.Domain, error) { } conn, err := getConnection(d.ConnectionURI) if err != nil { - return nil, errors.Wrap(err, "error getting libvirt connection") + return nil, errors.Wrap(err, "getting libvirt connection") } - defer conn.Close() + defer func() { + if _, err := conn.Close(); err != nil { + log.Errorf("unable to close libvirt connection: %v", err) + } + }() log.Infof("define libvirt domain using xml: %v", domainXML.String()) // define the domain in libvirt using the generated XML diff --git a/pkg/drivers/kvm/kvm.go b/pkg/drivers/kvm/kvm.go index 00220e6827ec..63c628649926 100644 --- a/pkg/drivers/kvm/kvm.go +++ b/pkg/drivers/kvm/kvm.go @@ -117,35 +117,19 @@ func NewDriver(hostName, storePath string) *Driver { } } -// PreCommandCheck checks the connection before issuing a command -func (d *Driver) PreCommandCheck() error { - conn, err := getConnection(d.ConnectionURI) - if err != nil { - return errors.Wrap(err, "error connecting to libvirt socket. Have you added yourself to the libvirtd group?") - } - defer conn.Close() - - libVersion, err := conn.GetLibVersion() - if err != nil { - return errors.Wrap(err, "getting libvirt version") - } - log.Debugf("Using libvirt version %d", libVersion) - - return nil -} - // GetURL returns a Docker URL inside this host // e.g. tcp://1.2.3.4:2376 // more info https://github.com/docker/machine/blob/b170508bf44c3405e079e26d5fdffe35a64c6972/libmachine/provision/utils.go#L159_L175 func (d *Driver) GetURL() (string, error) { if err := d.PreCommandCheck(); err != nil { - return "", errors.Wrap(err, "getting URL, precheck failed") + return "", errors.Wrap(err, "prechecking") } ip, err := d.GetIP() if err != nil { - return "", errors.Wrap(err, "getting URL, could not get IP") + return "", errors.Wrap(err, "getting domain IP") } + if ip == "" { return "", nil } @@ -153,15 +137,37 @@ func (d *Driver) GetURL() (string, error) { return fmt.Sprintf("tcp://%s:2376", ip), nil } +// PreCommandCheck checks the connection before issuing a command +func (d *Driver) PreCommandCheck() error { + conn, err := getConnection(d.ConnectionURI) + if err != nil { + return errors.Wrap(err, "getting libvirt connection") + } + defer func() { + if _, err := conn.Close(); err != nil { + log.Errorf("unable to close libvirt connection: %v", err) + } + }() + + libVersion, err := conn.GetLibVersion() + if err != nil { + return errors.Wrap(err, "getting libvirt version") + } + + log.Debugf("using libvirt version %d", libVersion) + + return nil +} + // GetState returns the state that the host is in (running, stopped, etc) -func (d *Driver) GetState() (st state.State, err error) { +func (d *Driver) GetState() (state.State, error) { dom, conn, err := d.getDomain() if err != nil { - return state.None, errors.Wrap(err, "getting connection") + return state.None, errors.Wrap(err, "getting domain") } defer func() { - if ferr := closeDomain(dom, conn); ferr != nil { - err = ferr + if err := closeDomain(dom, conn); err != nil { + log.Errorf("unable to close domain: %v", err) } }() @@ -169,8 +175,8 @@ func (d *Driver) GetState() (st state.State, err error) { if err != nil { return state.None, errors.Wrap(err, "getting domain state") } - st = machineState(lvs) - return // st, err + + return machineState(lvs), nil } // machineState converts libvirt state to libmachine state @@ -210,17 +216,22 @@ func machineState(lvs libvirt.DomainState) state.State { func (d *Driver) GetIP() (string, error) { s, err := d.GetState() if err != nil { - return "", errors.Wrap(err, "machine in unknown state") + return "", errors.Wrap(err, "getting domain state") } + if s != state.Running { - return "", errors.New("host is not running") + return "", errors.New("domain is not running") } conn, err := getConnection(d.ConnectionURI) if err != nil { return "", errors.Wrap(err, "getting libvirt connection") } - defer conn.Close() + defer func() { + if _, err := conn.Close(); err != nil { + log.Errorf("unable to close libvirt connection: %v", err) + } + }() return ipFromXML(conn, d.MachineName, d.PrivateNetwork) } @@ -236,57 +247,79 @@ func (d *Driver) DriverName() string { } // Kill stops a host forcefully, including any containers that we are managing. -func (d *Driver) Kill() (err error) { +func (d *Driver) Kill() error { + s, err := d.GetState() + if err != nil { + return errors.Wrap(err, "getting domain state") + } + + if s == state.Stopped { + return nil + } + + log.Info("killing domain...") + dom, conn, err := d.getDomain() if err != nil { - return errors.Wrap(err, "getting connection") + return errors.Wrap(err, "getting domain") } defer func() { - if ferr := closeDomain(dom, conn); ferr != nil { - err = ferr + if err := closeDomain(dom, conn); err != nil { + log.Errorf("unable to close domain: %v", err) } }() + + // ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainDestroy + // "virDomainDestroy first requests that a guest terminate (e.g. SIGTERM), then waits for it to comply. + // After a reasonable timeout, if the guest still exists, virDomainDestroy will forcefully terminate the guest (e.g. SIGKILL) + // if necessary (which may produce undesirable results, for example unflushed disk cache in the guest). + // To avoid this possibility, it's recommended to instead call virDomainDestroyFlags, sending the VIR_DOMAIN_DESTROY_GRACEFUL flag." + // ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainDestroyFlags + // ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainDestroyFlagsValues + // we're using default virDomainDestroyFlags - ie, VIR_DOMAIN_DESTROY_DEFAULT (0), the "Default behavior - could lead to data loss!!" return dom.Destroy() } // Restart a host func (d *Driver) Restart() error { + log.Info("restarting domain...") return pkgdrivers.Restart(d) } // Start a host -func (d *Driver) Start() (err error) { +func (d *Driver) Start() error { + log.Info("starting domain...") + // this call ensures that all networks are active - log.Info("Ensuring networks are active...") - err = d.ensureNetwork() - if err != nil { + log.Info("ensuring networks are active...") + if err := d.ensureNetwork(); err != nil { return errors.Wrap(err, "ensuring active networks") } - log.Info("Getting domain xml...") + log.Info("getting domain XML...") dom, conn, err := d.getDomain() if err != nil { - return errors.Wrap(err, "getting connection") + return errors.Wrap(err, "getting domain XML") } defer func() { - if ferr := closeDomain(dom, conn); ferr != nil { - err = ferr + if err := closeDomain(dom, conn); err != nil { + log.Errorf("unable to close domain: %v", err) } }() - log.Info("Creating domain...") + log.Info("creating domain...") if err := dom.Create(); err != nil { - return errors.Wrap(err, "error creating VM") + return errors.Wrap(err, "creating domain") } - log.Info("Waiting to get IP...") + log.Info("waiting for IP...") if err := d.waitForStaticIP(conn); err != nil { - return errors.Wrap(err, "IP not available after waiting") + return errors.Wrap(err, "waiting for IP") } - log.Info("Waiting for SSH to be available...") + log.Info("waiting for SSH...") if err := drivers.WaitForSSH(d); err != nil { - return errors.Wrap(err, "SSH not available after waiting") + return errors.Wrap(err, "waiting for SSH") } return nil @@ -297,44 +330,48 @@ func (d *Driver) waitForStaticIP(conn *libvirt.Connect) error { query := func() error { sip, err := ipFromAPI(conn, d.MachineName, d.PrivateNetwork) if err != nil { - return fmt.Errorf("failed getting IP during machine start, will retry: %w", err) + return errors.Wrap(err, "getting domain IP, will retry") } + if sip == "" { - return fmt.Errorf("waiting for machine to come up") + return errors.New("waiting for domain to come up") } - log.Infof("Found IP for machine: %s", sip) + log.Infof("found domain IP: %s", sip) d.IPAddress = sip return nil } if err := retry.Local(query, 1*time.Minute); err != nil { - return fmt.Errorf("machine %s didn't return IP after 1 minute", d.MachineName) + return fmt.Errorf("domain %s didn't return IP after 1 minute", d.MachineName) } - log.Info("Reserving static IP address...") + log.Info("reserving static IP address...") if err := addStaticIP(conn, d.PrivateNetwork, d.MachineName, d.PrivateMAC, d.IPAddress); err != nil { - log.Warnf("Failed reserving static IP %s for host %s, will continue anyway: %v", d.IPAddress, d.MachineName, err) + log.Warnf("failed reserving static IP address %s for domain %s, will continue anyway: %v", d.IPAddress, d.MachineName, err) } else { - log.Infof("Reserved static IP address: %s", d.IPAddress) + log.Infof("reserved static IP address %s for domain %s", d.IPAddress, d.MachineName) } return nil } // Create a host using the driver's config -func (d *Driver) Create() (err error) { - log.Info("Creating KVM machine...") - err = d.createNetwork() - if err != nil { +func (d *Driver) Create() error { + log.Info("creating KVM machine...") + + log.Info("creating network...") + if err := d.createNetwork(); err != nil { return errors.Wrap(err, "creating network") } + if d.GPU { - log.Info("Creating devices...") - d.DevicesXML, err = getDevicesXML() + log.Info("getting devices XML...") + xml, err := getDevicesXML() if err != nil { - return errors.Wrap(err, "creating devices") + return errors.Wrap(err, "getting devices XML") } + d.DevicesXML = xml } if d.NUMANodeCount > 1 { @@ -346,22 +383,22 @@ func (d *Driver) Create() (err error) { } store := d.ResolveStorePath(".") - log.Infof("Setting up store path in %s ...", store) + log.Infof("setting up store path in %s ...", store) // 0755 because it must be accessible by libvirt/qemu across a variety of configs if err := os.MkdirAll(store, 0755); err != nil { return errors.Wrap(err, "creating store") } - log.Infof("Building disk image from %s", d.Boot2DockerURL) - if err = pkgdrivers.MakeDiskImage(d.BaseDriver, d.Boot2DockerURL, d.DiskSize); err != nil { - return errors.Wrap(err, "error creating disk") + log.Infof("building disk image from %s", d.Boot2DockerURL) + if err := pkgdrivers.MakeDiskImage(d.BaseDriver, d.Boot2DockerURL, d.DiskSize); err != nil { + return errors.Wrap(err, "creating disk") } if d.ExtraDisks > 20 { // Limiting the number of disks to 20 arbitrarily. If more disks are // needed, the logical name generation has to changed to create them if // the form hdaa, hdab, etc - return errors.Wrap(err, "cannot create more than 20 extra disks") + return errors.New("cannot create more than 20 extra disks") } for i := 0; i < d.ExtraDisks; i++ { diskpath := pkgdrivers.ExtraDiskPath(d.BaseDriver, i) @@ -381,22 +418,23 @@ func (d *Driver) Create() (err error) { log.Errorf("unable to ensure permissions on %s: %v", store, err) } - log.Info("Creating domain...") + log.Info("creating domain...") + dom, err := d.createDomain() if err != nil { return errors.Wrap(err, "creating domain") } defer func() { - if ferr := dom.Free(); ferr != nil { - log.Warnf("unable to free domain: %v", err) - err = ferr + if err := dom.Free(); err != nil { + log.Errorf("unable to free domain: %v", err) } }() - if err = d.Start(); err != nil { - log.Errorf("unable to start VM: %v", err) - return err + + if err := d.Start(); err != nil { + return errors.Wrap(err, "starting domain") } - log.Infof("KVM machine creation complete!") + + log.Infof("KVM machine creation complete") return nil } @@ -405,19 +443,22 @@ func ensureDirPermissions(store string) error { // traverse upwards from /home/user/.minikube/machines to ensure // that libvirt/qemu has execute access for dir := store; dir != "/"; dir = filepath.Dir(dir) { - log.Debugf("Checking permissions on dir: %s", dir) + log.Debugf("checking permissions on dir: %s", dir) + s, err := os.Stat(dir) if err != nil { return err } + owner := int(s.Sys().(*syscall.Stat_t).Uid) if owner != os.Geteuid() { - log.Debugf("Skipping %s - not owner", dir) + log.Debugf("skipping %s - not owner", dir) continue } + mode := s.Mode() if mode&0011 != 1 { - log.Infof("Setting executable bit set on %s (perms=%s)", dir, mode) + log.Infof("setting executable bit set on %s (perms=%s)", dir, mode) mode |= 0011 if err := os.Chmod(dir, mode); err != nil { return err @@ -427,100 +468,154 @@ func ensureDirPermissions(store string) error { return nil } -// Stop a host gracefully -func (d *Driver) Stop() (err error) { +// Stop a host gracefully or forcefully otherwise. +func (d *Driver) Stop() error { s, err := d.GetState() if err != nil { - return errors.Wrap(err, "getting state of VM") + return errors.Wrap(err, "getting domain state") } - if s != state.Stopped { - dom, conn, err := d.getDomain() - defer func() { - if ferr := closeDomain(dom, conn); ferr != nil { - err = ferr - } - }() - if err != nil { - return errors.Wrap(err, "getting connection") + if s == state.Stopped { + return nil + } + + log.Info("stopping domain...") + + dom, conn, err := d.getDomain() + if err != nil { + return errors.Wrap(err, "getting domain") + } + defer func() { + if err := closeDomain(dom, conn); err != nil { + log.Errorf("unable to close domain: %v", err) } + }() - err = dom.Shutdown() - if err != nil { - return errors.Wrap(err, "stopping vm") + log.Info("gracefully shutting down domain...") + + // ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainShutdownFlags + // note: "The order in which the hypervisor tries each shutdown method is undefined, and a hypervisor is not required to support all methods." + // so we skip "VIR_DOMAIN_SHUTDOWN_DEFAULT" - the "hypervisor choice" and use bitwise-OR of available virDomainShutdownFlagValues + // ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainShutdownFlagValues + // ref: https://bugzilla.redhat.com/show_bug.cgi?id=1744156 + // "libvirt's QEMU driver, used to manage KVM guests, only supports the 'agent' and 'acpi' reboot/shutdown modes because those are the ones QEMU itself supports" + // note: we don't install/use agent + if err := dom.ShutdownFlags(libvirt.DOMAIN_SHUTDOWN_ACPI_POWER_BTN); err != nil { + log.Warnf("setting virDomainShutdownFlags failed, will continue anyway with defaults: %v", err) + if err := dom.ShutdownFlags(libvirt.DOMAIN_SHUTDOWN_DEFAULT); err != nil { + log.Errorf("resetting virDomainShutdownFlags to defaults failed, will continue anyway: %v", err) + } + } + + if err := dom.Shutdown(); err != nil { + return errors.Wrap(err, "gracefully shutting down domain") + } + + if s, err = d.waitForStopState(90, "graceful shutdown"); err == nil { + log.Info("domain gracefully shutdown") + return nil + } + + // could not get domain state + if s == state.None { + return err + } + + // at this point shutdown failed, so we try with a little bit of force + log.Warn("waiting for domain graceful shutdown failed, will try to force-stop") + if err := d.Kill(); err != nil { + log.Warnf("force-stopping domain request failed: %v", err) + } + + if s, err := d.waitForStopState(30, "force-stop"); err != nil { + return fmt.Errorf("unable to stop domain %s, current state is %q", d.MachineName, s.String()) + } + + log.Info("domain force-stopped") + + return nil +} + +// waitForStopState waits maxsec for the domain to reach a stopped state. +func (d *Driver) waitForStopState(maxsec int, method string) (state.State, error) { + var s state.State + var err error + for i := 0; i < maxsec; i++ { + if s, err = d.GetState(); err != nil { + return s, errors.Wrap(err, "getting domain state") } - maxsec := 120 - for i := 0; i < maxsec; i++ { - s, err := d.GetState() - if err != nil { - return errors.Wrap(err, "error getting state of VM") - } - if s == state.Stopped { - return nil - } - log.Infof("Waiting for machine to stop %d/%d", i, maxsec) - time.Sleep(time.Second) + if s == state.Stopped { + return state.Stopped, nil } + log.Infof("waiting for domain %s %d/%d", method, i, maxsec) + time.Sleep(1 * time.Second) } - return fmt.Errorf("unable to stop vm, current state %q", s.String()) + return s, fmt.Errorf("timed out waiting for domain %s, current state is %q", method, s) } // Remove a host func (d *Driver) Remove() error { - log.Debug("Removing machine...") + log.Info("removing KVM machine...") + conn, err := getConnection(d.ConnectionURI) if err != nil { - return errors.Wrap(err, "getting connection") + return errors.Wrap(err, "getting libvirt connection") } - defer conn.Close() + defer func() { + if _, err := conn.Close(); err != nil { + log.Errorf("unable to close libvirt connection: %v", err) + } + }() // Tear down network if it exists and is not in use by another minikube instance - log.Debug("Trying to delete the networks (if possible)") + log.Info("deleting networks...") if err := d.deleteNetwork(); err != nil { - log.Warnf("Deleting of networks failed: %v", err) + log.Errorf("deleting networks failed, will continue anyway: %v", err) } else { - log.Info("Successfully deleted networks") + log.Info("successfully deleted networks") } // Tear down the domain now - log.Debug("Checking if the domain needs to be deleted") + log.Info("checking if the domain needs to be deleted") dom, err := conn.LookupDomainByName(d.MachineName) if err != nil { - log.Warnf("Domain %s does not exist, nothing to clean up...", d.MachineName) + log.Warnf("domain %s does not exist, nothing to clean up...", d.MachineName) return nil } - log.Infof("Domain %s exists, removing...", d.MachineName) + log.Infof("domain %s exists, removing...", d.MachineName) if err := d.destroyRunningDomain(dom); err != nil { return errors.Wrap(err, "destroying running domain") } if err := d.undefineDomain(conn, dom); err != nil { - return errors.Wrap(err, "undefine domain") + return errors.Wrap(err, "undefining domain") } - log.Info("Removing static IP address...") + log.Info("removing static IP address...") if err := delStaticIP(conn, d.PrivateNetwork, "", "", d.IPAddress); err != nil { - log.Warnf("failed removing static IP %s for host %s, will continue anyway: %v", d.IPAddress, d.MachineName, err) + log.Warnf("failed removing static IP address %s for domain %s, will continue anyway: %v", d.IPAddress, d.MachineName, err) } else { - log.Info("Removed static IP address") + log.Infof("removed static IP address %s for domain %s", d.IPAddress, d.MachineName) } + log.Infof("KVM machine removal complete") + return nil } func (d *Driver) destroyRunningDomain(dom *libvirt.Domain) error { - state, _, err := dom.GetState() + lvs, _, err := dom.GetState() if err != nil { return errors.Wrap(err, "getting domain state") } // if the domain is not running, we don't destroy it - if state != libvirt.DOMAIN_RUNNING { - log.Warnf("Domain %s already destroyed, skipping...", d.MachineName) + if machineState(lvs) != state.Running { + log.Warnf("domain %s already destroyed, skipping...", d.MachineName) return nil } @@ -530,7 +625,7 @@ func (d *Driver) destroyRunningDomain(dom *libvirt.Domain) error { func (d *Driver) undefineDomain(conn *libvirt.Connect, dom *libvirt.Domain) error { definedDomains, err := conn.ListDefinedDomains() if err != nil { - return errors.Wrap(err, "list defined domains") + return errors.Wrap(err, "listing domains") } var found bool @@ -542,7 +637,7 @@ func (d *Driver) undefineDomain(conn *libvirt.Connect, dom *libvirt.Domain) erro } if !found { - log.Warnf("Domain %s not defined, skipping undefine...", d.MachineName) + log.Warnf("domain %s not defined, skipping undefine...", d.MachineName) return nil } diff --git a/pkg/drivers/kvm/network.go b/pkg/drivers/kvm/network.go index 8ee6ad8adc1f..66e947ed5f29 100644 --- a/pkg/drivers/kvm/network.go +++ b/pkg/drivers/kvm/network.go @@ -118,7 +118,11 @@ func (d *Driver) ensureNetwork() error { if err != nil { return errors.Wrap(err, "getting libvirt connection") } - defer conn.Close() + defer func() { + if _, err := conn.Close(); err != nil { + log.Errorf("unable to close libvirt connection: %v", err) + } + }() // network: default @@ -162,7 +166,11 @@ func (d *Driver) createNetwork() error { if err != nil { return errors.Wrap(err, "getting libvirt connection") } - defer conn.Close() + defer func() { + if _, err := conn.Close(); err != nil { + log.Errorf("unable to close libvirt connection: %v", err) + } + }() // network: default // It is assumed that the libvirt/kvm installation has already created this network @@ -240,7 +248,11 @@ func (d *Driver) deleteNetwork() error { if err != nil { return errors.Wrap(err, "getting libvirt connection") } - defer conn.Close() + defer func() { + if _, err := conn.Close(); err != nil { + log.Errorf("unable to close libvirt connection: %v", err) + } + }() // network: default // It is assumed that the OS manages this network