From 0e62d0e08ec924dcb41ddb942d305f166dac7c2f Mon Sep 17 00:00:00 2001 From: Thomas Parrott Date: Tue, 17 Oct 2023 21:46:02 +0100 Subject: [PATCH] lxd-agent: Fixes vsock listener restart on boot due to vsock module not being fully initialised Sometimes we were seeing the initial vsock context ID of 4294967295 (equivalent to VMADDR_CID_ANY) and then a few seconds later the context ID was returning the valid context ID for the VM. This was previously causing the vsock listener to be restarted 30s after the VM booted, which if there were any exec sessions ongoing would cause them to be terminated. This fixes the issue by waiting for a valid vsock ID and ignoring the value 4294967295. Signed-off-by: Thomas Parrott --- lxd-agent/api_1.0.go | 33 +++++++++++++++++++++++++++++++-- lxd-agent/daemon.go | 4 ---- lxd-agent/main_agent.go | 29 ++++++++++++++++++----------- 3 files changed, 49 insertions(+), 17 deletions(-) diff --git a/lxd-agent/api_1.0.go b/lxd-agent/api_1.0.go index fdbd559c803a..561aaa50a6af 100644 --- a/lxd-agent/api_1.0.go +++ b/lxd-agent/api_1.0.go @@ -1,12 +1,14 @@ package main import ( + "context" "encoding/json" "errors" "fmt" "io" "net/http" "os" + "time" "github.com/mdlayher/vsock" @@ -191,14 +193,41 @@ func getClient(CID uint32, port int, serverCertificate string) (*http.Client, er return client, nil } +// waitVsockContextID checks for valid local context ID and returns it. +// If no valid context ID has been ascertained when the context is cancelled, the last error is returned. +func waitVsockContextID(ctx context.Context) (uint32, error) { + const CIDAny uint32 = 4294967295 // Equivalent to VMADDR_CID_ANY. + + for { + cid, err := vsock.ContextID() + if cid == CIDAny { + // Ignore VMADDR_CID_ANY as this seems to indicate the vsock module is still initialising. + err = fmt.Errorf("Invalid context ID %d", cid) + } else if err == nil { + return cid, nil + } + + ctxErr := ctx.Err() + if ctxErr != nil { + if err != nil { + return 0, err + } + + return 0, ctxErr + } + + time.Sleep(time.Second) + } +} + func startHTTPServer(d *Daemon, debug bool) error { // Setup the listener on VM's context ID for inbound connections from LXD. - l, err := vsock.Listen(shared.HTTPSDefaultPort, nil) + l, err := vsock.ListenContextID(d.localCID, shared.HTTPSDefaultPort, nil) if err != nil { return fmt.Errorf("Failed to listen on vsock: %w", err) } - logger.Info("Started vsock listener") + logger.Info("Started vsock listener", logger.Ctx{"contextID": d.localCID}) // Load the expected server certificate. cert, err := shared.ReadCert("server.crt") diff --git a/lxd-agent/daemon.go b/lxd-agent/daemon.go index 9b99ce8a6e5b..6767f873ad1c 100644 --- a/lxd-agent/daemon.go +++ b/lxd-agent/daemon.go @@ -4,7 +4,6 @@ import ( "sync" "github.com/canonical/lxd/lxd/events" - "github.com/canonical/lxd/lxd/vsock" ) // A Daemon can respond to requests from a shared client. @@ -31,11 +30,8 @@ type Daemon struct { func newDaemon(debug, verbose bool) *Daemon { lxdEvents := events.NewServer(debug, verbose, nil) - cid, _ := vsock.ContextID() - return &Daemon{ events: lxdEvents, chConnected: make(chan struct{}), - localCID: cid, } } diff --git a/lxd-agent/main_agent.go b/lxd-agent/main_agent.go index 15f109c8cf35..130e4e181cf9 100644 --- a/lxd-agent/main_agent.go +++ b/lxd-agent/main_agent.go @@ -18,7 +18,6 @@ import ( "github.com/canonical/lxd/lxd/instance/instancetype" "github.com/canonical/lxd/lxd/storage/filesystem" "github.com/canonical/lxd/lxd/util" - "github.com/canonical/lxd/lxd/vsock" "github.com/canonical/lxd/shared" "github.com/canonical/lxd/shared/logger" ) @@ -130,6 +129,16 @@ func (c *cmdAgent) Run(cmd *cobra.Command, args []string) error { d := newDaemon(c.global.flagLogDebug, c.global.flagLogVerbose) + // Wait up to 30s to get a valid local vsock context ID. + ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) + d.localCID, err = waitVsockContextID(ctx) + if err != nil { + cancel() + return fmt.Errorf("Failed getting vsock context ID: %w", err) + } + + cancel() + // Start the server. err = startHTTPServer(d, c.global.flagLogDebug) if err != nil { @@ -138,26 +147,24 @@ func (c *cmdAgent) Run(cmd *cobra.Command, args []string) error { // Check context ID periodically, and restart the HTTP server if needed. go func() { - for range time.Tick(30 * time.Second) { - cid, err := vsock.ContextID() - if err != nil { - continue - } - - if d.localCID == cid { + for { + time.Sleep(30 * time.Second) + cid, err := waitVsockContextID(context.Background()) + if err != nil || d.localCID == cid { continue } // Restart server + logger.Warn("Restarting the vsock server due to context ID change", logger.Ctx{"oldID": d.localCID, "newID": cid}) servers["http"].Close() + // Update context ID. + d.localCID = cid + err = startHTTPServer(d, c.global.flagLogDebug) if err != nil { errChan <- err } - - // Update context ID. - d.localCID = cid } }()