From d213adfaf723f0ea0dd152d4a01fe367298eeb99 Mon Sep 17 00:00:00 2001 From: Derek Su Date: Fri, 29 Nov 2024 16:31:40 +0800 Subject: [PATCH] fix: decrease ctrlr_loss_timeout_sec for base bdev (replica) Longhorn 9874 Signed-off-by: Derek Su --- pkg/client/types.go | 3 ++- pkg/spdk/backup.go | 11 ++++++----- pkg/spdk/disk.go | 10 ++++++---- pkg/spdk/disk/nvme/nvme.go | 14 ++++++++++--- pkg/spdk/engine.go | 40 ++++++++++++++++++++++++-------------- pkg/spdk/replica.go | 14 ++++++++----- pkg/spdk/restore.go | 7 ++++--- pkg/spdk/server.go | 22 +++++++++++---------- pkg/spdk/types.go | 25 +++++++++++++++++++++++- pkg/spdk/util.go | 7 +++---- pkg/util/block.go | 3 ++- 11 files changed, 104 insertions(+), 52 deletions(-) diff --git a/pkg/client/types.go b/pkg/client/types.go index c70a13bd..6f67132e 100644 --- a/pkg/client/types.go +++ b/pkg/client/types.go @@ -3,8 +3,9 @@ package client import ( "time" - "github.com/longhorn/types/pkg/generated/spdkrpc" "google.golang.org/grpc" + + "github.com/longhorn/types/pkg/generated/spdkrpc" ) const ( diff --git a/pkg/spdk/backup.go b/pkg/spdk/backup.go index c08dfe0c..307a8f22 100644 --- a/pkg/spdk/backup.go +++ b/pkg/spdk/backup.go @@ -12,12 +12,13 @@ import ( "github.com/sirupsen/logrus" "github.com/longhorn/backupstore" + "github.com/longhorn/go-spdk-helper/pkg/nvme" + btypes "github.com/longhorn/backupstore/types" commonbitmap "github.com/longhorn/go-common-libs/bitmap" commonnet "github.com/longhorn/go-common-libs/net" commonns "github.com/longhorn/go-common-libs/ns" commontypes "github.com/longhorn/go-common-libs/types" - "github.com/longhorn/go-spdk-helper/pkg/nvme" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" @@ -166,10 +167,10 @@ func (b *Backup) OpenSnapshot(snapshotName, volumeName string) error { } b.initiator = initiator - b.log.Infof("Opening nvme device %v", b.initiator.Endpoint) + b.log.Infof("Opening NVMe device %v", b.initiator.Endpoint) devFh, err := os.OpenFile(b.initiator.Endpoint, os.O_RDONLY, 0666) if err != nil { - return errors.Wrapf(err, "failed to open nvme device %v for snapshot lvol bdev %v", b.initiator.Endpoint, lvolName) + return errors.Wrapf(err, "failed to open NVMe device %v for snapshot lvol bdev %v", b.initiator.Endpoint, lvolName) } b.devFh = devFh @@ -220,9 +221,9 @@ func (b *Backup) CloseSnapshot(snapshotName, volumeName string) error { b.Lock() defer b.Unlock() - b.log.Infof("Closing nvme device %v", b.initiator.Endpoint) + b.log.Infof("Closing NVMe device %v", b.initiator.Endpoint) if err := b.devFh.Close(); err != nil { - return errors.Wrapf(err, "failed to close nvme device %v", b.initiator.Endpoint) + return errors.Wrapf(err, "failed to close NVMe device %v", b.initiator.Endpoint) } b.log.Info("Stopping NVMe initiator") diff --git a/pkg/spdk/disk.go b/pkg/spdk/disk.go index c84cf722..7bda9523 100644 --- a/pkg/spdk/disk.go +++ b/pkg/spdk/disk.go @@ -6,20 +6,22 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" + "google.golang.org/protobuf/types/known/emptypb" + grpccodes "google.golang.org/grpc/codes" grpcstatus "google.golang.org/grpc/status" - "google.golang.org/protobuf/types/known/emptypb" - commontypes "github.com/longhorn/go-common-libs/types" "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" + "github.com/longhorn/types/pkg/generated/spdkrpc" + + commontypes "github.com/longhorn/go-common-libs/types" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" spdkutil "github.com/longhorn/go-spdk-helper/pkg/util" - "github.com/longhorn/types/pkg/generated/spdkrpc" + "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk" "github.com/longhorn/longhorn-spdk-engine/pkg/util" - "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk" _ "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk/aio" _ "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk/nvme" _ "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk/virtio-blk" diff --git a/pkg/spdk/disk/nvme/nvme.go b/pkg/spdk/disk/nvme/nvme.go index a2533b9c..b7003892 100644 --- a/pkg/spdk/disk/nvme/nvme.go +++ b/pkg/spdk/disk/nvme/nvme.go @@ -10,12 +10,21 @@ import ( spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdksetup "github.com/longhorn/go-spdk-helper/pkg/spdk/setup" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" - helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk" ) +const ( + // Timeouts for disk bdev + diskCtrlrLossTimeoutSec = 30 + diskReconnectDelaySec = 2 + diskFastIOFailTimeoutSec = 15 + diskTransportAckTimeout = 14 + diskKeepAliveTimeoutMs = 10000 + diskMultipath = "disable" +) + type DiskDriverNvme struct { } @@ -46,8 +55,7 @@ func (d *DiskDriverNvme) DiskCreate(spdkClient *spdkclient.Client, diskName, dis } }() bdevs, err := spdkClient.BdevNvmeAttachController(diskName, "", diskPath, "", "PCIe", "", - helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, - helpertypes.DefaultMultipath) + diskCtrlrLossTimeoutSec, diskReconnectDelaySec, diskFastIOFailTimeoutSec, diskMultipath) if err != nil { return "", errors.Wrapf(err, "failed to attach NVMe disk %v", diskPath) } diff --git a/pkg/spdk/engine.go b/pkg/spdk/engine.go index 6c79c7ff..1bc3a4f2 100644 --- a/pkg/spdk/engine.go +++ b/pkg/spdk/engine.go @@ -10,20 +10,22 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" + grpccodes "google.golang.org/grpc/codes" grpcstatus "google.golang.org/grpc/status" + "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" + "github.com/longhorn/go-spdk-helper/pkg/nvme" + "github.com/longhorn/types/pkg/generated/spdkrpc" + commonbitmap "github.com/longhorn/go-common-libs/bitmap" commonnet "github.com/longhorn/go-common-libs/net" commontypes "github.com/longhorn/go-common-libs/types" commonutils "github.com/longhorn/go-common-libs/utils" - "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" - "github.com/longhorn/go-spdk-helper/pkg/nvme" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" - "github.com/longhorn/types/pkg/generated/spdkrpc" "github.com/longhorn/longhorn-spdk-engine/pkg/api" "github.com/longhorn/longhorn-spdk-engine/pkg/client" @@ -47,6 +49,9 @@ type Engine struct { Nqn string Nguid string + ctrlrLossTimeout int + fastIOFailTimeoutSec int + ReplicaStatusMap map[string]*EngineReplicaStatus initiator *nvme.Initiator @@ -92,6 +97,10 @@ func NewEngine(engineName, volumeName, frontend string, specSize uint64, engineU Frontend: frontend, SpecSize: specSize, + // TODO: support user-defined values + ctrlrLossTimeout: replicaCtrlrLossTimeoutSec, + fastIOFailTimeoutSec: replicaFastIOFailTimeoutSec, + ReplicaStatusMap: map[string]*EngineReplicaStatus{}, State: types.InstanceStatePending, @@ -207,7 +216,7 @@ func (e *Engine) Create(spdkClient *spdkclient.Client, replicaAddressMap map[str Address: replicaAddr, } - bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr) + bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec) if err != nil { e.log.WithError(err).Warnf("Failed to get bdev from replica %s with address %s during creation, will mark the mode to ERR and continue", replicaName, replicaAddr) e.ReplicaStatusMap[replicaName].Mode = types.ModeERR @@ -673,7 +682,7 @@ func (e *Engine) ValidateAndUpdate(spdkClient *spdkclient.Client) (err error) { if replicaStatus.Mode != types.ModeERR { mode, err := e.validateAndUpdateReplicaNvme(replicaName, bdevMap[replicaStatus.BdevName]) if err != nil { - e.log.WithError(err).Errorf("Engine found valid nvme for replica %v, will update the mode from %s to ERR during ValidateAndUpdate", replicaName, replicaStatus.Mode) + e.log.WithError(err).Errorf("Engine found valid NVMe for replica %v, will update the mode from %s to ERR during ValidateAndUpdate", replicaName, replicaStatus.Mode) replicaStatus.Mode = types.ModeERR updateRequired = true } else if replicaStatus.Mode != mode { @@ -922,20 +931,20 @@ func (e *Engine) validateAndUpdateReplicaNvme(replicaName string, bdev *spdktype return types.ModeERR, fmt.Errorf("found bdev type %v rather than %v during replica %s mode validation", spdktypes.GetBdevType(bdev), spdktypes.BdevTypeNvme, replicaName) } if len(*bdev.DriverSpecific.Nvme) != 1 { - return types.ModeERR, fmt.Errorf("found zero or multiple nvme info in a nvme base bdev %v during replica %s mode validation", bdev.Name, replicaName) + return types.ModeERR, fmt.Errorf("found zero or multiple NVMe info in a NVMe base bdev %v during replica %s mode validation", bdev.Name, replicaName) } nvmeInfo := (*bdev.DriverSpecific.Nvme)[0] if !strings.EqualFold(string(nvmeInfo.Trid.Adrfam), string(spdktypes.NvmeAddressFamilyIPv4)) || !strings.EqualFold(string(nvmeInfo.Trid.Trtype), string(spdktypes.NvmeTransportTypeTCP)) { - return types.ModeERR, fmt.Errorf("found invalid address family %s and transport type %s in a remote nvme base bdev %s during replica %s mode validation", nvmeInfo.Trid.Adrfam, nvmeInfo.Trid.Trtype, bdev.Name, replicaName) + return types.ModeERR, fmt.Errorf("found invalid address family %s and transport type %s in a remote NVMe base bdev %s during replica %s mode validation", nvmeInfo.Trid.Adrfam, nvmeInfo.Trid.Trtype, bdev.Name, replicaName) } bdevAddr := net.JoinHostPort(nvmeInfo.Trid.Traddr, nvmeInfo.Trid.Trsvcid) if e.ReplicaStatusMap[replicaName].Address != bdevAddr { - return types.ModeERR, fmt.Errorf("found mismatching between replica bdev %s address %s and the nvme bdev actual address %s during replica %s mode validation", bdev.Name, e.ReplicaStatusMap[replicaName].Address, bdevAddr, replicaName) + return types.ModeERR, fmt.Errorf("found mismatching between replica bdev %s address %s and the NVMe bdev actual address %s during replica %s mode validation", bdev.Name, e.ReplicaStatusMap[replicaName].Address, bdevAddr, replicaName) } controllerName := helperutil.GetNvmeControllerNameFromNamespaceName(e.ReplicaStatusMap[replicaName].BdevName) if controllerName != replicaName { - return types.ModeERR, fmt.Errorf("found unexpected the nvme bdev controller name %s (bdev name %s) during replica %s mode validation", controllerName, bdev.Name, replicaName) + return types.ModeERR, fmt.Errorf("found unexpected the NVMe bdev controller name %s (bdev name %s) during replica %s mode validation", controllerName, bdev.Name, replicaName) } return e.ReplicaStatusMap[replicaName].Mode, nil @@ -1078,7 +1087,7 @@ func (e *Engine) ReplicaAdd(spdkClient *spdkclient.Client, dstReplicaName, dstRe } // Add rebuilding replica head bdev to the base bdev list of the RAID bdev - dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress) + dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec) if err != nil { return err } @@ -1246,7 +1255,7 @@ func (e *Engine) replicaAddFinish(srcReplicaServiceCli, dstReplicaServiceCli *cl } // Blindly ask the source replica to detach the rebuilding lvol - // If this detachment fails, there may be leftover rebuilding nvme controller in spdk_tgt of the src replica. We should continue since it's not a fatal error and shall not block the flow + // If this detachment fails, there may be leftover rebuilding NVMe controller in spdk_tgt of the src replica. We should continue since it's not a fatal error and shall not block the flow // Similarly, the below src/dst replica finish should not block the flow either. if srcReplicaErr := srcReplicaServiceCli.ReplicaRebuildingSrcDetach(srcReplicaName, dstReplicaName); srcReplicaErr != nil { e.log.WithError(srcReplicaErr).Errorf("Engine failed to detach the rebuilding lvol for rebuilding src replica %s, will ignore this error and continue", srcReplicaName) @@ -1631,7 +1640,7 @@ func (e *Engine) snapshotOperationWithoutLock(spdkClient *spdkclient.Client, rep func (e *Engine) replicaSnapshotOperation(spdkClient *spdkclient.Client, replicaClient *client.SPDKClient, replicaName, snapshotName string, snapshotOp SnapshotOperationType, opts *api.SnapshotOptions) error { switch snapshotOp { case SnapshotOperationCreate: - // TODO: execute `sync` for the nvme initiator before snapshot start + // TODO: execute `sync` for the NVMe initiator before snapshot start return replicaClient.ReplicaSnapshotCreate(replicaName, snapshotName, opts) case SnapshotOperationDelete: return replicaClient.ReplicaSnapshotDelete(replicaName, snapshotName) @@ -1648,7 +1657,7 @@ func (e *Engine) replicaSnapshotOperation(spdkClient *spdkclient.Client, replica if err := replicaClient.ReplicaSnapshotRevert(replicaName, snapshotName); err != nil { return err } - bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address) + bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec) if err != nil { return err } @@ -1954,8 +1963,9 @@ func (e *Engine) BackupRestoreFinish(spdkClient *spdkclient.Client) error { return err } e.log.Infof("Attaching replica %s with address %s before finishing restoration", replicaName, replicaAddress) - _, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4, - helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, helpertypes.DefaultMultipath) + _, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort, + spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4, + int32(e.ctrlrLossTimeout), replicaReconnectDelaySec, int32(e.fastIOFailTimeoutSec), replicaMultipath) if err != nil { return err } diff --git a/pkg/spdk/replica.go b/pkg/spdk/replica.go index d257c00b..5075ed96 100644 --- a/pkg/spdk/replica.go +++ b/pkg/spdk/replica.go @@ -11,21 +11,23 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" + grpccodes "google.golang.org/grpc/codes" grpcstatus "google.golang.org/grpc/status" "github.com/longhorn/backupstore" + "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" + "github.com/longhorn/types/pkg/generated/spdkrpc" + btypes "github.com/longhorn/backupstore/types" butil "github.com/longhorn/backupstore/util" commonbitmap "github.com/longhorn/go-common-libs/bitmap" commonnet "github.com/longhorn/go-common-libs/net" commonutils "github.com/longhorn/go-common-libs/utils" - "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" - "github.com/longhorn/types/pkg/generated/spdkrpc" "github.com/longhorn/longhorn-spdk-engine/pkg/api" "github.com/longhorn/longhorn-spdk-engine/pkg/types" @@ -1320,12 +1322,13 @@ func (r *Replica) RebuildingSrcAttach(spdkClient *spdkclient.Client, dstReplicaN if r.rebuildingSrcCache.dstRebuildingBdevName != "" { controllerName := helperutil.GetNvmeControllerNameFromNamespaceName(r.rebuildingSrcCache.dstRebuildingBdevName) if dstRebuildingLvolName != controllerName { - return fmt.Errorf("found mismatching between the required dst bdev nvme controller name %s and the expected dst controller name %s for replica %s rebuilding src attach", dstRebuildingLvolName, controllerName, r.Name) + return fmt.Errorf("found mismatching between the required dst bdev NVMe controller name %s and the expected dst controller name %s for replica %s rebuilding src attach", dstRebuildingLvolName, controllerName, r.Name) } return nil } - r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress) + r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress, + replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec) if err != nil { return errors.Wrapf(err, "failed to connect rebuilding lvol %s with address %s as a NVMe bdev for replica %s rebuilding src attach", dstRebuildingLvolName, dstRebuildingLvolAddress, r.Name) } @@ -1432,7 +1435,8 @@ func (r *Replica) RebuildingDstStart(spdkClient *spdkclient.Client, srcReplicaNa r.rebuildingDstCache.srcReplicaAddress = srcReplicaAddress externalSnapshotLvolName := GetReplicaSnapshotLvolName(srcReplicaName, externalSnapshotName) - externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress) + externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress, + replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec) if err != nil { return "", errors.Wrapf(err, "failed to connect the external src snapshot lvol %s with address %s as a NVMf bdev for dst replica %v rebuilding start", externalSnapshotLvolName, externalSnapshotAddress, r.Name) } diff --git a/pkg/spdk/restore.go b/pkg/spdk/restore.go index c2bc8abb..fc0b1534 100644 --- a/pkg/spdk/restore.go +++ b/pkg/spdk/restore.go @@ -10,10 +10,11 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" + "github.com/longhorn/go-spdk-helper/pkg/nvme" + btypes "github.com/longhorn/backupstore/types" commonns "github.com/longhorn/go-common-libs/ns" commontypes "github.com/longhorn/go-common-libs/types" - "github.com/longhorn/go-spdk-helper/pkg/nvme" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" @@ -156,9 +157,9 @@ func (r *Restore) OpenVolumeDev(volDevName string) (*os.File, string, error) { } func (r *Restore) CloseVolumeDev(volDev *os.File) error { - r.log.Infof("Closing nvme device %v", r.initiator.Endpoint) + r.log.Infof("Closing NVMe device %v", r.initiator.Endpoint) if err := volDev.Close(); err != nil { - return errors.Wrapf(err, "failed to close nvme device %v", r.initiator.Endpoint) + return errors.Wrapf(err, "failed to close NVMe device %v", r.initiator.Endpoint) } r.log.Info("Stopping NVMe initiator") diff --git a/pkg/spdk/server.go b/pkg/spdk/server.go index 24222192..f079257f 100644 --- a/pkg/spdk/server.go +++ b/pkg/spdk/server.go @@ -11,18 +11,20 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/net/context" + + "google.golang.org/protobuf/types/known/emptypb" + grpccodes "google.golang.org/grpc/codes" grpcstatus "google.golang.org/grpc/status" - "google.golang.org/protobuf/types/known/emptypb" "github.com/longhorn/backupstore" + "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" + "github.com/longhorn/types/pkg/generated/spdkrpc" + butil "github.com/longhorn/backupstore/util" commonbitmap "github.com/longhorn/go-common-libs/bitmap" - "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" - helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" - "github.com/longhorn/types/pkg/generated/spdkrpc" "github.com/longhorn/longhorn-spdk-engine/pkg/api" "github.com/longhorn/longhorn-spdk-engine/pkg/types" @@ -65,12 +67,12 @@ func NewServer(ctx context.Context, portStart, portEnd int32) (*Server, error) { } if _, err = cli.BdevNvmeSetOptions( - helpertypes.DefaultCtrlrLossTimeoutSec, - helpertypes.DefaultReconnectDelaySec, - helpertypes.DefaultFastIOFailTimeoutSec, - helpertypes.DefaultTransportAckTimeout, - helpertypes.DefaultKeepAliveTimeoutMs); err != nil { - return nil, errors.Wrap(err, "failed to set nvme options") + replicaCtrlrLossTimeoutSec, + replicaReconnectDelaySec, + replicaFastIOFailTimeoutSec, + replicaTransportAckTimeout, + replicaKeepAliveTimeoutMs); err != nil { + return nil, errors.Wrap(err, "failed to set NVMe options") } broadcasters := map[types.InstanceType]*broadcaster.Broadcaster{} diff --git a/pkg/spdk/types.go b/pkg/spdk/types.go index 2cd20f06..a56ac937 100644 --- a/pkg/spdk/types.go +++ b/pkg/spdk/types.go @@ -7,9 +7,10 @@ import ( "strings" "time" + "github.com/longhorn/types/pkg/generated/spdkrpc" + spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" - "github.com/longhorn/types/pkg/generated/spdkrpc" "github.com/longhorn/longhorn-spdk-engine/pkg/client" "github.com/longhorn/longhorn-spdk-engine/pkg/types" @@ -31,6 +32,28 @@ const ( retryInterval = 1 * time.Second ) +const ( + // Timeouts for RAID base bdev (replica) + // The ctrlr_loss_timeout_sec setting applies to the base bdev's NVMe controller + // and defines the timeout duration (30 seconds) for SPDK to attempt reconnecting to the controller + // after losing connection. + // + // When an instance manager containing a replica is deleted, SPDK starts to reconnect to the base bdev's controller. + // If the connection cannot be reestablished within the ctrlr_loss_timeout_sec period, the base bdev is removed from the RAID bdev. + // + // Because the ctrl-loss-tmo for the NVMe-oF initiator connecting to the RAID target is also set to 30 seconds, + // replicaCtrlrLossTimeoutSec and replicaFastIOFailTimeoutSec are set to 15 seconds and 10 seconds, respectively. + // + // If an I/O operation to a replica (base bdev) is unresponsive within 10 seconds, an I/O error is returned, + // and the base bdev is deleted after 5 seconds. + replicaCtrlrLossTimeoutSec = 15 + replicaReconnectDelaySec = 2 + replicaFastIOFailTimeoutSec = 10 + replicaTransportAckTimeout = 14 + replicaKeepAliveTimeoutMs = 10000 + replicaMultipath = "disable" +) + type Lvol struct { Name string UUID string diff --git a/pkg/spdk/util.go b/pkg/spdk/util.go index 0aabbe00..b5c154e8 100644 --- a/pkg/spdk/util.go +++ b/pkg/spdk/util.go @@ -12,13 +12,13 @@ import ( "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" "github.com/longhorn/go-spdk-helper/pkg/nvme" - helperutil "github.com/longhorn/go-spdk-helper/pkg/util" commonns "github.com/longhorn/go-common-libs/ns" commonutils "github.com/longhorn/go-common-libs/utils" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" + helperutil "github.com/longhorn/go-spdk-helper/pkg/util" ) func exposeSnapshotLvolBdev(spdkClient *spdkclient.Client, lvsName, lvolName, ip string, port int32, executor *commonns.Executor) (subsystemNQN, controllerName string, err error) { @@ -77,7 +77,7 @@ func splitHostPort(address string) (string, int32, error) { // connectNVMfBdev connects to the NVMe-oF target, which is exposed by a remote lvol bdev. // controllerName is typically the lvol name, and address is the IP:port of the NVMe-oF target. -func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string) (bdevName string, err error) { +func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string, ctrlrLossTimeout, fastIOFailTimeoutSec int) (bdevName string, err error) { if controllerName == "" || address == "" { return "", fmt.Errorf("controllerName or address is empty") } @@ -89,8 +89,7 @@ func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address stri nvmeBdevNameList, err := spdkClient.BdevNvmeAttachController(controllerName, helpertypes.GetNQN(controllerName), ip, port, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4, - helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, - helpertypes.DefaultMultipath) + int32(ctrlrLossTimeout), replicaReconnectDelaySec, int32(fastIOFailTimeoutSec), replicaMultipath) if err != nil { return "", err } diff --git a/pkg/util/block.go b/pkg/util/block.go index d7a26a1c..8f598ce8 100644 --- a/pkg/util/block.go +++ b/pkg/util/block.go @@ -10,8 +10,9 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" - commontypes "github.com/longhorn/go-common-libs/types" "github.com/longhorn/go-spdk-helper/pkg/types" + + commontypes "github.com/longhorn/go-common-libs/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" )