Skip to content

Commit

Permalink
fix: decrease ctrlr_loss_timeout_sec for base bdev (replica)
Browse files Browse the repository at this point in the history
Longhorn 9874

Signed-off-by: Derek Su <[email protected]>
  • Loading branch information
derekbit committed Nov 29, 2024
1 parent ef6f828 commit d213adf
Show file tree
Hide file tree
Showing 11 changed files with 104 additions and 52 deletions.
3 changes: 2 additions & 1 deletion pkg/client/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ package client
import (
"time"

"github.com/longhorn/types/pkg/generated/spdkrpc"
"google.golang.org/grpc"

"github.com/longhorn/types/pkg/generated/spdkrpc"
)

const (
Expand Down
11 changes: 6 additions & 5 deletions pkg/spdk/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ import (
"github.com/sirupsen/logrus"

"github.com/longhorn/backupstore"
"github.com/longhorn/go-spdk-helper/pkg/nvme"

btypes "github.com/longhorn/backupstore/types"
commonbitmap "github.com/longhorn/go-common-libs/bitmap"
commonnet "github.com/longhorn/go-common-libs/net"
commonns "github.com/longhorn/go-common-libs/ns"
commontypes "github.com/longhorn/go-common-libs/types"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
Expand Down Expand Up @@ -166,10 +167,10 @@ func (b *Backup) OpenSnapshot(snapshotName, volumeName string) error {
}
b.initiator = initiator

b.log.Infof("Opening nvme device %v", b.initiator.Endpoint)
b.log.Infof("Opening NVMe device %v", b.initiator.Endpoint)
devFh, err := os.OpenFile(b.initiator.Endpoint, os.O_RDONLY, 0666)
if err != nil {
return errors.Wrapf(err, "failed to open nvme device %v for snapshot lvol bdev %v", b.initiator.Endpoint, lvolName)
return errors.Wrapf(err, "failed to open NVMe device %v for snapshot lvol bdev %v", b.initiator.Endpoint, lvolName)
}
b.devFh = devFh

Expand Down Expand Up @@ -220,9 +221,9 @@ func (b *Backup) CloseSnapshot(snapshotName, volumeName string) error {
b.Lock()
defer b.Unlock()

b.log.Infof("Closing nvme device %v", b.initiator.Endpoint)
b.log.Infof("Closing NVMe device %v", b.initiator.Endpoint)
if err := b.devFh.Close(); err != nil {
return errors.Wrapf(err, "failed to close nvme device %v", b.initiator.Endpoint)
return errors.Wrapf(err, "failed to close NVMe device %v", b.initiator.Endpoint)
}

b.log.Info("Stopping NVMe initiator")
Expand Down
10 changes: 6 additions & 4 deletions pkg/spdk/disk.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@ import (

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/types/known/emptypb"

grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/emptypb"

commontypes "github.com/longhorn/go-common-libs/types"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/types/pkg/generated/spdkrpc"

commontypes "github.com/longhorn/go-common-libs/types"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
spdkutil "github.com/longhorn/go-spdk-helper/pkg/util"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk"
"github.com/longhorn/longhorn-spdk-engine/pkg/util"

"github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk"
_ "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk/aio"
_ "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk/nvme"
_ "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk/virtio-blk"
Expand Down
14 changes: 11 additions & 3 deletions pkg/spdk/disk/nvme/nvme.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,21 @@ import (
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdksetup "github.com/longhorn/go-spdk-helper/pkg/spdk/setup"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"

"github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk"
)

const (
// Timeouts for disk bdev
diskCtrlrLossTimeoutSec = 30
diskReconnectDelaySec = 2
diskFastIOFailTimeoutSec = 15
diskTransportAckTimeout = 14
diskKeepAliveTimeoutMs = 10000
diskMultipath = "disable"
)

type DiskDriverNvme struct {
}

Expand Down Expand Up @@ -46,8 +55,7 @@ func (d *DiskDriverNvme) DiskCreate(spdkClient *spdkclient.Client, diskName, dis
}
}()
bdevs, err := spdkClient.BdevNvmeAttachController(diskName, "", diskPath, "", "PCIe", "",
helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec,
helpertypes.DefaultMultipath)
diskCtrlrLossTimeoutSec, diskReconnectDelaySec, diskFastIOFailTimeoutSec, diskMultipath)
if err != nil {
return "", errors.Wrapf(err, "failed to attach NVMe disk %v", diskPath)
}
Expand Down
40 changes: 25 additions & 15 deletions pkg/spdk/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,22 @@ import (

"github.com/pkg/errors"
"github.com/sirupsen/logrus"

grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"

"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
"github.com/longhorn/types/pkg/generated/spdkrpc"

commonbitmap "github.com/longhorn/go-common-libs/bitmap"
commonnet "github.com/longhorn/go-common-libs/net"
commontypes "github.com/longhorn/go-common-libs/types"
commonutils "github.com/longhorn/go-common-libs/utils"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/client"
Expand All @@ -47,6 +49,9 @@ type Engine struct {
Nqn string
Nguid string

ctrlrLossTimeout int
fastIOFailTimeoutSec int

ReplicaStatusMap map[string]*EngineReplicaStatus

initiator *nvme.Initiator
Expand Down Expand Up @@ -92,6 +97,10 @@ func NewEngine(engineName, volumeName, frontend string, specSize uint64, engineU
Frontend: frontend,
SpecSize: specSize,

// TODO: support user-defined values
ctrlrLossTimeout: replicaCtrlrLossTimeoutSec,
fastIOFailTimeoutSec: replicaFastIOFailTimeoutSec,

ReplicaStatusMap: map[string]*EngineReplicaStatus{},

State: types.InstanceStatePending,
Expand Down Expand Up @@ -207,7 +216,7 @@ func (e *Engine) Create(spdkClient *spdkclient.Client, replicaAddressMap map[str
Address: replicaAddr,
}

bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr)
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)
if err != nil {
e.log.WithError(err).Warnf("Failed to get bdev from replica %s with address %s during creation, will mark the mode to ERR and continue", replicaName, replicaAddr)
e.ReplicaStatusMap[replicaName].Mode = types.ModeERR
Expand Down Expand Up @@ -673,7 +682,7 @@ func (e *Engine) ValidateAndUpdate(spdkClient *spdkclient.Client) (err error) {
if replicaStatus.Mode != types.ModeERR {
mode, err := e.validateAndUpdateReplicaNvme(replicaName, bdevMap[replicaStatus.BdevName])
if err != nil {
e.log.WithError(err).Errorf("Engine found valid nvme for replica %v, will update the mode from %s to ERR during ValidateAndUpdate", replicaName, replicaStatus.Mode)
e.log.WithError(err).Errorf("Engine found valid NVMe for replica %v, will update the mode from %s to ERR during ValidateAndUpdate", replicaName, replicaStatus.Mode)
replicaStatus.Mode = types.ModeERR
updateRequired = true
} else if replicaStatus.Mode != mode {
Expand Down Expand Up @@ -922,20 +931,20 @@ func (e *Engine) validateAndUpdateReplicaNvme(replicaName string, bdev *spdktype
return types.ModeERR, fmt.Errorf("found bdev type %v rather than %v during replica %s mode validation", spdktypes.GetBdevType(bdev), spdktypes.BdevTypeNvme, replicaName)
}
if len(*bdev.DriverSpecific.Nvme) != 1 {
return types.ModeERR, fmt.Errorf("found zero or multiple nvme info in a nvme base bdev %v during replica %s mode validation", bdev.Name, replicaName)
return types.ModeERR, fmt.Errorf("found zero or multiple NVMe info in a NVMe base bdev %v during replica %s mode validation", bdev.Name, replicaName)
}
nvmeInfo := (*bdev.DriverSpecific.Nvme)[0]
if !strings.EqualFold(string(nvmeInfo.Trid.Adrfam), string(spdktypes.NvmeAddressFamilyIPv4)) ||
!strings.EqualFold(string(nvmeInfo.Trid.Trtype), string(spdktypes.NvmeTransportTypeTCP)) {
return types.ModeERR, fmt.Errorf("found invalid address family %s and transport type %s in a remote nvme base bdev %s during replica %s mode validation", nvmeInfo.Trid.Adrfam, nvmeInfo.Trid.Trtype, bdev.Name, replicaName)
return types.ModeERR, fmt.Errorf("found invalid address family %s and transport type %s in a remote NVMe base bdev %s during replica %s mode validation", nvmeInfo.Trid.Adrfam, nvmeInfo.Trid.Trtype, bdev.Name, replicaName)
}
bdevAddr := net.JoinHostPort(nvmeInfo.Trid.Traddr, nvmeInfo.Trid.Trsvcid)
if e.ReplicaStatusMap[replicaName].Address != bdevAddr {
return types.ModeERR, fmt.Errorf("found mismatching between replica bdev %s address %s and the nvme bdev actual address %s during replica %s mode validation", bdev.Name, e.ReplicaStatusMap[replicaName].Address, bdevAddr, replicaName)
return types.ModeERR, fmt.Errorf("found mismatching between replica bdev %s address %s and the NVMe bdev actual address %s during replica %s mode validation", bdev.Name, e.ReplicaStatusMap[replicaName].Address, bdevAddr, replicaName)
}
controllerName := helperutil.GetNvmeControllerNameFromNamespaceName(e.ReplicaStatusMap[replicaName].BdevName)
if controllerName != replicaName {
return types.ModeERR, fmt.Errorf("found unexpected the nvme bdev controller name %s (bdev name %s) during replica %s mode validation", controllerName, bdev.Name, replicaName)
return types.ModeERR, fmt.Errorf("found unexpected the NVMe bdev controller name %s (bdev name %s) during replica %s mode validation", controllerName, bdev.Name, replicaName)
}

return e.ReplicaStatusMap[replicaName].Mode, nil
Expand Down Expand Up @@ -1078,7 +1087,7 @@ func (e *Engine) ReplicaAdd(spdkClient *spdkclient.Client, dstReplicaName, dstRe
}

// Add rebuilding replica head bdev to the base bdev list of the RAID bdev
dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress)
dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)
if err != nil {
return err
}
Expand Down Expand Up @@ -1246,7 +1255,7 @@ func (e *Engine) replicaAddFinish(srcReplicaServiceCli, dstReplicaServiceCli *cl
}

// Blindly ask the source replica to detach the rebuilding lvol
// If this detachment fails, there may be leftover rebuilding nvme controller in spdk_tgt of the src replica. We should continue since it's not a fatal error and shall not block the flow
// If this detachment fails, there may be leftover rebuilding NVMe controller in spdk_tgt of the src replica. We should continue since it's not a fatal error and shall not block the flow
// Similarly, the below src/dst replica finish should not block the flow either.
if srcReplicaErr := srcReplicaServiceCli.ReplicaRebuildingSrcDetach(srcReplicaName, dstReplicaName); srcReplicaErr != nil {
e.log.WithError(srcReplicaErr).Errorf("Engine failed to detach the rebuilding lvol for rebuilding src replica %s, will ignore this error and continue", srcReplicaName)
Expand Down Expand Up @@ -1631,7 +1640,7 @@ func (e *Engine) snapshotOperationWithoutLock(spdkClient *spdkclient.Client, rep
func (e *Engine) replicaSnapshotOperation(spdkClient *spdkclient.Client, replicaClient *client.SPDKClient, replicaName, snapshotName string, snapshotOp SnapshotOperationType, opts *api.SnapshotOptions) error {
switch snapshotOp {
case SnapshotOperationCreate:
// TODO: execute `sync` for the nvme initiator before snapshot start
// TODO: execute `sync` for the NVMe initiator before snapshot start
return replicaClient.ReplicaSnapshotCreate(replicaName, snapshotName, opts)
case SnapshotOperationDelete:
return replicaClient.ReplicaSnapshotDelete(replicaName, snapshotName)
Expand All @@ -1648,7 +1657,7 @@ func (e *Engine) replicaSnapshotOperation(spdkClient *spdkclient.Client, replica
if err := replicaClient.ReplicaSnapshotRevert(replicaName, snapshotName); err != nil {
return err
}
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address)
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)
if err != nil {
return err
}
Expand Down Expand Up @@ -1954,8 +1963,9 @@ func (e *Engine) BackupRestoreFinish(spdkClient *spdkclient.Client) error {
return err
}
e.log.Infof("Attaching replica %s with address %s before finishing restoration", replicaName, replicaAddress)
_, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, helpertypes.DefaultMultipath)
_, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort,
spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
int32(e.ctrlrLossTimeout), replicaReconnectDelaySec, int32(e.fastIOFailTimeoutSec), replicaMultipath)
if err != nil {
return err
}
Expand Down
14 changes: 9 additions & 5 deletions pkg/spdk/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,23 @@ import (

"github.com/pkg/errors"
"github.com/sirupsen/logrus"

grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"

"github.com/longhorn/backupstore"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/types/pkg/generated/spdkrpc"

btypes "github.com/longhorn/backupstore/types"
butil "github.com/longhorn/backupstore/util"
commonbitmap "github.com/longhorn/go-common-libs/bitmap"
commonnet "github.com/longhorn/go-common-libs/net"
commonutils "github.com/longhorn/go-common-libs/utils"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/types"
Expand Down Expand Up @@ -1320,12 +1322,13 @@ func (r *Replica) RebuildingSrcAttach(spdkClient *spdkclient.Client, dstReplicaN
if r.rebuildingSrcCache.dstRebuildingBdevName != "" {
controllerName := helperutil.GetNvmeControllerNameFromNamespaceName(r.rebuildingSrcCache.dstRebuildingBdevName)
if dstRebuildingLvolName != controllerName {
return fmt.Errorf("found mismatching between the required dst bdev nvme controller name %s and the expected dst controller name %s for replica %s rebuilding src attach", dstRebuildingLvolName, controllerName, r.Name)
return fmt.Errorf("found mismatching between the required dst bdev NVMe controller name %s and the expected dst controller name %s for replica %s rebuilding src attach", dstRebuildingLvolName, controllerName, r.Name)
}
return nil
}

r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress)
r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress,
replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec)
if err != nil {
return errors.Wrapf(err, "failed to connect rebuilding lvol %s with address %s as a NVMe bdev for replica %s rebuilding src attach", dstRebuildingLvolName, dstRebuildingLvolAddress, r.Name)
}
Expand Down Expand Up @@ -1432,7 +1435,8 @@ func (r *Replica) RebuildingDstStart(spdkClient *spdkclient.Client, srcReplicaNa
r.rebuildingDstCache.srcReplicaAddress = srcReplicaAddress

externalSnapshotLvolName := GetReplicaSnapshotLvolName(srcReplicaName, externalSnapshotName)
externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress)
externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress,
replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec)
if err != nil {
return "", errors.Wrapf(err, "failed to connect the external src snapshot lvol %s with address %s as a NVMf bdev for dst replica %v rebuilding start", externalSnapshotLvolName, externalSnapshotAddress, r.Name)
}
Expand Down
7 changes: 4 additions & 3 deletions pkg/spdk/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ import (
"github.com/pkg/errors"
"github.com/sirupsen/logrus"

"github.com/longhorn/go-spdk-helper/pkg/nvme"

btypes "github.com/longhorn/backupstore/types"
commonns "github.com/longhorn/go-common-libs/ns"
commontypes "github.com/longhorn/go-common-libs/types"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
Expand Down Expand Up @@ -156,9 +157,9 @@ func (r *Restore) OpenVolumeDev(volDevName string) (*os.File, string, error) {
}

func (r *Restore) CloseVolumeDev(volDev *os.File) error {
r.log.Infof("Closing nvme device %v", r.initiator.Endpoint)
r.log.Infof("Closing NVMe device %v", r.initiator.Endpoint)
if err := volDev.Close(); err != nil {
return errors.Wrapf(err, "failed to close nvme device %v", r.initiator.Endpoint)
return errors.Wrapf(err, "failed to close NVMe device %v", r.initiator.Endpoint)
}

r.log.Info("Stopping NVMe initiator")
Expand Down
22 changes: 12 additions & 10 deletions pkg/spdk/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@ import (
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/net/context"

"google.golang.org/protobuf/types/known/emptypb"

grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/emptypb"

"github.com/longhorn/backupstore"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/types/pkg/generated/spdkrpc"

butil "github.com/longhorn/backupstore/util"
commonbitmap "github.com/longhorn/go-common-libs/bitmap"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/types"
Expand Down Expand Up @@ -65,12 +67,12 @@ func NewServer(ctx context.Context, portStart, portEnd int32) (*Server, error) {
}

if _, err = cli.BdevNvmeSetOptions(
helpertypes.DefaultCtrlrLossTimeoutSec,
helpertypes.DefaultReconnectDelaySec,
helpertypes.DefaultFastIOFailTimeoutSec,
helpertypes.DefaultTransportAckTimeout,
helpertypes.DefaultKeepAliveTimeoutMs); err != nil {
return nil, errors.Wrap(err, "failed to set nvme options")
replicaCtrlrLossTimeoutSec,
replicaReconnectDelaySec,
replicaFastIOFailTimeoutSec,
replicaTransportAckTimeout,
replicaKeepAliveTimeoutMs); err != nil {
return nil, errors.Wrap(err, "failed to set NVMe options")
}

broadcasters := map[types.InstanceType]*broadcaster.Broadcaster{}
Expand Down
Loading

0 comments on commit d213adf

Please sign in to comment.