From f8a4890438b2fe2179389b2b2520fe614b851cc8 Mon Sep 17 00:00:00 2001 From: Derek Su Date: Sun, 1 Dec 2024 20:00:58 +0800 Subject: [PATCH 1/2] chore(vendor): update dependencies Longhorn 9874 Signed-off-by: Derek Su --- go.mod | 2 +- go.sum | 4 ++-- .../go-spdk-helper/pkg/types/types.go | 24 +++++++++++++++---- vendor/modules.txt | 2 +- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index a127f6dc..4166e234 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/google/uuid v1.6.0 github.com/longhorn/backupstore v0.0.0-20241130163459-2b482603a2c6 github.com/longhorn/go-common-libs v0.0.0-20241128023039-4d6c3a880dbc - github.com/longhorn/go-spdk-helper v0.0.0-20241130163407-e912304fab8b + github.com/longhorn/go-spdk-helper v0.0.0-20241202131855-7d9a097456b2 github.com/longhorn/types v0.0.0-20241123075624-48c550af4eab github.com/pkg/errors v0.9.1 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index a43fc65a..80bab85a 100644 --- a/go.sum +++ b/go.sum @@ -48,8 +48,8 @@ github.com/longhorn/backupstore v0.0.0-20241130163459-2b482603a2c6 h1:hcIAm6c92I github.com/longhorn/backupstore v0.0.0-20241130163459-2b482603a2c6/go.mod h1:cQXypqB6WonN0aIxWZWtUBPCOKlNNoi0hqkfYFsZlkI= github.com/longhorn/go-common-libs v0.0.0-20241128023039-4d6c3a880dbc h1:Ok7qdNu2038Oj7tQNaKjFqP20NqokR31a3RVMV7ulms= github.com/longhorn/go-common-libs v0.0.0-20241128023039-4d6c3a880dbc/go.mod h1:gSa+qB058kcNlCaOOwIFPHb3tvqMTmKcxtL7HPTS4o4= -github.com/longhorn/go-spdk-helper v0.0.0-20241130163407-e912304fab8b h1:FtYxFNC/HfjT3tq6te2unDzEroJNm6pG1UNnoKfApWk= -github.com/longhorn/go-spdk-helper v0.0.0-20241130163407-e912304fab8b/go.mod h1:siQvee7KIyFESzr5iQUVQavHCcSlzw2AjbdjchpDm4o= +github.com/longhorn/go-spdk-helper v0.0.0-20241202131855-7d9a097456b2 h1:r55x3FfMPn5mj2aXLvJT6ijfa9BkWQSL+qNZK1CKsb4= +github.com/longhorn/go-spdk-helper v0.0.0-20241202131855-7d9a097456b2/go.mod h1:siQvee7KIyFESzr5iQUVQavHCcSlzw2AjbdjchpDm4o= github.com/longhorn/types v0.0.0-20241123075624-48c550af4eab h1:vW/sSHB0U/GPnornD1cdoKSEe9bdBwVKP68cZUs1Xh0= github.com/longhorn/types v0.0.0-20241123075624-48c550af4eab/go.mod h1:dIPa2yMBPOa514bn3pohT7kbVAQcZbZSWItpgxunuPs= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= diff --git a/vendor/github.com/longhorn/go-spdk-helper/pkg/types/types.go b/vendor/github.com/longhorn/go-spdk-helper/pkg/types/types.go index d63d53fd..12481635 100644 --- a/vendor/github.com/longhorn/go-spdk-helper/pkg/types/types.go +++ b/vendor/github.com/longhorn/go-spdk-helper/pkg/types/types.go @@ -22,6 +22,24 @@ const ( ShallowCopyStateComplete = "complete" ShallowCopyStateError = "error" + ExecuteTimeout = 60 * time.Second +) + +const ( + // Sequence of Events: + // 1. Issuing I/O Command: The system sends a read command to the NVMe SSD. + // 2. Waiting for ACK: The system waits for an acknowledgment from the NVMe SSD. + // 3. Timeout: If no ACK is received within 2^transport_ack_timeout milliseconds, + // the system considers the I/O operation as a failure and attempts to resend it. + // 4. Fast I/O Failure: If multiple retries fail consecutively and the total elapsed + // time exceeds fast_io_fail_timeout_sec seconds, the system determines that the I/O + // operation is stuck and takes further actions, such as raising an alarm or switching + // to a backup path. + // 5. Controller Loss: If the NVMe SSD does not respond at all for more than + // ctrlr_loss_timeout_sec seconds, the system considers the controller as lost. + // 6. Reconnect Attempt: The system attempts to reconnect to the NVMe SSD every + // 2Reconnect_Delay_Sec seconds. + DefaultCtrlrLossTimeoutSec = 15 // DefaultReconnectDelaySec can't be more than DefaultFastIOFailTimeoutSec. DefaultReconnectDelaySec = 2 @@ -29,15 +47,13 @@ const ( // DefaultTransportAckTimeout value is not the timeout second. // The timeout formula is 2^(transport_ack_timeout) msec. - // DefaultTransportAckTimeout is 14, so the default timeout is 2^14 = 16384 msec = 16.384 sec. + // DefaultTransportAckTimeout is set to 10, so the default timeout is 2^10 = 1024 msec = 1.024 sec. // By default, error detection on a qpair is very slow for TCP transports. For fast error // detection, transport_ack_timeout should be set. - DefaultTransportAckTimeout = 14 + DefaultTransportAckTimeout = 10 DefaultKeepAliveTimeoutMs = 10000 DefaultMultipath = "disable" - - ExecuteTimeout = 60 * time.Second ) func GetNQN(name string) string { diff --git a/vendor/modules.txt b/vendor/modules.txt index ad728696..c75df122 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -64,7 +64,7 @@ github.com/longhorn/go-common-libs/sync github.com/longhorn/go-common-libs/sys github.com/longhorn/go-common-libs/types github.com/longhorn/go-common-libs/utils -# github.com/longhorn/go-spdk-helper v0.0.0-20241130163407-e912304fab8b +# github.com/longhorn/go-spdk-helper v0.0.0-20241202131855-7d9a097456b2 ## explicit; go 1.22.7 github.com/longhorn/go-spdk-helper/pkg/jsonrpc github.com/longhorn/go-spdk-helper/pkg/nvme From 2559c18694f2a7e89fae4f2c5eef2cc6367810d7 Mon Sep 17 00:00:00 2001 From: Derek Su Date: Sun, 1 Dec 2024 19:54:13 +0800 Subject: [PATCH 2/2] feat: decrease transport_ack_timeout Decreasing transport_ack_timeout value for imporving the error detection of the transport layer. Longhorn 9874 Signed-off-by: Derek Su --- pkg/spdk/disk/nvme/nvme.go | 2 +- pkg/spdk/types.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/spdk/disk/nvme/nvme.go b/pkg/spdk/disk/nvme/nvme.go index b7003892..7162222b 100644 --- a/pkg/spdk/disk/nvme/nvme.go +++ b/pkg/spdk/disk/nvme/nvme.go @@ -20,7 +20,7 @@ const ( diskCtrlrLossTimeoutSec = 30 diskReconnectDelaySec = 2 diskFastIOFailTimeoutSec = 15 - diskTransportAckTimeout = 14 + diskTransportAckTimeout = 10 diskKeepAliveTimeoutMs = 10000 diskMultipath = "disable" ) diff --git a/pkg/spdk/types.go b/pkg/spdk/types.go index a56ac937..004cdb58 100644 --- a/pkg/spdk/types.go +++ b/pkg/spdk/types.go @@ -49,7 +49,7 @@ const ( replicaCtrlrLossTimeoutSec = 15 replicaReconnectDelaySec = 2 replicaFastIOFailTimeoutSec = 10 - replicaTransportAckTimeout = 14 + replicaTransportAckTimeout = 10 replicaKeepAliveTimeoutMs = 10000 replicaMultipath = "disable" )