From f4d0de9629d3704bbd6034190c7bbff8a01f9287 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Nov 2024 10:55:43 +0000 Subject: [PATCH 1/3] Don't unwrap when removing invalid TCP state from fastpath. Now that the fastpath temporarily drops (and reacquires) the port lock when TCP state needs to be invalidated, we opened ourselves up to the possibility that another packet could have removed this state ahead of us. Equally, a packet could insert new TCP state which we might accidentally remove. This PR removes the unwrap on removal to account for the race, and only removes TCP flows if they are pointer-equal. Closes #618. --- lib/opte/src/engine/port.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/opte/src/engine/port.rs b/lib/opte/src/engine/port.rs index 94fe776a..967a25a5 100644 --- a/lib/opte/src/engine/port.rs +++ b/lib/opte/src/engine/port.rs @@ -1395,9 +1395,19 @@ impl Port { let ufid_out = &flow_lock.outbound_ufid; let ufid_in = flow_lock.inbound_ufid.as_ref(); - self.uft_tcp_closed(&mut local_lock, ufid_out, ufid_in); - let _ = local_lock.tcp_flows.remove(ufid_out).unwrap(); + // Because we've dropped the port lock, another flow could have also + // invalidated this flow and removed the entry. It could even install + // new UFT/TCP entries, depending on ordering. + // + // Verify that the state we want to remove still exists, and is + // `Arc`-identical. + if let Some(found_entry) = local_lock.tcp_flows.get(ufid_out) { + if Arc::ptr_eq(found_entry, &entry) { + self.uft_tcp_closed(&mut local_lock, ufid_out, ufid_in); + _ = local_lock.tcp_flows.remove(ufid_out); + } + } if reprocess { lock = Some(local_lock); From 65380ae169d38406d72cadd596ee55dddb62d504 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Nov 2024 12:18:31 +0000 Subject: [PATCH 2/3] Revisit comment. --- lib/opte/src/engine/port.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/opte/src/engine/port.rs b/lib/opte/src/engine/port.rs index 967a25a5..c0c4f9b8 100644 --- a/lib/opte/src/engine/port.rs +++ b/lib/opte/src/engine/port.rs @@ -1396,9 +1396,9 @@ impl Port { let ufid_in = flow_lock.inbound_ufid.as_ref(); - // Because we've dropped the port lock, another flow could have also + // Because we've dropped the port lock, another packet could have also // invalidated this flow and removed the entry. It could even install - // new UFT/TCP entries, depending on ordering. + // new UFT/TCP entries, depending on lock/process ordering. // // Verify that the state we want to remove still exists, and is // `Arc`-identical. From 178a1785f264fcf965e5edbfc20af86a3d5801c6 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 29 Nov 2024 15:01:49 +0000 Subject: [PATCH 3/3] Correctly set ioctl resp_buf capacity on ENOBUFS Closes #624. --- lib/opte-ioctl/src/lib.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index afe86ef2..505039e0 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -285,7 +285,8 @@ where // It would be a shame if the command failed and we didn't have enough bytes // to serialize the error response, so we set this to default to 16 KiB. - let mut resp_buf = Vec::with_capacity(16 * 1024); + const BASE_CAPACITY: usize = 16 * 1024; + let mut resp_buf = Vec::with_capacity(BASE_CAPACITY); let mut rioctl = OpteCmdIoctl { api_version: API_VERSION, cmd, @@ -313,8 +314,18 @@ where if raw_err == libc::ENOBUFS { assert!(rioctl.resp_len_actual != 0); assert!(rioctl.resp_len_actual > resp_buf.capacity()); - // Make room for the size the kernel claims to need - resp_buf.reserve(rioctl.resp_len_actual - resp_buf.capacity()); + + // Make room for at least the size the kernel claims to need. + // This can be slightly tricky: since every retry reruns the + // command, the size of the next resp could change from under + // us (increase or decrease). Keep some headroom to account + // for this. + let wanted_capacity = + BASE_CAPACITY / 4 + rioctl.resp_len_actual; + + // XDE could write into `resp_buf` (but does not). + // .len() *should* be zero -- but don't bank on it. + resp_buf.reserve(wanted_capacity - resp_buf.len()); rioctl.resp_bytes = resp_buf.as_mut_ptr(); rioctl.resp_len = resp_buf.capacity(); rioctl.resp_len_actual = 0;