diff --git a/Cargo.lock b/Cargo.lock index 81e14834..7b399c85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -875,6 +875,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "illumos-sys-hdrs" version = "0.1.0" +dependencies = [ + "bitflags 2.6.0", +] [[package]] name = "indexmap" diff --git a/crates/illumos-sys-hdrs/Cargo.toml b/crates/illumos-sys-hdrs/Cargo.toml index 3fe76454..f6196a93 100644 --- a/crates/illumos-sys-hdrs/Cargo.toml +++ b/crates/illumos-sys-hdrs/Cargo.toml @@ -8,4 +8,7 @@ repository.workspace = true [features] default = [] -kernel = [] \ No newline at end of file +kernel = [] + +[dependencies] +bitflags.workspace = true diff --git a/crates/illumos-sys-hdrs/src/lib.rs b/crates/illumos-sys-hdrs/src/lib.rs index 12bb8d1d..455bf8fb 100644 --- a/crates/illumos-sys-hdrs/src/lib.rs +++ b/crates/illumos-sys-hdrs/src/lib.rs @@ -11,6 +11,10 @@ pub mod kernel; #[cfg(feature = "kernel")] pub use kernel::*; +use mac::mac_ether_offload_info_t; +use mac::mac_ether_tun_info_t; + +pub mod mac; use core::ptr; @@ -236,6 +240,9 @@ pub struct dblk_t { pub db_struioun: u64, // imprecise pub db_fthdr: *const c_void, // imprecise pub db_credp: *const c_void, // imprecise + + pub db_meoi: mac_ether_offload_info_t, + pub db_mett: mac_ether_tun_info_t, } impl Default for dblk_t { @@ -259,6 +266,9 @@ impl Default for dblk_t { db_struioun: 0, db_fthdr: ptr::null(), db_credp: ptr::null(), + + db_meoi: Default::default(), + db_mett: Default::default(), } } } diff --git a/crates/illumos-sys-hdrs/src/mac.rs b/crates/illumos-sys-hdrs/src/mac.rs new file mode 100644 index 00000000..c1618be5 --- /dev/null +++ b/crates/illumos-sys-hdrs/src/mac.rs @@ -0,0 +1,123 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2024 Oxide Computer Company + +#[cfg(feature = "kernel")] +use crate::mblk_t; +use bitflags::bitflags; + +// ====================================================================== +// uts/common/sys/mac_provider.h +// ====================================================================== + +bitflags! { +#[repr(C)] +#[derive(Clone, Copy, Debug, Default)] +/// Flags which denote the valid fields of a `mac_ether_offload_info_t` +/// or `mac_ether_tun_info_t`. +pub struct MacEtherOffloadFlags: u8 { + /// `l2hlen` and `l3proto` are set. + const L2INFO_SET = 1 << 0; + /// The ethernet header contains a VLAN tag. + const VLAN_TAGGED = 1 << 1; + /// `l3hlen` and `l4proto` are set. + const L3INFO_SET = 1 << 2; + /// `l4hlen` is set. + const L4INFO_SET = 1 << 3; + /// `tuntype` is set. + const TUNINFO_SET = 1 << 4; +} +} + +#[repr(C)] +#[derive(Copy, Clone, Debug, Default, Eq, PartialEq)] +pub struct MacTunType(u8); + +impl MacTunType { + pub const NONE: Self = Self(0); + pub const GENEVE: Self = Self(1); + pub const VXLAN: Self = Self(2); +} + +#[repr(C)] +#[derive(Clone, Copy, Debug, Default)] +pub struct mac_ether_offload_info_t { + pub meoi_flags: MacEtherOffloadFlags, + pub meoi_l2hlen: u8, + pub meoi_l3proto: u16, + pub meoi_l3hlen: u16, + pub meoi_l4proto: u8, + pub meoi_l4hlen: u8, + pub meoi_len: u32, +} + +#[repr(C)] +#[derive(Clone, Copy, Debug, Default)] +pub struct mac_ether_tun_info_t { + pub mett_flags: MacEtherOffloadFlags, + pub mett_tuntype: MacTunType, + pub mett_l2hlen: u8, + pub mett_l3proto: u16, + pub mett_l3hlen: u16, +} + +#[cfg(feature = "kernel")] +extern "C" { + pub fn lso_info_set(mp: *mut mblk_t, mss: u32, flags: u32); + pub fn lso_info_cleanup(mp: *mut mblk_t); + pub fn mac_hcksum_set( + mp: *mut mblk_t, + start: u32, + stuff: u32, + end: u32, + value: u32, + flags: u32, + ); + pub fn mac_hcksum_get( + mp: *mut mblk_t, + start: *mut u32, + stuff: *mut u32, + end: *mut u32, + value: *mut u32, + flags: *mut u32, + ); +} + +// ====================================================================== +// uts/common/sys/pattr.h +// ====================================================================== + +bitflags! { +/// Flags which denote checksum and LSO state for an `mblk_t`. +pub struct MblkOffloadFlags: u16 { + /// Tx: IPv4 header checksum must be computer by hardware. + const HCK_IPV4_HDRCKSUM = 1 << 0; + /// Rx: IPv4 header checksum was verified correct by hardware. + const HCK_IPV4_HDRCKSUM_OK = Self::HCK_IPV4_HDRCKSUM.bits(); + /// * Tx: Compute partial checksum based on start/stuff/end offsets. + /// * Rx: Partial checksum computed and attached. + const HCK_PARTIALCKSUM = 1 << 1; + /// * Tx: Compute full (pseudo + l4 + payload) cksum for this packet. + /// * Rx: Full checksum was computed in hardware, and is attached. + const HCK_FULLCKSUM = 1 << 2; + /// Rx: Hardware has verified that L3/L4 checksums are correct. + const HCK_FULLCKSUM_OK = 1 << 3; + /// Tx: Hardware must perform LSO. + const HW_LSO = 1 << 4; + /// Tx: Hardware must compute all checksum for the outer tunnel + /// encapsulation of this packet. + const HCK_FULLOUTERCKSUM = 1 << 5; + + const HCK_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() | + Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits() | + Self::HCK_FULLCKSUM_OK.bits() | Self::HCK_FULLOUTERCKSUM.bits(); + + const HCK_TX_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() | + Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits() | + Self::HCK_FULLOUTERCKSUM.bits(); + + const HW_LSO_FLAGS = Self::HW_LSO.bits(); +} +} diff --git a/lib/opte/Cargo.toml b/lib/opte/Cargo.toml index dcc77f47..4aa8fc9a 100644 --- a/lib/opte/Cargo.toml +++ b/lib/opte/Cargo.toml @@ -29,7 +29,7 @@ opte-api.workspace = true ingot.workspace = true -bitflags.workspace = true +bitflags = { workspace = true , features = ["serde"] } cfg-if.workspace = true crc32fast = { workspace = true, optional = true } dyn-clone.workspace = true diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index 13cacfe9..b06ff25a 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -27,6 +27,12 @@ use illumos_sys_hdrs as ddi; use illumos_sys_hdrs::c_uchar; #[cfg(any(feature = "std", test))] use illumos_sys_hdrs::dblk_t; +use illumos_sys_hdrs::mac::mac_ether_offload_info_t; +use illumos_sys_hdrs::mac::mac_ether_tun_info_t; +#[cfg(all(not(feature = "std"), not(test)))] +use illumos_sys_hdrs::mac::MacEtherOffloadFlags; +use illumos_sys_hdrs::mac::MacTunType; +use illumos_sys_hdrs::mac::MblkOffloadFlags; use illumos_sys_hdrs::mblk_t; use illumos_sys_hdrs::uintptr_t; use ingot::types::Emit; @@ -36,6 +42,14 @@ use ingot::types::Read; pub static MBLK_MAX_SIZE: usize = u16::MAX as usize; +/// Abstractions over an `mblk_t` which can be returned to their +/// raw pointer representation. +pub trait AsMblk { + /// Consume `self`, returning the underlying `mblk_t`. The caller of this + /// function now owns the underlying segment chain. + fn unwrap_mblk(self) -> Option>; +} + /// The head and tail of an mblk_t list. struct MsgBlkChainInner { head: NonNull, @@ -146,11 +160,10 @@ impl MsgBlkChain { self.0 = Some(MsgBlkChainInner { head: pkt, tail: pkt }); } } +} - /// Return the head of the underlying `mblk_t` packet chain and - /// consume `self`. The caller of this function now owns the - /// `mblk_t` segment chain. - pub fn unwrap_mblk(mut self) -> Option> { +impl AsMblk for MsgBlkChain { + fn unwrap_mblk(mut self) -> Option> { self.0.take().map(|v| v.head) } } @@ -615,9 +628,7 @@ impl MsgBlk { /// consume `self`. The caller of this function now owns the /// `mblk_t` segment chain. pub fn unwrap_mblk(self) -> NonNull { - let ptr_out = self.0; - _ = ManuallyDrop::new(self); - ptr_out + AsMblk::unwrap_mblk(self).unwrap() } /// Wrap the `mblk_t` packet in a [`MsgBlk`], taking ownership of @@ -707,6 +718,108 @@ impl MsgBlk { self.0 = head; } + + /// Copies the offload information from this message block to + /// another, including checksum/LSO flags and TCP MSS (if set). + pub fn copy_offload_info_to(&self, other: &mut Self) { + unsafe { + let info = offload_info(self.0); + set_offload_info(other.0, info); + } + } + + #[allow(unused)] + pub fn request_offload( + &mut self, + cksum_needed: bool, + is_tcp: bool, + mss: u32, + ) { + let ckflags = if cksum_needed { + MblkOffloadFlags::HCK_IPV4_HDRCKSUM + | MblkOffloadFlags::HCK_FULLCKSUM + } else { + MblkOffloadFlags::empty() + }; + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + if !ckflags.is_empty() { + illumos_sys_hdrs::mac::mac_hcksum_set( + self.0.as_ptr(), + 0, + 0, + 0, + 0, + ckflags.bits() as u32, + ); + } + if is_tcp { + illumos_sys_hdrs::mac::lso_info_set( + self.0.as_ptr(), + mss, + MblkOffloadFlags::HW_LSO.bits() as u32, + ); + } + } + } + + #[allow(unused)] + pub fn strip_lso(&mut self) { + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + illumos_sys_hdrs::mac::lso_info_cleanup(self.0.as_ptr()); + } + } + + #[allow(unused)] + pub fn set_tuntype(&mut self, tuntype: MacTunType) { + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + (*(*self.0.as_ptr()).b_datap).db_mett.mett_tuntype = tuntype; + (*(*self.0.as_ptr()).b_datap).db_mett.mett_flags |= + MacEtherOffloadFlags::TUNINFO_SET; + } + } + + #[allow(unused)] + pub fn fill_offload_info( + &mut self, + tun_meoi: &mac_ether_tun_info_t, + ulp_meoi: &mac_ether_offload_info_t, + ) { + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + (*(*self.0.as_ptr()).b_datap).db_mett = *tun_meoi; + (*(*self.0.as_ptr()).b_datap).db_meoi = *ulp_meoi; + } + } + + #[allow(unused)] + pub fn cksum_flags(&self) -> MblkOffloadFlags { + let mut out = 0u32; + + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + illumos_sys_hdrs::mac::mac_hcksum_get( + self.0.as_ptr(), + ptr::null_mut(), + ptr::null_mut(), + ptr::null_mut(), + ptr::null_mut(), + &raw mut out, + ) + }; + + MblkOffloadFlags::from_bits_retain(out as u16) + } +} + +impl AsMblk for MsgBlk { + fn unwrap_mblk(self) -> Option> { + let ptr_out = self.0; + _ = ManuallyDrop::new(self); + Some(ptr_out) + } } /// An interior node of an [`MsgBlk`]'s chain, accessed via iterator. @@ -1046,6 +1159,8 @@ pub fn mock_desballoc(buf: Vec) -> *mut mblk_t { db_struioun: 0, db_fthdr: ptr::null(), db_credp: ptr::null(), + + ..Default::default() }); let dbp = Box::into_raw(dblk); diff --git a/lib/opte/src/engine/packet.rs b/lib/opte/src/engine/packet.rs index 3269c0da..e3f86a06 100644 --- a/lib/opte/src/engine/packet.rs +++ b/lib/opte/src/engine/packet.rs @@ -35,6 +35,7 @@ use super::ip::L3; use super::parse::NoEncap; use super::parse::Ulp; use super::parse::UlpRepr; +use super::port::meta::ActionMeta; use super::rule::CompiledEncap; use super::rule::CompiledTransform; use super::rule::HdrTransform; @@ -964,7 +965,10 @@ impl Packet> { #[inline] /// Convert a packet's metadata into a set of instructions /// needed to serialize all its changes to the wire. - pub fn emit_spec(&mut self) -> Result + pub fn emit_spec( + &mut self, + action_meta: &ActionMeta, + ) -> Result where T::Chunk: ByteSliceMut, { @@ -1169,6 +1173,7 @@ impl Packet> { ulp_len: encapped_len as u32, prepend: PushSpec::Slowpath(push_spec.into()), l4_hash, + mtu_unrestricted: action_meta.is_internal_target(), }) } @@ -1594,11 +1599,18 @@ pub struct EmitSpec { pub(crate) l4_hash: u32, pub(crate) rewind: u16, pub(crate) ulp_len: u32, + pub(crate) mtu_unrestricted: bool, } impl Default for EmitSpec { fn default() -> Self { - Self { prepend: PushSpec::NoOp, l4_hash: 0, rewind: 0, ulp_len: 0 } + Self { + prepend: PushSpec::NoOp, + l4_hash: 0, + rewind: 0, + ulp_len: 0, + mtu_unrestricted: false, + } } } @@ -1610,6 +1622,14 @@ impl EmitSpec { self.l4_hash } + /// Return whether this packet's route allows the use of a full jumbo frame + /// MSS. + #[inline] + #[must_use] + pub fn mtu_unrestricted(&self) -> bool { + self.mtu_unrestricted + } + /// Perform final structural transformations to a packet (removal of /// existing headers, and copying in new/replacement headers). #[inline] @@ -1744,6 +1764,7 @@ impl EmitSpec { } if let Some(mut prepend) = prepend { + pkt.copy_offload_info_to(&mut prepend); prepend.append(pkt); prepend } else { diff --git a/lib/opte/src/engine/port/meta.rs b/lib/opte/src/engine/port/meta.rs new file mode 100644 index 00000000..5d7d65d4 --- /dev/null +++ b/lib/opte/src/engine/port/meta.rs @@ -0,0 +1,107 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2024 Oxide Computer Company +use alloc::collections::BTreeMap; +use alloc::string::String; +use alloc::string::ToString; + +/// A value meant to be used in the [`ActionMeta`] map. +/// +/// The purpose of this trait is to define the value's key as well +/// as serialization to/from strings. These are like Display and +/// FromStr; but here their focus is on unambiguous parsing. That +/// is, we can't necessarily rely on a type's Display impl being +/// good for serializing to a metadata string, but at the same +/// time we don't want to force its Display to have to work in +/// this constraint. +/// +/// A value doesn't have to implement this type; there is nothing +/// that enforces the strings stored in [`ActionMeta`] are strings +/// generated by this trait impl. It's just a convenient way to +/// mark and implement values meant to be used as action metadata. +pub trait ActionMetaValue: Sized { + const KEY: &'static str; + + fn key(&self) -> String { + Self::KEY.to_string() + } + + /// Create a representation of the value to be used in + /// [`ActionMeta`]. + fn as_meta(&self) -> String; + + /// Attempt to create a value assuming that `s` was created + /// with [`Self::as_meta()`]. + fn from_meta(s: &str) -> Result; +} + +/// The action metadata map. +/// +/// This metadata is accessible by all actions during layer +/// processing and acts as a form of inter-action communication. +/// Given that packets and their metadata are immutable (outside of +/// reified header transforms), this also allows actions to inform +/// OPTE of facts about a path or destination (e.g., MTU). +/// +/// Action metadata is nothing more than a map of string keys +/// to string values. It is up to the actions to decide what these strings +/// mean. However, *all keys prefaced with "opte:" are reserved for use by +/// operations on `ActionMeta`*. +#[derive(Default)] +pub struct ActionMeta { + inner: BTreeMap, +} + +impl ActionMeta { + const INTERNAL_TARGET: &str = "opte:internal-target"; + + pub fn new() -> Self { + Self::default() + } + + /// Clear all entries. + pub fn clear(&mut self) { + self.inner.clear(); + } + + /// Insert the key-value pair into the map, replacing any + /// existing key-value pair. Return the value being replaced, + /// or `None`. + pub fn insert(&mut self, key: String, val: String) -> Option { + self.inner.insert(key, val) + } + + /// Remove the key-value pair with the specified key. Return + /// the value, or `None` if no such entry exists. + pub fn remove(&mut self, key: &str) -> Option { + self.inner.remove(key) + } + + /// Get a reference to the value with the given key, or `None` + /// if no such entry exists. + pub fn get(&self, key: &str) -> Option<&String> { + self.inner.get(key) + } + + /// Records whether this packet's destination can be reached using only + /// internal/private paths. + /// + /// The dataplane may use this to choose a larger (jumbo-frame) MSS for + /// TCP segmentation, or rely on other aspects of its internal network. + pub fn set_internal_target(&mut self, val: bool) { + _ = self.insert( + Self::INTERNAL_TARGET.into(), + if val { "1".into() } else { "0".into() }, + ); + } + + /// Returns whether this packet's destination can be reached using only + /// internal/private paths. + pub fn is_internal_target(&self) -> bool { + self.get(Self::INTERNAL_TARGET) + .and_then(|v| Some(v == "1")) + .unwrap_or_default() + } +} diff --git a/lib/opte/src/engine/port.rs b/lib/opte/src/engine/port/mod.rs similarity index 96% rename from lib/opte/src/engine/port.rs rename to lib/opte/src/engine/port/mod.rs index c0c4f9b8..ee7c40cc 100644 --- a/lib/opte/src/engine/port.rs +++ b/lib/opte/src/engine/port/mod.rs @@ -6,7 +6,6 @@ //! A virtual switch port. -use self::meta::ActionMeta; use super::ether::Ethernet; use super::flow_table::Dump; use super::flow_table::FlowEntry; @@ -46,6 +45,7 @@ use super::rule::Finalized; use super::rule::HdrTransform; use super::rule::HdrTransformError; use super::rule::Rule; +use super::rule::TransformFlags; use super::tcp::TcpState; use super::tcp::KEEPALIVE_EXPIRE_TTL; use super::tcp::TIME_WAIT_EXPIRE_TTL; @@ -94,12 +94,16 @@ use ingot::types::Emit; use ingot::types::HeaderLen; use ingot::types::Read; use ingot::udp::Udp; +use meta::ActionMeta; use opte_api::Direction; use opte_api::MacAddr; use opte_api::OpteError; use zerocopy::ByteSlice; use zerocopy::ByteSliceMut; +/// Metadata for inter-action communication. +pub mod meta; + pub type Result = result::Result; #[derive(Debug)] @@ -1423,7 +1427,7 @@ impl Port { let len = pkt.len(); let meta = pkt.meta_mut(); - let body_csum = if tx.checksums_dirty { + let body_csum = if tx.checksums_dirty() { meta.compute_body_csum() } else { None @@ -1439,6 +1443,7 @@ impl Port { _ => 0, }; let out = EmitSpec { + mtu_unrestricted: tx.local_destination(), prepend: PushSpec::Fastpath(tx), l4_hash, rewind, @@ -1532,7 +1537,7 @@ impl Port { } InternalProcessResult::Hairpin(v) => Ok(ProcessResult::Hairpin(v)), InternalProcessResult::Modified => pkt - .emit_spec() + .emit_spec(&ameta) .map_err(|_| ProcessError::BadEmitSpec) .map(ProcessResult::Modified), }); @@ -1727,7 +1732,7 @@ impl Transforms { } #[inline] - fn compile(mut self, checksums_dirty: bool) -> Arc { + fn compile(mut self, flags: TransformFlags) -> Arc { // Compile to a fasterpath transform iff. no body transform. if self.body.is_empty() { let mut still_permissable = true; @@ -1890,7 +1895,7 @@ impl Transforms { inner_ether: inner_ether.cloned(), inner_ip: inner_ip.cloned(), inner_ulp: inner_ulp.cloned(), - checksums_dirty, + flags, } .into(), ); @@ -2320,10 +2325,18 @@ impl Port { Err(e) => return Err(ProcessError::Layer(e)), } + let mut flags = TransformFlags::empty(); + if pkt.checksums_dirty() { + flags |= TransformFlags::CSUM_DIRTY; + } + if ameta.is_internal_target() { + flags |= TransformFlags::LOCAL_DESTINATION; + } + let ufid_out = pkt.flow().mirror(); let mut hte = UftEntry { - pair: KMutex::new(Some(ufid_out), KMutexType::Spin), - xforms: xforms.compile(pkt.checksums_dirty()), + pair: KMutex::new(Some(ufid_out), KMutexType::Driver), + xforms: xforms.compile(flags), epoch, l4_hash: ufid_in.crc32(), tcp_flow: None, @@ -2546,9 +2559,17 @@ impl Port { let flow_before = *pkt.flow(); let res = self.layers_process(data, Out, pkt, &mut xforms, ameta); + let mut flags = TransformFlags::empty(); + if pkt.checksums_dirty() { + flags |= TransformFlags::CSUM_DIRTY; + } + if ameta.is_internal_target() { + flags |= TransformFlags::LOCAL_DESTINATION; + } + let hte = UftEntry { - pair: KMutex::new(None, KMutexType::Spin), - xforms: xforms.compile(pkt.checksums_dirty()), + pair: KMutex::new(None, KMutexType::Driver), + xforms: xforms.compile(flags), epoch, l4_hash: flow_before.crc32(), tcp_flow, @@ -2855,7 +2876,7 @@ impl TcpFlowEntryState { bytes_in, bytes_out: 0, }, - KMutexType::Spin, + KMutexType::Driver, ), } } @@ -2876,7 +2897,7 @@ impl TcpFlowEntryState { bytes_in: 0, bytes_out, }, - KMutexType::Spin, + KMutexType::Driver, ), } } @@ -3045,82 +3066,3 @@ extern "C" { ifid: *const InnerFlowId, ); } - -/// Metadata for inter-action communication. -pub mod meta { - use alloc::collections::BTreeMap; - use alloc::string::String; - use alloc::string::ToString; - - /// A value meant to be used in the [`ActionMeta`] map. - /// - /// The purpose of this trait is to define the value's key as well - /// as serialization to/from strings. These are like Display and - /// FromStr; but here their focus is on unambiguous parsing. That - /// is, we can't necessarily rely on a type's Display impl being - /// good for serializing to a metadata string, but at the same - /// time we don't want to force its Display to have to work in - /// this constraint. - /// - /// A value doesn't have to implement this type; there is nothing - /// that enforces the strings stored in [`ActionMeta`] are strings - /// generated by this trait impl. It's just a convenient way to - /// mark and implement values meant to be used as action metadata. - pub trait ActionMetaValue: Sized { - const KEY: &'static str; - - fn key(&self) -> String { - Self::KEY.to_string() - } - - /// Create a representation of the value to be used in - /// [`ActionMeta`]. - fn as_meta(&self) -> String; - - /// Attempt to create a value assuming that `s` was created - /// with [`Self::as_meta()`]. - fn from_meta(s: &str) -> Result; - } - - /// The action metadata map. - /// - /// This metadata is accessible by all actions during layer - /// processing and acts as a form of inter-action communication. - /// The action metadata is nothing more than a map of string keys - /// to string values -- their meaning is opaque to OPTE itself. It - /// is up to the actions to decide what these strings mean. - #[derive(Default)] - pub struct ActionMeta { - inner: BTreeMap, - } - - impl ActionMeta { - pub fn new() -> Self { - Self::default() - } - - /// Clear all entries. - pub fn clear(&mut self) { - self.inner.clear(); - } - - /// Insert the key-value pair into the map, replacing any - /// existing key-value pair. Return the value being replaced, - /// or `None`. - pub fn insert(&mut self, key: String, val: String) -> Option { - self.inner.insert(key, val) - } - - /// Remove the key-value pair with the specified key. Return - /// the value, or `None` if no such entry exists. - pub fn remove(&mut self, key: &str) -> Option { - self.inner.remove(key) - } - - /// Get a reference to the value with the given key, or `None` - /// if no such entry exists. - pub fn get(&self, key: &str) -> Option<&String> { - self.inner.get(key) - } - } -} diff --git a/lib/opte/src/engine/rule.rs b/lib/opte/src/engine/rule.rs index e4fa9936..f10efa0c 100644 --- a/lib/opte/src/engine/rule.rs +++ b/lib/opte/src/engine/rule.rs @@ -45,6 +45,7 @@ use alloc::string::String; use alloc::string::ToString; use alloc::sync::Arc; use alloc::vec::Vec; +use bitflags::bitflags; use core::ffi::CStr; use core::fmt; use core::fmt::Debug; @@ -319,6 +320,14 @@ impl Display for HdrTransform { } } +bitflags! { + #[derive(Copy, Clone, Debug, Deserialize, Serialize)] + pub struct TransformFlags: u8 { + const CSUM_DIRTY = 1; + const LOCAL_DESTINATION = 1 << 1; + } +} + /// Header transformations matching a simple format, amenable /// to fastpath compilation: /// * Encap is either pushed or popped in its entirety, @@ -331,10 +340,20 @@ pub struct CompiledTransform { pub inner_ether: Option, pub inner_ip: Option, pub inner_ulp: Option, - pub checksums_dirty: bool, + pub flags: TransformFlags, } impl CompiledTransform { + #[inline] + pub fn checksums_dirty(&self) -> bool { + self.flags.contains(TransformFlags::CSUM_DIRTY) + } + + #[inline] + pub fn local_destination(&self) -> bool { + self.flags.contains(TransformFlags::LOCAL_DESTINATION) + } + #[inline(always)] pub fn transform_ether( &self, @@ -498,6 +517,7 @@ impl CompiledEncap { *l4_len_slot = (l4_len as u16).to_be_bytes(); if let Some(mut prepend) = prepend { + pkt.copy_offload_info_to(&mut prepend); prepend.append(pkt); prepend } else { diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 1255d52f..a65a1289 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -238,6 +238,7 @@ impl StaticAction for EncapAction { let phys_target = match target { RouterTargetInternal::InternetGateway(_) => { + action_meta.set_internal_target(false); match self.v2b.get(&flow_id.dst_ip()) { Some(phys) => { // Hash the packet onto a route target. This is a very @@ -260,6 +261,7 @@ impl StaticAction for EncapAction { RouterTargetInternal::Ip(virt_ip) => match self.v2p.get(&virt_ip) { Some(phys) => { + action_meta.set_internal_target(true); PhysNet { ether: phys.ether, ip: phys.ip, vni: self.vni } } diff --git a/xde/src/dls/mod.rs b/xde/src/dls/mod.rs index 5cc7aaa0..d1d77631 100644 --- a/xde/src/dls/mod.rs +++ b/xde/src/dls/mod.rs @@ -21,7 +21,7 @@ use illumos_sys_hdrs::c_int; use illumos_sys_hdrs::datalink_id_t; use illumos_sys_hdrs::uintptr_t; use illumos_sys_hdrs::ENOENT; -use opte::ddi::mblk::MsgBlk; +use opte::ddi::mblk::AsMblk; pub use sys::*; /// An integer ID used by DLS to refer to a given link. @@ -198,7 +198,7 @@ impl DlsStream { /// but for now we pass only a single packet at a time. pub fn tx_drop_on_no_desc( &self, - pkt: MsgBlk, + pkt: impl AsMblk, hint: uintptr_t, flags: MacTxFlags, ) { @@ -209,13 +209,16 @@ impl DlsStream { // We must unwrap the raw `mblk_t` out of the `pkt` here, // otherwise the mblk_t would be dropped at the end of this // function along with `pkt`. + let Some(mblk) = pkt.unwrap_mblk() else { + return; + }; let mut raw_flags = flags.bits(); raw_flags |= MAC_DROP_ON_NO_DESC; unsafe { // mac_tx(self.mch, pkt.unwrap_mblk(), hint, raw_flags, &mut ret_mp) str_mdata_fastpath_put( inner.dld_str.as_ptr(), - pkt.unwrap_mblk().as_ptr(), + mblk.as_ptr(), hint, raw_flags, ) diff --git a/xde/src/mac/mod.rs b/xde/src/mac/mod.rs index 28dcbd2d..4c7c6244 100644 --- a/xde/src/mac/mod.rs +++ b/xde/src/mac/mod.rs @@ -18,9 +18,12 @@ use alloc::sync::Arc; use bitflags::bitflags; use core::ffi::CStr; use core::fmt; +use core::mem::MaybeUninit; use core::ptr; use illumos_sys_hdrs::*; +use opte::ddi::mblk::AsMblk; use opte::ddi::mblk::MsgBlk; +use opte::ddi::mblk::MsgBlkChain; use opte::engine::ether::EtherAddr; pub use sys::*; @@ -70,6 +73,41 @@ impl MacHandle { } mac } + + pub fn get_min_max_sdu(&self) -> (u32, u32) { + let (mut min, mut max) = (0, 0); + + unsafe { + mac_sdu_get(self.0, &raw mut min, &raw mut max); + } + + (min, max) + } + + pub fn get_cso_capabs(&self) -> mac_capab_cso_t { + let mut cso = mac_capab_cso_t::default(); + unsafe { + mac_capab_get( + self.0, + mac_capab_t::MAC_CAPAB_HCKSUM, + (&raw mut cso) as *mut _, + ); + } + cso + } + + pub fn get_lso_capabs(&self) -> mac_capab_lso_t { + let mut lso = MaybeUninit::::zeroed(); + unsafe { + mac_capab_get( + self.0, + mac_capab_t::MAC_CAPAB_LSO, + (&raw mut lso) as *mut _, + ); + + lso.assume_init() + } + } } impl Drop for MacHandle { @@ -207,7 +245,7 @@ impl MacClientHandle { /// but for now we pass only a single packet at a time. pub fn tx( &self, - pkt: MsgBlk, + pkt: impl AsMblk, hint: uintptr_t, flags: MacTxFlags, ) -> Option { @@ -215,14 +253,11 @@ impl MacClientHandle { // otherwise the mblk_t would be dropped at the end of this // function along with `pkt`. let mut ret_mp = ptr::null_mut(); + let Some(mblk) = pkt.unwrap_mblk() else { + return None; + }; unsafe { - mac_tx( - self.mch, - pkt.unwrap_mblk().as_ptr(), - hint, - flags.bits(), - &mut ret_mp, - ) + mac_tx(self.mch, mblk.as_ptr(), hint, flags.bits(), &mut ret_mp) }; if !ret_mp.is_null() { // Unwrap: We know the ret_mp is valid because we gave @@ -248,7 +283,7 @@ impl MacClientHandle { /// but for now we pass only a single packet at a time. pub fn tx_drop_on_no_desc( &self, - pkt: MsgBlk, + pkt: impl AsMblk, hint: uintptr_t, flags: MacTxFlags, ) { @@ -258,14 +293,13 @@ impl MacClientHandle { let mut raw_flags = flags.bits(); raw_flags |= MAC_DROP_ON_NO_DESC; let mut ret_mp = ptr::null_mut(); + + let Some(mblk) = pkt.unwrap_mblk() else { + return; + }; + unsafe { - mac_tx( - self.mch, - pkt.unwrap_mblk().as_ptr(), - hint, - raw_flags, - &mut ret_mp, - ) + mac_tx(self.mch, mblk.as_ptr(), hint, raw_flags, &mut ret_mp) }; debug_assert_eq!(ret_mp, ptr::null_mut()); } @@ -394,3 +428,89 @@ impl Drop for MacPerimeterHandle { } } } + +bitflags! { +/// Classes of TCP segmentation offload supported by a MAC provider. +pub struct TcpLsoFlags: u32 { + /// The device supports TCP LSO over IPv4. + const BASIC_IPV4 = LSO_TX_BASIC_TCP_IPV4; + /// The device supports TCP LSO over IPv6. + const BASIC_IPV6 = LSO_TX_BASIC_TCP_IPV6; + /// The device supports LSO of TCP packets within IP-based tunnels. + const TUN = LSO_TX_TUNNEL_TCP; +} + +/// Supported LSO use specific to [`TcpLsoFlags::TUN_IPV4`] or +/// [`TcpLsoFlags::TUN_IPV6`]. +pub struct TunnelTcpLsoFlags: u32 { + /// The device can fill the outer L4 (e.g., UDP) checksum + /// on generated tunnel packets. + const FILL_OUTER_CSUM = LSO_TX_TUNNEL_OUTER_CSUM; + /// The device supports *inner* TCP LSO over IPv4. + const INNER_IPV4 = LSO_TX_TUNNEL_INNER_IP4; + /// The device supports *inner* TCP LSO over IPv6. + const INNER_IPV6 = LSO_TX_TUNNEL_INNER_IP6; + /// LSO is supported with a Geneve outer transport. + const GENEVE = LSO_TX_TUNNEL_GENEVE; + /// LSO is supported with a VXLAN outer transport. + const VXLAN = LSO_TX_TUNNEL_VXLAN; +} + +/// Classes of checksum offload suppported by a MAC provider. +pub struct ChecksumOffloadCapabs: u32 { + /// CSO is enabled on the device. + const ENABLE = 1 << 0; + + /// Device can finalize packet checksum when provided with a partial + /// (pseudoheader) checksum. + const INET_PARTIAL = 1 << 1; + /// Device can compute full (L3+L4) checksum of TCP/UDP over IPv4. + const INET_FULL_V4 = 1 << 2; + /// Device can compute full (L4) checksum of TCP/UDP over IPv6. + const INET_FULL_V6 = 1 << 3; + /// Device can compute IPv4 header checksum. + const INET_HDRCKSUM = 1 << 4; + + const NON_TUN_CAPABS = + Self::ENABLE.bits() | Self::INET_PARTIAL.bits() | + Self::INET_FULL_V4.bits() | Self::INET_FULL_V6.bits() | + Self::INET_HDRCKSUM.bits(); + + /// Device can fill outer (UDP) checksum on tunnelled packets. + const TUNNEL_VALID = 1 << 5; +} + +/// Classes of tunnel suppported by a MAC provider. +pub struct TunnelType: u32 { + const GENEVE = 1 << 0; + const VXLAN = 1 << 1; +} +} + +bitflags! { +/// Flagset for requesting emulation on any packets marked +/// with the given offloads. +pub struct MacEmul: u32 { + /// Calculate the L3/L4 checksums. + const HWCKSUM_EMUL = MAC_HWCKSUM_EMUL; + /// Calculate the IPv4 checksum, ignoring L4. + const IPCKSUM_EMUL = MAC_IPCKSUM_EMUL; + /// Segment TCP packets into MSS-sized chunks. + const LSO_EMUL = MAC_LSO_EMUL; +} +} + +/// Emulates various offloads (checksum, LSO) for packets on loopback paths. +pub fn mac_hw_emul(msg: impl AsMblk, flags: MacEmul) -> Option { + let mut chain = msg.unwrap_mblk()?.as_ptr(); + unsafe { + sys::mac_hw_emul( + &raw mut chain, + ptr::null_mut(), + ptr::null_mut(), + flags.bits(), + ); + } + + (!chain.is_null()).then(|| unsafe { MsgBlkChain::new(chain).unwrap() }) +} diff --git a/xde/src/mac/sys.rs b/xde/src/mac/sys.rs index e4fb9ee8..9b7c09e1 100644 --- a/xde/src/mac/sys.rs +++ b/xde/src/mac/sys.rs @@ -67,6 +67,8 @@ pub enum link_state_t { #[allow(unused_imports)] use mac_client_promisc_type_t::*; +use crate::ip::t_uscalar_t; + pub type mac_tx_cookie_t = uintptr_t; pub type mac_rx_fn = unsafe extern "C" fn( *mut c_void, @@ -159,6 +161,12 @@ extern "C" { mp_chain: *mut mblk_t, ); pub fn mac_private_minor() -> minor_t; + + pub fn mac_sdu_get( + mh: *mut mac_handle, + min_sdu: *mut c_uint, + max_sdu: *mut c_uint, + ); } // Private MAC functions needed to get us a Tx path. @@ -177,8 +185,80 @@ extern "C" { ) -> c_int; pub fn mac_perim_exit(mph: mac_perim_handle); pub fn mac_perim_held(mh: mac_handle) -> boolean_t; + + pub fn mac_hw_emul( + mp_chain: *mut *mut mblk_t, + otail: *mut *mut mblk_t, + ocount: *mut c_uint, + mac_emul: u32, + ); + + // VERY private to MAC. + pub fn mac_capab_get( + mh: *mut mac_handle, + capab: mac_capab_t, + data: *mut c_void, + ) -> boolean_t; } +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct cso_tunnel_t { + pub ct_flags: u32, + pub ct_encap_max: u32, + pub ct_types: u32, +} + +#[derive(Clone, Copy, Default, Debug)] +pub struct mac_capab_cso_t { + pub cso_flags: u32, + pub cso_tunnel: cso_tunnel_t, +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct lso_basic_tcp_ipv4_t { + pub lso_max: t_uscalar_t, +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct lso_basic_tcp_ipv6_t { + pub lso_max: t_uscalar_t, +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct lso_tunnel_tcp_t { + pub tun_pay_max: u32, + pub tun_encap_max: u32, + pub tun_flags: u32, + pub tun_types: u32, + pub tun_pad: [u32; 2], +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct mac_capab_lso_t { + pub lso_flags: t_uscalar_t, + pub lso_basic_tcp_ipv4: lso_basic_tcp_ipv4_t, + pub lso_basic_tcp_ipv6: lso_basic_tcp_ipv6_t, + + pub lso_tunnel_tcp: lso_tunnel_tcp_t, +} + +// Currently supported flags for LSO. +pub const LSO_TX_BASIC_TCP_IPV4: u32 = 0x01; +pub const LSO_TX_BASIC_TCP_IPV6: u32 = 0x02; +pub const LSO_TX_TUNNEL_TCP: u32 = 0x04; + +// Currently supported tunnel classes for tunnelled LSO offload. +pub const LSO_TX_TUNNEL_OUTER_CSUM: u32 = 0x01; +pub const LSO_TX_TUNNEL_INNER_IP4: u32 = 0x02; +pub const LSO_TX_TUNNEL_INNER_IP6: u32 = 0x04; +pub const LSO_TX_TUNNEL_GENEVE: u32 = 0x08; +pub const LSO_TX_TUNNEL_VXLAN: u32 = 0x10; + #[repr(C)] #[derive(Debug)] pub enum mac_diag { @@ -461,3 +541,10 @@ pub struct mac_register_t { pub m_v12n: u32, pub m_multicast_sdu: c_uint, } + +// ====================================================================== +// uts/common/sys/mac_client.h +// ====================================================================== +pub const MAC_HWCKSUM_EMUL: u32 = 1 << 0; +pub const MAC_IPCKSUM_EMUL: u32 = 1 << 1; +pub const MAC_LSO_EMUL: u32 = 1 << 2; diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 79278806..5c76b013 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -18,17 +18,28 @@ use crate::dls::DlsStream; use crate::dls::LinkId; use crate::ioctl::IoctlEnvelope; use crate::mac; +use crate::mac::cso_tunnel_t; +use crate::mac::lso_basic_tcp_ipv4_t; +use crate::mac::lso_basic_tcp_ipv6_t; +use crate::mac::lso_tunnel_tcp_t; +use crate::mac::mac_capab_cso_t; +use crate::mac::mac_capab_lso_t; use crate::mac::mac_getinfo; +use crate::mac::mac_hw_emul; use crate::mac::mac_private_minor; +use crate::mac::ChecksumOffloadCapabs; +use crate::mac::MacEmul; use crate::mac::MacHandle; use crate::mac::MacPromiscHandle; use crate::mac::MacTxFlags; +use crate::mac::TcpLsoFlags; +use crate::mac::TunnelTcpLsoFlags; +use crate::mac::TunnelType; use crate::route::Route; use crate::route::RouteCache; use crate::route::RouteKey; use crate::secpolicy; use crate::stats::XdeStats; -use crate::sys; use crate::warn; use alloc::borrow::ToOwned; use alloc::boxed::Box; @@ -43,8 +54,16 @@ use core::ptr; use core::ptr::addr_of; use core::ptr::addr_of_mut; use core::time::Duration; +use illumos_sys_hdrs::mac::mac_ether_offload_info_t; +use illumos_sys_hdrs::mac::mac_ether_tun_info_t; +use illumos_sys_hdrs::mac::MacEtherOffloadFlags; +use illumos_sys_hdrs::mac::MacTunType; +use illumos_sys_hdrs::mac::MblkOffloadFlags; use illumos_sys_hdrs::*; +use ingot::ethernet::Ethertype; use ingot::geneve::GeneveRef; +use ingot::ip::IpProtocol; +use ingot::types::HeaderLen; use opte::api::ClearXdeUnderlayReq; use opte::api::CmdOk; use opte::api::Direction; @@ -57,6 +76,7 @@ use opte::api::XDE_IOC_OPTE_CMD; use opte::d_error::LabelBlock; use opte::ddi::kstat::KStatNamed; use opte::ddi::kstat::KStatProvider; +use opte::ddi::mblk::AsMblk; use opte::ddi::mblk::MsgBlk; use opte::ddi::mblk::MsgBlkChain; use opte::ddi::sync::KMutex; @@ -66,14 +86,17 @@ use opte::ddi::sync::KRwLockReadGuard; use opte::ddi::sync::KRwLockType; use opte::ddi::time::Interval; use opte::ddi::time::Periodic; +use opte::engine::ether::Ethernet; use opte::engine::ether::EthernetRef; use opte::engine::geneve::Vni; use opte::engine::headers::IpAddr; use opte::engine::ioctl::{self as api}; +use opte::engine::ip::v6::Ipv6; use opte::engine::ip::v6::Ipv6Addr; use opte::engine::packet::InnerFlowId; use opte::engine::packet::Packet; use opte::engine::packet::ParseError; +use opte::engine::parse::ValidUlp; use opte::engine::port::Port; use opte::engine::port::PortBuilder; use opte::engine::port::ProcessResult; @@ -214,6 +237,9 @@ pub struct xde_underlay_port { /// The MAC address associated with this underlay port. pub mac: [u8; 6], + /// The MTU of this link. + pub mtu: u32, + /// MAC promiscuous handle for receiving packets on the underlay link. mph: MacPromiscHandle, @@ -222,6 +248,148 @@ pub struct xde_underlay_port { stream: Arc, } +#[derive(Copy, Clone, Debug)] +struct OffloadInfo { + cso_state: mac_capab_cso_t, + lso_state: mac_capab_lso_t, + mtu: u32, +} + +impl OffloadInfo { + /// Forwards the underlay's tunnel checksum offload capabilities into + /// standard capabilities. + fn upstream_csum(&self) -> mac_capab_cso_t { + let base_capabs = + ChecksumOffloadCapabs::from_bits_truncate(self.cso_state.cso_flags); + let mut out = mac_capab_cso_t::default(); + + out.cso_flags = (if base_capabs + .contains(ChecksumOffloadCapabs::TUNNEL_VALID) + && TunnelType::from_bits_truncate( + self.cso_state.cso_tunnel.ct_types, + ) + .contains(TunnelType::GENEVE) + { + base_capabs & ChecksumOffloadCapabs::NON_TUN_CAPABS + } else { + ChecksumOffloadCapabs::empty() + }) + .bits(); + + out + } + + /// Forwards the underlay's tunnel TCP LSO capabilities into + /// standard LSO capabilities. + fn upstream_lso(&self) -> mac_capab_lso_t { + let base_capabs = + TcpLsoFlags::from_bits_truncate(self.lso_state.lso_flags); + let mut out = mac_capab_lso_t::default(); + + if base_capabs.contains(TcpLsoFlags::TUN) { + let tun_flags = TunnelTcpLsoFlags::from_bits_truncate( + self.lso_state.lso_tunnel_tcp.tun_flags, + ); + let tun_types = TunnelType::from_bits_truncate( + self.lso_state.lso_tunnel_tcp.tun_types, + ); + + if tun_types.contains(TunnelType::GENEVE) { + out.lso_flags |= TcpLsoFlags::BASIC_IPV4.bits() + | TcpLsoFlags::BASIC_IPV6.bits(); + out.lso_basic_tcp_ipv4 = lso_basic_tcp_ipv4_t { + lso_max: self.lso_state.lso_tunnel_tcp.tun_pay_max, + }; + out.lso_basic_tcp_ipv6 = lso_basic_tcp_ipv6_t { + lso_max: self.lso_state.lso_tunnel_tcp.tun_pay_max, + }; + } + } + + out + } + + fn should_request_lso(&self) -> bool { + let base_capabs = + TcpLsoFlags::from_bits_truncate(self.lso_state.lso_flags); + + base_capabs.contains(TcpLsoFlags::TUN) + && TunnelType::from_bits_truncate( + self.lso_state.lso_tunnel_tcp.tun_types, + ) + .contains(TunnelType::GENEVE) + } + + fn should_request_cso(&self) -> bool { + let base_capabs = + ChecksumOffloadCapabs::from_bits_truncate(self.cso_state.cso_flags); + + base_capabs.contains(ChecksumOffloadCapabs::TUNNEL_VALID) + && TunnelType::from_bits_truncate( + self.cso_state.cso_tunnel.ct_types, + ) + .contains(TunnelType::GENEVE) + } +} + +impl core::ops::BitAnd for OffloadInfo { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self::Output { + Self { + cso_state: mac_capab_cso_t { + cso_flags: self.cso_state.cso_flags & rhs.cso_state.cso_flags, + cso_tunnel: cso_tunnel_t { + ct_flags: self.cso_state.cso_tunnel.ct_flags + & rhs.cso_state.cso_tunnel.ct_flags, + ct_encap_max: self + .cso_state + .cso_tunnel + .ct_encap_max + .min(rhs.cso_state.cso_tunnel.ct_encap_max), + ct_types: self.cso_state.cso_tunnel.ct_types + & rhs.cso_state.cso_tunnel.ct_types, + }, + }, + lso_state: mac_capab_lso_t { + lso_flags: self.lso_state.lso_flags & rhs.lso_state.lso_flags, + lso_basic_tcp_ipv4: lso_basic_tcp_ipv4_t { + lso_max: self + .lso_state + .lso_basic_tcp_ipv4 + .lso_max + .min(rhs.lso_state.lso_basic_tcp_ipv4.lso_max), + }, + lso_basic_tcp_ipv6: lso_basic_tcp_ipv6_t { + lso_max: self + .lso_state + .lso_basic_tcp_ipv6 + .lso_max + .min(rhs.lso_state.lso_basic_tcp_ipv6.lso_max), + }, + lso_tunnel_tcp: lso_tunnel_tcp_t { + tun_pay_max: self + .lso_state + .lso_tunnel_tcp + .tun_pay_max + .min(rhs.lso_state.lso_tunnel_tcp.tun_pay_max), + tun_encap_max: self + .lso_state + .lso_tunnel_tcp + .tun_encap_max + .min(rhs.lso_state.lso_tunnel_tcp.tun_encap_max), + tun_flags: self.lso_state.lso_tunnel_tcp.tun_flags + & rhs.lso_state.lso_tunnel_tcp.tun_flags, + tun_types: self.lso_state.lso_tunnel_tcp.tun_types + & rhs.lso_state.lso_tunnel_tcp.tun_types, + tun_pad: [0; 2], + }, + }, + mtu: self.mtu.min(rhs.mtu), + } + } +} + struct XdeState { ectx: Arc, vpc_map: Arc, @@ -237,6 +405,7 @@ struct UnderlayState { // onto the underlay network u1: Arc, u2: Arc, + shared_props: OffloadInfo, } fn get_xde_state() -> &'static XdeState { @@ -305,6 +474,7 @@ pub struct XdeDev { // driver. pub u1: Arc, pub u2: Arc, + underlay_capab: OffloadInfo, // We make this a per-port cache rather than sharing between all // ports to theoretically reduce contention around route expiry @@ -724,6 +894,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { passthrough: req.passthrough, u1: underlay.u1.clone(), u2: underlay.u2.clone(), + underlay_capab: underlay.shared_props, routes: RouteCache::default(), }); drop(underlay_); @@ -747,7 +918,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { mreg.m_min_sdu = 1; mreg.m_max_sdu = 1500; // TODO hardcode mreg.m_multicast_sdu = 0; - mreg.m_margin = sys::VLAN_TAGSZ; + mreg.m_margin = crate::sys::VLAN_TAGSZ; mreg.m_v12n = mac::MAC_VIRT_NONE as u32; unsafe { @@ -1037,7 +1208,7 @@ fn create_underlay_port( link_name: String, // This parameter is likely to be used as part of the flows work. _mc_name: &str, -) -> Result { +) -> Result<(xde_underlay_port, OffloadInfo), OpteError> { let link_cstr = CString::new(link_name.as_str()).unwrap(); let link_id = @@ -1075,12 +1246,20 @@ fn create_underlay_port( }, )?; - Ok(xde_underlay_port { - name: link_name, - mac: mh.get_mac_addr(), - mph, - stream, - }) + let (.., mtu) = mh.get_min_max_sdu(); + let cso_state = mh.get_cso_capabs(); + let lso_state = mh.get_lso_capabs(); + + Ok(( + xde_underlay_port { + name: link_name, + mac: mh.get_mac_addr(), + mtu, + mph, + stream, + }, + OffloadInfo { lso_state, cso_state, mtu }, + )) } #[no_mangle] @@ -1088,9 +1267,10 @@ unsafe fn init_underlay_ingress_handlers( u1_name: String, u2_name: String, ) -> Result { - let u1 = Arc::new(create_underlay_port(u1_name, "xdeu0")?); - let u2 = Arc::new(create_underlay_port(u2_name, "xdeu1")?); - Ok(UnderlayState { u1, u2 }) + let (u1, i1) = create_underlay_port(u1_name, "xdeu0")?; + let (u2, i2) = create_underlay_port(u2_name, "xdeu1")?; + opte::engine::err!("I have {:#?} and {:#?}", &i1, &i2); + Ok(UnderlayState { u1: u1.into(), u2: u2.into(), shared_props: i1 & i2 }) } #[no_mangle] @@ -1431,14 +1611,33 @@ fn guest_loopback( // destination Port. match dest_dev.port.process(In, parsed_pkt) { Ok(ProcessResult::Modified(emit_spec)) => { - let pkt = emit_spec.apply(pkt); - unsafe { - mac::mac_rx( - dest_dev.mh, - ptr::null_mut(), - pkt.unwrap_mblk().as_ptr(), - ) + let mut pkt = emit_spec.apply(pkt); + + // BREAKS EVERYTHING. + // pkt.strip_lso(); + + // Having advertised offloads to our guest, looped back + // packets are liable to have zero-checksums. Fill these + // if necessary. + let pkt = if pkt + .cksum_flags() + .intersects(MblkOffloadFlags::HCK_TX_FLAGS) + { + mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) + .and_then(|v| v.unwrap_mblk()) + } else { + Some(pkt.unwrap_mblk()) }; + + if let Some(pkt) = pkt { + unsafe { + mac::mac_rx( + dest_dev.mh, + ptr::null_mut(), + pkt.as_ptr(), + ) + }; + } } Ok(ProcessResult::Drop { reason }) => { @@ -1552,6 +1751,39 @@ unsafe fn xde_mc_tx_one(src_dev: &XdeDev, mut pkt: MsgBlk) -> *mut mblk_t { return ptr::null_mut(); } }; + let meoi_len = parsed_pkt.len() as u32; + + let meta = parsed_pkt.meta(); + let is_tcp = meta + .inner_ulp + .as_ref() + .map(|v| matches!(v, ValidUlp::Tcp(_))) + .unwrap_or_default(); + let non_eth_payl_bytes = (&meta.inner_l3, &meta.inner_ulp).packet_length(); + + let (l4_flag, l4_ty) = match &meta.inner_ulp { + Some(ValidUlp::Tcp(_)) => { + (MacEtherOffloadFlags::L4INFO_SET, IpProtocol::TCP.0) + } + Some(ValidUlp::Udp(_)) => { + (MacEtherOffloadFlags::L4INFO_SET, IpProtocol::UDP.0) + } + _ => (MacEtherOffloadFlags::empty(), 0), + }; + + let ulp_meoi = mac_ether_offload_info_t { + meoi_flags: MacEtherOffloadFlags::L2INFO_SET + | MacEtherOffloadFlags::L3INFO_SET + | l4_flag, + meoi_len, + meoi_l2hlen: meta.inner_eth.packet_length() as u8, + meoi_l3proto: meta.inner_eth.ethertype().0, + meoi_l3hlen: meta.inner_l3.packet_length() as u16, + meoi_l4proto: l4_ty, + meoi_l4hlen: meta.inner_ulp.packet_length() as u8, + + ..Default::default() + }; // Choose u1 as a starting point. This may be changed in the next_hop // function when we are actually able to determine what interface should be @@ -1604,8 +1836,9 @@ unsafe fn xde_mc_tx_one(src_dev: &XdeDev, mut pkt: MsgBlk) -> *mut mblk_t { } }; + let mtu_unrestricted = emit_spec.mtu_unrestricted(); let l4_hash = emit_spec.l4_hash(); - let out_pkt = emit_spec.apply(pkt); + let mut out_pkt = emit_spec.apply(pkt); if ip6_src == ip6_dst { let devs = unsafe { xde_devs.read() }; @@ -1613,6 +1846,42 @@ unsafe fn xde_mc_tx_one(src_dev: &XdeDev, mut pkt: MsgBlk) -> *mut mblk_t { return ptr::null_mut(); } + // TODO: should these not just be copied from the original input + // mblk? + let cso_possible = src_dev.underlay_capab.should_request_cso(); + let lso_possible = src_dev.underlay_capab.should_request_lso(); + + // CSO is a prerequisite for LSO. + if cso_possible { + // Boost MSS to use full jumbo frames if we know our path + // can be served purely on internal links. + // Recall that SDU does not include L2 size, hence 'non_eth_payl' + + let mss = if mtu_unrestricted { + src_dev.underlay_capab.mtu + - 70 + - (non_eth_payl_bytes as u32) + } else { + 1500 - (non_eth_payl_bytes as u32) + }; + + out_pkt.request_offload(true, is_tcp && lso_possible, mss); + + let tun_meoi = mac_ether_tun_info_t { + mett_flags: MacEtherOffloadFlags::L2INFO_SET + | MacEtherOffloadFlags::L3INFO_SET + | MacEtherOffloadFlags::TUNINFO_SET, + mett_l2hlen: Ethernet::MINIMUM_LENGTH as u8, + mett_l3proto: Ethertype::IPV6.0, + mett_l3hlen: Ipv6::MINIMUM_LENGTH as u16, + mett_tuntype: MacTunType::GENEVE, + + ..Default::default() + }; + + out_pkt.fill_offload_info(&tun_meoi, &ulp_meoi); + } + // Currently the overlay layer leaves the outer frame // destination and source zero'd. Ask IRE for the route // associated with the underlay destination. Then ask NCE @@ -1712,11 +1981,81 @@ where #[no_mangle] unsafe extern "C" fn xde_mc_getcapab( - _arg: *mut c_void, - _cap: mac::mac_capab_t, - _capb_data: *mut c_void, + arg: *mut c_void, + cap: mac::mac_capab_t, + capb_data: *mut c_void, ) -> boolean_t { - boolean_t::B_FALSE + let dev = arg as *mut XdeDev; + + let shared_underlay_caps = unsafe { (*dev).underlay_capab }; + + match cap { + // TODO: work out a safer interface for this. + mac::mac_capab_t::MAC_CAPAB_HCKSUM => { + // capab data is a *mut u32 (enum). + let capab = capb_data as *mut mac_capab_cso_t; + + opte::engine::err!("I see base as {:?}", &shared_underlay_caps); + + let desired_capabs = shared_underlay_caps.upstream_csum(); + unsafe { + // Don't write the newer capabs -- don't want to corrupt + // memory on older illumos and/or CI. + (*capab).cso_flags = desired_capabs.cso_flags; + } + + opte::engine::err!("Adverising CSO {:?}", &desired_capabs); + + // FORCE + // unsafe { + // (*capab).cso_flags = ChecksumOffloadCapabs::NON_TUN_CAPABS + // .difference(ChecksumOffloadCapabs::INET_PARTIAL).bits(); + // } + + // if desired_capabs.cso_flags == 0 { + // boolean_t::B_FALSE + // } else { + // boolean_t::B_TRUE + // } + + boolean_t::B_TRUE + } + mac::mac_capab_t::MAC_CAPAB_LSO => { + let capab = capb_data as *mut mac_capab_lso_t; + let desired_lso = shared_underlay_caps.upstream_lso(); + + opte::engine::err!("I see base as {:?}", &shared_underlay_caps); + opte::engine::err!("Adverising LSO {:?}", &desired_lso); + + // FORCE + // let desired_lso = mac_capab_lso_t { + // lso_flags: TcpLsoFlags::BASIC_IPV4.bits() + // | TcpLsoFlags::BASIC_IPV6.bits(), + // lso_basic_tcp_ipv4: lso_basic_tcp_ipv4_t { + // lso_max: u16::MAX as u32, + // }, + // lso_basic_tcp_ipv6: lso_basic_tcp_ipv6_t { + // lso_max: u16::MAX as u32, + // }, + // ..Default::default() + // }; + + unsafe { + // Don't write the newer capabs -- don't want to corrupt + // memory on older illumos and/or CI. + (*capab).lso_flags = desired_lso.lso_flags; + (*capab).lso_basic_tcp_ipv4 = desired_lso.lso_basic_tcp_ipv4; + (*capab).lso_basic_tcp_ipv6 = desired_lso.lso_basic_tcp_ipv6; + } + + if desired_lso.lso_flags == 0 { + boolean_t::B_FALSE + } else { + boolean_t::B_TRUE + } + } + _ => boolean_t::B_FALSE, + } } #[no_mangle] @@ -1877,6 +2216,9 @@ unsafe fn xde_rx_one( return; }; + let is_tcp = matches!(meta.inner_ulp, ValidUlp::Tcp(_)); + let mss_estimate = 1500 - (&meta.inner_l3, &meta.inner_ulp).packet_length(); + // We are in passthrough mode, skip OPTE processing. if dev.passthrough { drop(parsed_pkt); @@ -1893,7 +2235,16 @@ unsafe fn xde_rx_one( mac::mac_rx(dev.mh, mrh, pkt.unwrap_mblk().as_ptr()); } Ok(ProcessResult::Modified(emit_spec)) => { - let npkt = emit_spec.apply(pkt); + let mut npkt = emit_spec.apply(pkt); + + // Due to possible pseudo-GRO, we need to inform mac/viona on how + // it can split up this packet, if the guest cannot receive it + // (e.g., no GRO/large frame support). + // HW_LSO will cause viona to treat this packet as though it were + // a locally delivered segment making use of LSO. + if is_tcp && npkt.len() > 1500 + Ethernet::MINIMUM_LENGTH { + npkt.request_offload(false, true, mss_estimate as u32); + } mac::mac_rx(dev.mh, mrh, npkt.unwrap_mblk().as_ptr()); }