diff --git a/.github/buildomat/jobs/test.sh b/.github/buildomat/jobs/test.sh index 236234a0..59a62dae 100755 --- a/.github/buildomat/jobs/test.sh +++ b/.github/buildomat/jobs/test.sh @@ -82,3 +82,15 @@ pfexec add_drv xde banner "test" pfexec chmod +x /input/xde/work/test/loopback pfexec /input/xde/work/test/loopback --nocapture + +# Multicast tests must run with --test-threads=1 because they share +# hardcoded device names (xde_test_sim0/1, xde_test_vnic0/1) that conflict +# when tests run in parallel +pfexec chmod +x /input/xde/work/test/multicast_rx +pfexec /input/xde/work/test/multicast_rx --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_multi_sub +pfexec /input/xde/work/test/multicast_multi_sub --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_validation +pfexec /input/xde/work/test/multicast_validation --nocapture --test-threads=1 diff --git a/.github/buildomat/jobs/xde.sh b/.github/buildomat/jobs/xde.sh index 3abe2881..82baf11c 100755 --- a/.github/buildomat/jobs/xde.sh +++ b/.github/buildomat/jobs/xde.sh @@ -14,6 +14,9 @@ #: "=/work/release/xde_link.so", #: "=/work/release/xde_link.so.sha256", #: "=/work/test/loopback", +#: "=/work/test/multicast_rx", +#: "=/work/test/multicast_multi_sub", +#: "=/work/test/multicast_validation", #: "=/work/xde.conf", #: ] #: @@ -116,5 +119,23 @@ loopback_test=$( cargo build -q --test loopback --message-format=json |\ jq -r "select(.profile.test == true) | .filenames[]" ) +cargo build --test multicast_rx +multicast_rx_test=$( + cargo build -q --test multicast_rx --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_multi_sub +multicast_multi_sub_test=$( + cargo build -q --test multicast_multi_sub --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_validation +multicast_validation_test=$( + cargo build -q --test multicast_validation --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) mkdir -p /work/test cp $loopback_test /work/test/loopback +cp $multicast_rx_test /work/test/multicast_rx +cp $multicast_multi_sub_test /work/test/multicast_multi_sub +cp $multicast_validation_test /work/test/multicast_validation diff --git a/.gitignore b/.gitignore index f82d74c0..5956d6b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.html target download -.DS_STORE +scripts +.DS_STORE \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index c3fb0628..743f68c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2972,6 +2972,7 @@ dependencies = [ "anyhow", "libnet", "opte-ioctl", + "opte-test-utils", "oxide-vpc", "rand", "slog", diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index 219bf555..706b14a4 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -27,8 +27,10 @@ use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; use oxide_vpc::api::BOUNDARY_SERVICES_VNI; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DhcpCfg; @@ -39,22 +41,26 @@ use oxide_vpc::api::FirewallRule; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Cfg; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::Ports; use oxide_vpc::api::ProtoFilter; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::TunnelEndpoint; use oxide_vpc::api::VpcCfg; +use oxide_vpc::print::print_mcast_fwd; use oxide_vpc::print::print_v2b; use oxide_vpc::print::print_v2p; use std::io; @@ -225,6 +231,31 @@ enum Command { /// Clear a virtual-to-boundary mapping ClearV2B { prefix: IpCidr, tunnel_endpoint: Vec }, + /// Set a multicast forwarding entry + SetMcastFwd { + /// The multicast group address (IPv4 or IPv6) + group: IpAddr, + /// Next hop IPv6 address + next_hop_addr: Ipv6Addr, + /// Next hop VNI (defaults to fleet-level DEFAULT_MULTICAST_VNI) + #[arg(default_value_t = Vni::new(DEFAULT_MULTICAST_VNI).unwrap())] + next_hop_vni: Vni, + /// Delivery mode (replication): + /// - external: local guests in same VNI + /// - underlay: infrastructure via underlay multicast + /// - all: both local and underlay + replication: Replication, + }, + + /// Clear a multicast forwarding entry + ClearMcastFwd { + /// The multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Dump the multicast forwarding table + DumpMcastFwd, + /// Add a new router entry, either IPv4 or IPv6. AddRouterEntry { #[command(flatten)] @@ -764,6 +795,29 @@ fn main() -> anyhow::Result<()> { hdl.clear_v2b(&req)?; } + Command::SetMcastFwd { + group, + next_hop_addr, + next_hop_vni, + replication, + } => { + let next_hop = NextHopV6::new(next_hop_addr, next_hop_vni); + let req = SetMcastForwardingReq { + group, + next_hops: vec![(next_hop, replication)], + }; + hdl.set_mcast_fwd(&req)?; + } + + Command::ClearMcastFwd { group } => { + let req = ClearMcastForwardingReq { group }; + hdl.clear_mcast_fwd(&req)?; + } + + Command::DumpMcastFwd => { + print_mcast_fwd(&hdl.dump_mcast_fwd()?)?; + } + Command::AddRouterEntry { route: RouterRule { port, dest, target, class }, } => { diff --git a/crates/illumos-sys-hdrs/src/kernel.rs b/crates/illumos-sys-hdrs/src/kernel.rs index 9ac0c26b..c0d854d4 100644 --- a/crates/illumos-sys-hdrs/src/kernel.rs +++ b/crates/illumos-sys-hdrs/src/kernel.rs @@ -500,6 +500,8 @@ unsafe extern "C" { pub fn freemsg(mp: *mut mblk_t); pub fn freemsgchain(mp: *mut mblk_t); + pub fn msgpullup(mp: *mut mblk_t, n_bytes: isize) -> *mut mblk_t; + pub fn gethrtime() -> hrtime_t; pub fn getmajor(dev: dev_t) -> major_t; diff --git a/crates/opte-api/src/cmd.rs b/crates/opte-api/src/cmd.rs index 5c0f9986..de507062 100644 --- a/crates/opte-api/src/cmd.rs +++ b/crates/opte-api/src/cmd.rs @@ -25,31 +25,38 @@ pub const XDE_IOC_OPTE_CMD: i32 = XDE_IOC as i32 | 0x01; #[derive(Clone, Copy, Debug)] #[repr(C)] pub enum OpteCmd { - ListPorts = 1, // list all ports - AddFwRule = 20, // add firewall rule - RemFwRule = 21, // remove firewall rule - SetFwRules = 22, // set/replace all firewall rules at once - DumpTcpFlows = 30, // dump TCP flows - DumpLayer = 31, // dump the specified Layer - DumpUft = 32, // dump the Unified Flow Table - ListLayers = 33, // list the layers on a given port - ClearUft = 40, // clear the UFT - ClearLft = 41, // clear the given Layer's Flow Table - SetVirt2Phys = 50, // set a v2p mapping - DumpVirt2Phys = 51, // dump the v2p mappings - SetVirt2Boundary = 52, // set a v2b mapping - ClearVirt2Boundary = 53, // clear a v2b mapping - DumpVirt2Boundary = 54, // dump the v2b mappings - ClearVirt2Phys = 55, // clear a v2p mapping - AddRouterEntry = 60, // add a router entry for IP dest - DelRouterEntry = 61, // remove a router entry for IP dest - CreateXde = 70, // create a new xde device - DeleteXde = 71, // delete an xde device - SetXdeUnderlay = 72, // set xde underlay devices - ClearXdeUnderlay = 73, // clear xde underlay devices - SetExternalIps = 80, // set xde external IPs for a port - AllowCidr = 90, // allow ip block through gateway tx/rx - RemoveCidr = 91, // deny ip block through gateway tx/rx + ListPorts = 1, // list all ports + AddFwRule = 20, // add firewall rule + RemFwRule = 21, // remove firewall rule + SetFwRules = 22, // set/replace all firewall rules at once + DumpTcpFlows = 30, // dump TCP flows + DumpLayer = 31, // dump the specified Layer + DumpUft = 32, // dump the Unified Flow Table + ListLayers = 33, // list the layers on a given port + ClearUft = 40, // clear the UFT + ClearLft = 41, // clear the given Layer's Flow Table + SetVirt2Phys = 50, // set a v2p mapping + DumpVirt2Phys = 51, // dump the v2p mappings + SetVirt2Boundary = 52, // set a v2b mapping + ClearVirt2Boundary = 53, // clear a v2b mapping + DumpVirt2Boundary = 54, // dump the v2b mappings + ClearVirt2Phys = 55, // clear a v2p mapping + AddRouterEntry = 60, // add a router entry for IP dest + DelRouterEntry = 61, // remove a router entry for IP dest + CreateXde = 70, // create a new xde device + DeleteXde = 71, // delete an xde device + SetXdeUnderlay = 72, // set xde underlay devices + ClearXdeUnderlay = 73, // clear xde underlay devices + SetExternalIps = 80, // set xde external IPs for a port + AllowCidr = 90, // allow ip block through gateway tx/rx + RemoveCidr = 91, // deny ip block through gateway tx/rx + SetMcastForwarding = 100, // set multicast forwarding entries + ClearMcastForwarding = 101, // clear multicast forwarding entries + DumpMcastForwarding = 102, // dump multicast forwarding table + McastSubscribe = 103, // subscribe a port to a multicast group + McastUnsubscribe = 104, // unsubscribe a port from a multicast group + SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) + ClearMcast2Phys = 106, // clear M2P mapping } impl TryFrom for OpteCmd { @@ -82,6 +89,13 @@ impl TryFrom for OpteCmd { 80 => Ok(Self::SetExternalIps), 90 => Ok(Self::AllowCidr), 91 => Ok(Self::RemoveCidr), + 100 => Ok(Self::SetMcastForwarding), + 101 => Ok(Self::ClearMcastForwarding), + 102 => Ok(Self::DumpMcastForwarding), + 103 => Ok(Self::McastSubscribe), + 104 => Ok(Self::McastUnsubscribe), + 105 => Ok(Self::SetMcast2Phys), + 106 => Ok(Self::ClearMcast2Phys), _ => Err(()), } } @@ -177,6 +191,7 @@ pub enum OpteError { dest: IpCidr, target: String, }, + InvalidUnderlayMulticast(String), LayerNotFound(String), MacExists { port: String, @@ -230,6 +245,7 @@ impl OpteError { Self::DeserCmdReq(_) => ENOMSG, Self::FlowExists(_) => EEXIST, Self::InvalidRouterEntry { .. } => EINVAL, + Self::InvalidUnderlayMulticast(_) => EINVAL, Self::LayerNotFound(_) => ENOENT, Self::MacExists { .. } => EEXIST, Self::MaxCapacity(_) => ENFILE, diff --git a/crates/opte-api/src/ip.rs b/crates/opte-api/src/ip.rs index 20fffaaa..3da20d9c 100644 --- a/crates/opte-api/src/ip.rs +++ b/crates/opte-api/src/ip.rs @@ -307,6 +307,15 @@ pub enum IpAddr { Ip6(Ipv6Addr), } +impl IpAddr { + pub const fn is_multicast(&self) -> bool { + match self { + IpAddr::Ip4(v4) => v4.is_multicast(), + IpAddr::Ip6(v6) => v6.is_multicast(), + } + } +} + impl From for IpAddr { fn from(ipv4: Ipv4Addr) -> Self { IpAddr::Ip4(ipv4) @@ -431,6 +440,10 @@ impl Ipv4Addr { // u32. u32::from_be_bytes(self.bytes()).to_be() } + + pub const fn is_multicast(&self) -> bool { + matches!(self.inner[0], 224..240) + } } impl From for Ipv4Addr { @@ -640,6 +653,24 @@ impl Ipv6Addr { self.inner[0] == 0xFF } + /// Return `true` if this is a multicast IPv6 address with administrative scope + /// (admin-local, site-local, or organization-local) as defined in RFC 4291 and RFC 7346. + /// + /// The three administrative scopes are: + /// - `0x4`: admin-local scope + /// - `0x5`: site-local scope + /// - `0x8`: organization-local scope + pub const fn is_admin_scoped_multicast(&self) -> bool { + if !self.is_multicast() { + return false; + } + + // Extract the scope field from the lower 4 bits of the second byte + // (first byte is 0xFF for all multicast, second byte contains flags and scope) + let scope = self.inner[1] & 0x0F; + matches!(scope, 0x4 | 0x5 | 0x8) + } + /// Return the bytes of the address. pub fn bytes(&self) -> [u8; 16] { self.inner @@ -989,6 +1020,12 @@ impl Display for Ipv4Cidr { } impl Ipv4Cidr { + /// IPv4 multicast address range, `224.0.0.0/4`. + pub const MCAST: Self = Self { + ip: Ipv4Addr::from_const([224, 0, 0, 0]), + prefix_len: Ipv4PrefixLen(4), + }; + pub fn ip(&self) -> Ipv4Addr { self.parts().0 } @@ -1146,6 +1183,24 @@ impl Ipv6Cidr { prefix_len: Ipv6PrefixLen(64), }; + /// IPv6 admin-local multicast scope prefix, `ff04::/16`. + pub const MCAST_ADMIN_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff04, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + + /// IPv6 site-local multicast scope prefix, `ff05::/16`. + pub const MCAST_SITE_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff05, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + + /// IPv6 organization-local multicast scope prefix, `ff08::/16`. + pub const MCAST_ORG_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff08, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + pub fn new(ip: Ipv6Addr, prefix_len: Ipv6PrefixLen) -> Self { let ip = ip.safe_mask(prefix_len); Ipv6Cidr { ip, prefix_len } @@ -1468,6 +1523,24 @@ mod test { assert_eq!(addr.solicited_node_multicast(), expected); } + #[test] + fn test_ipv6_admin_scoped_multicast() { + // Test the three valid administrative scopes + assert!(to_ipv6("ff04::1").is_admin_scoped_multicast()); // admin-local (0x4) + assert!(to_ipv6("ff05::1").is_admin_scoped_multicast()); // site-local (0x5) + assert!(to_ipv6("ff08::1").is_admin_scoped_multicast()); // organization-local (0x8) + + // Test non-admin scoped multicast addresses + assert!(!to_ipv6("ff01::1").is_admin_scoped_multicast()); // interface-local + assert!(!to_ipv6("ff02::1").is_admin_scoped_multicast()); // link-local + assert!(!to_ipv6("ff0e::1").is_admin_scoped_multicast()); // global + + // Test non-multicast addresses + assert!(!to_ipv6("fd00::1").is_admin_scoped_multicast()); // ULA + assert!(!to_ipv6("fe80::1").is_admin_scoped_multicast()); // link-local unicast + assert!(!to_ipv6("2001:db8::1").is_admin_scoped_multicast()); // global unicast + } + #[test] fn dhcp_fqdn() { let no_host = DhcpCfg { hostname: None, ..Default::default() }; diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index 7176e7a5..558a6e41 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -51,7 +51,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 37; +pub const API_VERSION: u64 = 38; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/crates/opte-api/src/mac.rs b/crates/opte-api/src/mac.rs index 1818a997..728774de 100644 --- a/crates/opte-api/src/mac.rs +++ b/crates/opte-api/src/mac.rs @@ -55,6 +55,11 @@ impl MacAddr { pub const fn from_const(bytes: [u8; 6]) -> Self { Self { inner: bytes } } + + /// Return whether this MAC address is broadcast/multicast. + pub const fn is_broadcast(&self) -> bool { + (self.inner[0] & 0b0000_0001) != 0 + } } impl From for smoltcp::wire::EthernetAddress { diff --git a/dtrace/opte-mcast-delivery.d b/dtrace/opte-mcast-delivery.d new file mode 100644 index 00000000..4924012a --- /dev/null +++ b/dtrace/opte-mcast-delivery.d @@ -0,0 +1,97 @@ +/* + * Track multicast packet delivery. + * + * dtrace -L ./lib -I . -Cqs ./opte-mcast-delivery.d + */ +#include "common.h" + +#define HDR_FMT "%-8s %-6s %-39s %-20s %-10s\n" +#define LINE_FMT "%-8s %-6d %-39s %-20s %-10s\n" + +BEGIN { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; +} + +sdt:xde::mcast-tx { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=replication */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->repl = arg3; + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + this->repl_str = (this->repl == 0) ? "External" : + (this->repl == 1) ? "Underlay" : + (this->repl == 2) ? "All" : "Unknown"; + printf(LINE_FMT, "TX", this->vni, this->group_str, "-", this->repl_str); + num++; +} + +sdt:xde::mcast-rx { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=replication */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->repl = arg3; + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + this->repl_str = (this->repl == 0) ? "External" : + (this->repl == 1) ? "Underlay" : + (this->repl == 2) ? "All" : "Unknown"; + printf(LINE_FMT, "RX", this->vni, this->group_str, "-", this->repl_str); + num++; +} + +sdt:xde::mcast-local-delivery { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=port */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->port = stringof(arg3); + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + printf(LINE_FMT, "DELIVER", this->vni, this->group_str, this->port, "-"); + num++; +} + +sdt:xde::mcast-underlay-fwd { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=next_hop */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->next_hop = (in6_addr_t *)arg3; + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop); + printf(LINE_FMT, "UNDERLAY", this->vni, this->group_str, this->next_hop_str, "-"); + num++; +} diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index c896ce4b..26fd831f 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -27,6 +27,8 @@ use opte::api::XDE_IOC_OPTE_CMD; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AllowCidrReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; @@ -34,15 +36,20 @@ use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrReq; use oxide_vpc::api::RemoveCidrResp; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::VpcCfg; @@ -205,6 +212,16 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) } + pub fn set_m2p(&self, req: &SetMcast2PhysReq) -> Result { + let cmd = OpteCmd::SetMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + pub fn clear_m2p(&self, req: &ClearMcast2PhysReq) -> Result { + let cmd = OpteCmd::ClearMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + pub fn set_v2b(&self, req: &SetVirt2BoundaryReq) -> Result { let cmd = OpteCmd::SetVirt2Boundary; run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) @@ -224,6 +241,48 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) } + /// Set a multicast forwarding entry. + pub fn set_mcast_fwd( + &self, + req: &SetMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::SetMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Clear a multicast forwarding entry. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::ClearMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Dump the multicast forwarding table. + pub fn dump_mcast_fwd(&self) -> Result { + let cmd = OpteCmd::DumpMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) + } + + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( + &self, + req: &McastSubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastSubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastUnsubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + /// Set xde underlay devices. pub fn set_xde_underlay( &self, diff --git a/lib/opte-test-utils/src/geneve_verify.rs b/lib/opte-test-utils/src/geneve_verify.rs new file mode 100644 index 00000000..9a510548 --- /dev/null +++ b/lib/opte-test-utils/src/geneve_verify.rs @@ -0,0 +1,183 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Module to parse and verify Geneve headers from snoop hex output. +//! +//! This uses the existing OPTE/ingot Geneve types to parse raw packet bytes +//! and extract key multicast-related fields for test assertions. + +use opte::engine::geneve::Vni; +use opte::engine::ip::v6::Ipv6Ref; +use opte::engine::parse::ValidGeneveOverV6; +use opte::ingot::geneve::GeneveRef; +use opte::ingot::types::HeaderParse; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::Replication; +use oxide_vpc::engine::geneve::extract_multicast_replication; + +/// Parsed Geneve header information for test verification. +pub struct GeneveInfo { + pub vni: Vni, + pub outer_ipv6_dst: Ipv6Addr, + pub replication: Option, +} + +/// Parse a Geneve/IPv6 packet from raw bytes and extract multicast-related +/// fields. +/// +/// Returns VNI, outer IPv6 destination, and replication mode from Geneve +/// options. +pub fn parse_geneve_packet(bytes: &[u8]) -> Result { + let (pkt, _, _) = ValidGeneveOverV6::parse(bytes) + .map_err(|e| format!("Failed to parse Geneve/IPv6 packet: {e:?}"))?; + + let vni = pkt.outer_encap.vni(); + let outer_ipv6_dst = pkt.outer_v6.destination(); + let replication = extract_multicast_replication(&pkt.outer_encap); + + Ok(GeneveInfo { vni, outer_ipv6_dst, replication }) +} + +/// Parse hex string from snoop output into bytes. +/// +/// Snoop output with `-x0` flag is hex digits without separators: +/// "ffffffffffff001122334455..." +pub fn parse_snoop_hex(hex_str: &str) -> Result, String> { + hex_str + .as_bytes() + .chunks(2) + .map(|chunk| { + let hex_byte = std::str::from_utf8(chunk) + .map_err(|e| format!("Invalid UTF-8: {e}"))?; + u8::from_str_radix(hex_byte, 16) + .map_err(|e| format!("Invalid hex: {e}")) + }) + .collect() +} + +/// Extract snoop hex output from command output. +/// +/// We support common `snoop -P -x0` formats: +/// - Lines of contiguous hex digits (with or without spaces). +/// - Hex dumps with an offset prefix like `0:` or `0000:` followed by +/// groups of hex digits (2/4/8/16 chars). +/// +/// To avoid false positives from summary lines (e.g., "UDP port 6081"), the +/// tokenized fallback triggers only for lines that look like offset-prefixed +/// hex dumps. +pub fn extract_snoop_hex(snoop_output: &str) -> Result { + let mut hex_bytes = String::new(); + + for line in snoop_output.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.contains("Using device") { + continue; + } + + // Case 1: entire line is hex digits + whitespace (e.g., "aa bb cc ..." or + // single long line of hex). Remove whitespace and append. + if trimmed.chars().all(|c| c.is_ascii_hexdigit() || c.is_whitespace()) { + for ch in trimmed.chars().filter(|c| c.is_ascii_hexdigit()) { + hex_bytes.push(ch); + } + continue; + } + + // Case 2: offset-prefixed hexdump lines (e.g., "0: 4500 003c ..."). + // Only consider tokenized parsing if the first token looks like an + // offset (decimal or hex) ending with a ':' to avoid pulling numbers + // from summary lines. + let mut tokens = trimmed.split_whitespace(); + let Some(first) = tokens.next() else { continue }; + if !first.ends_with(':') { + continue; // Not a hexdump line + } + let mut off = first.trim_end_matches(':'); + if off.starts_with("0x") || off.starts_with("0X") { + off = &off[2..]; + } + if !off.chars().all(|c| c.is_ascii_hexdigit()) { + continue; // Not a valid offset + } + + for tok in tokens { + let mut t = tok.trim_end_matches(':'); + if t.len() > 2 && (t.starts_with("0x") || t.starts_with("0X")) { + t = &t[2..]; + } + if t.is_empty() { + continue; + } + // Accept groups commonly used in dumps: bytes (2), words (4), dwords (8), + // or qwords (16). Ignore anything else to avoid accidental matches. + let len = t.len(); + if matches!(len, 2 | 4 | 8 | 16) + && t.chars().all(|c| c.is_ascii_hexdigit()) + { + hex_bytes.push_str(t); + } + } + } + + if hex_bytes.is_empty() { + return Err("No hex data found in snoop output".to_string()); + } + + // Ensure even number of nibbles to form complete bytes. + if hex_bytes.len() % 2 == 1 { + hex_bytes.pop(); + } + + Ok(hex_bytes) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extract_contiguous_hex() { + let input = "deadbeefCAFEBABE"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "deadbeefCAFEBABE"); + let bytes = parse_snoop_hex(&out).unwrap(); + assert_eq!(bytes, vec![0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe]); + } + + #[test] + fn extract_bytes_with_spaces() { + let input = "45 00 00 3c 1c 46 40 00"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn extract_offset_words() { + let input = "0: 4500 003c 1c46 4000"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn extract_offset_bytes() { + let input = "0: 45 00 00 3c 1c 46 40 00"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn ignore_summary_numbers() { + let input = r#" +Using device xde_test_sim1 (promiscuous) +UDP: fe80::1 > ff04::224.1.2.3, port 6081 +0: 4500 003c 1c46 4000 +"#; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + // Should not accidentally include "6081" + assert!(!out.contains("6081")); + } +} diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index a4f3cb7b..efbf2a0d 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -10,6 +10,7 @@ #![allow(dead_code)] pub mod dhcp; +pub mod geneve_verify; pub mod icmp; pub mod pcap; #[macro_use] @@ -84,6 +85,7 @@ pub use oxide_vpc::engine::gateway; pub use oxide_vpc::engine::geneve::OxideOptionType; pub use oxide_vpc::engine::nat; pub use oxide_vpc::engine::overlay; +pub use oxide_vpc::engine::overlay::PerVniMaps; pub use oxide_vpc::engine::overlay::TUNNEL_ENDPOINT_MAC; pub use oxide_vpc::engine::overlay::Virt2Boundary; pub use oxide_vpc::engine::overlay::Virt2Phys; @@ -253,7 +255,7 @@ fn oxide_net_builder( name: &str, cfg: &oxide_vpc::cfg::VpcCfg, vpc_map: Arc, - v2p: Arc, + vni_state: Arc, v2b: Arc, ) -> PortBuilder { #[allow(clippy::arc_with_non_send_sync)] @@ -268,11 +270,11 @@ fn oxide_net_builder( let dhcp = base_dhcp_config(); firewall::setup(&mut pb, fw_limit).expect("failed to add firewall layer"); - gateway::setup(&pb, cfg, vpc_map, fw_limit, &dhcp) + gateway::setup(&pb, cfg, vpc_map.clone(), fw_limit, &dhcp) .expect("failed to setup gateway layer"); router::setup(&pb, cfg, one_limit).expect("failed to add router layer"); nat::setup(&mut pb, cfg, snat_limit).expect("failed to add nat layer"); - overlay::setup(&pb, cfg, v2p, v2b, one_limit) + overlay::setup(&pb, cfg, vni_state, vpc_map.clone(), v2b, one_limit) .expect("failed to add overlay layer"); pb } @@ -383,10 +385,12 @@ pub fn oxide_net_setup2( let mut updates = vec![ // * Epoch starts at 1, adding router entry bumps it to 2. "set:epoch=2", - // * Allow inbound IPv6 traffic for guest. - // * Allow inbound IPv4 traffic for guest. + // * Allow inbound IPv4 unicast traffic for guest. + // * Allow inbound IPv4 multicast traffic for guest. + // * Allow inbound IPv6 unicast traffic for guest. + // * Allow inbound IPv6 multicast traffic for guest. // * Deny inbound NDP for guest. - "set:gateway.rules.in=3", + "set:gateway.rules.in=5", // IPv4 // ---- // @@ -394,7 +398,8 @@ pub fn oxide_net_setup2( // * ICMP Echo Reply for Gateway // * DHCP Offer // * DHCP Ack - // * Outbound traffic from Guest IP + MAC address + // * Outbound unicast traffic from Guest IP + MAC address + // * Outbound multicast traffic from Guest IP + MAC address // // IPv6 // ---- @@ -405,8 +410,9 @@ pub fn oxide_net_setup2( // * ICMPv6 Echo Reply for Gateway from Guest Link-Local // * ICMPv6 Echo Reply for Gateway from Guest VPC ULA // * DHCPv6 - // * Outbound traffic from Guest IPv6 + MAC Address - "set:gateway.rules.out=12", + // * Outbound unicast traffic from Guest IPv6 + MAC Address + // * Outbound multicast traffic from Guest IPv6 + MAC Address + "set:gateway.rules.out=14", // * Allow all outbound traffic "set:firewall.rules.out=0", // * Outbound IPv4 SNAT diff --git a/lib/opte/README.adoc b/lib/opte/README.adoc index 3bf6fe79..97f19242 100644 --- a/lib/opte/README.adoc +++ b/lib/opte/README.adoc @@ -209,11 +209,47 @@ resources. Pausing, Saving, & Restoring:: A port may be paused, saved, and restored for the purpose of live migration. The pausing of a state allows it to halt all packet processing and quiesce to a steady state. -In this state is is then possible to save the port's state which has +In this state it is then possible to save the port's state which has all data needed to restart the port without rebuilding the entire flow state. This is achieved by restoring the port based on some payload of save data. +=== Multicast Model + +OPTE implements multicast consistent with the rack networking +architecture described in [RFD 63](https://rfd.shared.oxide.computer/rfd/0063) +and [RFD 488](https://rfd.shared.oxide.computer/rfd/488). Key points: + +Fleet VNI:: All multicast traffic uses a single fleet‑level Geneve VNI +(`DEFAULT_MULTICAST_VNI`, currently `77`) rather than per‑tenant VNIs. +Mappings from overlay multicast groups to underlay multicast addresses +are stored and validated under this VNI. (See `RFD 488` for the rationale behind +fleet-level VNI.) + +Delivery Modes (Replication):: The Oxide Geneve multicast option carries +the delivery mode as a 2‑bit field in the top two bits of the option +body's first byte: + +* External — local guest delivery within the same VNI: OPTE decapsulates + and delivers to all local subscribers (guests) on the port map. +* Underlay — infrastructure delivery: OPTE sends Geneve‑encapsulated + packets towards the configured underlay multicast address in fleet + VNI 77. The underlay performs any further replication. +* All — both behaviors above. + +Encapsulation Path:: The overlay layer sets `External` in the multicast +option on initial encapsulation. XDE uses its multicast forwarding table +to decide whether to additionally forward to underlay next hops, and, if +so, marks those forwarded copies as `Underlay` or `All` to prevent +re‑relay at downstream receivers. + +Constraints & Validation:: + +* M2P (multicast‑to‑physical) mappings must use `DEFAULT_MULTICAST_VNI`. +* Any next hop that causes underlay forwarding must specify VNI 77. +* Underlay multicast addresses must be IPv6 admin‑scoped (e.g., + `ff04::/16`, `ff05::/16`, `ff08::/16`). + === Layers The main function of the port is to process packets in a flow-based diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index e6bce52f..6b89aff8 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -16,6 +16,7 @@ use core::cmp::Ordering; use core::marker::PhantomData; use core::mem::ManuallyDrop; use core::mem::MaybeUninit; +use core::num::NonZeroUsize; use core::ops::Deref; use core::ops::DerefMut; use core::ptr; @@ -300,6 +301,68 @@ impl MsgBlk { out } + /// Copy the first `n` bytes of this packet into a new `mblk_t`, + /// increasing the refcount of all remaining segments. + /// + /// On non-kernel platforms this will simple clone the underlying packet + /// with the desired segmentation. + pub fn pullup( + &self, + n: Option, + ) -> Result { + let totlen = self.byte_len(); + + if let Some(n) = n + && n.get() > totlen + { + // The DDI function will bail out if this is the case, but + // we'll be none the wiser to *what* the failure mode was. + return Err(PktPullupError::TooLong); + } + + cfg_if! { + if #[cfg(all(not(feature = "std"), not(test)))] { + let out = unsafe { + ddi::msgpullup( + self.0.as_ptr(), + n.map(|v| v.get() as isize).unwrap_or(-1), + ) + }; + + let mp = NonNull::new(out) + .ok_or(PktPullupError::AllocFailed)?; + + Ok(Self(mp)) + } else { + // We aren't (currently?) simulating refcount tracking at all + // in our userland mblk abstraction. + // Do the segmentation right, but otherwise it's fully cloned. + let to_ensure = n.map(|v| v.get()).unwrap_or(totlen); + let mut top_mblk = MsgBlk::new(to_ensure); + let mut still_to_write = to_ensure; + + for chunk in self.iter() { + let mut left_in_chunk = chunk.len(); + let to_take = chunk.len().min(still_to_write); + + if still_to_write != 0 { + top_mblk.write_bytes_back(&chunk[..to_take]) + .expect("to_take should be <= remaining capacity"); + } + + still_to_write -= to_take; + left_in_chunk -= to_take; + + if left_in_chunk != 0 { + top_mblk.append(MsgBlk::copy(&chunk[to_take..])); + } + } + + Ok(top_mblk) + } + } + } + /// Creates a new [`MsgBlk`] using a given set of packet headers. pub fn new_pkt(emit: impl Emit + EmitDoesNotRelyOnBufContents) -> Self { let mut pkt = Self::new(emit.packet_length()); @@ -1034,6 +1097,26 @@ impl core::fmt::Display for PktInfoError { } } +/// Reasons a [`MsgBlk`] could not be pulled up. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Hash)] +pub enum PktPullupError { + /// Requested pullup was longer than the underlying packet. + TooLong, + /// The OS was unable to allocate a [`MsgBlk`]. + AllocFailed, +} + +impl core::error::Error for PktPullupError {} + +impl core::fmt::Display for PktPullupError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str(match self { + Self::TooLong => "requested pullup is longer than packet", + Self::AllocFailed => "failed to allocate an mblk_t", + }) + } +} + /// Counts the number of segments in an `mblk_t` from `head`, linked /// via `b_cont`. unsafe fn count_mblk_chain(mut head: Option>) -> usize { diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index b1e82e62..14f443e8 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -20,6 +20,81 @@ use serde::Deserialize; use serde::Serialize; use uuid::Uuid; +/// Multicast packet replication strategy. +/// +/// Encoding and scope: +/// - The Geneve Oxide multicast option encodes replication in the top 2 bits +/// of the option body’s first byte (u2). The remaining 30 bits are reserved. +/// - External means local customer-facing delivery within the same VNI +/// - Underlay means Geneve-encapsulated forwarding to underlay infrastructure +/// members using the fleet multicast VNI. +/// - All combines both behaviors. +/// +/// Current implementation uses a single fleet VNI (DEFAULT_MULTICAST_VNI = 77) +/// for all multicast traffic rack-wide (RFD 488 "Multicast across VPCs"). +#[derive( + Clone, Copy, Debug, Default, Serialize, Deserialize, Eq, PartialEq, Hash, +)] +#[repr(u8)] +pub enum Replication { + /// Replicate packets to external/customer-facing members (guest instances). + /// + /// Local delivery within the same VNI. Packets are decapsulated at the + /// switch before delivery to guests. + #[default] + External = 0x00, + /// Replicate packets to underlay/infrastructure members. + /// + /// Forwards Geneve-encapsulated packets to underlay destinations for + /// infrastructure delivery (not directly to guest instances). Uses + /// DEFAULT_MULTICAST_VNI (77) for encapsulation. + Underlay = 0x01, + /// Replicate packets to both external and underlay members (bifurcated). + /// + /// Combines both customer-facing (decapsulated to guests) and infrastructure + /// (encapsulated) delivery modes for comprehensive multicast distribution. + All = 0x02, + /// Reserved for future use. This value exists to account for all possible + /// values in the 2-bit Geneve option field. + Reserved = 0x03, +} + +impl Replication { + /// Merge two replication strategies, preferring the most permissive. + /// + /// Merging rules: + /// - Any `All` -> `All` + /// - `External` + `Underlay` -> `All` + /// - Same values -> keep the value + /// - Default to `All` for unexpected combinations + pub const fn merge(self, other: Self) -> Self { + match (self, other) { + (Self::All, _) | (_, Self::All) => Self::All, + (Self::External, Self::Underlay) + | (Self::Underlay, Self::External) => Self::All, + (a, b) if a as u8 == b as u8 => a, + // Prefer `All` for unexpected combinations + _ => Self::All, + } + } +} + +#[cfg(any(feature = "std", test))] +impl FromStr for Replication { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "external" => Ok(Self::External), + "underlay" => Ok(Self::Underlay), + "all" => Ok(Self::All), + lower => Err(format!( + "unexpected replication type {lower} -- expected 'external', 'underlay', or 'all'" + )), + } + } +} + /// This is the MAC address that OPTE uses to act as the virtual gateway. pub const GW_MAC_ADDR: MacAddr = MacAddr::from_const([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]); @@ -27,6 +102,19 @@ pub const GW_MAC_ADDR: MacAddr = /// tunnel endpoint. pub const BOUNDARY_SERVICES_VNI: u32 = 99u32; +/// Default VNI for rack-wide multicast groups (no VPC association). +/// Must match Omicron's DEFAULT_MULTICAST_VNI. +/// +/// This is the only VNI currently supported for multicast traffic. +/// All multicast groups (M2P mappings and forwarding entries) must use this VNI. +/// OPTE validates that multicast operations specify this VNI and rejects others. +/// +/// **Security model:** While M2P (Multicast-to-Physical) mappings are stored +/// per-VNI in the code, the enforcement of DEFAULT_MULTICAST_VNI means all +/// multicast traffic shares a single namespace across the rack, with no +/// VPC-level isolation (as multicast groups are fleet-wide). +pub const DEFAULT_MULTICAST_VNI: u32 = 77u32; + /// Description of Boundary Services, the endpoint used to route traffic /// to external networks. // @@ -303,6 +391,44 @@ pub struct PhysNet { pub vni: Vni, } +/// Represents an IPv6 next hop for multicast forwarding. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, +)] +pub struct NextHopV6 { + /// The IPv6 address of the next hop + pub addr: Ipv6Addr, + /// The VNI to use for this next hop + pub vni: Vni, +} + +impl NextHopV6 { + pub fn new(addr: Ipv6Addr, vni: Vni) -> Self { + Self { addr, vni } + } +} + +/// A next hop for multicast forwarding (supports both IPv4 and IPv6). +#[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct NextHop { + /// The IP address of the next hop + pub addr: IpAddr, + /// The VNI to use for this next hop + pub vni: Vni, +} + +impl NextHop { + pub fn new(addr: IpAddr, vni: Vni) -> Self { + Self { addr, vni } + } +} + +impl From for NextHop { + fn from(v6: NextHopV6) -> Self { + Self { addr: v6.addr.into(), vni: v6.vni } + } +} + /// A Geneve tunnel endpoint. #[derive(Clone, Copy, Debug, Deserialize, Serialize)] pub struct TunnelEndpoint { @@ -366,12 +492,18 @@ impl From for GuestPhysAddr { /// abstraction, it's simply allowing one subnet to talk to another. /// There is no separate VPC router process, the real routing is done /// by the underlay. +/// +/// * Multicast: Packets matching this entry are multicast traffic. +/// Uses the M2P (Multicast-to-Physical) mapping to determine underlay +/// destinations. Does not apply SNAT; the outer IPv6 underlay source +/// is the physical IP. #[derive(Clone, Debug, Copy, Deserialize, Serialize)] pub enum RouterTarget { Drop, InternetGateway(Option), Ip(IpAddr), VpcSubnet(IpCidr), + Multicast(IpCidr), } #[cfg(any(feature = "std", test))] @@ -403,6 +535,15 @@ impl FromStr for RouterTarget { cidr6s.parse().map(|x| Self::VpcSubnet(IpCidr::Ip6(x))) } + Some(("mcast4", cidr4s)) => { + let cidr4 = cidr4s.parse()?; + Ok(Self::Multicast(IpCidr::Ip4(cidr4))) + } + + Some(("mcast6", cidr6s)) => { + cidr6s.parse().map(|x| Self::Multicast(IpCidr::Ip6(x))) + } + Some(("ig", uuid)) => Ok(Self::InternetGateway(Some( uuid.parse::().map_err(|e| e.to_string())?, ))), @@ -423,6 +564,12 @@ impl Display for RouterTarget { Self::Ip(IpAddr::Ip6(ip6)) => write!(f, "ip6={ip6}"), Self::VpcSubnet(IpCidr::Ip4(sub4)) => write!(f, "sub4={sub4}"), Self::VpcSubnet(IpCidr::Ip6(sub6)) => write!(f, "sub6={sub6}"), + Self::Multicast(IpCidr::Ip4(mcast4)) => { + write!(f, "mcast4={mcast4}") + } + Self::Multicast(IpCidr::Ip6(mcast6)) => { + write!(f, "mcast6={mcast6}") + } } } } @@ -529,6 +676,8 @@ pub struct VpcMapResp { pub vni: Vni, pub ip4: Vec<(Ipv4Addr, GuestPhysAddr)>, pub ip6: Vec<(Ipv6Addr, GuestPhysAddr)>, + pub mcast_ip4: Vec<(Ipv4Addr, Ipv6Addr)>, + pub mcast_ip6: Vec<(Ipv6Addr, Ipv6Addr)>, } #[derive(Debug, Deserialize, Serialize)] @@ -565,6 +714,28 @@ pub struct ClearVirt2PhysReq { pub phys: PhysNet, } +/// Set mapping from multicast group to underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address + pub underlay: Ipv6Addr, + /// VNI for this mapping + pub vni: Vni, +} + +/// Clear a mapping from multicast group to underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address + pub underlay: Ipv6Addr, + /// VNI for this mapping + pub vni: Vni, +} + /// Set a mapping from a VPC IP to boundary tunnel endpoint destination. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetVirt2BoundaryReq { @@ -605,8 +776,60 @@ pub enum DelRouterEntryResp { NotFound, } +/// Set multicast forwarding entries for a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcastForwardingReq { + /// The multicast group address (overlay) + pub group: IpAddr, + /// The next hops (underlay IPv6 addresses) with replication information + pub next_hops: Vec<(NextHopV6, Replication)>, +} + +/// Clear multicast forwarding entries for a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcastForwardingReq { + /// The multicast group address + pub group: IpAddr, +} + +/// Response for dumping the multicast forwarding table. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DumpMcastForwardingResp { + /// The multicast forwarding table entries + pub entries: Vec, +} + +impl CmdOk for DumpMcastForwardingResp {} + +/// A single multicast forwarding table entry. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastForwardingEntry { + /// The multicast group address (overlay) + pub group: IpAddr, + /// The next hops (underlay IPv6 addresses) with replication information + pub next_hops: Vec<(NextHopV6, Replication)>, +} + impl opte::api::cmd::CmdOk for DelRouterEntryResp {} +/// Subscribe a port to a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastSubscribeReq { + /// The port name to subscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + +/// Unsubscribe a port from a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastUnsubscribeReq { + /// The port name to unsubscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetExternalIpsReq { pub port_name: String, diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index eb2c3b44..74ff34bc 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -56,6 +56,8 @@ use opte::api::Direction; use opte::api::OpteError; use opte::engine::ether::EtherMod; use opte::engine::headers::HeaderAction; +use opte::engine::ip::v4::Ipv4Cidr; +use opte::engine::ip::v6::Ipv6Cidr; use opte::engine::layer::DefaultAction; use opte::engine::layer::Layer; use opte::engine::layer::LayerActions; @@ -173,7 +175,7 @@ fn setup_ipv4( let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); + let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta.clone())); nospoof_out.add_predicate(Predicate::InnerSrcIp4(vec![ Ipv4AddrMatch::Exact(ip_cfg.private_ip), ])); @@ -196,6 +198,27 @@ fn setup_ipv4( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Multicast prefixes (224.0.0.0/4) + let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; + + // Outbound multicast - allow from guest's MAC to multicast destinations + let mut mcast_out = Rule::new(1001, Action::Meta(vpc_meta.clone())); + mcast_out.add_predicate(Predicate::InnerDstIp4(ipv4_mcast.clone())); + mcast_out.add_predicate(Predicate::InnerEtherSrc(vec![ + EtherAddrMatch::Exact(cfg.guest_mac), + ])); + layer.add_rule(Direction::Out, mcast_out.finalize()); + + // Inbound multicast - allow multicast destinations to guest + let mut mcast_in = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in.add_predicate(Predicate::InnerDstIp4(ipv4_mcast)); + layer.add_rule(Direction::In, mcast_in.finalize()); + Ok(()) } @@ -209,7 +232,7 @@ fn setup_ipv6( icmpv6::setup(layer, cfg, ip_cfg)?; dhcpv6::setup(layer, cfg, dhcp_cfg)?; let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); + let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta.clone())); nospoof_out.add_predicate(Predicate::InnerSrcIp6(vec![ Ipv6AddrMatch::Exact(ip_cfg.private_ip), ])); @@ -232,6 +255,32 @@ fn setup_ipv6( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Admin-/site-/org-scoped multicast prefixes (for underlay forwarding) + let admin_mcast_prefixes = vec![ + Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_ADMIN_LOCAL), + Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_SITE_LOCAL), + Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_ORG_LOCAL), + ]; + + // Outbound multicast - allow from guest's MAC to multicast destinations + let mut mcast_out = Rule::new(1001, Action::Meta(vpc_meta.clone())); + mcast_out + .add_predicate(Predicate::InnerDstIp6(admin_mcast_prefixes.clone())); + mcast_out.add_predicate(Predicate::InnerEtherSrc(vec![ + EtherAddrMatch::Exact(cfg.guest_mac), + ])); + layer.add_rule(Direction::Out, mcast_out.finalize()); + + // Inbound multicast - allow multicast destinations to guest + let mut mcast_in = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in.add_predicate(Predicate::InnerDstIp6(admin_mcast_prefixes)); + layer.add_rule(Direction::In, mcast_in.finalize()); + Ok(()) } diff --git a/lib/oxide-vpc/src/engine/geneve.rs b/lib/oxide-vpc/src/engine/geneve.rs index f22ed8c6..0cb18be6 100644 --- a/lib/oxide-vpc/src/engine/geneve.rs +++ b/lib/oxide-vpc/src/engine/geneve.rs @@ -5,7 +5,72 @@ // Copyright 2025 Oxide Computer Company //! Geneve option types specific to the Oxide VPC dataplane. - +//! +//! # Oxide Geneve Options +//! +//! This module defines Geneve options used in the Oxide rack network to carry +//! VPC-specific metadata during packet encapsulation. All options use the Oxide +//! option class (`GENEVE_OPT_CLASS_OXIDE` = 0x0129). +//! +//! ## Option Types +//! +//! - **External** (0x00): Indicates a packet originated from outside the rack +//! and was encapsulated by the switch NAT ingress path with Geneve wrapping. +//! OPTE decapsulates before delivering to the guest. +//! - **Multicast** (0x01): Carries multicast replication strategy as a 2-bit +//! field for coordinating delivery between OPTE and sidecar switch logic. +//! - **Mss** (0x02): Carries original TCP MSS for MSS clamping/boosting to +//! prevent MTU issues during underlay encapsulation. +//! +//! ## Multicast Option Encoding +//! +//! The multicast option uses a compact 2-bit encoding aligned with sidecar.p4's +//! processing constraints: +//! +//! ```text +//! Option body (4 bytes): +//! ┌──────────┬────────────────────────────┐ +//! │ Bits 7-6 │ Bits 5-0 + remaining bytes │ +//! │ (u2) │ (reserved, must be 0) │ +//! └──────────┴────────────────────────────┘ +//! │ +//! └─> Replication mode: +//! 00 = External (local guest delivery) +//! 01 = Underlay (infrastructure forwarding) +//! 10 = All (both External and Underlay) +//! 11 = Reserved +//! ``` +//! +//! ### Replication Semantics +//! +//! - **External**: Packet should be decapsulated and delivered to local guest +//! instances subscribed to this multicast group. Switch sets `nat_egress_hit` +//! to trigger decapsulation before delivery. +//! - **Underlay**: Packet should remain encapsulated and forwarded to underlay +//! infrastructure destinations. +//! - **All**: Bifurcated delivery to both local guests (decapsulated) and +//! underlay destinations (encapsulated). +//! +//! All multicast packets are encapsulated with fleet VNI 77 (`DEFAULT_MULTICAST_VNI`) +//! regardless of replication mode. The replication mode determines delivery behavior, +//! not VNI selection. +//! +//! The 2-bit encoding allows efficient extraction in P4 programs without complex +//! parsing, aligning with the sidecar pipeline's tag-based routing decisions. +//! +//! ## Option Length Encoding +//! +//! Geneve has two length fields to consider (both measured in 4-byte words): +//! - Geneve header `opt_len` (6 bits): total size of the options area +//! (sums each option's 4-byte header + body). +//! - Option header `len` (5 bits): size of that option's body only. +//! +//! For Oxide options used here: +//! - External: geneve opt_len += 1; option len = 0 +//! - Multicast: geneve opt_len += 2; option len = 1 +//! - MSS: geneve opt_len += 2; option len = 1 + +use crate::api::Replication; use ingot::geneve::GeneveFlags; use ingot::geneve::GeneveRef; use ingot::geneve::ValidGeneve; @@ -84,28 +149,24 @@ impl<'a> OptionCast<'a> for ValidOxideOption<'a> { } } +/// Geneve multicast option body carrying replication strategy information. +/// +/// This option encodes the replication scope as a 2-bit field in the top two +/// bits of the first byte of the option body. The remaining 30 bits are +/// reserved for future use. The replication strategy determines whether the +/// packet is delivered to local guest instances (External), underlay +/// infrastructure destinations (Underlay), or both (All). #[derive(Debug, Clone, Ingot, Eq, PartialEq)] #[ingot(impl_default)] pub struct MulticastInfo { + /// Replication scope encoded as a u2 (top 2 bits of the first byte). + /// Values map to `Replication::{External, Underlay, All, Reserved}`. #[ingot(is = "u2")] pub version: Replication, + /// Reserved bits (remaining 30 bits of the body). rsvd: u30be, } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Hash)] -#[repr(u8)] -pub enum Replication { - /// Replicate packets to ports set for external multicast traffic. - #[default] - External = 0x00, - /// Replicate packets to ports set for underlay multicast traffic. - Underlay, - /// Replicate packets to ports set for underlay and external multicast - /// traffic (bifurcated). - All, - Reserved, -} - impl NetworkRepr for Replication { fn to_network(self) -> u2 { self as u8 @@ -118,7 +179,7 @@ impl NetworkRepr for Replication { 1 => Replication::Underlay, 2 => Replication::All, 3 => Replication::Reserved, - _ => panic!("outside bounds of u2"), + _ => unreachable!("u2 value out of range: {val}"), } } } @@ -157,6 +218,33 @@ pub fn validate_options( Ok(()) } +/// Extract multicast replication info from Geneve options. +/// Returns None if no multicast option is present, or Some(Replication) if found. +/// +/// Treats Reserved (value 3) as invalid and returns None, implementing fail-closed +/// behavior without crashing the parser. +/// +/// Note: This function silently skips options with parse errors (e.g., TooSmall). +/// Call `validate_options()` first if you want parse errors surfaced instead of +/// being silently ignored. +pub fn extract_multicast_replication( + pkt: &ValidGeneve, +) -> Option { + for opt in OxideOptions::from_raw(pkt) { + let Ok(opt) = opt else { continue }; + if let Some(ValidOxideOption::Multicast(mc_info)) = opt.option.known() { + let repl = mc_info.version(); + // Filter out Reserved (u2=3). This value exists in the 2-bit space + // but is not used by sidecar P4; treat as invalid. + if matches!(repl, Replication::Reserved) { + return None; + } + return Some(repl); + } + } + None +} + #[cfg(test)] pub fn valid_geneve_has_oxide_external( pkt: &ValidGeneve, @@ -177,6 +265,7 @@ pub fn valid_geneve_has_oxide_external( #[cfg(test)] mod test { use super::*; + use alloc::vec::Vec; use ingot::types::HeaderParse; use ingot::udp::ValidUdp; @@ -201,7 +290,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type @@ -219,6 +307,57 @@ mod test { assert!(valid_geneve_has_oxide_external(&geneve)); } + #[test] + fn parse_multicast_replication_values() { + // Build a minimal UDP+Geneve packet with one Oxide multicast option + // Body's first byte top-2 bits carry Replication. + fn build_buf(rep: Replication) -> Vec { + #[rustfmt::skip] + let mut buf = vec![ + // UDP source + 0x1E, 0x61, + // UDP dest + 0x17, 0xC1, + // UDP length (8 UDP hdr + 8 Geneve hdr + 4 opt hdr + 4 opt body = 24 = 0x18) + 0x00, 0x18, + // UDP csum + 0x00, 0x00, + // Geneve: ver + opt len (2 words = 8 bytes: 4 opt hdr + 4 opt body) + 0x02, + // Geneve flags + 0x00, + // Geneve proto + 0x65, 0x58, + // Geneve vni + reserved + 0x00, 0x00, 0x00, 0x00, + // Geneve option: class 0x0129 (Oxide) + 0x01, 0x29, + // Geneve option: flags+type (non-critical, Multicast = 0x01) + 0x01, + // Geneve option: rsvd + len (1 word = 4 bytes body) + 0x01, + ]; + // Geneve option body: 4-byte body with replication in top 2 bits + buf.push((rep as u8) << 6); + buf.extend_from_slice(&[0x00, 0x00, 0x00]); + buf + } + + for (rep, expect) in [ + (Replication::External, Replication::External), + (Replication::Underlay, Replication::Underlay), + (Replication::All, Replication::All), + ] { + let buf = build_buf(rep); + let (.., rem) = ValidUdp::parse(&buf[..]).unwrap(); + let (geneve, ..) = ValidGeneve::parse(rem).unwrap(); + validate_options(&geneve).unwrap(); + + let got = extract_multicast_replication(&geneve).unwrap(); + assert_eq!(got, expect); + } + } + #[test] fn unknown_crit_option_fails() { // Create a packet with one extension header with the critical @@ -242,7 +381,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0xff, 0xff, // crt + type @@ -281,7 +419,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0x01, 0x29, // crt + type @@ -314,8 +451,8 @@ mod test { 0x1E, 0x61, // dest 0x17, 0xC1, - // length - 0x00, 0x1c, + // length (8 UDP hdr + 8 Geneve hdr + 20 options = 36 = 0x24) + 0x00, 0x24, // csum 0x00, 0x00, // ver + opt len @@ -326,14 +463,12 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type 0x00, // rsvd + len 0x00, - // experimenter option class 0xff, 0xff, // crt + type @@ -342,7 +477,6 @@ mod test { 0x01, // body 0x00, 0x00, 0x00, 0x00, - // experimenter option class 0xff, 0xff, // crt + type diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 5149416a..111ccdf9 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -9,10 +9,12 @@ //! This implements the Oxide Network VPC Overlay. use super::geneve::OxideOptions; use super::router::RouterTargetInternal; +use crate::api::DEFAULT_MULTICAST_VNI; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; use crate::api::PhysNet; +use crate::api::Replication; use crate::api::TunnelEndpoint; use crate::api::V2bMapResp; use crate::api::VpcMapResp; @@ -80,7 +82,8 @@ pub const OVERLAY_LAYER_NAME: &str = "overlay"; pub fn setup( pb: &PortBuilder, cfg: &VpcCfg, - v2p: Arc, + vni_state: Arc, + vpc_map: Arc, v2b: Arc, ft_limit: core::num::NonZeroU32, ) -> core::result::Result<(), OpteError> { @@ -88,7 +91,8 @@ pub fn setup( let encap = Action::Static(Arc::new(EncapAction::new( cfg.phys_ip, cfg.vni, - v2p, + vni_state, + vpc_map, v2b, ))); @@ -182,7 +186,8 @@ pub struct EncapAction { // sending data. phys_ip_src: Ipv6Addr, vni: Vni, - v2p: Arc, + vni_state: Arc, + vpc_map: Arc, v2b: Arc, } @@ -190,10 +195,11 @@ impl EncapAction { pub fn new( phys_ip_src: Ipv6Addr, vni: Vni, - v2p: Arc, + vni_state: Arc, + vpc_map: Arc, v2b: Arc, ) -> Self { - Self { phys_ip_src, vni, v2p, v2b } + Self { phys_ip_src, vni, vni_state, vpc_map, v2b } } } @@ -241,9 +247,13 @@ impl StaticAction for EncapAction { } }; - let (is_internal, phys_target) = match target { + // Map the router target to a physical network location. + // The router layer has already made the routing decision - we just + // execute it here by looking up the appropriate physical mapping. + let dst_ip = flow_id.dst_ip(); + let (is_internal, phys_target, is_mcast) = match target { RouterTargetInternal::InternetGateway(_) => { - match self.v2b.get(&flow_id.dst_ip()) { + match self.v2b.get(&dst_ip) { Some(phys) => { // Hash the packet onto a route target. This is a very // rudimentary mechanism. Should level-up to an ECMP @@ -260,16 +270,46 @@ impl StaticAction for EncapAction { ip: target.ip, vni: target.vni, }, + false, ) } None => return Ok(AllowOrDeny::Deny), } } - RouterTargetInternal::Ip(virt_ip) => match self.v2p.get(&virt_ip) { + // Multicast target - use M2P mapping to get the multicast underlay address. + // The router has determined this packet should be multicast forwarded. + RouterTargetInternal::Multicast(_) => { + // Fleet-level multicast mappings live under DEFAULT_MULTICAST_VNI. + // Look up the underlay multicast IPv6 for this group using the + // global VPC mappings and encapsulate with the fleet multicast VNI. + let mvni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + match self.vpc_map.get_mcast_underlay(mvni, dst_ip) { + Some(underlay) => ( + true, + PhysNet { + ether: underlay.dst_mac(), + ip: underlay.0, + vni: mvni, + }, + true, + ), + None => { + // No mapping configured for this group; deny. + return Ok(AllowOrDeny::Deny); + } + } + } + + RouterTargetInternal::Ip(virt_ip) => match self + .vni_state + .v2p + .get(&virt_ip) + { Some(phys) => ( true, PhysNet { ether: phys.ether, ip: phys.ip, vni: self.vni }, + false, ), // The router target has specified a VPC IP we do not @@ -290,7 +330,7 @@ impl StaticAction for EncapAction { }, RouterTargetInternal::VpcSubnet(_) => { - match self.v2p.get(&flow_id.dst_ip()) { + match self.vni_state.v2p.get(&flow_id.dst_ip()) { Some(phys) => ( true, PhysNet { @@ -298,6 +338,7 @@ impl StaticAction for EncapAction { ip: phys.ip, vni: self.vni, }, + false, ), // The guest is attempting to contact a VPC IP we @@ -330,25 +371,54 @@ impl StaticAction for EncapAction { data: Cow::Borrowed(GENEVE_MSS_SIZE_OPT_BODY), }; + // For multicast originated from this host, we set External replication. + // The actual replication scope will be determined by the mcast_fwd table. + // The first byte encodes Replication in the top 2 bits: + // External=0x00, Underlay=0x40, All=0x80, Reserved=0xC0 + const REPLICATION_EXTERNAL_BYTE: u8 = + (Replication::External as u8) << 6; + static GENEVE_MCAST_OPT_BODY: &[u8] = &[ + REPLICATION_EXTERNAL_BYTE, // Top 2 bits encode replication strategy + 0x00, + 0x00, + 0x00, // Reserved bytes + ]; + static GENEVE_MCAST_OPT: ArbitraryGeneveOption = + ArbitraryGeneveOption { + option_class: GENEVE_OPT_CLASS_OXIDE, + option_type: OxideOptionType::Multicast as u8, + data: Cow::Borrowed(GENEVE_MCAST_OPT_BODY), + }; + + let outer_mac = + if is_mcast { phys_target.ether } else { MacAddr::ZERO }; + let tfrm = HdrTransform { name: ENCAP_NAME.to_string(), // We leave the outer src/dst up to the driver. + // In the multicast case we can, however, derive this. outer_ether: HeaderAction::Push( Valid::validated(EtherMeta { + dst: outer_mac, src: MacAddr::ZERO, - dst: MacAddr::ZERO, ether_type: EtherType::Ipv6, }) .expect("Ethernet validation is infallible"), ), - outer_ip: HeaderAction::Push(Valid::validated(IpPush::from( - Ipv6Push { + outer_ip: HeaderAction::Push({ + let ip_push = IpPush::from(Ipv6Push { src: self.phys_ip_src, dst: phys_target.ip, proto: Protocol::UDP, exts: Cow::Borrowed(&[]), - }, - ))?), + }); + match Valid::validated(ip_push) { + Ok(v) => v, + Err(e) => { + return Err(e.into()); + } + } + }), // XXX Geneve uses the UDP source port as a flow label // value for the purposes of ECMP -- a hash of the // 5-tuple. However, when using Geneve in IPv6 one could @@ -369,30 +439,47 @@ impl StaticAction for EncapAction { EncapPush::from(GenevePush { vni: phys_target.vni, entropy: flow_id.crc32() as u16, - // Allocate space in which we can include the TCP MSS, when - // needed during MSS boosting. It's theoretically doable to - // gate this on seeing an unexpectedly high/low MSS option - // in the TCP handshake, but there are problems in doing so: - // * The MSS for the flow is negotiated, but the UFT entry - // containing this transform does not know the other side. - // * UFT invalidation means we may rerun this transform in - // the middle of a flow. - // So, emit it unconditionally for VPC-internal TCP traffic, - // which could need the original MSS to be carried when LSO - // is in use. - options: if pkt_meta.is_inner_tcp() && is_internal { - Cow::Borrowed(core::slice::from_ref( + options: match ( + pkt_meta.is_inner_tcp() && is_internal, + is_mcast, + ) { + // Allocate space in which we can include the TCP MSS, when + // needed during MSS boosting. It's theoretically doable to + // gate this on seeing an unexpectedly high/low MSS option + // in the TCP handshake, but there are problems in doing so: + // * The MSS for the flow is negotiated, but the UFT entry + // containing this transform does not know the other side. + // * UFT invalidation means we may rerun this transform in + // the middle of a flow. + // So, emit it unconditionally for VPC-internal TCP traffic, + // which could need the original MSS to be carried when LSO + // is in use. + (true, false) => Cow::Borrowed(core::slice::from_ref( &GENEVE_MSS_SIZE_OPT, - )) - } else { - Cow::Borrowed(&[]) + )), + (false, true) => Cow::Borrowed(core::slice::from_ref( + &GENEVE_MCAST_OPT, + )), + (false, false) => Cow::Borrowed(&[]), + // We do not support TCP over multicast delivery. + // Multicast replication semantics conflict with TCP's + // connection/ordering guarantees, so deny this case. + (true, true) => { + return Ok(AllowOrDeny::Deny); + } }, }), )?), - inner_ether: HeaderAction::Modify(EtherMod { - dst: Some(phys_target.ether), - ..Default::default() - }), + // For multicast packets, the inner destination MAC should already + // correspond to the inner L3 destination address. + inner_ether: if is_mcast { + HeaderAction::Ignore + } else { + HeaderAction::Modify(EtherMod { + dst: Some(phys_target.ether), + ..Default::default() + }) + }, ..Default::default() }; @@ -483,31 +570,27 @@ impl StaticAction for DecapAction { } pub struct VpcMappings { - inner: KMutex>>, + inner: KMutex>>, } impl VpcMappings { + /// Generate a new mapping struct. + pub fn new() -> Self { + Self { inner: KMutex::new(BTreeMap::new()) } + } + /// Add a new mapping from VIP to [`PhysNet`], returning a pointer /// to the [`Virt2Phys`] this mapping belongs to. - pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { + pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { // We convert to GuestPhysAddr because it saves us from // redundant storage of the VNI. let guest_phys = GuestPhysAddr::from(phys); let mut lock = self.inner.lock(); - match lock.get(&phys.vni) { - Some(v2p) => { - v2p.set(vip, guest_phys); - v2p.clone() - } + let state = lock.entry(phys.vni).or_default(); + state.v2p.set(vip, guest_phys); - None => { - let v2p = Arc::new(Virt2Phys::new()); - v2p.set(vip, guest_phys); - lock.insert(phys.vni, v2p.clone()); - v2p - } - } + state.clone() } /// Delete the mapping for the given VIP in the given VNI. @@ -515,7 +598,7 @@ impl VpcMappings { /// Return the existing entry, if there is one. pub fn del(&self, vip: &IpAddr, phys: &PhysNet) -> Option { match self.inner.lock().get(&phys.vni) { - Some(v2p) => v2p.remove(vip).map(|guest_phys| PhysNet { + Some(state) => state.v2p.remove(vip).map(|guest_phys| PhysNet { ether: guest_phys.ether, ip: guest_phys.ip, vni: phys.vni, @@ -530,11 +613,13 @@ impl VpcMappings { let mut mappings = Vec::new(); let lock = self.inner.lock(); - for (vni, v2p) in lock.iter() { + for (vni, state) in lock.iter() { mappings.push(VpcMapResp { vni: *vni, - ip4: v2p.dump_ip4(), - ip6: v2p.dump_ip6(), + ip4: state.v2p.dump_ip4(), + ip6: state.v2p.dump_ip6(), + mcast_ip4: state.m2p.dump_ip4(), + mcast_ip6: state.m2p.dump_ip6(), }); } @@ -548,8 +633,8 @@ impl VpcMappings { /// assumption is enforced by the control plane; making sure that /// peered VPCs do not overlap their VIP ranges. pub fn ip_to_vni(&self, vip: &IpAddr) -> Option { - for (vni, v2p) in self.inner.lock().iter() { - if v2p.get(vip).is_some() { + for (vni, state) in self.inner.lock().iter() { + if state.v2p.get(vip).is_some() { return Some(*vni); } } @@ -557,8 +642,62 @@ impl VpcMappings { None } - pub fn new() -> Self { - VpcMappings { inner: KMutex::new(BTreeMap::new()) } + /// Add a multicast forwarding entry from a multicast group IP to a physical + /// underlay IP. + /// + /// Returns an error if: + /// - The VNI is not DEFAULT_MULTICAST_VNI + /// - The underlay address is not a valid IPv6 multicast address + pub fn add_mcast( + &self, + group: IpAddr, + underlay: Ipv6Addr, + vni: Vni, + ) -> Result, OpteError> { + // Validate VNI is DEFAULT_MULTICAST_VNI for fleet-level multicast + if vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + errno: illumos_sys_hdrs::EINVAL, + msg: format!( + "multicast VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", + vni.as_u32() + ), + }); + } + + let mut lock = self.inner.lock(); + let state = lock.entry(vni).or_default(); + + let mcast_underlay = MulticastUnderlay::new(underlay).ok_or_else(|| { + OpteError::InvalidUnderlayMulticast(format!( + "underlay address must be an administratively-scoped multicast address \ + (scope 0x4/admin-local, 0x5/site-local, or 0x8/organization-local): {underlay}", + )) + })?; + + state.m2p.set(group, mcast_underlay); + Ok(state.clone()) + } + + /// Delete a multicast forwarding entry. + pub fn del_mcast(&self, group: IpAddr, _underlay: Ipv6Addr, vni: Vni) { + let mut lock = self.inner.lock(); + if let Some(state) = lock.get_mut(&vni) { + state.m2p.remove(&group); + } + } + + /// Get the underlay multicast for a given VNI and overlay multicast group. + pub fn get_mcast_underlay( + &self, + vni: Vni, + group: IpAddr, + ) -> Option { + let lock = self.inner.lock(); + lock.get(&vni).and_then(|state| match group { + IpAddr::Ip4(ip4) => state.m2p.ip4.lock().get(&ip4).copied(), + IpAddr::Ip6(ip6) => state.m2p.ip6.lock().get(&ip6).copied(), + }) } } @@ -568,6 +707,10 @@ impl Default for VpcMappings { } } +// XXX: Should these not be RwLocks? This is a really unfortunate degree of +// contention for multiple ports in the slowpath to block one another. +// (Not common by any means, but needless when it does occur!) + /// A mapping from virtual IPs to physical location. pub struct Virt2Phys { // XXX We need to implement some sort of invalidation mechanism @@ -606,6 +749,29 @@ pub struct Virt2Boundary { pt6: KRwLock>>, } +// NOTE: This is structurally similar to V2P mapping, but maps to MulticastUnderlay +// which wraps only an IPv6 address. The destination MAC is derived algorithmically +// from the IPv6 multicast address rather than stored explicitly. +/// A mapping from inner multicast destination IPs to underlay multicast groups. +/// +/// Validation is enforced through the `MulticastUnderlay` newtype wrapper, which +/// ensures only valid IPv6 multicast addresses can be stored. +pub struct Mcast2Phys { + ip4: KMutex>, + ip6: KMutex>, +} + +/// Per-VNI mapping state containing both unicast and multicast address mappings. +/// +/// This struct holds all address-to-physical mappings organized by VNI: +/// - `v2p`: Unicast virtual IPs to physical locations +/// - `m2p`: Multicast group IPs to physical underlay addresses +#[derive(Default)] +pub struct PerVniMaps { + pub v2p: Virt2Phys, + pub m2p: Mcast2Phys, +} + pub const TUNNEL_ENDPOINT_MAC: [u8; 6] = [0xA8, 0x40, 0x25, 0x77, 0x77, 0x77]; impl Virt2Boundary { @@ -828,3 +994,88 @@ impl MappingResource for Virt2Phys { } } } + +impl Mcast2Phys { + /// Create a new empty multicast-to-physical mapping table. + pub fn new() -> Self { + Self { + ip4: KMutex::new(BTreeMap::new()), + ip6: KMutex::new(BTreeMap::new()), + } + } + + /// Dump all IPv4 overlay multicast group to underlay IPv6 multicast mappings. + pub fn dump_ip4(&self) -> Vec<(Ipv4Addr, Ipv6Addr)> { + self.ip4.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() + } + + /// Dump all IPv6 overlay multicast group to underlay IPv6 multicast mappings. + pub fn dump_ip6(&self) -> Vec<(Ipv6Addr, Ipv6Addr)> { + self.ip6.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() + } +} + +impl Default for Mcast2Phys { + fn default() -> Self { + Self::new() + } +} + +/// An overlay multicast group address mapped to the underlay (outer) IPv6 multicast address. +/// +/// This type ensures that the wrapped IPv6 address is a valid multicast address +/// with administrative scope (admin-local, site-local, or organization-local). +/// +/// Administrative scopes per RFC 4291 and RFC 7346: +/// - `0x4`: admin-local scope +/// - `0x5`: site-local scope +/// - `0x8`: organization-local scope +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct MulticastUnderlay(Ipv6Addr); + +impl MulticastUnderlay { + /// Create a new `MulticastUnderlay` if the address is a valid + /// administratively-scoped multicast IPv6 address (scope 0x4, 0x5, or 0x8). + pub fn new(addr: Ipv6Addr) -> Option { + if addr.is_admin_scoped_multicast() { Some(Self(addr)) } else { None } + } + + /// Return the underlying IPv6 multicast address. + pub fn addr(&self) -> Ipv6Addr { + self.0 + } + + /// Return the destination MAC address derived from the IPv6 multicast address. + fn dst_mac(&self) -> MacAddr { + self.0.unchecked_multicast_mac() + } +} + +impl Resource for Mcast2Phys {} +impl ResourceEntry for MulticastUnderlay {} + +impl MappingResource for Mcast2Phys { + type Key = IpAddr; + type Entry = MulticastUnderlay; + + fn get(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().get(ip4).cloned(), + IpAddr::Ip6(ip6) => self.ip6.lock().get(ip6).cloned(), + } + } + + fn remove(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().remove(ip4), + IpAddr::Ip6(ip6) => self.ip6.lock().remove(ip6), + } + } + + fn set(&self, vip: Self::Key, mcast: Self::Entry) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().insert(ip4, mcast), + IpAddr::Ip6(ip6) => self.ip6.lock().insert(ip6, mcast), + } + } +} diff --git a/lib/oxide-vpc/src/engine/router.rs b/lib/oxide-vpc/src/engine/router.rs index cabe96e5..11263c63 100644 --- a/lib/oxide-vpc/src/engine/router.rs +++ b/lib/oxide-vpc/src/engine/router.rs @@ -65,6 +65,7 @@ pub enum RouterTargetInternal { InternetGateway(Option), Ip(IpAddr), VpcSubnet(IpCidr), + Multicast(IpCidr), } impl RouterTargetInternal { @@ -86,6 +87,7 @@ impl RouterTargetInternal { } RouterTargetInternal::Ip(_) => RouterTargetClass::Ip, RouterTargetInternal::VpcSubnet(_) => RouterTargetClass::VpcSubnet, + RouterTargetInternal::Multicast(_) => RouterTargetClass::Multicast, } } } @@ -117,6 +119,16 @@ impl ActionMetaValue for RouterTargetInternal { Ok(Self::VpcSubnet(IpCidr::Ip6(cidr6))) } + Some(("mcast4", cidr4_s)) => { + let cidr4 = cidr4_s.parse::()?; + Ok(Self::Multicast(IpCidr::Ip4(cidr4))) + } + + Some(("mcast6", cidr6_s)) => { + let cidr6 = cidr6_s.parse::()?; + Ok(Self::Multicast(IpCidr::Ip6(cidr6))) + } + Some(("ig", ig)) => { let ig = ig.parse::().map_err(|e| e.to_string())?; Ok(Self::InternetGateway(Some(ig))) @@ -141,6 +153,12 @@ impl ActionMetaValue for RouterTargetInternal { Self::VpcSubnet(IpCidr::Ip6(cidr6)) => { format!("sub6={cidr6}").into() } + Self::Multicast(IpCidr::Ip4(mcast4)) => { + format!("mcast4={mcast4}").into() + } + Self::Multicast(IpCidr::Ip6(mcast6)) => { + format!("mcast6={mcast6}").into() + } } } } @@ -151,6 +169,7 @@ impl fmt::Display for RouterTargetInternal { Self::InternetGateway(addr) => format!("IG({addr:?})"), Self::Ip(addr) => format!("IP: {addr}"), Self::VpcSubnet(sub) => format!("Subnet: {sub}"), + Self::Multicast(mcast) => format!("Multicast: {mcast}"), }; write!(f, "{s}") } @@ -161,6 +180,7 @@ pub enum RouterTargetClass { InternetGateway, Ip, VpcSubnet, + Multicast, } impl ActionMetaValue for RouterTargetClass { @@ -171,6 +191,7 @@ impl ActionMetaValue for RouterTargetClass { "ig" => Ok(Self::InternetGateway), "ip" => Ok(Self::Ip), "subnet" => Ok(Self::VpcSubnet), + "mcast" => Ok(Self::Multicast), _ => Err(format!("bad router target class: {s}")), } } @@ -180,6 +201,7 @@ impl ActionMetaValue for RouterTargetClass { Self::InternetGateway => "ig".into(), Self::Ip => "ip".into(), Self::VpcSubnet => "subnet".into(), + Self::Multicast => "mcast".into(), } } } @@ -190,6 +212,7 @@ impl fmt::Display for RouterTargetClass { Self::InternetGateway => write!(f, "IG"), Self::Ip => write!(f, "IP"), Self::VpcSubnet => write!(f, "Subnet"), + Self::Multicast => write!(f, "Multicast"), } } } @@ -278,6 +301,8 @@ fn valid_router_dest_target_pair(dest: &IpCidr, target: &RouterTarget) -> bool { (_, RouterTarget::Drop) | // Internet gateways are valid for any IP family. (_, RouterTarget::InternetGateway(_)) | + // Multicast targets are valid for any IP family + (_, RouterTarget::Multicast(_)) | // IPv4 destination, IPv4 address (IpCidr::Ip4(_), RouterTarget::Ip(IpAddr::Ip4(_))) | // IPv4 destination, IPv4 subnet @@ -362,6 +387,22 @@ fn make_rule( ))); (predicate, action) } + + RouterTarget::Multicast(mcast) => { + let predicate = match dest { + IpCidr::Ip4(ip4) => { + Predicate::InnerDstIp4(vec![Ipv4AddrMatch::Prefix(ip4)]) + } + + IpCidr::Ip6(ip6) => { + Predicate::InnerDstIp6(vec![Ipv6AddrMatch::Prefix(ip6)]) + } + }; + let action = Action::Meta(Arc::new(RouterAction::new( + RouterTargetInternal::Multicast(mcast), + ))); + (predicate, action) + } }; let priority = compute_rule_priority(&dest, class); diff --git a/lib/oxide-vpc/src/print.rs b/lib/oxide-vpc/src/print.rs index c6a46ef3..f69a8b4c 100644 --- a/lib/oxide-vpc/src/print.rs +++ b/lib/oxide-vpc/src/print.rs @@ -9,6 +9,7 @@ //! This is mostly just a place to hang printing routines so that they //! can be used by both opteadm and integration tests. +use crate::api::DumpMcastForwardingResp; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; @@ -135,3 +136,38 @@ fn print_v2p_ip6( std::net::Ipv6Addr::from(phys.ip.bytes()), ) } + +/// Print the header for the [`print_mcast_fwd()`] output. +fn print_mcast_fwd_header(t: &mut impl Write) -> std::io::Result<()> { + writeln!(t, "GROUP IP\tUNDERLAY IP\tVNI\tREPLICATION") +} + +/// Print a [`DumpMcastForwardingResp`]. +pub fn print_mcast_fwd(resp: &DumpMcastForwardingResp) -> std::io::Result<()> { + print_mcast_fwd_into(&mut std::io::stdout(), resp) +} + +/// Print a [`DumpMcastForwardingResp`] into a given writer. +pub fn print_mcast_fwd_into( + writer: &mut impl Write, + resp: &DumpMcastForwardingResp, +) -> std::io::Result<()> { + let mut t = TabWriter::new(writer); + writeln!(t, "Multicast Forwarding Table")?; + write_hrb(&mut t)?; + writeln!(t)?; + print_mcast_fwd_header(&mut t)?; + write_hr(&mut t)?; + + for entry in &resp.entries { + for (next_hop, replication) in &entry.next_hops { + writeln!( + t, + "{}\t{}\t{}\t{replication:?}", + entry.group, next_hop.addr, next_hop.vni + )?; + } + } + writeln!(t)?; + t.flush() +} diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index fe3454d6..57a1d541 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -36,6 +36,7 @@ use opte::engine::ip::v4::Ipv4Addr; use opte::engine::ip::v4::Ipv4Ref; use opte::engine::ip::v4::ValidIpv4; use opte::engine::ip::v6::Ipv6; +use opte::engine::ip::v6::Ipv6Addr; use opte::engine::ip::v6::Ipv6Ref; use opte::engine::ip::v6::ValidIpv6; use opte::engine::packet::InnerFlowId; @@ -43,10 +44,14 @@ use opte::engine::packet::MblkFullParsed; use opte::engine::packet::MismatchError; use opte::engine::packet::Packet; use opte::engine::parse::ValidUlp; +use opte::engine::port::DropReason; use opte::engine::port::ProcessError; +use opte::engine::port::ProcessResult; use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS; +use opte::ingot::ethernet::Ethertype; use opte::ingot::geneve::GeneveRef; use opte::ingot::icmp::IcmpV6Ref; +use opte::ingot::ip::IpProtocol; use opte::ingot::tcp::TcpRef; use opte::ingot::types::Emit; use opte::ingot::types::HeaderLen; @@ -59,6 +64,7 @@ use oxide_vpc::api::ExternalIpCfg; use oxide_vpc::api::FirewallRule; use oxide_vpc::api::RouterClass; use oxide_vpc::api::VpcCfg; +use oxide_vpc::engine::geneve; use pcap::*; use smoltcp::phy::ChecksumCapabilities as CsumCapab; use smoltcp::wire::Icmpv4Packet; @@ -4678,7 +4684,7 @@ fn icmp_inner_has_nat_applied() { header: smoltcp::wire::Ipv4Repr { src_addr: remote_addr.into(), dst_addr: g1_cfg.ipv4().private_ip.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, payload_len: 256, hop_limit: 0, }, @@ -4747,7 +4753,7 @@ fn icmpv6_inner_has_nat_applied() { header: smoltcp::wire::Ipv6Repr { src_addr: eph_ip.into(), dst_addr: remote_addr.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, // Unimportant -- header is truncated. payload_len: 256, hop_limit: 255, @@ -4811,3 +4817,338 @@ fn icmpv6_inner_has_nat_applied() { let (v6, ..) = ValidIpv6::parse(body).unwrap(); assert_eq!(v6.source(), g1_cfg.ipv6().private_ip); } + +// Test that IPv6 multicast packets get encapsulated with Geneve +#[test] +fn test_ipv6_multicast_encapsulation() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast packet (ff04::1:3 - admin-local multicast) + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // Create a multicast underlay address (must be multicast for forwarding) + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Add multicast forwarding entry BEFORE starting the port + let mcast_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + g1.vpc_map.add_mcast(mcast_dst.into(), mcast_underlay, mcast_vni).unwrap(); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Add router entry for IPv6 multicast traffic (ff00::/8) via Multicast target + router::add_entry( + &g1.port, + IpCidr::Ip6("ff00::/8".parse().unwrap()), + RouterTarget::Multicast(IpCidr::Ip6("ff00::/8".parse().unwrap())), + RouterClass::System, + ) + .unwrap(); + incr!(g1, ["epoch", "router.rules.out"]); + + // Build a UDP packet to the multicast address + // (TCP + multicast is incompatible and would be denied) + let eth = Ethernet { + destination: MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + source: g1_cfg.guest_mac, + ethertype: Ethertype::IPV6, + }; + let ip = Ipv6 { + source: g1_cfg.ipv6().private_ip, + destination: mcast_dst, + next_header: IpProtocol::UDP, + payload_len: (Udp::MINIMUM_LENGTH) as u16, + hop_limit: 64, + ..Default::default() + }; + let udp = Udp { + source: 12345, + destination: 5353, // mDNS port as an example multicast UDP service + length: Udp::MINIMUM_LENGTH as u16, + ..Default::default() + }; + let mut pkt_m = ulp_pkt(eth, ip, udp, &[]); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt); + + // Verify packet was encapsulated + let Ok(Modified(spec)) = res else { + panic!("Expected Modified result, got {res:?}"); + }; + let mut pkt_m = spec.apply(pkt_m); + + // Parse the encapsulated packet as inbound (it's now on the wire with Geneve) + let parsed = Packet::parse_inbound(pkt_m.iter_mut(), VpcParser {}).unwrap(); + let meta = parsed.meta(); + + // Verify the outer IPv6 destination is the multicast underlay address + assert_eq!( + meta.outer_v6.destination(), + mcast_underlay, + "Outer IPv6 destination should be multicast underlay address" + ); + + // Verify the outer IPv6 source is the physical IP of the guest + assert_eq!( + meta.outer_v6.source(), + g1_cfg.phys_ip, + "Outer IPv6 source should be the physical IP" + ); + + // Verify the outer Ethernet destination MAC is the IPv6 multicast MAC + // For IPv6 multicast, MAC is 33:33:xx:xx:xx:xx where xx:xx:xx:xx are the last 4 bytes of the IPv6 address + let expected_outer_mac = mcast_underlay.multicast_mac().unwrap(); + assert_eq!( + meta.outer_eth.destination(), + expected_outer_mac, + "Outer Ethernet MAC should be IPv6 multicast MAC" + ); + + // Verify we have Geneve encapsulation with the correct VNI (fleet multicast VNI) + assert_eq!( + meta.outer_encap.vni(), + mcast_vni, + "Geneve VNI should match DEFAULT_MULTICAST_VNI" + ); + + // Verify the Geneve multicast option is present with External replication + let replication = geneve::extract_multicast_replication(&meta.outer_encap) + .expect("Geneve packet should have multicast option"); + assert_eq!( + replication, + oxide_vpc::api::Replication::External, + "Multicast option should have External replication" + ); +} + +// Test that TCP + multicast packets are denied (TCP is incompatible with multicast) +#[test] +fn test_tcp_multicast_denied() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast address + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + let mcast_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + g1.vpc_map.add_mcast(mcast_dst.into(), mcast_underlay, mcast_vni).unwrap(); + + g1.port.start(); + set!(g1, "port_state=running"); + + router::add_entry( + &g1.port, + IpCidr::Ip6("ff00::/8".parse().unwrap()), + RouterTarget::Multicast(IpCidr::Ip6("ff00::/8".parse().unwrap())), + RouterClass::System, + ) + .unwrap(); + incr!(g1, ["epoch", "router.rules.out"]); + + // Build a TCP packet to the multicast address (should be denied) + let mut pkt_m = http_syn3( + g1_cfg.guest_mac, + g1_cfg.ipv6().private_ip, + MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + mcast_dst, + 12345, + 80, + ); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt); + + // Verify packet was denied (TCP + multicast is incompatible) + match res { + Ok(Hairpin(_)) => panic!("Expected packet to be denied, got Hairpin"), + Ok(Modified(_)) => panic!("Expected packet to be denied, got Modified"), + Ok(ProcessResult::Drop { reason: DropReason::Layer { .. } }) => { + // Expected - TCP + multicast is denied by overlay layer + } + other => panic!("Expected Drop with Layer reason, got: {:?}", other), + } +} + +// Ensure packets with unknown critical Geneve options are rejected during +// option validation (fail-closed on unrecognised critical options). +#[test] +fn test_drop_on_unknown_critical_option() { + // Build Ethernet + IPv6 (with no extensions) + UDP + Geneve header + // carrying a single unknown critical option (class=0xffff, type=0x80, len=0). + // Minimal inner Ethernet + IPv4 + UDP follows to satisfy the parser. + let mut buf: Vec = Vec::new(); + + // Ethernet (14B) + buf.extend_from_slice(&[ + 0x33, 0x33, 0x00, 0x00, 0x00, 0x01, // dst + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, // src + 0x86, 0xdd, // ethertype IPv6 + ]); + + // IPv6 header (40B) + // ver/tc/fl, payload_len, next_header=UDP(17), hop_limit + // payload_len = UDP length (we'll compute) + let ip6_hdr_pos = buf.len(); + buf.extend_from_slice(&[ + 0x60, 0x00, 0x00, 0x00, // ver+tc+fl + 0x00, 0x00, // payload length (placeholder) + 0x11, // next header UDP + 0x40, // hop limit + // src + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, // dst + 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // UDP header (8B) + let udp_pos = buf.len(); + buf.extend_from_slice(&[ + 0x1e, 0x61, // source port + 0x17, 0xc1, // dest 6081 + 0x00, 0x00, // length (placeholder) + 0x00, 0x00, // checksum + ]); + + // Geneve header (8B): ver+optlen=1 (4B option header), flags=critical opts + buf.extend_from_slice(&[ + 0x01, // ver=0, optlen=1 word (4B option header) + 0x40, // flags: critical options present + 0x65, 0x58, // protocol type 0x6558 + 0x00, 0x00, 0x00, 0x00, // VNI=0, reserved + ]); + // Unknown critical option: class=0xffff, type=0x80 (critical), len=0 + buf.extend_from_slice(&[ + 0xff, 0xff, // class + 0x80, // critical + type + 0x00, // rsvd+len=0 + ]); + // No body (len=0) + + // Minimal inner Ethernet + IPv4 + UDP (to satisfy inner parse) + buf.extend_from_slice(&[ + // inner Ethernet + 0x00, 0x16, 0x3e, 0x00, 0x00, 0x02, 0x00, 0x16, 0x3e, 0x00, 0x00, 0x01, + 0x08, 0x00, // IPv4 + // inner IPv4 (20B) + 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01, 0x00, 0x00, 0x11, 0x00, 0x0a, 0x00, + 0x00, 0x01, 0x0a, 0x00, 0x00, 0x02, // src=10.0.0.1, dst=10.0.0.2 + // inner UDP (8B) + 0x12, 0x34, 0x13, 0x37, 0x00, 0x08, 0x00, 0x00, + ]); + + // Compute UDP length and IPv6 payload length + let udp_len = (buf.len() - udp_pos) as u16; + buf[udp_pos + 4] = (udp_len >> 8) as u8; + buf[udp_pos + 5] = (udp_len & 0xff) as u8; + + let ip6_payload_len = (buf.len() - (ip6_hdr_pos + 40)) as u16; + buf[ip6_hdr_pos + 4] = (ip6_payload_len >> 8) as u8; + buf[ip6_hdr_pos + 5] = (ip6_payload_len & 0xff) as u8; + + // Parse Geneve directly from the UDP payload (skip L2/L3) and validate options + let geneve_offset = 14 /*eth*/ + 40 /*ipv6*/ + 8 /*udp*/; + let (geneve, _, _) = + opte::ingot::geneve::ValidGeneve::parse(&buf[geneve_offset..]) + .expect("parse geneve header"); + assert!(matches!( + geneve::validate_options(&geneve), + Err(opte::engine::packet::ParseError::UnrecognisedTunnelOpt { .. }) + )); +} + +// Ensure Geneve parsing works correctly when an IPv6 extension header is present +// before UDP (e.g., Hop-by-Hop). Verifies that option walking is positioned at +// the correct Geneve offset. +#[test] +fn test_v6_ext_hdr_geneve_offset_ok() { + let mut buf: Vec = Vec::new(); + + // Ethernet + buf.extend_from_slice(&[ + 0x33, 0x33, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, + 0x86, 0xdd, + ]); + + // IPv6 header (Next Header = Hop-by-Hop (0)) + let ip6_hdr_pos = buf.len(); + buf.extend_from_slice(&[ + 0x60, 0x00, 0x00, 0x00, 0x00, + 0x00, // payload length (placeholder) + 0x00, // next header: Hop-by-Hop + 0x40, // hop limit + // src + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, // dst + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Hop-by-Hop extension header (8B) -> next header UDP (17), hdr ext len=0 + buf.extend_from_slice(&[0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]); + + // UDP header (8B) + let udp_pos = buf.len(); + buf.extend_from_slice(&[ + 0x1e, 0x61, // source + 0x17, 0xc1, // dest 6081 + 0x00, 0x00, // length (placeholder) + 0x00, 0x00, // checksum + ]); + + // Geneve header (8B): ver+optlen=2 (8B option area), flags=0 + buf.extend_from_slice(&[0x02, 0x00, 0x65, 0x58, 0x00, 0x00, 0x00, 0x00]); + // Multicast option: class=0x0129, type=0x01, len=1; body=4B with External + buf.extend_from_slice(&[ + 0x01, + 0x29, + 0x01, + 0x01, // class, type, rsvd+len + (oxide_vpc::api::Replication::External as u8) << 6, + 0x00, + 0x00, + 0x00, + ]); + + // Minimal inner Ethernet + IPv4 + UDP + buf.extend_from_slice(&[ + 0x00, 0x16, 0x3e, 0x00, 0x00, 0x02, 0x00, 0x16, 0x3e, 0x00, 0x00, 0x01, + 0x08, 0x00, 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01, 0x00, 0x00, 0x11, 0x00, + 0x0a, 0x00, 0x00, 0x01, 0x0a, 0x00, 0x00, 0x02, 0x12, 0x34, 0x13, 0x37, + 0x00, 0x08, 0x00, 0x00, + ]); + + // Set UDP and IPv6 payload lengths + let udp_len = (buf.len() - udp_pos) as u16; + buf[udp_pos + 4] = (udp_len >> 8) as u8; + buf[udp_pos + 5] = (udp_len & 0xff) as u8; + + let ip6_payload_len = (buf.len() - (ip6_hdr_pos + 40)) as u16; + buf[ip6_hdr_pos + 4] = (ip6_payload_len >> 8) as u8; + buf[ip6_hdr_pos + 5] = (ip6_payload_len & 0xff) as u8; + + // Parse Geneve directly after IPv6 ext header and UDP, then check multicast option + let geneve_offset = 14 /*eth*/ + 40 /*ipv6*/ + 8 /*hop-by-hop*/ + 8 /*udp*/; + let (geneve, _, _) = + opte::ingot::geneve::ValidGeneve::parse(&buf[geneve_offset..]) + .expect("parse geneve header after ext hdr"); + let repl = geneve::extract_multicast_replication(&geneve) + .expect("multicast option present"); + assert_eq!(repl, oxide_vpc::api::Replication::External); +} diff --git a/xde-tests/Cargo.toml b/xde-tests/Cargo.toml index 84e0d5bd..6ca3dc3a 100644 --- a/xde-tests/Cargo.toml +++ b/xde-tests/Cargo.toml @@ -8,6 +8,7 @@ repository.workspace = true [dependencies] opte-ioctl.workspace = true +opte-test-utils.workspace = true oxide-vpc.workspace = true anyhow.workspace = true diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index 2fd8a634..d2908fe3 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -5,10 +5,15 @@ // Copyright 2025 Oxide Computer Company use anyhow::Result; +use anyhow::anyhow; +use anyhow::bail; use opte_ioctl::OpteHdl; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::Direction; use oxide_vpc::api::ExternalIpCfg; @@ -21,27 +26,41 @@ use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::PhysNet; use oxide_vpc::api::Ports; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; +use oxide_vpc::api::SNat6Cfg; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::Vni; use oxide_vpc::api::VpcCfg; use rand::Rng; +use std::cell::RefCell; use std::collections::HashSet; +use std::process::Child; use std::process::Command; +use std::process::Stdio; use std::sync::Arc; use std::time::Duration; +use std::time::Instant; use zone::Zlogin; pub use ztest::*; -/// The overlay network used in all tests. +/// The IPv4 overlay network used in all tests. pub const OVERLAY_NET: &str = "10.0.0.0/24"; -/// The overlay OPTE gateway used in all tests. +/// The IPv4 overlay OPTE gateway used in all tests. pub const OVERLAY_GW: &str = "10.0.0.254"; +/// The IPv6 overlay network used in all tests. +pub const OVERLAY_NET_V6: &str = "fd00::/64"; +/// The IPv6 overlay OPTE gateway used in all tests. +pub const OVERLAY_GW_V6: &str = "fd00::254"; /// This is a wrapper around the ztest::Zone object that encapsulates common /// logic needed for running the OPTE tests zones used in this test suite. @@ -58,15 +77,48 @@ impl OpteZone { Ok(Self { zone }) } - /// Wait for the network to come up, then set up the overlay network. + /// Wait for the network to come up, then set up the IPv4 overlay network. fn setup(&self, devname: &str, addr: String) -> Result<()> { self.zone.wait_for_network()?; + // Configure IPv4 via DHCP self.zone - .zexec(&format!("ipadm create-addr -t -T dhcp {}/test", devname))?; + .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/test"))?; + self.zone.zexec(&format!("route add -iface {OVERLAY_GW} {addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + Ok(()) + } + + /// Wait for the network to come up, then set up dual-stack (IPv4 + IPv6) overlay network. + fn setup_dualstack( + &self, + devname: &str, + ipv4_addr: String, + ipv6_addr: String, + ) -> Result<()> { + self.zone.wait_for_network()?; + // Configure IPv4 via DHCP self.zone - .zexec(&format!("route add -iface {} {}", OVERLAY_GW, addr))?; + .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/testv4"))?; self.zone - .zexec(&format!("route add {} {}", OVERLAY_NET, OVERLAY_GW))?; + .zexec(&format!("route add -iface {OVERLAY_GW} {ipv4_addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + + // Configure IPv6 with static address + // Use addrconf first to enable IPv6 on the interface, then add static address + self.zone.zexec(&format!( + "ipadm create-addr -t -T addrconf {devname}/addrconf" + ))?; + // Small delay to let addrconf initialize + std::thread::sleep(Duration::from_millis(500)); + self.zone.zexec(&format!( + "ipadm create-addr -t -T static -a {ipv6_addr}/64 {devname}/testv6" + ))?; + self.zone.zexec(&format!( + "route add -inet6 -iface {OVERLAY_GW_V6} {ipv6_addr}" + ))?; + self.zone.zexec(&format!( + "route add -inet6 {OVERLAY_NET_V6} {OVERLAY_GW_V6}" + ))?; Ok(()) } } @@ -77,6 +129,7 @@ impl OpteZone { pub struct OptePort { name: String, cfg: VpcCfg, + mcast_subscriptions: RefCell>, } impl OptePort { @@ -106,12 +159,67 @@ impl OptePort { }), guest_mac: guest_mac.parse().unwrap(), gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), + phys_ip: phys_ip.parse().unwrap(), + }; + let adm = OpteHdl::open()?; + adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) + } + + /// Create a new OPTE port with dual-stack (IPv4 + IPv6) support. + pub fn new_dualstack( + name: &str, + private_ip_v4: &str, + private_ip_v6: &str, + guest_mac: &str, + phys_ip: &str, + ) -> Result { + let cfg = VpcCfg { + ip_cfg: IpCfg::DualStack { + ipv4: Ipv4Cfg { + vpc_subnet: OVERLAY_NET.parse().unwrap(), + private_ip: private_ip_v4.parse().unwrap(), + gateway_ip: OVERLAY_GW.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat4Cfg { + external_ip: "1.2.3.4".parse().unwrap(), + ports: 1000..=2000, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + ipv6: Ipv6Cfg { + vpc_subnet: OVERLAY_NET_V6.parse().unwrap(), + private_ip: private_ip_v6.parse().unwrap(), + gateway_ip: OVERLAY_GW_V6.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat6Cfg { + external_ip: "2001:db8::1".parse().unwrap(), + ports: 4097..=8192, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + }, + guest_mac: guest_mac.parse().unwrap(), + gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), phys_ip: phys_ip.parse().unwrap(), }; let adm = OpteHdl::open()?; adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; - Ok(OptePort { name: name.into(), cfg }) + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) } /// Add an overlay routing entry to this port. @@ -150,11 +258,20 @@ impl OptePort { self.cfg.guest_mac.bytes() } - /// Return the guest IP address as a string. + /// Return the guest IPv4 address as a string. pub fn ip(&self) -> String { match &self.cfg.ip_cfg { IpCfg::Ipv4(cfg) => cfg.private_ip.to_string(), - _ => panic!("expected ipv4 guest"), + IpCfg::DualStack { ipv4, .. } => ipv4.private_ip.to_string(), + _ => panic!("expected ipv4 or dualstack guest"), + } + } + + /// Return the guest IPv6 address as a string (for dual-stack ports). + pub fn ipv6(&self) -> Option { + match &self.cfg.ip_cfg { + IpCfg::DualStack { ipv6, .. } => Some(ipv6.private_ip.to_string()), + _ => None, } } @@ -162,6 +279,53 @@ impl OptePort { pub fn underlay_ip(&self) -> std::net::Ipv6Addr { self.cfg.phys_ip.into() } + + /// Return the port name. + pub fn name(&self) -> &str { + &self.name + } + + /// Subscribe this port to a multicast group. + /// Automatically tracks the subscription for cleanup on drop. + pub fn subscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_subscribe(&McastSubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().push(group); + Ok(()) + } + + /// Unsubscribe this port from a multicast group. + pub fn unsubscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().retain(|g| *g != group); + Ok(()) + } + + /// Add a multicast router entry for this port. + pub fn add_multicast_router_entry(&self, cidr: IpCidr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.add_router_entry(&AddRouterEntryReq { + port_name: self.name.clone(), + dest: cidr, + target: RouterTarget::Multicast(cidr), + class: RouterClass::System, + })?; + Ok(()) + } + + /// Allow multicast CIDR through the overlay firewall for the given direction. + pub fn allow_cidr(&self, cidr: IpCidr, direction: Direction) -> Result<()> { + let adm = OpteHdl::open()?; + adm.allow_cidr(&self.name, cidr, direction)?; + Ok(()) + } } impl Drop for OptePort { @@ -174,8 +338,23 @@ impl Drop for OptePort { return; } }; + + // Clean up multicast subscriptions + let subscriptions = self.mcast_subscriptions.borrow().clone(); + for group in subscriptions { + if let Err(e) = adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + }) { + let name = &self.name; + eprintln!( + "failed to unsubscribe {name} from multicast group {group}: {e}" + ); + } + } + if let Err(e) = adm.delete_xde(&self.name) { - eprintln!("failed to delete xde on drop: {}", e); + eprintln!("failed to delete xde on drop: {e}"); } } } @@ -202,26 +381,143 @@ impl Xde { phys: PhysNet { ether: ether.parse().unwrap(), ip: ip.parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), }, })?; Ok(()) } } impl Drop for Xde { - /// When this object is dropped, remove the xde kernel module from the - /// underlying system. fn drop(&mut self) { - // The module can no longer be successfully removed until the underlay - // has been cleared. This may not have been done, so this is fallible. + // Clear underlay to release references to simnet/vnic devices, + // allowing their cleanup to proceed. Driver remains loaded. if let Ok(adm) = OpteHdl::open() { let _ = adm.clear_xde_underlay(); } + } +} + +/// Helper to run `snoop` and ensure it doesn't outlive the test. +/// +/// This avoids leaked `snoop` processes pinning DLPI devices (causing EBUSY) +/// when tests time out. +pub struct SnoopGuard { + child: Option, +} + +impl SnoopGuard { + /// Start a `snoop` capture on `dev_name` with the provided BPF-like `filter`. + /// Captures a single packet (`-c 1`) and dumps hex output (`-x0`). + /// Uses `-r` to disable name resolution for deterministic numeric output. + pub fn start(dev_name: &str, filter: &str) -> anyhow::Result { + let child = Command::new("pfexec") + .args(&[ + "snoop", "-r", "-d", dev_name, "-c", "1", "-P", "-x0", filter, + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + Ok(Self { child: Some(child) }) + } + + /// Wait for completion with a timeout. Returns stdout if successful. + pub fn wait_with_timeout( + &mut self, + timeout: Duration, + ) -> anyhow::Result { + let deadline = Instant::now() + timeout; + + loop { + let child = self.child.as_mut().expect("child already taken"); + match child.try_wait()? { + Some(_status) => { + // Child exited; collect output. + let child = self.child.take().expect("child already taken"); + return Ok(child.wait_with_output()?); + } + None => { + if Instant::now() >= deadline { + // Timed out; kill snoop so it doesn't hold interfaces open. + let _ = child.kill(); + let _ = child.wait(); + bail!("snoop capture timed out"); + } + std::thread::sleep(Duration::from_millis(50)); + } + } + } + } +} + +impl Drop for SnoopGuard { + fn drop(&mut self) { + if let Some(child) = &mut self.child { + if let Ok(None) = child.try_wait() { + let _ = child.kill(); + let _ = child.wait(); + } + } + } +} + +/// Global multicast group state that cleans up M2P mappings and forwarding +/// entries on drop. Port-specific subscriptions are handled automatically by +/// OptePort::drop(). +/// +/// Use this to set up multicast groups in tests. Port subscriptions should use +/// `port.subscribe_multicast(group)` which tracks cleanup automatically. +pub struct MulticastGroup { + pub group: IpAddr, + pub underlay: Ipv6Addr, + pub vni: Vni, +} + +impl MulticastGroup { + pub fn new(group: IpAddr, underlay: Ipv6Addr, vni: Vni) -> Result { + let hdl = OpteHdl::open()?; + hdl.set_m2p(&SetMcast2PhysReq { group, underlay, vni })?; + Ok(Self { group, underlay, vni }) + } + + /// Set multicast forwarding entries for this group. + pub fn set_forwarding( + &self, + next_hops: Vec<( + oxide_vpc::api::NextHopV6, + oxide_vpc::api::Replication, + )>, + ) -> Result<()> { + let hdl = OpteHdl::open()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { + group: self.group, + next_hops, + })?; + Ok(()) + } +} + +impl Drop for MulticastGroup { + fn drop(&mut self) { + let Ok(hdl) = OpteHdl::open() else { + eprintln!("failed to open xde device for multicast cleanup"); + return; + }; + + // Clear forwarding entry + let group = self.group; + if let Err(e) = + hdl.clear_mcast_fwd(&ClearMcastForwardingReq { group: self.group }) + { + eprintln!("failed to clear multicast forwarding for {group}: {e}"); + } - let mut cmd = Command::new("pfexec"); - cmd.args(["rem_drv", "xde"]); - if let Err(e) = cmd.output() { - eprintln!("failed to remove xde driver: {}", e); + // Clear M2P mapping + if let Err(e) = hdl.clear_m2p(&ClearMcast2PhysReq { + group: self.group, + underlay: self.underlay, + vni: self.vni, + }) { + eprintln!("failed to clear M2P mapping for {group}: {e}"); } } } @@ -244,6 +540,9 @@ impl TestNode { /// A topology of local zones interconnected with simlinks over /// an OPTE dataplane. // Note: these fields have a *very* sensitive drop order. +// Rust drops fields in declaration order. Zones must drop FIRST (to release +// references to network devices), then network infrastructure can clean up. +// Drop order: nodes -> null_ports -> v6_routes -> xde -> lls -> vnics -> simnet -> zfs pub struct Topology { pub nodes: Vec, pub null_ports: Vec, @@ -288,6 +587,14 @@ pub struct Topology { /// sanity checker to make sure basic opte/xde functionality is working - and /// that we're not hitting things like debug asserts in the OS. pub fn two_node_topology(brand: &str) -> Result { + two_node_topology_named(brand, "a", "b") +} + +pub fn two_node_topology_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, +) -> Result { // Create the "underlay loopback". With simnet device pairs, any packet that // goes in one is forwarded to the other. In the topology depicted above, // this means that anything vopte0 sends, will be encapsulated onto the @@ -349,29 +656,198 @@ pub fn two_node_topology(brand: &str) -> Result { let zfs = Arc::new(Zfs::new("opte2node")?); // Create a pair of zones to simulate our VM instances. - println!("start zone a"); - let a = OpteZone::new("a", &zfs, &[&opte0.name], brand)?; - println!("start zone b"); - let b = OpteZone::new("b", &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; - println!("setup zone a"); + println!("setup zone {zone_a_name}"); a.setup(&opte0.name, opte0.ip())?; - println!("setup zone b"); + println!("setup zone {zone_b_name}"); b.setup(&opte1.name, opte1.ip())?; Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + ], + null_ports: vec![], + v6_routes: vec![r0, r1], xde, lls: vec![ll0, ll1], vnics: vec![vn0, vn1], simnet: Some(sim), + zfs, + }) +} + +pub fn two_node_topology_dualstack(brand: &str) -> Result { + two_node_topology_dualstack_named(brand, "a", "b") +} + +pub fn two_node_topology_dualstack_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, +) -> Result { + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up v2p mappings (same as IPv4-only version) + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + + // Create dual-stack OPTE ports + let opte0 = OptePort::new_dualstack( + "opte0", + "10.0.0.1", + "fd00::1", + "a8:40:25:ff:00:01", + "fd44::1", + )?; + opte0.add_router_entry("10.0.0.2")?; + opte0.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + let opte1 = OptePort::new_dualstack( + "opte1", + "10.0.0.2", + "fd00::2", + "a8:40:25:ff:00:02", + "fd77::1", + )?; + opte1.add_router_entry("10.0.0.1")?; + opte1.fw_allow_all()?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte2node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup_dualstack(&opte0.name, opte0.ip(), "fd00::1".to_string())?; + + println!("setup zone {zone_b_name}"); + b.setup_dualstack(&opte1.name, opte1.ip(), "fd00::2".to_string())?; + + Ok(Topology { nodes: vec![ TestNode { zone: a, port: opte0 }, TestNode { zone: b, port: opte1 }, ], + null_ports: vec![], v6_routes: vec![r0, r1], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), zfs, + }) +} + +pub fn three_node_topology(brand: &str) -> Result { + three_node_topology_named(brand, "a", "b", "c") +} + +pub fn three_node_topology_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, + zone_c_name: &str, +) -> Result { + // Create three-node topology for testing multicast fanout + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up V2P mappings for three nodes + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + Xde::set_v2p("10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + + // Create three OPTE ports + let opte0 = + OptePort::new("opte0", "10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + opte0.add_router_entry("10.0.0.2")?; + opte0.add_router_entry("10.0.0.3")?; + opte0.fw_allow_all()?; + + let opte1 = + OptePort::new("opte1", "10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + opte1.add_router_entry("10.0.0.1")?; + opte1.add_router_entry("10.0.0.3")?; + opte1.fw_allow_all()?; + + let opte2 = + OptePort::new("opte2", "10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + opte2.add_router_entry("10.0.0.1")?; + opte2.add_router_entry("10.0.0.2")?; + opte2.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + println!("adding underlay route 2"); + let r2 = + RouteV6::new(opte2.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte3node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_c_name}"); + let c = OpteZone::new(zone_c_name, &zfs, &[&opte2.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup(&opte0.name, opte0.ip())?; + + println!("setup zone {zone_b_name}"); + b.setup(&opte1.name, opte1.ip())?; + + println!("setup zone {zone_c_name}"); + c.setup(&opte2.name, opte2.ip())?; + + Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + TestNode { zone: c, port: opte2 }, + ], null_ports: vec![], + v6_routes: vec![r0, r1, r2], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), + zfs, }) } @@ -410,16 +886,16 @@ pub fn get_linklocal_addr(link_name: &str) -> Result { let text = std::str::from_utf8(&out.stdout)?; if !out.status.success() || text.lines().count() == 1 { - anyhow::bail!("could not find address {target_addr}"); + bail!("could not find address {target_addr}"); } let mut maybe_addr = text .lines() .nth(1) - .ok_or(anyhow::anyhow!("expected to find entry line for IP"))? + .ok_or(anyhow!("expected to find entry line for IP"))? .split_whitespace() .last() - .ok_or(anyhow::anyhow!("expected to find column for IP"))?; + .ok_or(anyhow!("expected to find column for IP"))?; // remove iface qualifier on link-local addr. if maybe_addr.contains('%') { @@ -443,7 +919,7 @@ pub fn single_node_over_real_nic( let max_macs = (1 << 20) - peers.len() - 1; if null_port_count > max_macs as u32 { - anyhow::bail!( + bail!( "Cannot allocate {null_port_count} ports: \ Oxide MAC space admits {max_macs} accounting for peers" ); @@ -522,13 +998,13 @@ pub fn single_node_over_real_nic( a.setup(&opte.name, opte.ip())?; Ok(Topology { + nodes: vec![TestNode { zone: a, port: opte }], + null_ports, + v6_routes, xde, lls: vec![], vnics: vec![], simnet: None, - nodes: vec![TestNode { zone: a, port: opte }], - null_ports, - v6_routes, zfs, }) } diff --git a/xde-tests/tests/loopback.rs b/xde-tests/tests/loopback.rs index c64990a8..4ceb8b52 100644 --- a/xde-tests/tests/loopback.rs +++ b/xde-tests/tests/loopback.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company use anyhow::Result; diff --git a/xde-tests/tests/multicast_multi_sub.rs b/xde-tests/tests/multicast_multi_sub.rs new file mode 100644 index 00000000..64f586d9 --- /dev/null +++ b/xde-tests/tests/multicast_multi_sub.rs @@ -0,0 +1,363 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast multiple subscriber tests. + +use anyhow::Context; +use anyhow::Result; +use opte_ioctl::OpteHdl; +use opte_test_utils::geneve_verify; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_multicast_multiple_local_subscribers() -> Result<()> { + // Create 3-node topology to test local fanout + let topol = xde_tests::three_node_topology_named( + "omicron1", "mlsa", "mlsb", "mlsc", + )?; + + // IPv4 multicast group: 224.1.2.3 + let mcast_group = Ipv4Addr::from([224, 1, 2, 3]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 3, + ]); + + // Set up multicast state with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Node B's underlay address for forwarding + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast forwarding with External replication + // This will deliver to all local subscribers in the same VNI + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + // Allow IPv4 multicast traffic via Multicast target and subscribe to the group + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port.subscribe_multicast(mcast_group.into())?; + } + + // Start snoops on nodes B and C using SnoopGuard + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + // Also snoop underlay to verify NO underlay forwarding with External mode + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send multicast packet from node A + let payload = "fanout test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for both snoops to capture packets + let snoop_output_b = snoop_b + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop on node B")?; + let snoop_output_c = snoop_c + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop on node C")?; + + // Verify both nodes received the packet + let stdout_b = String::from_utf8_lossy(&snoop_output_b.stdout); + assert!( + snoop_output_b.status.success() && stdout_b.contains("UDP"), + "Expected to capture multicast UDP packet on node B, snoop output:\n{stdout_b}" + ); + + let stdout_c = String::from_utf8_lossy(&snoop_output_c.stdout); + assert!( + snoop_output_c.status.success() && stdout_c.contains("UDP"), + "Expected to capture multicast UDP packet on node C, snoop output:\n{stdout_c}" + ); + + // Verify NO underlay forwarding (External mode = local-only) + if let Ok(output) = snoop_underlay.wait_with_timeout(Duration::from_secs(2)) + { + let stdout = String::from_utf8_lossy(&output.stdout); + panic!( + "External mode should NOT forward to underlay, but captured:\n{stdout}" + ); + } + + Ok(()) +} + +#[test] +fn test_multicast_underlay_replication() -> Result<()> { + // Create 2-node topology to test Underlay replication mode + let topol = xde_tests::two_node_topology_named("omicron1", "ura", "urb")?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 4]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 4, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Debug: dump V2P/M2P mappings to verify M2P is set correctly + let hdl = OpteHdl::open()?; + let v2p_dump = hdl.dump_v2p()?; + println!("\n=== V2P/M2P Mappings ==="); + for vpc_map in &v2p_dump.mappings { + println!(" VNI {}: ", vpc_map.vni.as_u32()); + println!(" Unicast IPv4 mappings: {:?}", vpc_map.ip4); + println!(" Multicast IPv4 mappings: {:?}", vpc_map.mcast_ip4); + println!(" Multicast IPv6 mappings: {:?}", vpc_map.mcast_ip6); + } + println!("=== End V2P/M2P Mappings ===\n"); + + // Node B's underlay address + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast forwarding with Underlay replication ONLY + // This should forward to underlay but NOT deliver to local ports + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::Underlay, + )])?; + + // Allow IPv4 multicast traffic via Multicast target + // + // Note: We deliberately do NOT subscribe any nodes to verify that Underlay mode + // forwards to underlay regardless of local subscription state (zero subscribers) + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Add IPv6 multicast route for admin-scoped multicast (ff04::/16) + // This tells the kernel to route multicast packets through the underlay interface + let route_add_result = std::process::Command::new("pfexec") + .args(&[ + "route", + "add", + "-inet6", + "ff04::/16", + "-interface", + "xde_test_vnic0", + ]) + .output() + .context("Failed to add IPv6 multicast route")?; + if !route_add_result.status.success() { + println!( + "Warning: Failed to add IPv6 multicast route: {}", + String::from_utf8_lossy(&route_add_result.stderr) + ); + } + + // Start snoop on the UNDERLAY simnet device (not the OPTE port) + // to verify the packet is forwarded to the underlay + let underlay_dev = "xde_test_sim1"; // Underlay device + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; // Geneve port + + // Debug: dump forwarding table to verify configuration + let mfwd = hdl.dump_mcast_fwd()?; + println!("\n=== Multicast forwarding table (Underlay test) ==="); + for entry in &mfwd.entries { + println!( + " Group: {:?}, Next hops: {:?}", + entry.group, entry.next_hops + ); + } + + // Also snoop node B's OPTE port to verify NO local delivery with Underlay mode + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + // Clear UFT right before sending to ensure fresh flow computation + hdl.clear_uft(topol.nodes[0].port.name())?; + + // Send multicast packet from node A + let payload = "underlay test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for snoop to capture the underlay packet + let snoop_output_underlay = snoop_underlay + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop on underlay")?; + + // Verify packet was forwarded to underlay + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + + assert!( + snoop_output_underlay.status.success() + && stdout_underlay.contains("UDP"), + "Expected to capture Geneve packet on underlay, snoop output:\n{stdout_underlay}" + ); + + // Verify Geneve header fields (VNI, outer IPv6 dst, replication mode) + let hex_str = geneve_verify::extract_snoop_hex(&stdout_underlay) + .expect("Failed to extract hex from snoop output"); + + let packet_bytes = geneve_verify::parse_snoop_hex(&hex_str) + .expect("Failed to parse hex string"); + + let geneve_info = geneve_verify::parse_geneve_packet(&packet_bytes) + .expect("Failed to parse Geneve packet"); + + assert_eq!( + geneve_info.vni, vni, + "Geneve VNI should be DEFAULT_MULTICAST_VNI ({})", + DEFAULT_MULTICAST_VNI + ); + assert_eq!( + geneve_info.outer_ipv6_dst, mcast_underlay, + "Outer IPv6 dst should be multicast underlay address" + ); + assert_eq!( + geneve_info.replication, + Some(Replication::Underlay), + "Geneve replication mode should be Underlay" + ); + + // Verify NO local delivery (Underlay mode = remote-only) + if let Ok(output) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&output.stdout); + panic!( + "Underlay mode should NOT deliver locally, but captured:\n{stdout}" + ); + } + + Ok(()) +} + +#[test] +fn test_multicast_all_replication() -> Result<()> { + // Create 3-node topology to test All replication mode (bifurcated delivery) + let topol = + xde_tests::three_node_topology_named("omicron1", "ara", "arb", "arc")?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 5]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 5, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Node B's underlay address for underlay forwarding + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast forwarding with All replication + // This should deliver BOTH to local subscribers AND forward to underlay + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::All, + )])?; + + // Allow IPv4 multicast traffic via Multicast target and subscribe to the group + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port.subscribe_multicast(mcast_group.into())?; + } + + // Start snoop on node B (local delivery) and underlay (underlay forwarding) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send multicast packet from node A + let payload = "all replication test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for both snoops to capture packets + let snoop_output_local = snoop_local + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for local delivery snoop")?; + let snoop_output_underlay = snoop_underlay + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for underlay snoop")?; + + // Verify local delivery happened + let stdout_local = String::from_utf8_lossy(&snoop_output_local.stdout); + assert!( + snoop_output_local.status.success() && stdout_local.contains("UDP"), + "Expected local delivery to node B, snoop output:\n{stdout_local}" + ); + + // Verify underlay forwarding happened + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + assert!( + snoop_output_underlay.status.success() + && stdout_underlay.contains("UDP"), + "Expected underlay forwarding, snoop output:\n{stdout_underlay}" + ); + + Ok(()) +} diff --git a/xde-tests/tests/multicast_rx.rs b/xde-tests/tests/multicast_rx.rs new file mode 100644 index 00000000..f29d1697 --- /dev/null +++ b/xde-tests/tests/multicast_rx.rs @@ -0,0 +1,514 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast RX-path tests. + +use anyhow::Context; +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::Direction; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_xde_multicast_rx_ipv4() -> Result<()> { + // Create 2-node topology (IPv4 overlay: 10.0.0.0/24) + let topol = xde_tests::two_node_topology_named("omicron1", "rx4a", "rx4b")?; + + // IPv4 multicast group: 224.0.0.251 + let mcast_group = Ipv4Addr::from([224, 0, 0, 251]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + // M2P mapping: overlay layer needs IPv6 multicast underlay address + // Use admin-scoped IPv6 multicast per Omicron's map_external_to_underlay_ip() + // Maps IPv4 multicast to ff04::/16 (admin-local scope) + IPv4 address + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 0, 0, 251, + ]); + + // Node B's underlay address - this is where we'll forward multicast packets + // From two_node_topology: node B (10.0.0.2) has underlay fd77::1 + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast group with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Set up multicast forwarding with External replication for unicast delivery. + // Maps overlay IPv4 multicast group -> underlay IPv6 unicast address of node B + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + // Allow IPv4 multicast traffic (224.0.0.0/4) via Multicast target. + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + + // Allow outbound multicast traffic through the gateway layer + topol.nodes[0].port.allow_cidr(mcast_cidr, Direction::Out)?; + topol.nodes[1].port.allow_cidr(mcast_cidr, Direction::Out)?; + + // Add router entries for multicast + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe both ports to the multicast group + topol.nodes[0].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Debug: dump multicast forwarding table + println!("\n=== Multicast forwarding table ==="); + let hdl = OpteHdl::open()?; + let mfwd = hdl.dump_mcast_fwd()?; + for entry in &mfwd.entries { + println!( + " Group: {:?}, Next hops: {:?}", + entry.group, entry.next_hops + ); + } + // Assert forwarding table contains expected next-hop + replication + let entry = mfwd + .entries + .iter() + .find(|e| e.group == mcast_group.into()) + .expect("missing multicast forwarding entry for group"); + assert!( + entry.next_hops.iter().any(|(nh, rep)| { + *rep == Replication::External + && nh.addr == node_b_underlay + && nh.vni == vni + }), + "expected External replication to {node_b_underlay:?} in forwarding table; got: {:?}", + entry.next_hops + ); + + // Start snoop using SnoopGuard to ensure cleanup + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(&dev_name_b, &filter)?; + + // Send UDP packet to the multicast address from zone A using netcat + // nc -u: IPv4 UDP mode + // -w1: timeout after 1 second + let payload = "multicast test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for snoop to capture the packet (or timeout) + let snoop_output = snoop + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop to capture multicast packet")?; + + // Check that snoop successfully captured a packet and validate basics + let stdout = String::from_utf8_lossy(&snoop_output.stdout); + assert!( + snoop_output.status.success() && !stdout.is_empty(), + "Expected to capture multicast packet on {dev_name_b}, snoop output:\n{stdout}" + ); + // Protocol summary present + assert!( + stdout.contains("UDP"), + "expected UDP summary in snoop output:\n{stdout}" + ); + // Verify destination address appears in snoop output + // SnoopGuard uses -r flag, so we always get numeric addresses + assert!( + stdout.contains("224.0.0.251"), + "expected destination 224.0.0.251 in snoop output:\n{stdout}" + ); + // Payload present - check for substring in ASCII representation + // The full payload may wrap across lines, so just check for a distinctive part + assert!( + stdout.contains("ast test"), + "expected payload substring 'ast test' in ASCII portion of snoop output:\n{stdout}" + ); + // L2 dest: with current XDE/gateway pipeline, multicast RX to guests + // is delivered with broadcast dest MAC. snoop shows 16-bit grouped hex. + assert!( + stdout.to_ascii_lowercase().contains("ffff ffff ffff"), + "expected L2 broadcast MAC 'ffff ffff ffff' in snoop output; got:\n{stdout}" + ); + + // Unsubscribe receiver and verify no further local delivery + topol.nodes[1].port.unsubscribe_multicast(mcast_group.into())?; + + let mut snoop2 = SnoopGuard::start(&dev_name_b, &filter)?; + let send_cmd2 = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd2) + .context("Failed to send multicast UDP packet (post-unsubscribe)")?; + let res = snoop2.wait_with_timeout(Duration::from_millis(800)); + match res { + Ok(out) => { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "expected no local delivery after unsubscribe; snoop output:\n{stdout}" + ); + } + Err(_) => {} + } + Ok(()) +} + +#[test] +fn test_xde_multicast_rx_ipv6() -> Result<()> { + // Create 2-node topology with dual-stack (IPv4 + IPv6) + let topol = xde_tests::two_node_topology_dualstack_named( + "omicron1", "rx6a", "rx6b", + )?; + + // IPv6 multicast group: ff05::1:3 (site-local, all-dhcp-agents) + let mcast_group = Ipv6Addr::from([ + 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + // M2P mapping: Map IPv6 multicast to admin-scoped underlay (ff04::/16) + // Per Omicron's map_external_to_underlay_ip(), convert ff05 -> ff04 + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // Node B's underlay address + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast group with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Set up multicast forwarding with External replication for local delivery + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + // Allow IPv6 multicast traffic (ff05::/16 site-local) via Multicast target + let mcast_cidr = IpCidr::Ip6("ff05::/16".parse().unwrap()); + + // Add router entries for multicast + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe both ports to the multicast group + topol.nodes[0].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Get the device names for snoop + let dev_name_b = topol.nodes[1].port.name().to_string(); + + // Start snoop using SnoopGuard to ensure cleanup + let filter = format!("udp and ip6 dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(&dev_name_b, &filter)?; + + // Send UDP packet to the multicast address from zone A using netcat + // nc -6 -u: IPv6 UDP mode + // -w1: timeout after 1 second + let payload = "multicast test v6"; + let sender_v6 = topol.nodes[0] + .port + .ipv6() + .expect("dualstack port must have IPv6 address"); + // illumos netcat selects IPv6 based on the destination; avoid `-6` for compatibility. + let send_cmd = format!( + "echo '{payload}' | nc -u -s {sender_v6} -w1 {mcast_group} {MCAST_PORT}" + ); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send IPv6 multicast UDP packet")?; + + // Wait for snoop to capture the packet (or timeout) + let snoop_output = + snoop.wait_with_timeout(Duration::from_secs(5)).context( + "Timeout waiting for snoop to capture IPv6 multicast packet", + )?; + + // Check that snoop successfully captured a packet + let stdout = String::from_utf8_lossy(&snoop_output.stdout); + assert!( + snoop_output.status.success() && !stdout.is_empty(), + "Expected to capture IPv6 multicast packet on {dev_name_b}, snoop output:\n{stdout}" + ); + + Ok(()) +} + +#[test] +fn test_reject_link_local_underlay_ff02() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let link_local_underlay = Ipv6Addr::from([ + 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: link_local_underlay, + vni, + }); + assert!( + result.is_err(), + "Expected link-local underlay (ff02::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_reject_global_underlay_ff0e() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let global_underlay = Ipv6Addr::from([ + 0xff, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: global_underlay, + vni, + }); + assert!( + result.is_err(), + "Expected global underlay (ff0e::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_accept_admin_local_underlay_ff04() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let admin_local = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: admin_local, + vni, + }); + assert!( + result.is_ok(), + "Expected admin-local underlay (ff04::) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_accept_site_local_underlay_ff05() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let site_local = Ipv6Addr::from([ + 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: site_local, + vni, + }); + assert!( + result.is_ok(), + "Expected site-local underlay (ff05::) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_accept_org_local_underlay_ff08() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let org_local = Ipv6Addr::from([ + 0xff, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: org_local, + vni, + }); + assert!( + result.is_ok(), + "Expected org-local underlay (ff08::) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_reject_wrong_vni() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 100, + ]); + + let wrong_vni = Vni::new(1701u32)?; + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + vni: wrong_vni, + }); + assert!( + result.is_err(), + "Expected VNI 1701 to be rejected (must use DEFAULT_MULTICAST_VNI), got: {:?}", + result + ); + + Ok(()) +} + +#[test] +fn test_accept_default_multicast_vni() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 100, + ]); + + let correct_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + vni: correct_vni, + }); + assert!( + result.is_ok(), + "Expected DEFAULT_MULTICAST_VNI (77) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_multicast_rx_no_relay_loop() -> Result<()> { + // Test RX loop-prevention: packets arriving from underlay with + // Replication::Underlay should NOT be re-relayed back to underlay. + // This prevents infinite relay loops. + + let topol = xde_tests::two_node_topology_named("omicron1", "lpa", "lpb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 200, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up forwarding with Underlay replication + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::Underlay, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port.subscribe_multicast(mcast_group.into())?; + } + + // Snoop the underlay to verify NO re-relay happens + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Simulate receiving a multicast packet FROM the underlay + // with Replication::Underlay already set (indicating it came from another host). + // Build a Geneve packet with the Underlay replication bit set. + let hdl = OpteHdl::open()?; + + // We need to inject a packet on the underlay that looks like it came from + // another host. Unfortunately, we can't easily inject raw packets in the test + // environment without significant plumbing. Instead, we verify the logic + // indirectly by checking that the dtrace probe shows the right behavior. + + // For now, document the expected behavior and add a TODO for full integration + // test once we have packet injection capability. + println!("\n=== RX Loop Prevention Test ==="); + println!("Expected behavior: Packets arriving from underlay with"); + println!("Replication::Underlay should NOT be re-relayed."); + println!("\nThis requires packet injection capability to fully test."); + println!( + "Current implementation checks incoming delivery mode in Geneve options" + ); + println!("and only relays if delivery_mode is Underlay or All."); + + // Verify the multicast forwarding table is set up correctly + let mfwd = hdl.dump_mcast_fwd()?; + println!("\n=== Multicast forwarding table ==="); + for entry in &mfwd.entries { + println!( + " Group: {:?}, Next hops: {:?}", + entry.group, entry.next_hops + ); + } + + // Since we can't inject packets easily, verify NO spurious underlay traffic + // by waiting to ensure nothing appears on underlay without us sending anything + let snoop_result = snoop_underlay.wait_with_timeout(Duration::from_secs(2)); + + match snoop_result { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + assert!( + stdout.is_empty(), + "No multicast traffic should appear on underlay without a sender:\n{stdout}" + ); + } + Err(_) => { + // Timeout is expected - no packets should appear + } + } + + Ok(()) +} diff --git a/xde-tests/tests/multicast_validation.rs b/xde-tests/tests/multicast_validation.rs new file mode 100644 index 00000000..5d472281 --- /dev/null +++ b/xde-tests/tests/multicast_validation.rs @@ -0,0 +1,239 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Validation tests covering multicast operations. + +use anyhow::Context; +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_subscribe_nonexistent_port() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + + // Try to subscribe non-existent port + let result = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: "this_port_does_not_exist_anywhere".to_string(), + group: mcast_group.into(), + }); + + // Should return error, not panic or succeed + assert!( + result.is_err(), + "Expected error when subscribing non-existent port, got Ok" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_unicast_ip_as_group() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "unia", "unib")?; + let hdl = OpteHdl::open()?; + + // Try to subscribe to unicast IP (not multicast) - should be rejected + let unicast_ip = Ipv4Addr::from([10, 0, 0, 1]); + let result = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: unicast_ip.into(), + }); + + // Should reject non-multicast addresses + match result { + Ok(_) => { + panic!("Expected error when subscribing to unicast IP, got Ok") + } + Err(e) => { + assert!( + format!("{e:?}").contains("not a multicast address"), + "Expected 'not a multicast address' error, got: {e:?}", + ); + } + } + + Ok(()) +} + +#[test] +fn test_double_subscribe() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "dsa", "dsb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 101]); + const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 101, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay, vni)?; + + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Subscribe once + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Subscribe again (should be idempotent) + let result = topol.nodes[1].port.subscribe_multicast(mcast_group.into()); + + // Should succeed (idempotent operation) + assert!( + result.is_ok(), + "Double subscribe should be idempotent, got error: {:?}", + result + ); + + // Verify delivery works and packet is NOT duplicated + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(topol.nodes[1].port.name(), &filter)?; + + topol.nodes[0].zone.zone.zexec(&format!( + "echo 'test' | nc -u -w1 {mcast_group} {MCAST_PORT}" + ))?; + + let output = snoop + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for multicast delivery")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + + // Verify packet received + assert!( + output.status.success() && stdout.contains("UDP"), + "Should receive multicast after double subscribe:\n{stdout}" + ); + + // Count occurrences - should be 1, not 2 (no duplication) + let count = stdout.matches("UDP").count(); + assert!( + count == 1, + "Packet should be delivered once, not duplicated. Found {count} deliveries" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_never_subscribed() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "usa", "usb")?; + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 102]); + + // Try to unsubscribe without ever subscribing + let result = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: mcast_group.into(), + }); + + // Expected: Ok (no-op). Unsubscribe is idempotent for existing ports. + assert!( + result.is_ok(), + "Unsubscribe should be a no-op (Ok), got: {result:?}" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_then_clear_m2p() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "sca", "scb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 103]); + const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 103, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay, vni)?; + + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Clear M2P while subscription active + let hdl = OpteHdl::open()?; + hdl.clear_m2p(&ClearMcast2PhysReq { + group: mcast_group.into(), + underlay, + vni, + })?; + + // Start snoops to verify NO delivery occurs after M2P clear + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send packet - command should execute successfully regardless of delivery + let result = topol.nodes[0] + .zone + .zone + .zexec(&format!("echo 'test' | nc -u -w1 {mcast_group} {MCAST_PORT}")); + + // Expected: Ok (command executed). Delivery should NOT occur. + assert!(result.is_ok(), "Send after M2P clear should succeed: {result:?}"); + + // Verify no local delivery + if let Ok(out) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!("No local delivery expected; got:\n{stdout}"); + } + + // Verify no underlay forwarding (encap denied without M2P) + if let Ok(out) = snoop_underlay.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "No underlay forwarding expected after M2P clear; got:\n{stdout}" + ); + } + + Ok(()) +} diff --git a/xde/src/dev_map.rs b/xde/src/dev_map.rs index 5fbdf5c0..43e95e9e 100644 --- a/xde/src/dev_map.rs +++ b/xde/src/dev_map.rs @@ -7,12 +7,17 @@ use crate::postbox::Postbox; use crate::xde::XdeDev; use alloc::collections::btree_map::BTreeMap; +use alloc::collections::btree_map::Entry; +use alloc::collections::btree_set::BTreeSet; use alloc::string::String; use alloc::sync::Arc; +use opte::api::IpAddr; use opte::api::MacAddr; +use opte::api::OpteError; use opte::api::Vni; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockReadGuard; +use opte::ddi::sync::KRwLockWriteGuard; /// A map/set lookup key for ports indexed on `(Vni, MacAddr)`. /// @@ -27,6 +32,11 @@ impl VniMac { pub fn new(vni: Vni, mac: MacAddr) -> Self { VniMac(vni.as_u32(), mac_to_u64(mac)) } + + #[inline] + pub fn vni(&self) -> Vni { + Vni::new(self.0).expect("VniMac contains valid VNI") + } } type Dev = Arc; @@ -41,6 +51,7 @@ type Dev = Arc; pub struct DevMap { devs: BTreeMap, names: BTreeMap, + mcast_groups: BTreeMap>, } impl Default for DevMap { @@ -51,7 +62,11 @@ impl Default for DevMap { impl DevMap { pub const fn new() -> Self { - Self { devs: BTreeMap::new(), names: BTreeMap::new() } + Self { + devs: BTreeMap::new(), + names: BTreeMap::new(), + mcast_groups: BTreeMap::new(), + } } /// Insert an `XdeDev`. @@ -66,9 +81,70 @@ impl DevMap { /// Remove an `XdeDev` using its name. pub fn remove(&mut self, name: &str) -> Option { let key = get_key(&self.names.remove(name)?); + + // Clean up all multicast group subscriptions for this port + self.mcast_groups.retain(|_group, subscribers| { + subscribers.remove(&key); + !subscribers.is_empty() + }); + self.devs.remove(&key) } + /// Allow a port to receive on a given multicast group. + /// + /// This takes the overlay (outer v6) multicast group address. + pub fn mcast_subscribe( + &mut self, + name: &str, + mcast_ip: IpAddr, + ) -> Result<(), OpteError> { + // Validate that the IP is actually a multicast address + if !mcast_ip.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + mcast_ip + ))); + } + + let port = self + .get_by_name(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + // TODO: probably could store Arcs or Weaks here, but want to be safe for now. + self.mcast_groups.entry(mcast_ip).or_default().insert(key); + + Ok(()) + } + + /// Rescind a port's ability to receive on a given multicast group. + pub fn mcast_unsubscribe( + &mut self, + name: &str, + mcast_ip: IpAddr, + ) -> Result<(), OpteError> { + let port = self + .get_by_name(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + // TODO: Do we need handling for a special VNI from rack-external traffic? + if let Entry::Occupied(set) = self.mcast_groups.entry(mcast_ip) { + set.into_mut().remove(&key); + } + + Ok(()) + } + + /// Find the keys for all ports who want to receive a given multicast packet. + pub fn mcast_listeners( + &self, + mcast_ip: &IpAddr, + ) -> Option> { + self.mcast_groups.get(mcast_ip).map(|v| v.iter()) + } + /// Return a reference to an `XdeDev` using its address. #[inline] #[must_use] @@ -135,4 +211,8 @@ impl ReadOnlyDevMap { pub fn read(&self) -> KRwLockReadGuard<'_, DevMap> { self.0.read() } + + pub fn write(&self) -> KRwLockWriteGuard<'_, DevMap> { + self.0.write() + } } diff --git a/xde/src/stats.rs b/xde/src/stats.rs index 53a57076..02518ab8 100644 --- a/xde/src/stats.rs +++ b/xde/src/stats.rs @@ -55,9 +55,72 @@ pub struct XdeStats { out_drop_misc: KStatU64, // NOTE: tun_opt is not relevant to outbound packets -- no encapsulation // is in use. + /// The number of multicast packets delivered to external/customer + /// members (decapsulated packets to local guest instances). + mcast_tx_external: KStatU64, + /// The number of multicast packets forwarded to underlay/infrastructure + /// members (encapsulated Geneve packets to infrastructure destinations). + mcast_tx_underlay: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during external delivery. + mcast_tx_stale_external: KStatU64, + + /// The number of multicast packets received and delivered to external/customer + /// members (decapsulated packets to local guest instances). + mcast_rx_external: KStatU64, + /// The number of multicast packets received and forwarded to underlay/infrastructure + /// members (re-encapsulated Geneve packets to infrastructure destinations). + mcast_rx_underlay: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during Rx external delivery. + mcast_rx_stale_external: KStatU64, + /// The number of multicast packets received with no forwarding entry. + mcast_rx_no_fwd_entry: KStatU64, + /// The number of times a pullup operation failed during multicast TX + /// (packet replication), causing a packet to be dropped. + mcast_tx_pullup_fail: KStatU64, + /// The number of times a pullup operation failed during multicast RX + /// (packet delivery/relay), causing a packet to be dropped. + mcast_rx_pullup_fail: KStatU64, } impl XdeStats { + pub fn mcast_tx_external(&self) -> &KStatU64 { + &self.mcast_tx_external + } + + pub fn mcast_tx_underlay(&self) -> &KStatU64 { + &self.mcast_tx_underlay + } + + pub fn mcast_tx_stale_external(&self) -> &KStatU64 { + &self.mcast_tx_stale_external + } + + pub fn mcast_rx_external(&self) -> &KStatU64 { + &self.mcast_rx_external + } + + pub fn mcast_rx_underlay(&self) -> &KStatU64 { + &self.mcast_rx_underlay + } + + pub fn mcast_rx_stale_external(&self) -> &KStatU64 { + &self.mcast_rx_stale_external + } + + pub fn mcast_rx_no_fwd_entry(&self) -> &KStatU64 { + &self.mcast_rx_no_fwd_entry + } + + pub fn mcast_tx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_tx_pullup_fail + } + + pub fn mcast_rx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_rx_pullup_fail + } + pub fn parse_error(&self, dir: Direction, err: &ParseError) { use Direction::*; (match (dir, err) { diff --git a/xde/src/xde.rs b/xde/src/xde.rs index b753484a..7267e581 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -153,6 +153,7 @@ use crate::sys::ncpus; use crate::warn; use alloc::borrow::ToOwned; use alloc::boxed::Box; +use alloc::collections::BTreeMap; use alloc::ffi::CString; use alloc::string::String; use alloc::string::ToString; @@ -160,6 +161,7 @@ use alloc::sync::Arc; use alloc::vec::Vec; use core::ffi::CStr; use core::num::NonZeroU32; +use core::num::NonZeroUsize; use core::ptr; use core::ptr::NonNull; use core::ptr::addr_of; @@ -167,6 +169,7 @@ use core::ptr::addr_of_mut; use core::time::Duration; use illumos_sys_hdrs::mac::MacEtherOffloadFlags; use illumos_sys_hdrs::mac::MblkOffloadFlags; +use illumos_sys_hdrs::mac::mac_ether_offload_info_t; use illumos_sys_hdrs::*; use ingot::geneve::Geneve; use ingot::geneve::GeneveOpt; @@ -185,6 +188,7 @@ use opte::api::DumpUftReq; use opte::api::DumpUftResp; use opte::api::ListLayersReq; use opte::api::ListLayersResp; +use opte::api::MacAddr; use opte::api::NoResp; use opte::api::OpteCmd; use opte::api::OpteCmdIoctl; @@ -212,6 +216,7 @@ use opte::engine::geneve::Vni; use opte::engine::geneve::WalkOptions; use opte::engine::headers::IpAddr; use opte::engine::ip::v6::Ipv6Addr; +use opte::engine::ip::v6::Ipv6Ref; use opte::engine::packet::InnerFlowId; use opte::engine::packet::Packet; use opte::engine::packet::ParseError; @@ -221,21 +226,32 @@ use opte::engine::port::PortBuilder; use opte::engine::port::ProcessResult; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastForwardingEntry; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::cfg::IpCfg; @@ -285,6 +301,30 @@ unsafe extern "C" { dst_port: uintptr_t, ); pub safe fn __dtrace_probe_hdlr__resp(resp_str: uintptr_t); + pub safe fn __dtrace_probe_mcast__tx( + af: uintptr_t, // AF_INET or AF_INET6 + inner_dst: uintptr_t, // *const Ipv4Addr or *const Ipv6Addr + vni: uintptr_t, + replication: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__rx( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + replication: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__local__delivery( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + port: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__underlay__fwd( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + next_hop: *const oxide_vpc::api::Ipv6Addr, + ); } fn bad_packet_parse_probe( @@ -377,6 +417,10 @@ struct XdeState { struct XdeMgmt { devs: Arc>, underlay: Option, + /// XDE-wide multicast forwarding table mapping multicast group addresses + /// to their physical next hops with replication information. + /// Maps: IpAddr (overlay multicast group) -> BTreeMap + mcast_fwd: Arc>>>, } #[derive(Clone)] @@ -400,6 +444,7 @@ fn get_xde_state() -> &'static XdeState { impl XdeState { fn new() -> Self { + #[allow(clippy::arc_with_non_send_sync)] let ectx = Arc::new(ExecCtx { log: Box::new(opte::KernelLog {}) }); let dev_map = Arc::new(KRwLock::new(DevMap::default())); let devs = ReadOnlyDevMap::new(dev_map.clone()); @@ -408,6 +453,7 @@ impl XdeState { management_lock: TokenLock::new(XdeMgmt { devs: dev_map, underlay: None, + mcast_fwd: Arc::new(KRwLock::new(BTreeMap::new())), }), devs, ectx, @@ -444,7 +490,7 @@ pub struct XdeDev { // However, that's not where things are today. pub port: Arc>, vpc_cfg: VpcCfg, - port_v2p: Arc, + port_vni_state: Arc, // Pass the packets through to the underlay devices, skipping // opte-core processing. @@ -868,6 +914,41 @@ unsafe extern "C" fn xde_ioc_opte_cmd(karg: *mut c_void, mode: c_int) -> c_int { let resp = remove_cidr_hdlr(&mut env); hdlr_resp(&mut env, resp) } + + OpteCmd::SetMcastForwarding => { + let resp = set_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcastForwarding => { + let resp = clear_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::DumpMcastForwarding => { + let resp = dump_mcast_forwarding_hdlr(); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastSubscribe => { + let resp = mcast_subscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastUnsubscribe => { + let resp = mcast_unsubscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::SetMcast2Phys => { + let resp = set_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcast2Phys => { + let resp = clear_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } } } @@ -961,7 +1042,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { state.ectx.clone(), &req.dhcp, )?, - port_v2p, + port_vni_state: port_v2p, vni: cfg.vni, vpc_cfg: cfg, passthrough: req.passthrough, @@ -1236,6 +1317,9 @@ fn clear_xde_underlay() -> Result { }); } + // Clear multicast forwarding table to release any references + token.mcast_fwd.write().clear(); + if let Some(underlay) = token.underlay.take() { // If the underlay references have leaked/spread beyond `XdeDev`s and not // been cleaned up, we committed have a fatal programming error. @@ -1772,20 +1856,18 @@ fn guest_loopback_probe( fn guest_loopback( src_dev: &XdeDev, - entry_state: &DevMap, + dest_dev: &XdeDev, + port_key: VniMac, mut pkt: MsgBlk, - vni: Vni, postbox: &mut TxPostbox, ) { use Direction::*; let mblk_addr = pkt.mblk_addr(); - // Loopback now requires a reparse on loopback to account for UFT fastpath. - // When viona serves us larger packets, we needn't worry about allocing - // the encap on. - // We might be able to do better in the interim, but that costs us time. - + // Loopback requires a reparse to account for UFT fastpath. + // We might be able to do better, but the logistics in passing around + // the emitspec in lieu of 'full' metadata might be a little troublesome. let parsed_pkt = match Packet::parse_inbound(pkt.iter_mut(), VpcParser {}) { Ok(pkt) => pkt, Err(e) => { @@ -1810,78 +1892,755 @@ fn guest_loopback( let flow = parsed_pkt.flow(); - let ether_dst = parsed_pkt.meta().inner_eth.destination(); - let port_key = VniMac::new(vni, ether_dst); - let maybe_dest_dev = entry_state.get_by_key(port_key); - - match maybe_dest_dev { - Some(dest_dev) => { - guest_loopback_probe(mblk_addr, &flow, src_dev, dest_dev); - - // We have found a matching Port on this host; "loop back" - // the packet into the inbound processing path of the - // destination Port. - match dest_dev.port.process(In, parsed_pkt) { - Ok(ProcessResult::Modified(emit_spec)) => { - let mut pkt = emit_spec.apply(pkt); - if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { - opte::engine::err!("failed to set offload info: {}", e); - } + guest_loopback_probe(mblk_addr, &flow, src_dev, dest_dev); - // Having advertised offloads to our guest, looped back - // packets are liable to have zero-checksums. Fill these - // if necessary. - let pkt = if pkt - .offload_flags() - .flags - .intersects(MblkOffloadFlags::HCK_TX_FLAGS) - { - // We have only asked for cksum emulation, so we - // will either have: - // * 0 pkts (checksum could not be emulated, - // packet dropped) - // * 1 pkt. - mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) - .and_then(|mut v| v.pop_front()) - } else { - Some(pkt) - }; + match dest_dev.port.process(In, parsed_pkt) { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut pkt = emit_spec.apply(pkt); + if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } + + // Having advertised offloads to our guest, looped back + // packets are liable to have zero-checksums. Fill these + // if necessary. + let pkt = if pkt + .offload_flags() + .flags + .intersects(MblkOffloadFlags::HCK_TX_FLAGS) + { + // We have only asked for cksum emulation, so we + // will either have: + // * 0 pkts (checksum could not be emulated, + // packet dropped) + // * 1 pkt. + mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) + .and_then(|mut v| v.pop_front()) + } else { + Some(pkt) + }; + + if let Some(pkt) = pkt { + postbox.post_local(port_key, pkt); + } + } + + Ok(ProcessResult::Drop { reason }) => { + opte::engine::dbg!("loopback rx drop: {:?}", reason); + } + + Ok(ProcessResult::Hairpin(_hppkt)) => { + // There should be no reason for an loopback + // inbound packet to generate a hairpin response + // from the destination port. + opte::engine::dbg!("unexpected loopback rx hairpin"); + } + + Err(e) => { + opte::engine::dbg!( + "loopback port process error: {} -> {} {:?}", + src_dev.port.name(), + dest_dev.port.name(), + e + ); + } + } +} - if let Some(pkt) = pkt { - postbox.post_local(port_key, pkt); +/// Locate the Oxide Multicast Geneve option and return the offset to its body. +/// +/// Walks through Geneve options starting at `geneve_offset + 8` to find the +/// Oxide Multicast option (class=0x0129, type=0x01). Returns the offset to the +/// option body (after the 4-byte option header) if found. +/// +/// # Safety +/// This function validates option headers as it walks to avoid reading beyond +/// packet boundaries. Returns None if the option is not found or if validation fails. +/// +/// # Geneve Option Format +/// Each option consists of: +/// - 2 bytes: Option class +/// - 1 byte: Flags (bit 7=critical) + Type (bits 0-6) +/// - 1 byte: Reserved (3 bits) + Length in 4-byte words (5 bits) +/// - N bytes: Option data (N = length field * 4) +fn find_mcast_option_offset( + pkt: &MsgBlk, + geneve_offset: usize, +) -> Option { + const GENEVE_HDR_LEN: usize = 8; + const OPT_HDR_LEN: usize = 4; + const OXIDE_OPT_CLASS: u16 = 0x0129; + const MULTICAST_OPT_TYPE: u8 = 0x01; + + // Read Geneve header to get option length + let geneve_hdr = pkt.get(geneve_offset..geneve_offset + GENEVE_HDR_LEN)?; + let opt_len_words = (geneve_hdr[0] & 0x3F) as usize; // Bottom 6 bits of first byte + + if opt_len_words == 0 { + return None; // No options present + } + + let opts_start = geneve_offset + GENEVE_HDR_LEN; + let opts_end = opts_start + (opt_len_words * 4); + + // Belt-and-braces: ensure options area doesn't exceed packet length + if opts_end > pkt.len() { + return None; + } + + let mut offset = opts_start; + + while offset + OPT_HDR_LEN <= opts_end { + let opt_hdr = pkt.get(offset..offset + OPT_HDR_LEN)?; + + let class = u16::from_be_bytes([opt_hdr[0], opt_hdr[1]]); + let opt_type = opt_hdr[2] & 0x7F; // Mask out critical bit + let opt_data_words = (opt_hdr[3] & 0x1F) as usize; // Bottom 5 bits + let opt_data_len = opt_data_words * 4; + + if class == OXIDE_OPT_CLASS && opt_type == MULTICAST_OPT_TYPE { + // Found it! Return offset to option body + return Some(offset + OPT_HDR_LEN); + } + + // Move to next option + offset += OPT_HDR_LEN + opt_data_len; + } + + None +} + +/// Update the Oxide Multicast Geneve option's replication field. +/// +/// Locates the multicast option and rewrites the replication strategy in the +/// first byte of the option body (top 2 bits encode the replication mode). +/// +/// Returns `true` if the option was found and updated, `false` otherwise. +/// +/// # Replication Encoding +/// The replication field uses the top 2 bits of the first byte: +/// - `External` (0): 0x00 +/// - `Underlay` (1): 0x40 +/// - `All` (2): 0x80 +/// - `Reserved` (3): 0xC0 +#[inline] +fn update_mcast_replication( + pkt: &mut MsgBlk, + geneve_offset: usize, + replication: Replication, +) -> bool { + let Some(mcast_body_off) = find_mcast_option_offset(pkt, geneve_offset) + else { + return false; + }; + + let Some(rep_byte) = pkt.get_mut(mcast_body_off..mcast_body_off + 1) else { + return false; + }; + + // Encode replication in top 2 bits, preserve bottom 6 bits + let repl_bits = (replication as u8) << 6; + rep_byte[0] = (rep_byte[0] & 0x3F) | repl_bits; + true +} + +/// Compute the combined replication strategy from a set of next hops. +/// +/// Starts from the first hop's replication and folds the rest using +/// `Replication::merge()` to avoid biasing toward `External`. +/// Returns `None` if `next_hops` is empty. +#[inline] +fn compute_replication_strategy( + next_hops: &BTreeMap, +) -> Option { + let mut acc: Option = None; + for repl in next_hops.values().copied() { + acc = Some(match acc { + None => repl, + Some(cur) => cur.merge(repl), + }); + } + acc +} + +struct MulticastTxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP + vni: Vni, + out_pkt: &'a MsgBlk, + encap_len: u32, + inner_eth_len: usize, + non_eth_payl_bytes: u32, + tun_meoi: &'a mac_ether_offload_info_t, + l4_hash: u32, +} + +struct MulticastRxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP + vni: Vni, + pkt: &'a MsgBlk, + pullup_len: usize, + geneve_offset: usize, + incoming_delivery_mode: Option, +} + +/// Handle multicast packet forwarding for both external/customer and +/// underlay/infrastructure delivery based on the XDE-wide multicast +/// forwarding table. +/// +/// - External: Customer-facing members, local guest instances (decapsulated) +/// - Underlay: Infrastructure members, underlay destinations (encapsulated Geneve) +fn handle_mcast_tx<'a>( + ctx: MulticastTxContext, + src_dev: &'a XdeDev, + postbox: &mut TxPostbox, + entry_state: &mut Option>>, +) { + // DTrace probe: capture TX entry + let (af, inner_addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) + } + }; + + // Determine replication strategy from XDE-wide multicast forwarding table + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + // Compute combined replication strategy from all next hops to govern local delivery. + let delivery_mode = mcast_fwd + .get(&ctx.inner_dst) + .and_then(compute_replication_strategy) + .unwrap_or(Replication::External); + + // Drop locks before potentially expensive operations + drop(mcast_fwd); + drop(mgmt); + + // DTrace probe: multicast TX entry with delivery mode + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) + } + }; + __dtrace_probe_mcast__tx( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + delivery_mode as uintptr_t, + ); + + // External/customer delivery if delivery mode is External or All + // Delivers decapsulated packets to customer-facing members in the same VNI + let do_external = matches!( + delivery_mode, + oxide_vpc::api::Replication::External + | oxide_vpc::api::Replication::All + ); + + if do_external { + let entry_state = + entry_state.get_or_insert_with(|| src_dev.port_map.read()); + if let Some(others) = entry_state.mcast_listeners(&ctx.inner_dst) { + let my_key = VniMac::new(ctx.vni, src_dev.port.mac_addr()); + for el in others { + // Filter by VNI - only deliver to listeners in the same VNI + if el.vni() != ctx.vni { + continue; + } + if my_key == *el { + continue; + } + + // This is a more lightweight clone in illumos, and + // gives us an owned form of the headers but a ref + // counted clone of the packet body. + // + // If there are any body transforms internally, OPTE + // will fully clone out the contents if required. + let pullup_len = (ctx.encap_len as usize) + + (ctx.non_eth_payl_bytes as usize) + + ctx.inner_eth_len; + let Ok(my_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast TX external pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + continue; + }; + match entry_state.get_by_key(*el) { + Some(dev) => { + // DTrace probe: local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => ( + 2usize, + AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, + ), + oxide_vpc::api::IpAddr::Ip6(v6) => ( + 26usize, + AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + guest_loopback(src_dev, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_external().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_stale_external().incr(1); } } + } + } + } - Ok(ProcessResult::Drop { reason }) => { - opte::engine::dbg!("loopback rx drop: {:?}", reason); + // Underlay/infrastructure forwarding only if the merged delivery mode + // calls for it. External-only means local delivery only, no underlay fanout. + let do_underlay = matches!( + delivery_mode, + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All + ); + + if do_underlay { + // Re-acquire locks for underlay forwarding + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + if let Some(next_hops) = mcast_fwd.get(&ctx.inner_dst) { + // We found forwarding entries, replicate to each next hop + for (next_hop, replication) in next_hops.iter() { + // Clone packet with headers using pullup + let pullup_len = (ctx.encap_len as usize) + + (ctx.non_eth_payl_bytes as usize) + + ctx.inner_eth_len; + let Ok(mut fwd_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast TX underlay pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + continue; // Skip this destination on allocation failure + }; + + // Modify VNI in Geneve header to next_hop.vni + // Geneve header follows outer Ethernet + IPv6 + UDP + let geneve_offset = usize::from(ctx.tun_meoi.meoi_l2hlen) + + usize::from(ctx.tun_meoi.meoi_l3hlen) + + usize::from(ctx.tun_meoi.meoi_l4hlen); + + // Determine the actual outer IPv6 destination and whether to modify it + // - External: Override with unicast next_hop.addr for delivery to specific host + // - Underlay/All: Keep the multicast underlay address from OPTE (already set via M2P) + let ipv6_offset = usize::from(ctx.tun_meoi.meoi_l2hlen); + let actual_outer_dst = match replication { + oxide_vpc::api::Replication::External => { + // External replication: override with unicast destination + let ipv6_dst_offset = ipv6_offset + 24; + if let Some(dst_bytes) = fwd_pkt + .get_mut(ipv6_dst_offset..ipv6_dst_offset + 16) + { + dst_bytes.copy_from_slice(AsRef::<[u8]>::as_ref( + &next_hop.addr, + )); + } + next_hop.addr // Use unicast address for routing + } + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All => { + // Underlay/All replication: The packet already has the correct + // multicast underlay address from OPTE's M2P mapping. + // Do NOT override it - just get it for route lookup + let xde = get_xde_state(); + match xde + .vpc_map + .get_mcast_underlay(ctx.vni, ctx.inner_dst) + { + Some(mcast_ul) => mcast_ul.addr(), // Use multicast address for routing + None => { + // No M2P mapping - skip this destination + continue; + } + } + } + _ => { + // Reserved or unknown replication type - skip + continue; + } + }; + + // VNI is at offset 4 in Geneve header (3 bytes) + if let Some(vni_bytes) = + fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) + { + let vni_be = next_hop.vni.as_u32().to_be_bytes(); + vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits } + // Update Geneve multicast option to reflect underlay replication to prevent re-relay loops. + update_mcast_replication( + &mut fwd_pkt, + geneve_offset, + *replication, + ); + + // Route lookup for next hop to get outer MAC addresses + // Use the actual_outer_dst we determined above + let route_key = RouteKey { + dst: actual_outer_dst, + l4_hash: Some(ctx.l4_hash), + }; + let Route { src: mac_src, dst: mac_dst, underlay_idx } = + src_dev.routes.next_hop(route_key, src_dev); + + // Fill in outer MAC addresses + let final_pkt = unsafe { + let mblk = fwd_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + ptr::copy(mac_dst.as_ptr(), rptr, 6); + ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + + // Note: The outer IPv6 destination was already set correctly in fwd_pkt + // based on the replication type, and we used the correct address for + // route lookup, so no need to modify it here. + + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + // DTrace probe: underlay forwarding + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + + // Send to underlay + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, + ); + + // Increment underlay forwarding stat + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + } + + // Release locks + drop(mcast_fwd); + drop(mgmt); + } + } +} - Ok(ProcessResult::Hairpin(_hppkt)) => { - // There should be no reason for an loopback - // inbound packet to generate a hairpin response - // from the destination port. - opte::engine::dbg!("unexpected loopback rx hairpin"); +/// Handle multicast packet reception from the underlay. +/// +/// This function processes incoming multicast packets and: +/// - Delivers to external/customer members in the same VNI (local listeners) +/// - Optionally forwards to underlay/infrastructure members (if acting as relay) +/// +/// Unlike Tx path which originates from a port, Rx path receives from underlay +/// and needs to determine all appropriate destinations. +fn handle_mcast_rx( + ctx: MulticastRxContext, + stream: &DlsStream, + devs: &DevMap, + postbox: &mut Postbox, +) { + // DTrace probe: multicast RX entry + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__rx( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + ctx.incoming_delivery_mode.map(|r| r as uintptr_t).unwrap_or(0), + ); + + // Determine replication strategy from XDE-wide multicast forwarding table + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + // Compute combined replication strategy from all next hops + let has_fwd_entry = mcast_fwd.get(&ctx.inner_dst).is_some(); + let delivery_mode = mcast_fwd + .get(&ctx.inner_dst) + .and_then(compute_replication_strategy) + .unwrap_or(Replication::External); + + // Drop locks before potentially expensive operations + drop(mcast_fwd); + drop(mgmt); + + // If no forwarding entry exists, check for local listeners only + if !has_fwd_entry { + if let Some(ports) = devs.mcast_listeners(&ctx.inner_dst) { + // Deliver to local listeners in the same VNI only + for el in ports { + // Filter by VNI - only deliver to listeners in the incoming packet's VNI + if el.vni() != ctx.vni { + continue; } - Err(e) => { + let Ok(my_pkt) = + ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { opte::engine::dbg!( - "loopback port process error: {} -> {} {:?}", - src_dev.port.name(), - dest_dev.port.name(), - e + "mcast RX external pullup failed: requested {} bytes", + ctx.pullup_len ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + continue; + }; + match devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: RX local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => ( + 2usize, + AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, + ), + oxide_vpc::api::IpAddr::Ip6(v6) => ( + 26usize, + AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_external().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_stale_external().incr(1); + } } } + } else { + // No forwarding entry and no local listeners + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_no_fwd_entry().incr(1); } + return; + } - None => { - opte::engine::dbg!( - "underlay dest is same as src but the Port was not found \ - vni = {}, mac = {}", - vni.as_u32(), - ether_dst - ); + // External/customer delivery if delivery mode is External or All. + // + // Loop Prevention: If the incoming packet has Underlay or All replication set, + // it means this packet has already been relayed by another host and we should + // NOT deliver it locally. This prevents: + // - Duplicate delivery to local listeners + // - Infinite forwarding loops in the underlay network + let do_external = matches!( + delivery_mode, + oxide_vpc::api::Replication::External + | oxide_vpc::api::Replication::All + ) && !matches!( + ctx.incoming_delivery_mode, + Some(oxide_vpc::api::Replication::Underlay) + | Some(oxide_vpc::api::Replication::All) + ); + + if do_external && let Some(ports) = devs.mcast_listeners(&ctx.inner_dst) { + // Deliver to local listeners in the same VNI only + for el in ports { + // Filter by VNI - only deliver to listeners in the incoming packet's VNI + if el.vni() != ctx.vni { + continue; + } + + let Ok(my_pkt) = ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { + opte::engine::dbg!( + "mcast RX external pullup failed: requested {} bytes", + ctx.pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + continue; + }; + match devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: RX local delivery (with forwarding entry) + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_external().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_stale_external().incr(1); + } + } + } + } + + // Underlay/infrastructure forwarding if delivery mode is Underlay or All + // For Rx path, this would mean we're acting as a multicast relay/router + // + // Loop prevention: Don't relay if incoming packet already has Underlay or All + // replication set in its Geneve option, as this indicates it has already been + // relayed by another host. + let should_relay = matches!( + delivery_mode, + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All + ) && !matches!( + ctx.incoming_delivery_mode, + Some(oxide_vpc::api::Replication::Underlay) + | Some(oxide_vpc::api::Replication::All) + ); + + if should_relay { + // Re-acquire locks for underlay forwarding + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + if let Some(next_hops) = mcast_fwd.get(&ctx.inner_dst) { + // Get routing info from any local device (all share same underlay) + let routing_dev = devs.iter().next(); + + for (next_hop, repl) in next_hops.iter() { + // Only forward to underlay destinations + if !matches!( + repl, + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All + ) { + continue; + } + + // Clone the packet for this destination + let Ok(mut fwd_pkt) = + ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { + opte::engine::dbg!( + "mcast RX underlay relay pullup failed: requested {} bytes", + ctx.pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + continue; + }; + + // NOTE: For multicast underlay relaying, we do NOT modify the outer + // IPv6 destination. It's already set to the multicast underlay address + // (e.g., ff04::...224.1.2.4) by OPTE's encapsulation layer. + // The next_hop.addr is only used for routing/MAC lookup, which returns + // MAC addresses without modifying the packet. + + // Modify VNI in Geneve header to next_hop.vni + // Use the Geneve offset calculated from parsed headers to handle VLANs and IPv6 extensions + let geneve_offset = ctx.geneve_offset; + + // VNI is at offset 4 in Geneve header (3 bytes) + if let Some(vni_bytes) = + fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) + { + let vni_be = next_hop.vni.as_u32().to_be_bytes(); + vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits + } + // Mark multicast replication as Underlay/All to avoid re-relay by downstream receivers. + update_mcast_replication(&mut fwd_pkt, geneve_offset, *repl); + + // Compute hash once for both routing and flow distribution + let l4_hash = { + use core::hash::Hash; + let mut hasher = crc32fast::Hasher::new(); + next_hop.addr.hash(&mut hasher); + hasher.finalize() + }; + + // Get routing information if we have a device + let (mac_src, mac_dst) = if let Some(dev) = routing_dev { + let route_key = + RouteKey { dst: next_hop.addr, l4_hash: Some(l4_hash) }; + let Route { src, dst, .. } = + dev.routes.next_hop(route_key, dev); + (src, dst) + } else { + // No devices available for routing - use zero MACs + use opte::engine::ether::EtherAddr; + (EtherAddr::zero(), EtherAddr::zero()) + }; + + // Fill in outer MAC addresses + let final_pkt = unsafe { + let mblk = fwd_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + ptr::copy(mac_dst.as_ptr(), rptr, 6); + ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + // Send to underlay via stream (same underlay we received from) + stream.tx_drop_on_no_desc( + final_pkt, + TxHint::from_crc32(l4_hash), + MacTxFlags::empty(), + ); + + xde.stats.vals.mcast_rx_underlay().incr(1); + } } + + drop(mcast_fwd); + drop(mgmt); } } @@ -1999,6 +2758,21 @@ fn xde_mc_tx_one<'a>( let old_len = parsed_pkt.len(); let meta = parsed_pkt.meta(); + + // Extract inner destination IP for potential multicast processing + use opte::engine::ip::ValidL3; + use opte::engine::ip::v4::Ipv4Ref; + use opte::engine::ip::v6::Ipv6Ref; + let inner_dst_ip = match &meta.inner_l3 { + Some(ValidL3::Ipv4(v4)) => { + Some(oxide_vpc::api::IpAddr::from(v4.destination())) + } + Some(ValidL3::Ipv6(v6)) => { + Some(oxide_vpc::api::IpAddr::from(v6.destination())) + } + None => None, + }; + let Ok(non_eth_payl_bytes) = u32::try_from((&meta.inner_l3, &meta.inner_ulp).packet_length()) else { @@ -2006,6 +2780,8 @@ fn xde_mc_tx_one<'a>( return; }; + let inner_eth_len = meta.inner_eth.packet_length(); + let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, Err(e) => { @@ -2027,6 +2803,9 @@ fn xde_mc_tx_one<'a>( return; } + // Multicast packets go through normal port.process() which will use M2P + // for encapsulation. After that, we intercept them for unicast replication. + let port = &src_dev.port; // The port processing code will fire a probe that describes what @@ -2039,24 +2818,34 @@ fn xde_mc_tx_one<'a>( // If the outer IPv6 destination is the same as the // source, then we need to loop the packet inbound to the // guest on this same host. - let (ip6_src, ip6_dst) = match emit_spec.outer_ip6_addrs() { - Some(v) => v, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no outer IPv6 header, dropping"); - return; - } + let Some((ip6_src, ip6_dst)) = emit_spec.outer_ip6_addrs() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no outer IPv6 header, dropping"); + return; }; - let vni = match emit_spec.outer_encap_vni() { - Some(vni) => vni, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no geneve header, dropping"); - return; - } + // EmitSpec applies pushes/pops, but modifications will have occurred + // by this point. Pull destination MAC to allow us to reuse code + // between unicast & multicast loopback. + // + // Ingot will have asserted that Ethernet came first, and that it was + // contiguous. + let Some(ether_dst) = pkt + .get(..size_of::()) + .map(|v| MacAddr::from_const(v.try_into().unwrap())) + else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("couldn't re-read inner MAC, dropping"); + return; + }; + + let Some(vni) = emit_spec.outer_encap_vni() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no geneve header, dropping"); + return; }; let Some(tun_meoi) = emit_spec.encap_meoi() else { @@ -2074,7 +2863,21 @@ fn xde_mc_tx_one<'a>( if ip6_src == ip6_dst { let entry_state = entry_state.get_or_insert_with(|| src_dev.port_map.read()); - guest_loopback(src_dev, entry_state, out_pkt, vni, postbox); + + let key = VniMac::new(vni, ether_dst); + if let Some(dest_dev) = entry_state.get_by_key(key) { + // We have found a matching Port on this host; "loop back" + // the packet into the inbound processing path of the + // destination Port. + guest_loopback(src_dev, dest_dev, key, out_pkt, postbox); + } else { + opte::engine::dbg!( + "underlay dest is same as src but the Port was not found \ + vni = {}, mac = {}", + vni.as_u32(), + ether_dst + ); + } return; } @@ -2086,6 +2889,52 @@ fn xde_mc_tx_one<'a>( return; }; + // For a multicast outbound frame, deliver to external/customer members + // (local guest instances) and/or underlay/infrastructure members + // based on the replication configuration. + // Check if this is a multicast packet by examining the outer IPv6 destination + // For multicast, OPTE should have set it to an ff0x:: address + let is_mcast_packet = ip6_dst.is_multicast(); + + if is_mcast_packet { + // This is a multicast packet - determine the inner destination + // from the packet contents or use a fallback + let inner_dst = inner_dst_ip.unwrap_or_else(|| { + // Fallback: derive from outer IPv6 multicast address + // For IPv4 multicast mapped to IPv6, the last 4 bytes contain the IPv4 address + if ip6_dst.bytes()[0] == 0xff && ip6_dst.bytes()[1] == 0x04 + { + // Admin-scoped IPv6 multicast, likely mapped from IPv4 + let bytes = ip6_dst.bytes(); + oxide_vpc::api::IpAddr::Ip4( + oxide_vpc::api::Ipv4Addr::from([ + bytes[12], bytes[13], bytes[14], bytes[15], + ]), + ) + } else { + // Use the IPv6 multicast address directly + oxide_vpc::api::IpAddr::Ip6(ip6_dst) + } + }); + + handle_mcast_tx( + MulticastTxContext { + inner_dst, + vni, + out_pkt: &out_pkt, + encap_len, + inner_eth_len, + non_eth_payl_bytes, + tun_meoi: &tun_meoi, + l4_hash, + }, + src_dev, + postbox, + entry_state, + ); + return; + } + // 'MSS boosting' is performed here -- we set a 9k (minus overheads) // MSS for compatible TCP traffic. This is a kind of 'pseudo-GRO', // sending larger frames internally rather than having the NIC/OS @@ -2333,7 +3182,7 @@ fn new_port( name: String, cfg: &VpcCfg, vpc_map: Arc, - v2p: Arc, + vni_state: Arc, v2b: Arc, ectx: Arc, dhcp_cfg: &DhcpCfg, @@ -2353,10 +3202,10 @@ fn new_port( // XXX some layers have no need for LFT, perhaps have two types // of Layer: one with, one without? - gateway::setup(&pb, &cfg, vpc_map, FT_LIMIT_ONE, dhcp_cfg)?; + gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE, dhcp_cfg)?; router::setup(&pb, &cfg, FT_LIMIT_ONE)?; nat::setup(&mut pb, &cfg, nat_ft_limit)?; - overlay::setup(&pb, &cfg, v2p, v2b, FT_LIMIT_ONE)?; + overlay::setup(&pb, &cfg, vni_state, vpc_map, v2b, FT_LIMIT_ONE)?; // Set the overall unified flow and TCP flow table limits based on the total // configuration above, by taking the maximum of size of the individual @@ -2368,7 +3217,9 @@ fn new_port( let limit = NonZeroU32::new(FW_FT_LIMIT.get().max(nat_ft_limit.get())).unwrap(); let net = VpcNetwork { cfg }; - Ok(Arc::new(pb.create(net, limit, limit)?)) + #[allow(clippy::arc_with_non_send_sync)] + let port = Arc::new(pb.create(net, limit, limit)?); + Ok(port) } #[unsafe(no_mangle)] @@ -2454,10 +3305,17 @@ unsafe extern "C" fn xde_rx( head } -/// Processes an individual packet receiver on the underlay device `stream`. +/// Processes an individual packet received on the underlay device `stream`. /// /// This function returns any input `pkt` which is not of interest to XDE (e.g., /// the packet is not Geneve over v6, or no matching OPTE port could be found). +/// +/// `xde_rx_one_direct` largely replicates this function due to lifetime issues +/// around parsing, so changes here may need to be made there too. We could do this +/// with a single function using an `enum` control parameter (e.g., +/// `DoMcastCheck(&DevMap)`, `DeliverDirect(&XdeDev, VniMac)`) but we'd be +/// really reliant on rustc interpreting these as static choices and inlining +/// accordingly. #[inline] fn xde_rx_one( stream: &DlsStream, @@ -2490,6 +3348,64 @@ fn xde_rx_one( let meta = parsed_pkt.meta(); let old_len = parsed_pkt.len(); + let ip6_dst = meta.outer_v6.destination(); + if ip6_dst.is_multicast() { + let pullup_len = ( + &meta.outer_eth, + &meta.outer_v6, + &meta.outer_udp, + &meta.outer_encap, + &meta.inner_eth, + &meta.inner_l3, + &meta.inner_ulp, + ) + .packet_length(); + debug_assert!( + pullup_len > 0, + "pullup_len should be non-zero for valid multicast packet" + ); + let vni = meta.outer_encap.vni(); + + // Extract inner destination IP for multicast processing + use opte::engine::ip::ValidL3; + use opte::engine::ip::v4::Ipv4Ref; + use opte::engine::ip::v6::Ipv6Ref; + let inner_dst = match &meta.inner_l3 { + ValidL3::Ipv4(v4) => oxide_vpc::api::IpAddr::from(v4.destination()), + ValidL3::Ipv6(v6) => oxide_vpc::api::IpAddr::from(v6.destination()), + }; + + // Extract multicast delivery mode from Geneve options + let incoming_delivery_mode = + oxide_vpc::engine::geneve::extract_multicast_replication( + &meta.outer_encap, + ); + + // Calculate Geneve offset from parsed outer header lengths (robust to VLANs and IPv6 extensions) + let geneve_offset = meta.outer_eth.packet_length() + + meta.outer_v6.packet_length() + + meta.outer_udp.packet_length(); + + // Drop the parsed packet before calling handle_mcast_rx + drop(parsed_pkt); + + // Handle multicast packets using the XDE-wide forwarding table + handle_mcast_rx( + MulticastRxContext { + inner_dst, + vni, + pkt: &pkt, + pullup_len, + geneve_offset, + incoming_delivery_mode, + }, + stream, + devs, + postbox, + ); + return None; + } + let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, Err(e) => { @@ -2595,6 +3511,117 @@ fn xde_rx_one( None } +/// Processes an individual packet after multicast replication has taken place. +/// This primarily duplicates `xde_rx_one`. +/// +/// Lifetimes (arond Packet etc.) will make this difficult to simplify +/// the expression of both this and its original implementation. We could insert +/// the body using macros, but then we really lose a lot (line numbers on crash, +/// subpar rust-analyzer integration)... +#[inline] +fn xde_rx_one_direct( + stream: &DlsStream, + dev: &XdeDev, + port_key: VniMac, + mut pkt: MsgBlk, + postbox: &mut Postbox, +) { + // TODO: it would be great if we could tell Ingot 'here are all the + // layer lengths/types, please believe that they are correct'. And then + // to plumb that through `NetworkParser`. I can't say that I *like* + // doing this reparse here post-replication. + let parser = VpcParser {}; + let parsed_pkt = Packet::parse_inbound(pkt.iter_mut(), parser) + .expect("this is a reparse of a known-valid packet"); + + let meta = parsed_pkt.meta(); + let old_len = parsed_pkt.len(); + + let ulp_meoi = match meta.ulp_meoi(old_len) { + Ok(ulp_meoi) => ulp_meoi, + Err(e) => { + opte::engine::dbg!("{}", e); + return; + } + }; + + let non_payl_bytes = u32::from(ulp_meoi.meoi_l2hlen) + + u32::from(ulp_meoi.meoi_l3hlen) + + u32::from(ulp_meoi.meoi_l4hlen); + + // Large TCP frames include their MSS in-band, as recipients can require + // this to correctly process frames which have been given split into + // larger chunks. + // + // This will be set to a nonzero value when TSO has been asked of the + // source packet. + let is_tcp = matches!(meta.inner_ulp, ValidUlp::Tcp(_)); + let recovered_mss = if is_tcp { + let mut out = None; + for opt in WalkOptions::from_raw(&meta.outer_encap) { + let Ok(opt) = opt else { break }; + if let Some(ValidOxideOption::Mss(el)) = opt.option.known() { + out = NonZeroU32::new(el.mss()); + break; + } + } + out + } else { + None + }; + + // We are in passthrough mode, skip OPTE processing. + if dev.passthrough { + drop(parsed_pkt); + postbox.post(port_key, pkt); + return; + } + + let port = &dev.port; + + let res = port.process(Direction::In, parsed_pkt); + + match res { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut npkt = emit_spec.apply(pkt); + let len = npkt.byte_len(); + let pay_len = len + - usize::try_from(non_payl_bytes) + .expect("usize > 32b on x86_64"); + + // Due to possible pseudo-GRO, we need to inform mac/viona on how + // it can split up this packet, if the guest cannot receive it + // (e.g., no GRO/large frame support). + // HW_LSO will cause viona to treat this packet as though it were + // a locally delivered segment making use of LSO. + if let Some(mss) = recovered_mss + // This packet could be the last segment of a split frame at + // which point it could be smaller than the original MSS. + // Don't re-tag the MSS if so, as guests may be confused and + // MAC emulation will reject the packet if the guest does not + // support GRO. + && pay_len > usize::try_from(mss.get()).expect("usize > 32b on x86_64") + { + npkt.request_offload(MblkOffloadFlags::HW_LSO, mss.get()); + } + + if let Err(e) = npkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } + + postbox.post(port_key, npkt); + } + Ok(ProcessResult::Hairpin(hppkt)) => { + stream.tx_drop_on_no_desc( + hppkt, + TxHint::NoneOrMixed, + MacTxFlags::empty(), + ); + } + _ => {} + } +} + #[unsafe(no_mangle)] fn add_router_entry_hdlr(env: &mut IoctlEnvelope) -> Result { let req: AddRouterEntryReq = env.copy_in_req()?; @@ -2682,6 +3709,63 @@ fn dump_v2p_hdlr() -> Result { Ok(state.vpc_map.dump()) } +#[unsafe(no_mangle)] +fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: SetMcast2PhysReq = env.copy_in_req()?; + + // Validate VNI is DEFAULT_MULTICAST_VNI for fleet-level multicast + if req.vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + // Propagate an actionable errno so userspace sees an error + errno: EINVAL, + msg: format!( + "multicast VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", + DEFAULT_MULTICAST_VNI, + req.vni.as_u32() + ), + }); + } + + // Validate underlay multicast address is admin-scoped IPv6 (ff04, ff05, or ff08) + // Per Omicron constraints: underlay must be admin-scoped for rack-internal routing + let first_byte = req.underlay.bytes()[0]; + let second_byte = req.underlay.bytes()[1]; + // Check if it's multicast (ff00::/8) and admin-scoped (ff04, ff05, ff08) + if first_byte != 0xff + || (second_byte != 0x04 && second_byte != 0x05 && second_byte != 0x08) + { + return Err(OpteError::InvalidUnderlayMulticast(format!( + "underlay multicast address must be admin-scoped IPv6 (ff04::/16, ff05::/16, or ff08::/16), got: {}", + req.underlay + ))); + } + + let state = get_xde_state(); + state.vpc_map.add_mcast(req.group, req.underlay, req.vni)?; + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: ClearMcast2PhysReq = env.copy_in_req()?; + + // Validate VNI is DEFAULT_MULTICAST_VNI (77) for fleet-level multicast + if req.vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "multicast VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", + DEFAULT_MULTICAST_VNI, + req.vni.as_u32() + ), + }); + } + + let state = get_xde_state(); + state.vpc_map.del_mcast(req.group, req.underlay, req.vni); + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn set_v2b_hdlr(env: &mut IoctlEnvelope) -> Result { let req: SetVirt2BoundaryReq = env.copy_in_req()?; @@ -2704,6 +3788,121 @@ fn dump_v2b_hdlr() -> Result { Ok(state.v2b.dump()) } +#[unsafe(no_mangle)] +fn set_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: SetMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for any next hop + // that will result in underlay forwarding (Underlay/All). + for (nh, rep) in &req.next_hops { + if matches!(rep, Replication::Underlay | Replication::All) + && nh.vni.as_u32() != DEFAULT_MULTICAST_VNI + { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "multicast next-hop VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", + DEFAULT_MULTICAST_VNI, + nh.vni.as_u32() + ), + }); + } + } + + let token = state.management_lock.lock(); + let mut mcast_fwd = token.mcast_fwd.write(); + + // Convert Vec into BTreeMap + let next_hop_map: BTreeMap = + req.next_hops.into_iter().collect(); + + mcast_fwd.insert(req.group, next_hop_map); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: ClearMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + let token = state.management_lock.lock(); + let mut mcast_fwd = token.mcast_fwd.write(); + + mcast_fwd.remove(&req.group); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn dump_mcast_forwarding_hdlr() -> Result { + let state = get_xde_state(); + + let token = state.management_lock.lock(); + let mcast_fwd = token.mcast_fwd.read(); + + let entries: Vec = mcast_fwd + .iter() + .map(|(group, next_hops)| McastForwardingEntry { + group: *group, + next_hops: next_hops.iter().map(|(nh, rep)| (*nh, *rep)).collect(), + }) + .collect(); + + Ok(DumpMcastForwardingResp { entries }) +} + +#[unsafe(no_mangle)] +fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: McastSubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by TX/RX + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + devs.mcast_subscribe(&req.port_name, req.group)?; + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + ); + } + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn mcast_unsubscribe_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: McastUnsubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by TX/RX + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + devs.mcast_unsubscribe(&req.port_name, req.group)?; + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + ); + } + + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn list_layers_hdlr( env: &mut IoctlEnvelope,