Skip to content

Commit 36abfc6

Browse files
authored
feat: and now enhance machine_interfaces with a full MachineBootInterface (#2285)
1 parent af5c576 commit 36abfc6

9 files changed

Lines changed: 174 additions & 47 deletions

File tree

crates/api-core/src/handlers/managed_host.rs

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -153,21 +153,6 @@ pub(crate) async fn set_primary_dpu(
153153
})?
154154
.mac_address;
155155

156-
// The operator picked this primary NIC by MAC. Resolve its Redfish interface
157-
// id from the host's exploration report so we can send a complete pair (which
158-
// enables the interface-id fallback); if the report has no id for it, target
159-
// the MAC alone.
160-
let boot_interface_id = db::explored_endpoints::find_by_ips(&mut txn, vec![bmc_addr])
161-
.await?
162-
.into_iter()
163-
.next()
164-
.and_then(|endpoint| {
165-
endpoint
166-
.report
167-
.find_interface_id_for_mac(primary_interface_mac_address)
168-
.map(str::to_string)
169-
});
170-
171156
txn.rollback().await?;
172157

173158
let Some(current_primary_interface_id) = current_primary_interface_id else {
@@ -183,10 +168,10 @@ pub(crate) async fn set_primary_dpu(
183168
.into());
184169
};
185170

186-
// Set the boot device. Send the complete (MAC + interface id) pair when the
187-
// report gave us an id, so the boot-order call can fall back to the interface
188-
// id; otherwise target the MAC alone.
189-
let boot_target = match boot_interface_id {
171+
// Set the boot device. The new primary interface row already stores its
172+
// Redfish interface id, so send the complete (MAC + id) pair when present,
173+
// allowing for interface ID fallback (and target the MAC alone otherwise).
174+
let boot_target = match new_primary_interface.boot_interface_id.clone() {
190175
Some(interface_id) => BootInterfaceTarget::Pair(MachineBootInterface {
191176
mac_address: primary_interface_mac_address,
192177
interface_id,

crates/api-core/src/tests/site_explorer.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3410,6 +3410,49 @@ async fn test_site_explorer_auto_corrects_nic_mode_per_expected_machine(
34103410
Ok(())
34113411
}
34123412

3413+
/// A managed host's DPU-facing `machine_interface` is created (via DHCP) with
3414+
/// just a MAC and no `boot_interface_id`. The exploration that ingests the host
3415+
/// then backfills the vendor-specific Redfish interface id onto that row, matched
3416+
/// by MAC, at which the primary interface ends up with a full `MachineBootInterface`.
3417+
/// This is the same backfill path any DHCP-derived interface takes (the capture is
3418+
/// keyed on MAC, not on how the row was created).
3419+
#[sqlx_test]
3420+
async fn test_site_explorer_backfills_boot_interface_id_onto_machine_interface(
3421+
pool: PgPool,
3422+
) -> Result<(), Box<dyn std::error::Error>> {
3423+
let env = common::api_fixtures::create_test_env(pool.clone()).await;
3424+
3425+
let dpu = DpuConfig::default();
3426+
let host_pf_mac = dpu.host_mac_address;
3427+
let mh = common::api_fixtures::create_managed_host_with_config(
3428+
&env,
3429+
ManagedHostConfig::with_dpus(vec![dpu]),
3430+
)
3431+
.await;
3432+
3433+
let mut txn = env.pool.begin().await?;
3434+
let interfaces = db::machine_interface::find_by_machine_ids(&mut txn, &[mh.id]).await?;
3435+
let primary = interfaces
3436+
.get(&mh.id)
3437+
.into_iter()
3438+
.flatten()
3439+
.find(|i| i.primary_interface)
3440+
.expect("ingested host should have a primary machine_interface");
3441+
3442+
// The primary row is the DPU host-PF interface (same factory MAC), now
3443+
// holding both halves of the pair: its MAC plus the Redfish interface id the
3444+
// host report named for it. The `ManagedHostConfig` fixture ids its DPU
3445+
// interfaces "NIC.Slot.{index + 5}-1", so the first DPU is "NIC.Slot.5-1".
3446+
assert_eq!(primary.mac_address, host_pf_mac);
3447+
assert_eq!(
3448+
primary.boot_interface_id.as_deref(),
3449+
Some("NIC.Slot.5-1"),
3450+
"exploration should backfill the Redfish interface id onto the machine_interface row",
3451+
);
3452+
3453+
Ok(())
3454+
}
3455+
34133456
/// A Managed Host whose `expected_machines` row is later removed becomes an
34143457
/// orphan: `audit_exploration_results` emits an `OrphanManagedHost` health
34153458
/// alert on the host's Machine. Re-adding the entry clears the alert on the
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-- Add boot_interface_id to machine_interfaces so each interface row holds a
2+
-- full MachineBootInterface pair with the MAC plus the vendor-named Redfish
3+
-- EthernetInterface.Id.
4+
ALTER TABLE machine_interfaces ADD COLUMN boot_interface_id text;

crates/api-db/src/machine_interface.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,25 @@ pub async fn set_primary_interface(
182182
.map_err(|e| DatabaseError::query(query, e))
183183
}
184184

185+
/// Records the vendor-native Redfish `EthernetInterface.Id` on the machine_interface
186+
/// row(s) with the given MAC. Captured by site-explorer per exploration; callers only
187+
/// invoke this when the id resolves from the current report, so a wiped MAC leaves the
188+
/// last-known-good id in place.
189+
pub async fn set_boot_interface_id(
190+
mac_address: MacAddress,
191+
boot_interface_id: &str,
192+
txn: &mut PgConnection,
193+
) -> Result<(), DatabaseError> {
194+
let query = "UPDATE machine_interfaces SET boot_interface_id=$1 WHERE mac_address=$2";
195+
sqlx::query(query)
196+
.bind(boot_interface_id)
197+
.bind(mac_address)
198+
.execute(txn)
199+
.await
200+
.map(|_| ())
201+
.map_err(|e| DatabaseError::query(query, e))
202+
}
203+
185204
pub async fn associate_interface_with_dpu_machine(
186205
interface_id: &MachineInterfaceId,
187206
dpu_machine_id: &MachineId,

crates/api-model/src/machine/capabilities.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ mod tests {
630630
interface_type: InterfaceType::Data,
631631
primary_interface: true,
632632
mac_address: MacAddress::from_str("08:c0:eb:cb:0e:96").unwrap(),
633+
boot_interface_id: None,
633634
attached_dpu_machine_id: Some(
634635
MachineId::from_str(
635636
"fm100dsbiu5ckus880v8407u0mkcensa39cule26im5gnpvmuufckacguc0",
@@ -654,6 +655,7 @@ mod tests {
654655
interface_type: InterfaceType::Data,
655656
primary_interface: true,
656657
mac_address: MacAddress::from_str("08:c0:eb:cb:0e:97").unwrap(),
658+
boot_interface_id: None,
657659
attached_dpu_machine_id: Some(
658660
MachineId::from_str(
659661
"fm100dsg23d2f4tq4tt5m2hgib5pcldrm3gvefbduau7gj3itgc3iqg3lpg",
@@ -756,6 +758,7 @@ mod tests {
756758
interface_type: InterfaceType::Data,
757759
primary_interface: true,
758760
mac_address: MacAddress::from_str("00:00:00:00:00:00").unwrap(),
761+
boot_interface_id: None,
759762
attached_dpu_machine_id: None,
760763
domain_id: None,
761764
machine_id: None,

crates/api-model/src/machine/mod.rs

Lines changed: 75 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -131,11 +131,6 @@ pub struct ManagedHostStateSnapshot {
131131
pub host_snapshot: Machine,
132132
pub dpu_snapshots: Vec<Machine>,
133133
pub dpa_interface_snapshots: Vec<DpaInterface>,
134-
/// The host's boot interface (MAC + Redfish interface id), captured by
135-
/// site-explorer. Populated at read time by `load_object_state` (like
136-
/// `dpa_interface_snapshots`); `None` until the host has been explored with
137-
/// a fully-resolved boot interface.
138-
pub boot_interface: Option<MachineBootInterface>,
139134
/// If there is an instance provisioned on top of the machine, this holds
140135
/// its state
141136
pub instance: Option<InstanceSnapshot>,
@@ -213,8 +208,6 @@ impl<'r> sqlx::FromRow<'r, sqlx::postgres::PgRow> for ManagedHostStateSnapshot {
213208
host_snapshot,
214209
dpu_snapshots,
215210
dpa_interface_snapshots,
216-
// Filled by load_object_state later (see dpa_interface_snapshots).
217-
boot_interface: None,
218211
managed_state,
219212
instance,
220213
rack_health_overrides,
@@ -280,19 +273,38 @@ impl From<ManagedHostStateSnapshotError> for sqlx::Error {
280273
/// so it's on the caller to figure out. What this usually means is the
281274
/// caller passes `boot_interface_mac: None` to machine_setup, and then
282275
/// subsequent logic flows from there (e.g. ::NoDpu handling).
283-
fn pick_boot_interface_mac(
276+
fn pick_boot_interface(
284277
interfaces: &[MachineInterfaceSnapshot],
285-
) -> Option<mac_address::MacAddress> {
278+
) -> Option<&MachineInterfaceSnapshot> {
286279
// The primary wins!
287280
if let Some(primary) = interfaces.iter().find(|x| x.primary_interface) {
288-
return Some(primary.mac_address);
281+
return Some(primary);
289282
}
290283
// ..no primary, so lets try to find *some* interface.
291284
interfaces
292285
.iter()
293286
.filter(|x| x.network_segment_type != Some(NetworkSegmentType::Underlay))
294287
.min_by_key(|x| x.mac_address)
295-
.map(|x| x.mac_address)
288+
}
289+
290+
fn pick_boot_interface_mac(
291+
interfaces: &[MachineInterfaceSnapshot],
292+
) -> Option<mac_address::MacAddress> {
293+
pick_boot_interface(interfaces).map(|x| x.mac_address)
294+
}
295+
296+
/// Resolves the boot interface to a fully-populated [`MachineBootInterface`]
297+
/// (MAC + Redfish interface id) from the picked interface's own row. Split out
298+
/// like `pick_boot_interface_mac` so it's unit-testable without a full snapshot.
299+
fn pick_boot_interface_pair(
300+
interfaces: &[MachineInterfaceSnapshot],
301+
) -> Option<MachineBootInterface> {
302+
pick_boot_interface(interfaces).and_then(|interface| {
303+
MachineBootInterface::from_parts(
304+
Some(interface.mac_address),
305+
interface.boot_interface_id.clone(),
306+
)
307+
})
296308
}
297309

298310
impl ManagedHostStateSnapshot {
@@ -365,6 +377,19 @@ impl ManagedHostStateSnapshot {
365377
pick_boot_interface_mac(&self.host_snapshot.interfaces)
366378
}
367379

380+
/// Returns the host's boot interface as a fully-populated
381+
/// [`MachineBootInterface`] (MAC + Redfish interface id), derived from the
382+
/// same primary `machine_interface` row that [`Self::boot_interface_mac`]
383+
/// selects.
384+
///
385+
/// Returns `None` when that row hasn't captured a Redfish interface id yet
386+
/// (e.g. not yet explored, or a zero-DPU host) -- callers then target the MAC
387+
/// alone. Because the MAC and id come from one row, the pair can never name a
388+
/// different interface than `boot_interface_mac`.
389+
pub fn boot_interface(&self) -> Option<MachineBootInterface> {
390+
pick_boot_interface_pair(&self.host_snapshot.interfaces)
391+
}
392+
368393
/// Returns `true` if override report is hw_health, `false` otherwise.
369394
fn merge_override_report_with_hw_health(
370395
output: &mut HealthReport,
@@ -2321,6 +2346,11 @@ pub struct MachineInterfaceSnapshot {
23212346
pub interface_type: InterfaceType,
23222347
pub primary_interface: bool,
23232348
pub mac_address: MacAddress,
2349+
/// Vendor-native Redfish `EthernetInterface.Id` for this interface, captured
2350+
/// by site-explorer alongside the MAC. Combined with `mac_address` it forms a
2351+
/// [`MachineBootInterface`]; for the `primary_interface` row that pair is the
2352+
/// host's boot device.
2353+
pub boot_interface_id: Option<String>,
23242354
pub attached_dpu_machine_id: Option<MachineId>,
23252355
pub domain_id: Option<DomainId>,
23262356
pub machine_id: Option<MachineId>,
@@ -2345,6 +2375,7 @@ impl MachineInterfaceSnapshot {
23452375
machine_id: None,
23462376
segment_id: uuid::Uuid::nil().into(),
23472377
mac_address,
2378+
boot_interface_id: None,
23482379
hostname: String::new(),
23492380
interface_type: InterfaceType::Data,
23502381
primary_interface: true,
@@ -2643,6 +2674,7 @@ impl<'r> FromRow<'r, PgRow> for MachineInterfaceSnapshot {
26432674
hostname: row.try_get("hostname")?,
26442675
interface_type: row.try_get("interface_type")?,
26452676
mac_address: row.try_get("mac_address")?,
2677+
boot_interface_id: row.try_get("boot_interface_id")?,
26462678
primary_interface: row.try_get("primary_interface")?,
26472679
created: row.try_get("created")?,
26482680
last_dhcp: row.try_get("last_dhcp")?,
@@ -3320,6 +3352,38 @@ mod tests {
33203352
);
33213353
}
33223354

3355+
// boot_interface() derives the full pair from the SAME primary row that the
3356+
// MAC selection uses, so the MAC and id can never name different interfaces.
3357+
#[test]
3358+
fn pick_boot_interface_pair_uses_primary_rows_mac_and_id() {
3359+
let other = build_mock_interface(
3360+
"05:00:00:00:00:01",
3361+
false,
3362+
Some(NetworkSegmentType::HostInband),
3363+
);
3364+
let primary = MachineInterfaceSnapshot {
3365+
boot_interface_id: Some("NIC.Slot.7-1-1".to_string()),
3366+
..build_mock_interface("10:00:00:00:00:01", true, Some(NetworkSegmentType::Admin))
3367+
};
3368+
3369+
assert_eq!(
3370+
pick_boot_interface_pair(&[other, primary]),
3371+
Some(MachineBootInterface {
3372+
mac_address: "10:00:00:00:00:01".parse().unwrap(),
3373+
interface_id: "NIC.Slot.7-1-1".to_string(),
3374+
})
3375+
);
3376+
}
3377+
3378+
// When the primary row hasn't captured a Redfish interface id yet, there's no
3379+
// complete pair -- callers fall back to the MAC alone.
3380+
#[test]
3381+
fn pick_boot_interface_pair_is_none_without_captured_id() {
3382+
let primary =
3383+
build_mock_interface("10:00:00:00:00:01", true, Some(NetworkSegmentType::Admin));
3384+
assert_eq!(pick_boot_interface_pair(&[primary]), None);
3385+
}
3386+
33233387
// Check the case where only the BMC has been discovered so far (which
33243388
// is common during early ingestion). In this case, there's no valid boot MAC
33253389
// yet; callers fall back to the `::NoDpu` handling downstream.

crates/machine-controller/src/boot_interface.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,17 @@ use model::machine::ManagedHostStateSnapshot;
2121

2222
/// Resolve how to target this host's boot interface for Redfish setup calls.
2323
///
24-
/// Prefers the stored, fully-captured boot interface (MAC + Redfish interface
25-
/// id), which enables the MAC-first / interface-id fallback; it falls back to
26-
/// the machine-interfaces-derived MAC (no id fallback) when no captured boot
27-
/// interface is available yet.
24+
/// Uses the host's primary `machine_interface`: when that row has a captured
25+
/// Redfish interface id, the full pair is returned (enabling the MAC-first /
26+
/// interface-id fallback); otherwise it targets the MAC alone. Both come from the
27+
/// same row, so the pair can never name a different interface than the MAC.
2828
///
2929
/// Returns `None` only when the host has no boot interface at all (e.g. only the
3030
/// BMC has been discovered, or the primary NIC hasn't appeared yet).
3131
pub fn boot_interface_target(
3232
mh_snapshot: &ManagedHostStateSnapshot,
3333
) -> Option<BootInterfaceTarget> {
34-
if let Some(boot_interface) = mh_snapshot.boot_interface.clone() {
34+
if let Some(boot_interface) = mh_snapshot.boot_interface() {
3535
return Some(BootInterfaceTarget::Pair(boot_interface));
3636
}
3737
mh_snapshot

crates/machine-controller/src/io.rs

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,6 @@ impl StateControllerIO for MachineStateControllerIO {
103103
let dpa_snapshots =
104104
db::dpa_interface::find_by_machine_id(&mut *txn, *machine_id).await?;
105105
retstate.dpa_interface_snapshots = dpa_snapshots;
106-
107-
// Populate the host's stored boot interface (MAC + Redfish interface id)
108-
// from its explored endpoint, so setup flows can target the MAC first
109-
// and fall back to the [stable] interface id when the MAC isn't resolvable.
110-
if let Ok(bmc_ip) = retstate.host_snapshot.bmc_info.ip_addr() {
111-
retstate.boot_interface =
112-
db::explored_endpoints::find_by_ips(&mut *txn, vec![bmc_ip])
113-
.await?
114-
.into_iter()
115-
.next()
116-
.and_then(|ep| ep.boot_interface());
117-
}
118106
};
119107

120108
return Ok(retstate);

crates/site-explorer/src/lib.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,9 @@ impl SiteExplorer {
10031003

10041004
let mut managed_hosts = Vec::new();
10051005
let mut boot_interfaces: Vec<(IpAddr, MachineBootInterface)> = Vec::new();
1006+
// Per-DPU (host PF MAC -> Redfish interface id) pairs to stamp onto their
1007+
// machine_interfaces rows so the primary row holds the full boot pair.
1008+
let mut interface_boot_ids: Vec<(MacAddress, String)> = Vec::new();
10061009

10071010
for (_, ep) in explored_hosts {
10081011
// Resolve the operator-declared DPU mode for this host once;
@@ -1271,6 +1274,18 @@ impl SiteExplorer {
12711274
});
12721275
}
12731276

1277+
// Capture each explored DPU interface's Redfish interface id onto its
1278+
// machine_interfaces row (matched by MAC), so the primary-flagged row
1279+
// holds the full boot pair (MAC + id). Last-known-good: only record
1280+
// when the id resolves from this report -- a wiped MAC keeps its prior id.
1281+
for dpu in &dpus_explored_for_host {
1282+
if let Some(mac) = dpu.host_pf_mac_address
1283+
&& let Some(interface_id) = ep.report.find_interface_id_for_mac(mac)
1284+
{
1285+
interface_boot_ids.push((mac, interface_id.to_string()));
1286+
}
1287+
}
1288+
12741289
// For NicMode hosts, don't attach DPUs even if matching
12751290
// discovered some: the operator has declared "treat this host
12761291
// as zero-DPU". Any DPU hardware has already had `set_nic_mode`
@@ -1317,6 +1332,12 @@ impl SiteExplorer {
13171332
db::explored_endpoints::set_boot_interface(*address, boot_interface, &mut txn).await?;
13181333
}
13191334

1335+
// Stamp each DPU interface's Redfish id onto its machine_interfaces row so the
1336+
// primary-flagged row is the host's complete boot interface (MAC + id).
1337+
for (mac, interface_id) in &interface_boot_ids {
1338+
db::machine_interface::set_boot_interface_id(*mac, interface_id, &mut txn).await?;
1339+
}
1340+
13201341
txn.commit().await?;
13211342

13221343
Ok(managed_hosts)

0 commit comments

Comments
 (0)