Skip to content

Commit 7736892

Browse files
authored
Merge branch 'main' into feat/variable-frequency-health-reports
2 parents 5ff5880 + 36abfc6 commit 7736892

11 files changed

Lines changed: 208 additions & 53 deletions

File tree

crates/api-core/src/handlers/managed_host.rs

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -153,21 +153,6 @@ pub(crate) async fn set_primary_dpu(
153153
})?
154154
.mac_address;
155155

156-
// The operator picked this primary NIC by MAC. Resolve its Redfish interface
157-
// id from the host's exploration report so we can send a complete pair (which
158-
// enables the interface-id fallback); if the report has no id for it, target
159-
// the MAC alone.
160-
let boot_interface_id = db::explored_endpoints::find_by_ips(&mut txn, vec![bmc_addr])
161-
.await?
162-
.into_iter()
163-
.next()
164-
.and_then(|endpoint| {
165-
endpoint
166-
.report
167-
.find_interface_id_for_mac(primary_interface_mac_address)
168-
.map(str::to_string)
169-
});
170-
171156
txn.rollback().await?;
172157

173158
let Some(current_primary_interface_id) = current_primary_interface_id else {
@@ -183,10 +168,10 @@ pub(crate) async fn set_primary_dpu(
183168
.into());
184169
};
185170

186-
// Set the boot device. Send the complete (MAC + interface id) pair when the
187-
// report gave us an id, so the boot-order call can fall back to the interface
188-
// id; otherwise target the MAC alone.
189-
let boot_target = match boot_interface_id {
171+
// Set the boot device. The new primary interface row already stores its
172+
// Redfish interface id, so send the complete (MAC + id) pair when present,
173+
// allowing for interface ID fallback (and target the MAC alone otherwise).
174+
let boot_target = match new_primary_interface.boot_interface_id.clone() {
190175
Some(interface_id) => BootInterfaceTarget::Pair(MachineBootInterface {
191176
mac_address: primary_interface_mac_address,
192177
interface_id,

crates/api-core/src/tests/common/metadata.rs

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ pub fn invalid_metadata_testcases(
135135
).to_string(),
136136
),
137137
(
138-
// Maximum of 10 labels
138+
// Maximum of 16 labels
139139
rpc::forge::Metadata {
140140
name: "aa".to_string(),
141141
description: "".to_string(),
@@ -182,9 +182,33 @@ pub fn invalid_metadata_testcases(
182182
rpc::forge::Label {
183183
key: "key11".to_string(),
184184
value: None,
185+
},
186+
rpc::forge::Label {
187+
key: "key12".to_string(),
188+
value: None,
189+
},
190+
rpc::forge::Label {
191+
key: "key13".to_string(),
192+
value: None,
193+
},
194+
rpc::forge::Label {
195+
key: "key14".to_string(),
196+
value: None,
197+
},
198+
rpc::forge::Label {
199+
key: "key15".to_string(),
200+
value: None,
201+
},
202+
rpc::forge::Label {
203+
key: "key16".to_string(),
204+
value: None,
205+
},
206+
rpc::forge::Label {
207+
key: "key17".to_string(),
208+
value: None,
185209
},],
186210
},
187-
"Invalid value: Cannot have more than 10 labels".to_string()
211+
"Invalid value: Cannot have more than 16 labels".to_string()
188212
),
189213
].to_vec();
190214

crates/api-core/src/tests/site_explorer.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3410,6 +3410,49 @@ async fn test_site_explorer_auto_corrects_nic_mode_per_expected_machine(
34103410
Ok(())
34113411
}
34123412

3413+
/// A managed host's DPU-facing `machine_interface` is created (via DHCP) with
3414+
/// just a MAC and no `boot_interface_id`. The exploration that ingests the host
3415+
/// then backfills the vendor-specific Redfish interface id onto that row, matched
3416+
/// by MAC, at which the primary interface ends up with a full `MachineBootInterface`.
3417+
/// This is the same backfill path any DHCP-derived interface takes (the capture is
3418+
/// keyed on MAC, not on how the row was created).
3419+
#[sqlx_test]
3420+
async fn test_site_explorer_backfills_boot_interface_id_onto_machine_interface(
3421+
pool: PgPool,
3422+
) -> Result<(), Box<dyn std::error::Error>> {
3423+
let env = common::api_fixtures::create_test_env(pool.clone()).await;
3424+
3425+
let dpu = DpuConfig::default();
3426+
let host_pf_mac = dpu.host_mac_address;
3427+
let mh = common::api_fixtures::create_managed_host_with_config(
3428+
&env,
3429+
ManagedHostConfig::with_dpus(vec![dpu]),
3430+
)
3431+
.await;
3432+
3433+
let mut txn = env.pool.begin().await?;
3434+
let interfaces = db::machine_interface::find_by_machine_ids(&mut txn, &[mh.id]).await?;
3435+
let primary = interfaces
3436+
.get(&mh.id)
3437+
.into_iter()
3438+
.flatten()
3439+
.find(|i| i.primary_interface)
3440+
.expect("ingested host should have a primary machine_interface");
3441+
3442+
// The primary row is the DPU host-PF interface (same factory MAC), now
3443+
// holding both halves of the pair: its MAC plus the Redfish interface id the
3444+
// host report named for it. The `ManagedHostConfig` fixture ids its DPU
3445+
// interfaces "NIC.Slot.{index + 5}-1", so the first DPU is "NIC.Slot.5-1".
3446+
assert_eq!(primary.mac_address, host_pf_mac);
3447+
assert_eq!(
3448+
primary.boot_interface_id.as_deref(),
3449+
Some("NIC.Slot.5-1"),
3450+
"exploration should backfill the Redfish interface id onto the machine_interface row",
3451+
);
3452+
3453+
Ok(())
3454+
}
3455+
34133456
/// A Managed Host whose `expected_machines` row is later removed becomes an
34143457
/// orphan: `audit_exploration_results` emits an `OrphanManagedHost` health
34153458
/// alert on the host's Machine. Re-adding the entry clears the alert on the
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-- Add boot_interface_id to machine_interfaces so each interface row holds a
2+
-- full MachineBootInterface pair with the MAC plus the vendor-named Redfish
3+
-- EthernetInterface.Id.
4+
ALTER TABLE machine_interfaces ADD COLUMN boot_interface_id text;

crates/api-db/src/machine_interface.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,25 @@ pub async fn set_primary_interface(
182182
.map_err(|e| DatabaseError::query(query, e))
183183
}
184184

185+
/// Records the vendor-native Redfish `EthernetInterface.Id` on the machine_interface
186+
/// row(s) with the given MAC. Captured by site-explorer per exploration; callers only
187+
/// invoke this when the id resolves from the current report, so a wiped MAC leaves the
188+
/// last-known-good id in place.
189+
pub async fn set_boot_interface_id(
190+
mac_address: MacAddress,
191+
boot_interface_id: &str,
192+
txn: &mut PgConnection,
193+
) -> Result<(), DatabaseError> {
194+
let query = "UPDATE machine_interfaces SET boot_interface_id=$1 WHERE mac_address=$2";
195+
sqlx::query(query)
196+
.bind(boot_interface_id)
197+
.bind(mac_address)
198+
.execute(txn)
199+
.await
200+
.map(|_| ())
201+
.map_err(|e| DatabaseError::query(query, e))
202+
}
203+
185204
pub async fn associate_interface_with_dpu_machine(
186205
interface_id: &MachineInterfaceId,
187206
dpu_machine_id: &MachineId,

crates/api-model/src/machine/capabilities.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ mod tests {
630630
interface_type: InterfaceType::Data,
631631
primary_interface: true,
632632
mac_address: MacAddress::from_str("08:c0:eb:cb:0e:96").unwrap(),
633+
boot_interface_id: None,
633634
attached_dpu_machine_id: Some(
634635
MachineId::from_str(
635636
"fm100dsbiu5ckus880v8407u0mkcensa39cule26im5gnpvmuufckacguc0",
@@ -654,6 +655,7 @@ mod tests {
654655
interface_type: InterfaceType::Data,
655656
primary_interface: true,
656657
mac_address: MacAddress::from_str("08:c0:eb:cb:0e:97").unwrap(),
658+
boot_interface_id: None,
657659
attached_dpu_machine_id: Some(
658660
MachineId::from_str(
659661
"fm100dsg23d2f4tq4tt5m2hgib5pcldrm3gvefbduau7gj3itgc3iqg3lpg",
@@ -756,6 +758,7 @@ mod tests {
756758
interface_type: InterfaceType::Data,
757759
primary_interface: true,
758760
mac_address: MacAddress::from_str("00:00:00:00:00:00").unwrap(),
761+
boot_interface_id: None,
759762
attached_dpu_machine_id: None,
760763
domain_id: None,
761764
machine_id: None,

crates/api-model/src/machine/mod.rs

Lines changed: 75 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -131,11 +131,6 @@ pub struct ManagedHostStateSnapshot {
131131
pub host_snapshot: Machine,
132132
pub dpu_snapshots: Vec<Machine>,
133133
pub dpa_interface_snapshots: Vec<DpaInterface>,
134-
/// The host's boot interface (MAC + Redfish interface id), captured by
135-
/// site-explorer. Populated at read time by `load_object_state` (like
136-
/// `dpa_interface_snapshots`); `None` until the host has been explored with
137-
/// a fully-resolved boot interface.
138-
pub boot_interface: Option<MachineBootInterface>,
139134
/// If there is an instance provisioned on top of the machine, this holds
140135
/// its state
141136
pub instance: Option<InstanceSnapshot>,
@@ -213,8 +208,6 @@ impl<'r> sqlx::FromRow<'r, sqlx::postgres::PgRow> for ManagedHostStateSnapshot {
213208
host_snapshot,
214209
dpu_snapshots,
215210
dpa_interface_snapshots,
216-
// Filled by load_object_state later (see dpa_interface_snapshots).
217-
boot_interface: None,
218211
managed_state,
219212
instance,
220213
rack_health_overrides,
@@ -280,19 +273,38 @@ impl From<ManagedHostStateSnapshotError> for sqlx::Error {
280273
/// so it's on the caller to figure out. What this usually means is the
281274
/// caller passes `boot_interface_mac: None` to machine_setup, and then
282275
/// subsequent logic flows from there (e.g. ::NoDpu handling).
283-
fn pick_boot_interface_mac(
276+
fn pick_boot_interface(
284277
interfaces: &[MachineInterfaceSnapshot],
285-
) -> Option<mac_address::MacAddress> {
278+
) -> Option<&MachineInterfaceSnapshot> {
286279
// The primary wins!
287280
if let Some(primary) = interfaces.iter().find(|x| x.primary_interface) {
288-
return Some(primary.mac_address);
281+
return Some(primary);
289282
}
290283
// ..no primary, so lets try to find *some* interface.
291284
interfaces
292285
.iter()
293286
.filter(|x| x.network_segment_type != Some(NetworkSegmentType::Underlay))
294287
.min_by_key(|x| x.mac_address)
295-
.map(|x| x.mac_address)
288+
}
289+
290+
fn pick_boot_interface_mac(
291+
interfaces: &[MachineInterfaceSnapshot],
292+
) -> Option<mac_address::MacAddress> {
293+
pick_boot_interface(interfaces).map(|x| x.mac_address)
294+
}
295+
296+
/// Resolves the boot interface to a fully-populated [`MachineBootInterface`]
297+
/// (MAC + Redfish interface id) from the picked interface's own row. Split out
298+
/// like `pick_boot_interface_mac` so it's unit-testable without a full snapshot.
299+
fn pick_boot_interface_pair(
300+
interfaces: &[MachineInterfaceSnapshot],
301+
) -> Option<MachineBootInterface> {
302+
pick_boot_interface(interfaces).and_then(|interface| {
303+
MachineBootInterface::from_parts(
304+
Some(interface.mac_address),
305+
interface.boot_interface_id.clone(),
306+
)
307+
})
296308
}
297309

298310
impl ManagedHostStateSnapshot {
@@ -365,6 +377,19 @@ impl ManagedHostStateSnapshot {
365377
pick_boot_interface_mac(&self.host_snapshot.interfaces)
366378
}
367379

380+
/// Returns the host's boot interface as a fully-populated
381+
/// [`MachineBootInterface`] (MAC + Redfish interface id), derived from the
382+
/// same primary `machine_interface` row that [`Self::boot_interface_mac`]
383+
/// selects.
384+
///
385+
/// Returns `None` when that row hasn't captured a Redfish interface id yet
386+
/// (e.g. not yet explored, or a zero-DPU host) -- callers then target the MAC
387+
/// alone. Because the MAC and id come from one row, the pair can never name a
388+
/// different interface than `boot_interface_mac`.
389+
pub fn boot_interface(&self) -> Option<MachineBootInterface> {
390+
pick_boot_interface_pair(&self.host_snapshot.interfaces)
391+
}
392+
368393
/// Returns `true` if override report is hw_health, `false` otherwise.
369394
fn merge_override_report_with_hw_health(
370395
output: &mut HealthReport,
@@ -2321,6 +2346,11 @@ pub struct MachineInterfaceSnapshot {
23212346
pub interface_type: InterfaceType,
23222347
pub primary_interface: bool,
23232348
pub mac_address: MacAddress,
2349+
/// Vendor-native Redfish `EthernetInterface.Id` for this interface, captured
2350+
/// by site-explorer alongside the MAC. Combined with `mac_address` it forms a
2351+
/// [`MachineBootInterface`]; for the `primary_interface` row that pair is the
2352+
/// host's boot device.
2353+
pub boot_interface_id: Option<String>,
23242354
pub attached_dpu_machine_id: Option<MachineId>,
23252355
pub domain_id: Option<DomainId>,
23262356
pub machine_id: Option<MachineId>,
@@ -2345,6 +2375,7 @@ impl MachineInterfaceSnapshot {
23452375
machine_id: None,
23462376
segment_id: uuid::Uuid::nil().into(),
23472377
mac_address,
2378+
boot_interface_id: None,
23482379
hostname: String::new(),
23492380
interface_type: InterfaceType::Data,
23502381
primary_interface: true,
@@ -2643,6 +2674,7 @@ impl<'r> FromRow<'r, PgRow> for MachineInterfaceSnapshot {
26432674
hostname: row.try_get("hostname")?,
26442675
interface_type: row.try_get("interface_type")?,
26452676
mac_address: row.try_get("mac_address")?,
2677+
boot_interface_id: row.try_get("boot_interface_id")?,
26462678
primary_interface: row.try_get("primary_interface")?,
26472679
created: row.try_get("created")?,
26482680
last_dhcp: row.try_get("last_dhcp")?,
@@ -3320,6 +3352,38 @@ mod tests {
33203352
);
33213353
}
33223354

3355+
// boot_interface() derives the full pair from the SAME primary row that the
3356+
// MAC selection uses, so the MAC and id can never name different interfaces.
3357+
#[test]
3358+
fn pick_boot_interface_pair_uses_primary_rows_mac_and_id() {
3359+
let other = build_mock_interface(
3360+
"05:00:00:00:00:01",
3361+
false,
3362+
Some(NetworkSegmentType::HostInband),
3363+
);
3364+
let primary = MachineInterfaceSnapshot {
3365+
boot_interface_id: Some("NIC.Slot.7-1-1".to_string()),
3366+
..build_mock_interface("10:00:00:00:00:01", true, Some(NetworkSegmentType::Admin))
3367+
};
3368+
3369+
assert_eq!(
3370+
pick_boot_interface_pair(&[other, primary]),
3371+
Some(MachineBootInterface {
3372+
mac_address: "10:00:00:00:00:01".parse().unwrap(),
3373+
interface_id: "NIC.Slot.7-1-1".to_string(),
3374+
})
3375+
);
3376+
}
3377+
3378+
// When the primary row hasn't captured a Redfish interface id yet, there's no
3379+
// complete pair -- callers fall back to the MAC alone.
3380+
#[test]
3381+
fn pick_boot_interface_pair_is_none_without_captured_id() {
3382+
let primary =
3383+
build_mock_interface("10:00:00:00:00:01", true, Some(NetworkSegmentType::Admin));
3384+
assert_eq!(pick_boot_interface_pair(&[primary]), None);
3385+
}
3386+
33233387
// Check the case where only the BMC has been discovered so far (which
33243388
// is common during early ingestion). In this case, there's no valid boot MAC
33253389
// yet; callers fall back to the `::NoDpu` handling downstream.

crates/api-model/src/metadata.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ use serde::Deserialize;
2121

2222
use crate::ConfigValidationError;
2323

24+
/// Maximum number of labels allowed on a resource's metadata.
25+
const MAX_LABELS: usize = 16;
26+
2427
/// Metadata that can get associated with Forge managed resources
2528
#[derive(Debug, Default, Clone, PartialEq, Eq, Deserialize)]
2629
pub struct Metadata {
@@ -96,9 +99,10 @@ impl Metadata {
9699
}
97100
}
98101

99-
if self.labels.len() > 10 {
102+
if self.labels.len() > MAX_LABELS {
100103
return Err(ConfigValidationError::InvalidValue(format!(
101-
"Cannot have more than 10 labels, got {}",
104+
"Cannot have more than {} labels, got {}",
105+
MAX_LABELS,
102106
self.labels.len()
103107
)));
104108
}
@@ -230,11 +234,11 @@ mod tests {
230234
Err(ConfigValidationError::InvalidValue(_))
231235
));
232236

233-
// Too many labels
237+
// Too many labels (17 > 16)
234238
let metadata = Metadata {
235239
name: "nice name".to_string(),
236240
description: "anything is fine".to_string(),
237-
labels: "abcdefghijk"
241+
labels: "abcdefghijklmnopq"
238242
.chars()
239243
.map(|c| (c.to_string(), "x".to_string()))
240244
.collect(),

crates/machine-controller/src/boot_interface.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,17 @@ use model::machine::ManagedHostStateSnapshot;
2121

2222
/// Resolve how to target this host's boot interface for Redfish setup calls.
2323
///
24-
/// Prefers the stored, fully-captured boot interface (MAC + Redfish interface
25-
/// id), which enables the MAC-first / interface-id fallback; it falls back to
26-
/// the machine-interfaces-derived MAC (no id fallback) when no captured boot
27-
/// interface is available yet.
24+
/// Uses the host's primary `machine_interface`: when that row has a captured
25+
/// Redfish interface id, the full pair is returned (enabling the MAC-first /
26+
/// interface-id fallback); otherwise it targets the MAC alone. Both come from the
27+
/// same row, so the pair can never name a different interface than the MAC.
2828
///
2929
/// Returns `None` only when the host has no boot interface at all (e.g. only the
3030
/// BMC has been discovered, or the primary NIC hasn't appeared yet).
3131
pub fn boot_interface_target(
3232
mh_snapshot: &ManagedHostStateSnapshot,
3333
) -> Option<BootInterfaceTarget> {
34-
if let Some(boot_interface) = mh_snapshot.boot_interface.clone() {
34+
if let Some(boot_interface) = mh_snapshot.boot_interface() {
3535
return Some(BootInterfaceTarget::Pair(boot_interface));
3636
}
3737
mh_snapshot

0 commit comments

Comments
 (0)