Skip to content

Commit

Permalink
deviceplugin: if dev path of the device changes, mark the node as unh…
Browse files Browse the repository at this point in the history
…ealthy to trigger a kubelet reconciliation to update the paths in the pod

Signed-off-by: Muvaffak Onus <[email protected]>
  • Loading branch information
muvaf committed Jan 6, 2025
1 parent 327442f commit 5983fed
Showing 1 changed file with 33 additions and 1 deletion.
34 changes: 33 additions & 1 deletion agent/src/plugin_manager/device_plugin_instance_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,38 @@ impl InstanceDevicePlugin {
async fn update_slots(&self, slots: &HashMap<String, String>) -> Result<(), DevicePluginError> {
let my_slots = self.slots_status.lock().await;
let new_slots = construct_slots_map(slots)?;

// Track which specific slots have changed
let mut changed_slots = HashSet::new();
for (k, v) in new_slots.iter() {
if let Some(current_slot) = my_slots.borrow().get(*k) {
if current_slot != v {
changed_slots.insert(*k);
}
}
}

// If any slots changed, mark only those specific ones as unhealthy
if !changed_slots.is_empty() {
my_slots.send_if_modified(|current| {
for slot_id in &changed_slots {
if let Some(slot) = current.get_mut(*slot_id) {
if let DeviceUsage::Node(node) = slot {
if node == &self.node_name {
// Temporarily mark only this specific device as unhealthy
*node = "temporary-unhealthy".to_string();
}
}
}
}
true
});

// Give kubelet time to notice the unhealthy state
tokio::time::sleep(Duration::from_millis(100)).await;
}

// Now update to the new state
my_slots.send_if_modified(|current| {
let mut modified = false;
for (k, v) in new_slots.iter() {
Expand Down Expand Up @@ -476,7 +508,7 @@ enum ConfigurationSlot {

struct ConfigurationDevicePlugin {
instances: RwLock<HashMap<String, Arc<InstanceDevicePlugin>>>,
slots: Arc<RwLock<watch::Sender<HashMap<String, ConfigurationSlot>>>>,
slots: Arc<RwLock<watch::Sender<HashMap<String, ConfigurationSlot>>>>>,
config_name: String,
node_name: String,
stopper: Stopper,
Expand Down

0 comments on commit 5983fed

Please sign in to comment.