From d7a7f152f339a82a9065e46cf726186defec9907 Mon Sep 17 00:00:00 2001 From: nokyan Date: Sat, 14 Dec 2024 19:19:38 +0100 Subject: [PATCH 1/6] Add support for AMD NPUs --- lib/process_data/src/lib.rs | 218 +++++++++++++++++++++++++++--------- src/ui/window.rs | 21 +++- src/utils/app.rs | 47 ++++++++ src/utils/npu/amd.rs | 98 ++++++++++++++++ src/utils/npu/intel.rs | 8 +- src/utils/npu/mod.rs | 57 +++++++--- src/utils/npu/other.rs | 8 +- src/utils/process.rs | 36 +++++- 8 files changed, 414 insertions(+), 79 deletions(-) create mode 100644 src/utils/npu/amd.rs diff --git a/lib/process_data/src/lib.rs b/lib/process_data/src/lib.rs index 4bc932fa..74f9cf80 100644 --- a/lib/process_data/src/lib.rs +++ b/lib/process_data/src/lib.rs @@ -46,11 +46,17 @@ static RE_IO_READ: Lazy = lazy_regex!(r"read_bytes:\s*(\d+)"); static RE_IO_WRITE: Lazy = lazy_regex!(r"write_bytes:\s*(\d+)"); +static RE_DRM_DRIVER: Lazy = lazy_regex!(r"drm-driver:\s*(.+)"); + static RE_DRM_PDEV: Lazy = lazy_regex!(r"drm-pdev:\s*([0-9A-Fa-f]{4}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}\.[0-9A-Fa-f])"); static RE_DRM_CLIENT_ID: Lazy = lazy_regex!(r"drm-client-id:\s*(\d+)"); +// AMD only +static RE_DRM_ENGINE_NPU_AMDXDNA: Lazy = + lazy_regex!(r"drm-engine-npu-amdxdna:\s*(\d+)\s*ns"); + // AMD only static RE_DRM_ENGINE_GFX: Lazy = lazy_regex!(r"drm-engine-gfx:\s*(\d+)\s*ns"); @@ -75,6 +81,8 @@ static RE_DRM_ENGINE_RENDER: Lazy = lazy_regex!(r"drm-engine-render:\s*(\ // Intel only static RE_DRM_ENGINE_VIDEO: Lazy = lazy_regex!(r"drm-engine-video:\s*(\d+)\s*ns"); +static RE_DRM_TOTAL_MEMORY: Lazy = lazy_regex!(r"drm-total-memory:\s*(\d+)\s*KiB"); + static NVML: Lazy> = Lazy::new(Nvml::init); static NVML_DEVICES: Lazy> = Lazy::new(|| { @@ -135,6 +143,13 @@ pub struct GpuUsageStats { pub nvidia: bool, } +/// Represents NPU usage statistics per-process. +#[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize, Copy)] +pub struct NpuUsageStats { + pub usage: u64, + pub mem: u64, +} + /// Data that could be transferred using `resources-processes`, separated from /// `Process` mainly due to `Icon` not being able to derive `Serialize` and /// `Deserialize`. @@ -159,6 +174,7 @@ pub struct ProcessData { pub timestamp: u64, /// Key: PCI Slot ID of the GPU pub gpu_usage_stats: BTreeMap, + pub npu_usage_stats: BTreeMap, } impl ProcessData { @@ -365,6 +381,8 @@ impl ProcessData { let gpu_usage_stats = Self::gpu_usage_stats(proc_path, pid); + let npu_usage_stats = Self::npu_usage_stats(proc_path, pid).unwrap_or_default(); + let timestamp = unix_as_millis(); Ok(Self { @@ -386,6 +404,7 @@ impl ProcessData { write_bytes, timestamp, gpu_usage_stats, + npu_usage_stats, }) } @@ -396,6 +415,68 @@ impl ProcessData { other_stats } + /// Returns the fd_num and the plausibility of whether this file might contain drm fdinfo data. + /// This function is cautious and will signal plausibility if there's an error during evaluation. + fn drm_fdinfo_plausible>( + fdinfo_path: P, + pid: libc::pid_t, + seen_fds: &HashSet, + ) -> (bool, usize) { + let fdinfo_path = fdinfo_path.as_ref(); + + // if our fd is 0, 1 or 2 it's probably just a std stream so skip it + let fd_num = fdinfo_path + .file_name() + .and_then(|osstr| osstr.to_str()) + .unwrap_or("0") + .parse::() + .unwrap_or(0); + if fd_num <= 2 { + return (false, fd_num); + } + + let _file = std::fs::File::open(&fdinfo_path); + if _file.is_err() { + return (true, fd_num); + } + let file = _file.unwrap(); + + let _metadata = file.metadata(); + if _metadata.is_err() { + return (true, fd_num); + } + let metadata = _metadata.unwrap(); + + if !metadata.is_file() { + return (false, fd_num); + } + + // Adapted from nvtop's `is_drm_fd()` + // https://github.com/Syllo/nvtop/blob/master/src/extract_processinfo_fdinfo.c + let fd_path = fdinfo_path.to_str().map(|s| s.replace("fdinfo", "fd")); + if let Some(fd_path) = fd_path { + if let Ok(fd_metadata) = std::fs::metadata(fd_path) { + let major = unsafe { libc::major(fd_metadata.st_rdev()) }; + if (fd_metadata.st_mode() & libc::S_IFMT) != libc::S_IFCHR || major != 226 { + return (false, fd_num); + } + } + } + + // Adapted from nvtop's `processinfo_sweep_fdinfos()` + // https://github.com/Syllo/nvtop/blob/master/src/extract_processinfo_fdinfo.c + // if we've already seen the file this fd refers to, skip + let not_unique = seen_fds.iter().any(|seen_fd| unsafe { + syscalls::syscall!(syscalls::Sysno::kcmp, pid, pid, 0, fd_num, *seen_fd).unwrap_or(0) + == 0 + }); + if not_unique { + return (false, fd_num); + } + + (true, fd_num) + } + fn other_gpu_usage_stats( proc_path: &Path, pid: i32, @@ -409,60 +490,14 @@ impl ProcessData { let entry = entry?; let fdinfo_path = entry.path(); - let _file = std::fs::File::open(&fdinfo_path); - if _file.is_err() { - continue; - } - let mut file = _file.unwrap(); - - let _metadata = file.metadata(); - if _metadata.is_err() { - continue; - } - let metadata = _metadata.unwrap(); - - // if our fd is 0, 1 or 2 it's probably just a std stream so skip it - let fd_num = fdinfo_path - .file_name() - .and_then(|osstr| osstr.to_str()) - .unwrap_or("0") - .parse::() - .unwrap_or(0); - if fd_num <= 2 { - continue; - } - - if !metadata.is_file() { - continue; - } - - // Adapted from nvtop's `is_drm_fd()` - // https://github.com/Syllo/nvtop/blob/master/src/extract_processinfo_fdinfo.c - let fd_path = fdinfo_path.to_str().map(|s| s.replace("fdinfo", "fd")); - if let Some(fd_path) = fd_path { - if let Ok(fd_metadata) = std::fs::metadata(fd_path) { - let major = unsafe { libc::major(fd_metadata.st_rdev()) }; - if (fd_metadata.st_mode() & libc::S_IFMT) != libc::S_IFCHR || major != 226 { - continue; - } - } - } - - // Adapted from nvtop's `processinfo_sweep_fdinfos()` - // https://github.com/Syllo/nvtop/blob/master/src/extract_processinfo_fdinfo.c - // if we've already seen the file this fd refers to, skip - let not_unique = seen_fds.iter().any(|seen_fd| unsafe { - syscalls::syscall!(syscalls::Sysno::kcmp, pid, pid, 0, fd_num, *seen_fd) - .unwrap_or(0) - == 0 - }); - if not_unique { + let (plausible, fd_num) = Self::drm_fdinfo_plausible(&fdinfo_path, pid, &seen_fds); + if !plausible { continue; } seen_fds.insert(fd_num); - if let Ok(stats) = Self::read_fdinfo(&mut file, metadata.len() as usize) { + if let Ok(stats) = Self::read_gpu_fdinfo(&fdinfo_path) { return_map .entry(stats.0) .and_modify(|existing_value: &mut GpuUsageStats| { @@ -486,11 +521,86 @@ impl ProcessData { Ok(return_map) } - fn read_fdinfo( - fdinfo_file: &mut File, - file_size: usize, - ) -> Result<(PciSlot, GpuUsageStats, i64)> { - let mut content = String::with_capacity(file_size); + fn npu_usage_stats(proc_path: &Path, pid: i32) -> Result> { + let fdinfo_dir = proc_path.join("fdinfo"); + + let mut seen_fds = HashSet::new(); + + let mut return_map = BTreeMap::new(); + for entry in std::fs::read_dir(fdinfo_dir)? { + let entry = entry?; + let fdinfo_path = entry.path(); + + let (plausible, fd_num) = Self::drm_fdinfo_plausible(&fdinfo_path, pid, &seen_fds); + if !plausible { + continue; + } + + seen_fds.insert(fd_num); + + if let Ok((pci_slot, stats)) = Self::read_npu_fdinfo(&fdinfo_path) { + return_map + .entry(pci_slot) + .and_modify(|existing_value: &mut NpuUsageStats| { + if stats.usage > existing_value.usage { + existing_value.usage = stats.usage; + } + if stats.mem > existing_value.mem { + existing_value.mem = stats.mem; + } + }) + .or_insert(stats); + } + } + + Ok(return_map) + } + + fn read_npu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, NpuUsageStats)> { + let mut content = String::new(); + let mut fdinfo_file = File::open(fdinfo_path.as_ref())?; + fdinfo_file.read_to_string(&mut content)?; + fdinfo_file.flush()?; + + let driver = RE_DRM_DRIVER + .captures(&content) + .and_then(|captures| captures.get(1)) + .map(|capture| capture.as_str()); + + if driver.is_some() { + let pci_slot = RE_DRM_PDEV + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| PciSlot::from_str(capture.as_str()).ok()) + .unwrap_or_default(); + + let usage = RE_DRM_ENGINE_NPU_AMDXDNA + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default(); + + let total_memory = RE_DRM_TOTAL_MEMORY + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default() + .saturating_mul(1024); + + let stats = NpuUsageStats { + usage, + mem: total_memory, + }; + + return Ok((pci_slot, stats)); + } + + bail!("unable to find gpu information in this fdinfo"); + } + + fn read_gpu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, GpuUsageStats, i64)> { + let mut content = String::new(); + let mut fdinfo_file = File::open(fdinfo_path.as_ref())?; fdinfo_file.read_to_string(&mut content)?; fdinfo_file.flush()?; diff --git a/src/ui/window.rs b/src/ui/window.rs index 8e7862a0..89b7976b 100644 --- a/src/ui/window.rs +++ b/src/ui/window.rs @@ -578,17 +578,32 @@ impl MainWindow { page.refresh_page(&gpu_data); } - std::mem::drop(apps_context); - /* * Npu */ let npu_pages = imp.npu_pages.borrow(); - for ((_, page), npu_data) in npu_pages.values().zip(npu_data) { + for ((_, page), mut npu_data) in npu_pages.values().zip(npu_data) { let page = page.content().and_downcast::().unwrap(); + + let processes_npu_fraction = apps_context.npu_fraction(npu_data.pci_slot); + npu_data.usage_fraction = Some(f64::max( + npu_data.usage_fraction.unwrap_or(0.0), + processes_npu_fraction.into(), + )); + + if npu_data.total_memory.is_some() { + let processes_npu_memory_fraction = apps_context.npu_mem(npu_data.pci_slot); + npu_data.used_memory = Some(usize::max( + npu_data.used_memory.unwrap_or(0), + processes_npu_memory_fraction as usize, + )); + } + page.refresh_page(&npu_data); } + std::mem::drop(apps_context); + /* * Cpu */ diff --git a/src/utils/app.rs b/src/utils/app.rs index 3e684a9b..ad1d6fb7 100644 --- a/src/utils/app.rs +++ b/src/utils/app.rs @@ -638,6 +638,52 @@ impl AppsContext { .clamp(0.0, 1.0) } + pub fn npu_fraction(&self, pci_slot: PciSlot) -> f32 { + self.processes_iter() + .map(|process| { + ( + &process.data.npu_usage_stats, + &process.npu_usage_stats_last, + process.data.timestamp, + process.timestamp_last, + ) + }) + .map(|(new, old, timestamp, timestamp_last)| { + ( + new.get(&pci_slot), + old.get(&pci_slot), + timestamp, + timestamp_last, + ) + }) + .filter_map(|(new, old, timestamp, timestamp_last)| match (new, old) { + (Some(new), Some(old)) => Some((new, old, timestamp, timestamp_last)), + _ => None, + }) + .map(|(new, old, timestamp, timestamp_last)| { + if old.usage == 0 { + 0.0 + } else { + ((new.usage.saturating_sub(old.usage) as f32) + / (timestamp.saturating_sub(timestamp_last) as f32)) + .finite_or_default() + / 1_000_000.0 + } + }) + .sum::() + .clamp(0.0, 1.0) + } + + pub fn npu_mem(&self, pci_slot: PciSlot) -> u64 { + self.processes_iter() + .map(|process| process.data.npu_usage_stats.get(&pci_slot)) + .map(|stats| match stats { + Some(stats) => stats.mem, + None => 0, + }) + .sum() + } + fn app_associated_with_process(&self, process: &Process) -> Option { // TODO: tidy this up // ↓ look for whether we can find an ID in the cgroup @@ -787,6 +833,7 @@ impl AppsContext { old_process.read_bytes_last = old_process.data.read_bytes; old_process.write_bytes_last = old_process.data.write_bytes; old_process.gpu_usage_stats_last = old_process.data.gpu_usage_stats.clone(); + old_process.npu_usage_stats_last = old_process.data.npu_usage_stats.clone(); old_process.data = process_data.clone(); } else { diff --git a/src/utils/npu/amd.rs b/src/utils/npu/amd.rs new file mode 100644 index 00000000..8eba9c9a --- /dev/null +++ b/src/utils/npu/amd.rs @@ -0,0 +1,98 @@ +use anyhow::Result; +use process_data::pci_slot::PciSlot; + +use std::path::PathBuf; + +use crate::utils::pci::Device; + +use super::NpuImpl; + +#[derive(Debug, Clone, Default)] + +pub struct AmdNpu { + pub device: Option<&'static Device>, + pub pci_slot: PciSlot, + pub driver: String, + sysfs_path: PathBuf, + first_hwmon_path: Option, +} + +impl AmdNpu { + pub fn new( + device: Option<&'static Device>, + pci_slot: PciSlot, + driver: String, + sysfs_path: PathBuf, + first_hwmon_path: Option, + ) -> Self { + Self { + device, + pci_slot, + driver, + sysfs_path, + first_hwmon_path, + } + } +} + +impl NpuImpl for AmdNpu { + fn device(&self) -> Option<&'static Device> { + self.device + } + + fn pci_slot(&self) -> PciSlot { + self.pci_slot + } + + fn driver(&self) -> String { + self.driver.clone() + } + + fn sysfs_path(&self) -> PathBuf { + self.sysfs_path.clone() + } + + fn first_hwmon(&self) -> Option { + self.first_hwmon_path.clone() + } + + fn name(&self) -> Result { + self.drm_name() + } + + fn usage(&self) -> Result { + self.drm_usage().map(|usage| usage as f64 / 100.0) + } + + fn used_memory(&self) -> Result { + self.drm_used_memory().map(|usage| usage as usize) + } + + fn total_memory(&self) -> Result { + self.drm_total_memory().map(|usage| usage as usize) + } + + fn temperature(&self) -> Result { + self.hwmon_temperature() + } + + fn power_usage(&self) -> Result { + self.hwmon_power_usage() + } + + fn core_frequency(&self) -> Result { + self.hwmon_core_frequency() + } + + fn memory_frequency(&self) -> Result { + self.hwmon_vram_frequency() + } + + fn power_cap(&self) -> Result { + self.hwmon_power_cap() + } + + fn power_cap_max(&self) -> Result { + self.hwmon_power_cap_max() + } +} diff --git a/src/utils/npu/intel.rs b/src/utils/npu/intel.rs index bf9990a4..62029c05 100644 --- a/src/utils/npu/intel.rs +++ b/src/utils/npu/intel.rs @@ -82,12 +82,12 @@ impl NpuImpl for IntelNpu { Ok((delta_busy_time / delta_timestamp) / 1000.0) } - fn used_vram(&self) -> Result { - self.drm_used_vram().map(|usage| usage as usize) + fn used_memory(&self) -> Result { + self.drm_used_memory().map(|usage| usage as usize) } - fn total_vram(&self) -> Result { - self.drm_total_vram().map(|usage| usage as usize) + fn total_memory(&self) -> Result { + self.drm_total_memory().map(|usage| usage as usize) } fn temperature(&self) -> Result { diff --git a/src/utils/npu/mod.rs b/src/utils/npu/mod.rs index 97e80f55..d2c6339d 100644 --- a/src/utils/npu/mod.rs +++ b/src/utils/npu/mod.rs @@ -1,6 +1,8 @@ +mod amd; mod intel; mod other; +use amd::AmdNpu; use anyhow::{bail, Context, Result}; use log::{debug, info}; use process_data::pci_slot::PciSlot; @@ -21,6 +23,7 @@ use self::{intel::IntelNpu, other::OtherNpu}; use super::pci::Vendor; +pub const VID_AMD: u16 = 4098; pub const VID_INTEL: u16 = 0x8086; #[derive(Debug)] @@ -48,8 +51,8 @@ impl NpuData { let usage_fraction = npu.usage().ok(); - let total_memory = npu.total_vram().ok(); - let used_memory = npu.used_vram().ok(); + let total_memory = npu.total_memory().ok(); + let used_memory = npu.used_memory().ok(); let clock_speed = npu.core_frequency().ok(); let vram_speed = npu.memory_frequency().ok(); @@ -77,6 +80,7 @@ impl NpuData { #[derive(Debug, Clone)] pub enum Npu { + Amd(AmdNpu), Intel(IntelNpu), Other(OtherNpu), } @@ -96,8 +100,8 @@ pub trait NpuImpl { fn name(&self) -> Result; fn usage(&self) -> Result; - fn used_vram(&self) -> Result; - fn total_vram(&self) -> Result; + fn used_memory(&self) -> Result; + fn total_memory(&self) -> Result; fn temperature(&self) -> Result; fn power_usage(&self) -> Result; fn core_frequency(&self) -> Result; @@ -141,14 +145,17 @@ pub trait NpuImpl { } fn drm_usage(&self) -> Result { - bail!("usage fallback not implemented") + // This is purely a guess for the future since no NPU driver has actually implemented this statistic + self.read_device_int("npu_busy_percent") } - fn drm_used_vram(&self) -> Result { + fn drm_used_memory(&self) -> Result { + // This is purely a guess for the future since no NPU driver has actually implemented this statistic self.read_device_int("mem_info_vram_used") } - fn drm_total_vram(&self) -> Result { + fn drm_total_memory(&self) -> Result { + // This is purely a guess for the future since no NPU driver has actually implemented this statistic self.read_device_int("mem_info_vram_total") } @@ -256,6 +263,17 @@ impl Npu { )), "Intel", ) + } else if vid == VID_AMD || driver == "amdxdna" { + ( + Npu::Amd(AmdNpu::new( + device, + pci_slot, + driver, + path, + hwmon_vec.first().cloned(), + )), + "AMD", + ) } else { ( Npu::Other(OtherNpu::new( @@ -280,6 +298,7 @@ impl Npu { pub fn get_vendor(&self) -> Result<&'static Vendor> { Ok(match self { + Npu::Amd(npu) => npu.device(), Npu::Intel(npu) => npu.device(), Npu::Other(npu) => npu.device(), } @@ -289,6 +308,7 @@ impl Npu { pub fn pci_slot(&self) -> PciSlot { match self { + Npu::Amd(npu) => npu.pci_slot(), Npu::Intel(npu) => npu.pci_slot(), Npu::Other(npu) => npu.pci_slot(), } @@ -296,6 +316,7 @@ impl Npu { pub fn driver(&self) -> String { match self { + Npu::Amd(npu) => npu.driver(), Npu::Intel(npu) => npu.driver(), Npu::Other(npu) => npu.driver(), } @@ -303,6 +324,7 @@ impl Npu { pub fn name(&self) -> Result { match self { + Npu::Amd(npu) => npu.name(), Npu::Intel(npu) => npu.name(), Npu::Other(npu) => npu.name(), } @@ -310,27 +332,31 @@ impl Npu { pub fn usage(&self) -> Result { match self { + Npu::Amd(npu) => npu.usage(), Npu::Intel(npu) => npu.usage(), Npu::Other(npu) => npu.usage(), } } - pub fn used_vram(&self) -> Result { + pub fn used_memory(&self) -> Result { match self { - Npu::Intel(npu) => npu.used_vram(), - Npu::Other(npu) => npu.used_vram(), + Npu::Amd(npu) => npu.used_memory(), + Npu::Intel(npu) => npu.used_memory(), + Npu::Other(npu) => npu.used_memory(), } } - pub fn total_vram(&self) -> Result { + pub fn total_memory(&self) -> Result { match self { - Npu::Intel(npu) => npu.total_vram(), - Npu::Other(npu) => npu.total_vram(), + Npu::Amd(npu) => npu.total_memory(), + Npu::Intel(npu) => npu.total_memory(), + Npu::Other(npu) => npu.total_memory(), } } pub fn temperature(&self) -> Result { match self { + Npu::Amd(npu) => npu.temperature(), Npu::Intel(npu) => npu.temperature(), Npu::Other(npu) => npu.temperature(), } @@ -338,6 +364,7 @@ impl Npu { pub fn power_usage(&self) -> Result { match self { + Npu::Amd(npu) => npu.power_usage(), Npu::Intel(npu) => npu.power_usage(), Npu::Other(npu) => npu.power_usage(), } @@ -345,6 +372,7 @@ impl Npu { pub fn core_frequency(&self) -> Result { match self { + Npu::Amd(npu) => npu.core_frequency(), Npu::Intel(npu) => npu.core_frequency(), Npu::Other(npu) => npu.core_frequency(), } @@ -352,6 +380,7 @@ impl Npu { pub fn memory_frequency(&self) -> Result { match self { + Npu::Amd(npu) => npu.memory_frequency(), Npu::Intel(npu) => npu.memory_frequency(), Npu::Other(npu) => npu.memory_frequency(), } @@ -359,6 +388,7 @@ impl Npu { pub fn power_cap(&self) -> Result { match self { + Npu::Amd(npu) => npu.power_cap(), Npu::Intel(npu) => npu.power_cap(), Npu::Other(npu) => npu.power_cap(), } @@ -366,6 +396,7 @@ impl Npu { pub fn power_cap_max(&self) -> Result { match self { + Npu::Amd(npu) => npu.power_cap(), Npu::Intel(npu) => npu.power_cap_max(), Npu::Other(npu) => npu.power_cap_max(), } diff --git a/src/utils/npu/other.rs b/src/utils/npu/other.rs index 2104e303..8f851850 100644 --- a/src/utils/npu/other.rs +++ b/src/utils/npu/other.rs @@ -64,12 +64,12 @@ impl NpuImpl for OtherNpu { self.drm_usage().map(|usage| usage as f64 / 100.0) } - fn used_vram(&self) -> Result { - self.drm_used_vram().map(|usage| usage as usize) + fn used_memory(&self) -> Result { + self.drm_used_memory().map(|usage| usage as usize) } - fn total_vram(&self) -> Result { - self.drm_total_vram().map(|usage| usage as usize) + fn total_memory(&self) -> Result { + self.drm_total_memory().map(|usage| usage as usize) } fn temperature(&self) -> Result { diff --git a/src/utils/process.rs b/src/utils/process.rs index fa24ac06..0680f715 100644 --- a/src/utils/process.rs +++ b/src/utils/process.rs @@ -1,7 +1,7 @@ use anyhow::{bail, Context, Result}; use config::LIBEXECDIR; use log::{debug, error, info}; -use process_data::{pci_slot::PciSlot, GpuUsageStats, Niceness, ProcessData}; +use process_data::{pci_slot::PciSlot, GpuUsageStats, Niceness, NpuUsageStats, ProcessData}; use std::{ collections::BTreeMap, ffi::{OsStr, OsString}, @@ -69,6 +69,7 @@ pub struct Process { pub read_bytes_last: Option, pub write_bytes_last: Option, pub gpu_usage_stats_last: BTreeMap, + pub npu_usage_stats_last: BTreeMap, pub display_name: String, } @@ -152,6 +153,7 @@ impl Process { read_bytes_last, write_bytes_last, gpu_usage_stats_last: Default::default(), + npu_usage_stats_last: Default::default(), display_name, } } @@ -456,6 +458,38 @@ impl Process { .sum() } + #[must_use] + pub fn npu_usage(&self) -> f32 { + let mut returned_npu_usage = 0.0; + for (npu, usage) in &self.data.npu_usage_stats { + if let Some(old_usage) = self.npu_usage_stats_last.get(npu) { + let this_npu_usage = if old_usage.usage == 0 { + 0.0 + } else { + ((usage.usage.saturating_sub(old_usage.usage) as f32) + / (self.data.timestamp.saturating_sub(self.timestamp_last) as f32) + .finite_or_default()) + / 1_000_000.0 + }; + + if this_npu_usage > returned_npu_usage { + returned_npu_usage = this_npu_usage; + } + } + } + + returned_npu_usage + } + + #[must_use] + pub fn npu_mem_usage(&self) -> u64 { + self.data + .npu_usage_stats + .values() + .map(|stats| stats.mem) + .sum() + } + #[must_use] pub fn starttime(&self) -> f64 { self.data.starttime as f64 / *TICK_RATE as f64 From 40fe09b2c60d38098d635c7a884d09a18f77d08c Mon Sep 17 00:00:00 2001 From: nokyan Date: Sat, 14 Dec 2024 22:34:56 +0100 Subject: [PATCH 2/6] Use driver name to differentiate between NPU and GPU usage stats --- lib/process_data/src/lib.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/lib/process_data/src/lib.rs b/lib/process_data/src/lib.rs index 74f9cf80..6a970535 100644 --- a/lib/process_data/src/lib.rs +++ b/lib/process_data/src/lib.rs @@ -26,6 +26,9 @@ const STAT_SYSTEM_CPU_TIME: usize = 14 - STAT_OFFSET; const STAT_NICE: usize = 18 - STAT_OFFSET; const STAT_STARTTIME: usize = 21 - STAT_OFFSET; +const GPU_DRIVER_NAMES: &[&str] = &["amdgpu", "i915"]; +const NPU_DRIVER_NAMES: &[&str] = &["amdxdna_accel_driver"]; + static USERS_CACHE: LazyLock> = LazyLock::new(|| unsafe { uzers::all_users() .map(|user| (user.uid(), user.name().to_string_lossy().to_string())) @@ -567,7 +570,11 @@ impl ProcessData { .and_then(|captures| captures.get(1)) .map(|capture| capture.as_str()); - if driver.is_some() { + if let Some(driver) = driver { + if !NPU_DRIVER_NAMES.contains(&driver) { + bail!("this is not an NPU") + } + let pci_slot = RE_DRM_PDEV .captures(&content) .and_then(|captures| captures.get(1)) @@ -614,7 +621,16 @@ impl ProcessData { .and_then(|captures| captures.get(1)) .and_then(|capture| capture.as_str().parse::().ok()); + let driver = RE_DRM_DRIVER + .captures(&content) + .and_then(|captures| captures.get(1)) + .map(|capture| capture.as_str()); + if let (Some(pci_slot), Some(client_id)) = (pci_slot, client_id) { + if !GPU_DRIVER_NAMES.contains(&driver.unwrap_or_default()) { + bail!("this is not a GPU"); + } + let gfx = RE_DRM_ENGINE_GFX // amd .captures(&content) .and_then(|captures| captures.get(1)) From 6935f679ee003aa2aad287027064da9b04bfe778 Mon Sep 17 00:00:00 2001 From: nokyan Date: Sun, 15 Dec 2024 08:08:44 +0100 Subject: [PATCH 3/6] Small cleanups --- lib/process_data/src/lib.rs | 169 ++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 93 deletions(-) diff --git a/lib/process_data/src/lib.rs b/lib/process_data/src/lib.rs index 6a970535..e9e0ccc9 100644 --- a/lib/process_data/src/lib.rs +++ b/lib/process_data/src/lib.rs @@ -11,8 +11,6 @@ use nvml_wrapper::{Device, Nvml}; use pci_slot::PciSlot; use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, HashMap, HashSet}; -use std::fs::File; -use std::io::{Read, Write}; use std::os::linux::fs::MetadataExt; use std::path::Path; use std::str::FromStr; @@ -54,8 +52,6 @@ static RE_DRM_DRIVER: Lazy = lazy_regex!(r"drm-driver:\s*(.+)"); static RE_DRM_PDEV: Lazy = lazy_regex!(r"drm-pdev:\s*([0-9A-Fa-f]{4}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}\.[0-9A-Fa-f])"); -static RE_DRM_CLIENT_ID: Lazy = lazy_regex!(r"drm-client-id:\s*(\d+)"); - // AMD only static RE_DRM_ENGINE_NPU_AMDXDNA: Lazy = lazy_regex!(r"drm-engine-npu-amdxdna:\s*(\d+)\s*ns"); @@ -500,24 +496,24 @@ impl ProcessData { seen_fds.insert(fd_num); - if let Ok(stats) = Self::read_gpu_fdinfo(&fdinfo_path) { + if let Ok((pci_slot, stats)) = Self::read_gpu_fdinfo(&fdinfo_path) { return_map - .entry(stats.0) + .entry(pci_slot) .and_modify(|existing_value: &mut GpuUsageStats| { - if stats.1.gfx > existing_value.gfx { - existing_value.gfx = stats.1.gfx; + if stats.gfx > existing_value.gfx { + existing_value.gfx = stats.gfx; } - if stats.1.dec > existing_value.dec { - existing_value.dec = stats.1.dec; + if stats.dec > existing_value.dec { + existing_value.dec = stats.dec; } - if stats.1.enc > existing_value.enc { - existing_value.enc = stats.1.enc; + if stats.enc > existing_value.enc { + existing_value.enc = stats.enc; } - if stats.1.mem > existing_value.mem { - existing_value.mem = stats.1.mem; + if stats.mem > existing_value.mem { + existing_value.mem = stats.mem; } }) - .or_insert(stats.1); + .or_insert(stats); } } @@ -560,10 +556,7 @@ impl ProcessData { } fn read_npu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, NpuUsageStats)> { - let mut content = String::new(); - let mut fdinfo_file = File::open(fdinfo_path.as_ref())?; - fdinfo_file.read_to_string(&mut content)?; - fdinfo_file.flush()?; + let content = std::fs::read_to_string(fdinfo_path.as_ref())?; let driver = RE_DRM_DRIVER .captures(&content) @@ -605,96 +598,86 @@ impl ProcessData { bail!("unable to find gpu information in this fdinfo"); } - fn read_gpu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, GpuUsageStats, i64)> { - let mut content = String::new(); - let mut fdinfo_file = File::open(fdinfo_path.as_ref())?; - fdinfo_file.read_to_string(&mut content)?; - fdinfo_file.flush()?; + fn read_gpu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, GpuUsageStats)> { + let content = std::fs::read_to_string(fdinfo_path.as_ref())?; let pci_slot = RE_DRM_PDEV .captures(&content) .and_then(|captures| captures.get(1)) - .and_then(|capture| PciSlot::from_str(capture.as_str()).ok()); - - let client_id = RE_DRM_CLIENT_ID - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()); + .and_then(|capture| PciSlot::from_str(capture.as_str()).ok()) + .context("can't parse PCI slot ID")?; let driver = RE_DRM_DRIVER .captures(&content) .and_then(|captures| captures.get(1)) - .map(|capture| capture.as_str()); - - if let (Some(pci_slot), Some(client_id)) = (pci_slot, client_id) { - if !GPU_DRIVER_NAMES.contains(&driver.unwrap_or_default()) { - bail!("this is not a GPU"); - } - - let gfx = RE_DRM_ENGINE_GFX // amd - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .or_else(|| { - // intel - RE_DRM_ENGINE_RENDER - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - }) - .unwrap_or_default(); + .map(|capture| capture.as_str()) + .unwrap_or_default(); - let compute = RE_DRM_ENGINE_COMPUTE - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .unwrap_or_default(); + if !GPU_DRIVER_NAMES.contains(&driver) { + bail!("this is not a GPU"); + } - let enc = RE_DRM_ENGINE_ENC // amd - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .or_else(|| { - // intel - RE_DRM_ENGINE_VIDEO - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - }) - .unwrap_or_default(); + let gfx = RE_DRM_ENGINE_GFX // amd + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .or_else(|| { + // intel + RE_DRM_ENGINE_RENDER + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + }) + .unwrap_or_default(); - let dec = RE_DRM_ENGINE_DEC - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .unwrap_or_default(); + let compute = RE_DRM_ENGINE_COMPUTE + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default(); - let vram = RE_DRM_MEMORY_VRAM - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .unwrap_or_default() - .saturating_mul(1024); + let enc = RE_DRM_ENGINE_ENC // amd + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .or_else(|| { + // intel + RE_DRM_ENGINE_VIDEO + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + }) + .unwrap_or_default(); - let gtt = RE_DRM_MEMORY_GTT - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .unwrap_or_default() - .saturating_mul(1024); + let dec = RE_DRM_ENGINE_DEC + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default(); - let stats = GpuUsageStats { - gfx: gfx.saturating_add(compute), - mem: vram.saturating_add(gtt), - enc, - dec, - nvidia: false, - }; + let vram = RE_DRM_MEMORY_VRAM + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default() + .saturating_mul(1024); - return Ok((pci_slot, stats, client_id)); - } + let gtt = RE_DRM_MEMORY_GTT + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default() + .saturating_mul(1024); + + let stats = GpuUsageStats { + gfx: gfx.saturating_add(compute), + mem: vram.saturating_add(gtt), + enc, + dec, + nvidia: false, + }; - bail!("unable to find gpu information in this fdinfo"); + return Ok((pci_slot, stats)); } fn nvidia_gpu_stats_all(pid: i32) -> BTreeMap { From 1514eee49ab5dbdd38f62a264070b1d8623c214a Mon Sep 17 00:00:00 2001 From: nokyan Date: Sun, 15 Dec 2024 08:20:49 +0100 Subject: [PATCH 4/6] Don't use '?' in for loops --- lib/process_data/src/lib.rs | 63 +++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/lib/process_data/src/lib.rs b/lib/process_data/src/lib.rs index e9e0ccc9..506a3590 100644 --- a/lib/process_data/src/lib.rs +++ b/lib/process_data/src/lib.rs @@ -485,8 +485,7 @@ impl ProcessData { let mut seen_fds = HashSet::new(); let mut return_map = BTreeMap::new(); - for entry in std::fs::read_dir(fdinfo_dir)? { - let entry = entry?; + for entry in std::fs::read_dir(fdinfo_dir)?.flatten() { let fdinfo_path = entry.path(); let (plausible, fd_num) = Self::drm_fdinfo_plausible(&fdinfo_path, pid, &seen_fds); @@ -526,8 +525,7 @@ impl ProcessData { let mut seen_fds = HashSet::new(); let mut return_map = BTreeMap::new(); - for entry in std::fs::read_dir(fdinfo_dir)? { - let entry = entry?; + for entry in std::fs::read_dir(fdinfo_dir)?.flatten() { let fdinfo_path = entry.path(); let (plausible, fd_num) = Self::drm_fdinfo_plausible(&fdinfo_path, pid, &seen_fds); @@ -558,44 +556,41 @@ impl ProcessData { fn read_npu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, NpuUsageStats)> { let content = std::fs::read_to_string(fdinfo_path.as_ref())?; - let driver = RE_DRM_DRIVER + let pci_slot = RE_DRM_PDEV .captures(&content) .and_then(|captures| captures.get(1)) - .map(|capture| capture.as_str()); - - if let Some(driver) = driver { - if !NPU_DRIVER_NAMES.contains(&driver) { - bail!("this is not an NPU") - } + .and_then(|capture| PciSlot::from_str(capture.as_str()).ok()) + .context("can't parse PCI slot ID")?; - let pci_slot = RE_DRM_PDEV - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| PciSlot::from_str(capture.as_str()).ok()) - .unwrap_or_default(); + let driver = RE_DRM_DRIVER + .captures(&content) + .and_then(|captures| captures.get(1)) + .map(|capture| capture.as_str()) + .unwrap_or_default(); - let usage = RE_DRM_ENGINE_NPU_AMDXDNA - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .unwrap_or_default(); + if !NPU_DRIVER_NAMES.contains(&driver) { + bail!("this is not an NPU") + } - let total_memory = RE_DRM_TOTAL_MEMORY - .captures(&content) - .and_then(|captures| captures.get(1)) - .and_then(|capture| capture.as_str().parse::().ok()) - .unwrap_or_default() - .saturating_mul(1024); + let usage = RE_DRM_ENGINE_NPU_AMDXDNA + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default(); - let stats = NpuUsageStats { - usage, - mem: total_memory, - }; + let total_memory = RE_DRM_TOTAL_MEMORY + .captures(&content) + .and_then(|captures| captures.get(1)) + .and_then(|capture| capture.as_str().parse::().ok()) + .unwrap_or_default() + .saturating_mul(1024); - return Ok((pci_slot, stats)); - } + let stats = NpuUsageStats { + usage, + mem: total_memory, + }; - bail!("unable to find gpu information in this fdinfo"); + return Ok((pci_slot, stats)); } fn read_gpu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, GpuUsageStats)> { From 1d9b04e8859cedf2c24880ea76632353a04706cd Mon Sep 17 00:00:00 2001 From: nokyan Date: Sun, 15 Dec 2024 09:58:32 +0100 Subject: [PATCH 5/6] Add debug and trace prints for resources-processes --- Cargo.lock | 2 + lib/process_data/Cargo.toml | 2 + lib/process_data/src/lib.rs | 117 +++++++++++++++++++++++++++------ src/bin/resources-processes.rs | 11 ++++ src/utils/gpu/nvidia.rs | 12 +--- src/utils/process.rs | 17 ++++- 6 files changed, 128 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e946152c..8f347de9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1278,9 +1278,11 @@ dependencies = [ "glob", "lazy-regex", "libc", + "log", "num_cpus", "nutype", "nvml-wrapper", + "pretty_env_logger", "serde", "syscalls", "sysconf", diff --git a/lib/process_data/Cargo.toml b/lib/process_data/Cargo.toml index 52489eb8..9de3bccb 100644 --- a/lib/process_data/Cargo.toml +++ b/lib/process_data/Cargo.toml @@ -21,9 +21,11 @@ anyhow = "1.0.94" glob = "0.3.1" lazy-regex = "3.3.0" libc = "0.2.167" +log = "0.4.22" num_cpus = "1.16.0" nutype = { version = "0.5.0", features = ["serde"] } nvml-wrapper = "0.10.0" +pretty_env_logger = "0.5" serde = { version = "1.0.215", features = ["serde_derive"] } syscalls = { version = "0.6.18", features = ["all"] } sysconf = "0.3.4" diff --git a/lib/process_data/src/lib.rs b/lib/process_data/src/lib.rs index 506a3590..e2ffbf9c 100644 --- a/lib/process_data/src/lib.rs +++ b/lib/process_data/src/lib.rs @@ -3,6 +3,7 @@ pub mod pci_slot; use anyhow::{bail, Context, Result}; use glob::glob; use lazy_regex::{lazy_regex, Lazy, Regex}; +use log::{debug, trace, warn}; use nutype::nutype; use nvml_wrapper::enums::device::UsedGpuMemory; use nvml_wrapper::error::NvmlError; @@ -27,10 +28,18 @@ const STAT_STARTTIME: usize = 21 - STAT_OFFSET; const GPU_DRIVER_NAMES: &[&str] = &["amdgpu", "i915"]; const NPU_DRIVER_NAMES: &[&str] = &["amdxdna_accel_driver"]; +const MAJOR: u32 = 226; + static USERS_CACHE: LazyLock> = LazyLock::new(|| unsafe { - uzers::all_users() - .map(|user| (user.uid(), user.name().to_string_lossy().to_string())) - .collect() + debug!("Initializing users cache…"); + let users: HashMap = uzers::all_users() + .map(|user| { + trace!("Found user {}", user.name().to_string_lossy()); + (user.uid(), user.name().to_string_lossy().to_string()) + }) + .collect(); + debug!("Found {} users", users.len()); + users }); static PAGESIZE: LazyLock = LazyLock::new(sysconf::pagesize); @@ -82,16 +91,25 @@ static RE_DRM_ENGINE_VIDEO: Lazy = lazy_regex!(r"drm-engine-video:\s*(\d+ static RE_DRM_TOTAL_MEMORY: Lazy = lazy_regex!(r"drm-total-memory:\s*(\d+)\s*KiB"); -static NVML: Lazy> = Lazy::new(Nvml::init); +static NVML: Lazy> = Lazy::new(|| { + debug!("Initializing connection to NVML…"); + Nvml::init().inspect_err(|err| warn!("Unable to connect to NVML: {err}")) +}); static NVML_DEVICES: Lazy> = Lazy::new(|| { if let Ok(nvml) = NVML.as_ref() { + debug!("Looking for NVIDIA devices…"); let device_count = nvml.device_count().unwrap_or(0); let mut return_vec = Vec::with_capacity(device_count as usize); for i in 0..device_count { if let Ok(gpu) = nvml.device_by_index(i) { if let Ok(pci_slot) = gpu.pci_info().map(|pci_info| pci_info.bus_id) { let pci_slot = PciSlot::from_str(&pci_slot).unwrap(); + debug!( + "Found {} at {}", + gpu.name().unwrap_or("N/A".into()), + pci_slot + ); return_vec.push((pci_slot, gpu)); } } @@ -253,13 +271,6 @@ impl ProcessData { pub fn try_from_path>(proc_path: P) -> Result { let proc_path = proc_path.as_ref(); - let stat = std::fs::read_to_string(proc_path.join("stat"))?; - let statm = std::fs::read_to_string(proc_path.join("statm"))?; - let status = std::fs::read_to_string(proc_path.join("status"))?; - let comm = std::fs::read_to_string(proc_path.join("comm"))?; - let commandline = std::fs::read_to_string(proc_path.join("cmdline"))?; - let io = std::fs::read_to_string(proc_path.join("io")).ok(); - let pid = proc_path .file_name() .context("proc_path terminates in ..")? @@ -267,6 +278,23 @@ impl ProcessData { .context("can't turn OsStr to str")? .parse()?; + trace!("Inspecting process {pid}…"); + + trace!("Reading info files…"); + let stat = std::fs::read_to_string(proc_path.join("stat")) + .inspect_err(|err| trace!("Error reading 'stat': {err}"))?; + let statm = std::fs::read_to_string(proc_path.join("statm")) + .inspect_err(|err| trace!("Error reading 'statm': {err}"))?; + let status = std::fs::read_to_string(proc_path.join("status")) + .inspect_err(|err| trace!("Error reading 'status': {err}"))?; + let comm = std::fs::read_to_string(proc_path.join("comm")) + .inspect_err(|err| trace!("Error reading 'comm': {err}"))?; + let commandline = std::fs::read_to_string(proc_path.join("cmdline")) + .inspect_err(|err| trace!("Error reading 'cmdline': {err}"))?; + let io = std::fs::read_to_string(proc_path.join("io")) + .inspect_err(|err| trace!("Error reading 'io': {err}")) + .ok(); + let user = USERS_CACHE .get(&Self::get_uid(proc_path)?) .cloned() @@ -275,7 +303,8 @@ impl ProcessData { let stat = stat .split(')') // since we don't care about the pid or the executable name, split after the executable name to make our life easier .last() - .context("stat doesn't have ')'")? + .context("stat doesn't have ')'") + .inspect_err(|err| trace!("Can't parse 'stat': {err}"))? .split(' ') .skip(1) // the first element would be a space, let's ignore that .collect::>(); @@ -288,23 +317,28 @@ impl ProcessData { let parent_pid = stat .get(STAT_PARENT_PID) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse parent pid from 'stat': {err}"))?; let user_cpu_time = stat .get(STAT_USER_CPU_TIME) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse user cpu time from 'stat': {err}"))?; let system_cpu_time = stat .get(STAT_SYSTEM_CPU_TIME) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse system cpu time from 'stat': {err}"))?; let nice = stat .get(STAT_NICE) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse nice from 'stat': {err}"))?; let starttime = stat .get(STAT_STARTTIME) .context("wrong stat file format") - .and_then(|x| x.parse().context("couldn't parse stat file content"))?; + .and_then(|x| x.parse().context("couldn't parse stat file content to int")) + .inspect_err(|err| trace!("Can't parse start time from 'stat': {err}"))?; let mut affinity = Vec::with_capacity(*NUM_CPUS); RE_AFFINITY @@ -340,7 +374,8 @@ impl ProcessData { .and_then(|x| { x.parse::() .context("couldn't parse statm file content") - })? + }) + .inspect_err(|err| trace!("Can't parse memory usage from 'statm': {err}"))? .saturating_sub( statm .get(2) @@ -353,6 +388,7 @@ impl ProcessData { .saturating_mul(*PAGESIZE); let cgroup = std::fs::read_to_string(proc_path.join("cgroup")) + .inspect_err(|err| trace!("Can't read cgroup: {err}")) .ok() .and_then(Self::sanitize_cgroup); @@ -408,6 +444,7 @@ impl ProcessData { } fn gpu_usage_stats(proc_path: &Path, pid: i32) -> BTreeMap { + trace!("Gathering GPU stats…"); let nvidia_stats = Self::nvidia_gpu_stats_all(pid); let mut other_stats = Self::other_gpu_usage_stats(proc_path, pid).unwrap_or_default(); other_stats.extend(nvidia_stats); @@ -431,22 +468,30 @@ impl ProcessData { .parse::() .unwrap_or(0); if fd_num <= 2 { + trace!( + "fdinfo {fd_num} deemed as not plausible. Reason: fd_num ≤ 2 (probably std stream)" + ); return (false, fd_num); } let _file = std::fs::File::open(&fdinfo_path); if _file.is_err() { - return (true, fd_num); + trace!("fdinfo {fd_num} deemed as not plausible. Reason: File can't be opened"); + return (false, fd_num); } let file = _file.unwrap(); let _metadata = file.metadata(); if _metadata.is_err() { - return (true, fd_num); + trace!( + "fdinfo {fd_num} deemed as not plausible. Reason: File's metadata can't be read" + ); + return (false, fd_num); } let metadata = _metadata.unwrap(); if !metadata.is_file() { + trace!("fdinfo {fd_num} deemed as not plausible. Reason: Not a file"); return (false, fd_num); } @@ -455,8 +500,15 @@ impl ProcessData { let fd_path = fdinfo_path.to_str().map(|s| s.replace("fdinfo", "fd")); if let Some(fd_path) = fd_path { if let Ok(fd_metadata) = std::fs::metadata(fd_path) { + if (fd_metadata.st_mode() & libc::S_IFMT) != libc::S_IFCHR { + trace!("fdinfo {fd_num} deemed as not plausible. Reason: Wrong st_mode"); + return (false, fd_num); + } let major = unsafe { libc::major(fd_metadata.st_rdev()) }; - if (fd_metadata.st_mode() & libc::S_IFMT) != libc::S_IFCHR || major != 226 { + if major != MAJOR { + trace!( + "fdinfo {fd_num} deemed as not plausible. Reason: Wrong major (expected: {MAJOR}, got: {major})" + ); return (false, fd_num); } } @@ -470,9 +522,11 @@ impl ProcessData { == 0 }); if not_unique { + trace!("fdinfo {fd_num} deemed as not plausible. Reason: kcmp indicated that we've already seen this file"); return (false, fd_num); } + trace!("fdinfo {fd_num} deemed as plausible"); (true, fd_num) } @@ -480,6 +534,7 @@ impl ProcessData { proc_path: &Path, pid: i32, ) -> Result> { + trace!("Gathering other GPU stats…"); let fdinfo_dir = proc_path.join("fdinfo"); let mut seen_fds = HashSet::new(); @@ -520,6 +575,7 @@ impl ProcessData { } fn npu_usage_stats(proc_path: &Path, pid: i32) -> Result> { + trace!("Gathering NPU stats…"); let fdinfo_dir = proc_path.join("fdinfo"); let mut seen_fds = HashSet::new(); @@ -554,6 +610,10 @@ impl ProcessData { } fn read_npu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, NpuUsageStats)> { + trace!( + "Reading and parsing {} for NPU stats…", + fdinfo_path.as_ref().to_string_lossy() + ); let content = std::fs::read_to_string(fdinfo_path.as_ref())?; let pci_slot = RE_DRM_PDEV @@ -569,6 +629,7 @@ impl ProcessData { .unwrap_or_default(); if !NPU_DRIVER_NAMES.contains(&driver) { + trace!("Driver '{driver}' is not known to be NPU-related, skipping"); bail!("this is not an NPU") } @@ -590,10 +651,16 @@ impl ProcessData { mem: total_memory, }; + trace!("Success reading GPU data for {pci_slot}: {stats:?}"); + return Ok((pci_slot, stats)); } fn read_gpu_fdinfo>(fdinfo_path: P) -> Result<(PciSlot, GpuUsageStats)> { + trace!( + "Reading and parsing {} for GPU stats…", + fdinfo_path.as_ref().to_string_lossy() + ); let content = std::fs::read_to_string(fdinfo_path.as_ref())?; let pci_slot = RE_DRM_PDEV @@ -609,6 +676,7 @@ impl ProcessData { .unwrap_or_default(); if !GPU_DRIVER_NAMES.contains(&driver) { + trace!("Driver {driver} is not known to be GPU-related, skipping"); bail!("this is not a GPU"); } @@ -672,10 +740,14 @@ impl ProcessData { nvidia: false, }; + trace!("Success reading GPU data for {pci_slot}: {stats:?}"); + return Ok((pci_slot, stats)); } fn nvidia_gpu_stats_all(pid: i32) -> BTreeMap { + trace!("Gathering NVIDIA GPU stats…"); + let mut return_map = BTreeMap::new(); for (pci_slot, _) in NVML_DEVICES.iter() { @@ -688,6 +760,7 @@ impl ProcessData { } fn nvidia_gpu_stats(pid: i32, pci_slot: PciSlot) -> Result { + trace!("Gathering GPU stats for NVIDIA GPU at {pci_slot}…"); let this_process_stats = NVIDIA_PROCESSES_STATS .read() .unwrap() @@ -722,6 +795,7 @@ impl ProcessData { } fn nvidia_process_infos() -> HashMap> { + trace!("Refreshing NVIDIA process infos…"); let mut return_map = HashMap::new(); for (pci_slot, gpu) in NVML_DEVICES.iter() { @@ -735,6 +809,7 @@ impl ProcessData { } fn nvidia_process_stats() -> HashMap> { + trace!("Refreshing NVIDIA process stats…"); let mut return_map = HashMap::new(); for (pci_slot, gpu) in NVML_DEVICES.iter() { diff --git a/src/bin/resources-processes.rs b/src/bin/resources-processes.rs index 64569716..154d01ad 100644 --- a/src/bin/resources-processes.rs +++ b/src/bin/resources-processes.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use log::{info, trace}; use process_data::ProcessData; use ron::ser::PrettyConfig; use std::io::{Read, Write}; @@ -18,6 +19,11 @@ struct Args { } fn main() -> Result<()> { + // Initialize logger + pretty_env_logger::init(); + + info!("Starting resources-processes…"); + let args = Args::parse(); if args.once { @@ -29,12 +35,14 @@ fn main() -> Result<()> { let mut buffer = [0; 1]; std::io::stdin().read_exact(&mut buffer)?; + trace!("Received character"); output(args.ron)?; } } fn output(ron: bool) -> Result<()> { + trace!("Gathering process data…"); let data = ProcessData::all_process_data()?; let encoded = if ron { @@ -50,10 +58,13 @@ fn output(ron: bool) -> Result<()> { let stdout = std::io::stdout(); let mut handle = stdout.lock(); + trace!("Sending content length ({})…", encoded.len()); handle.write_all(&len_byte_array)?; + trace!("Sending content…"); handle.write_all(&encoded)?; + trace!("Flushing…"); handle.flush()?; Ok(()) } diff --git a/src/utils/gpu/nvidia.rs b/src/utils/gpu/nvidia.rs index f1668f33..4f47cfd4 100644 --- a/src/utils/gpu/nvidia.rs +++ b/src/utils/gpu/nvidia.rs @@ -10,15 +10,9 @@ use process_data::pci_slot::PciSlot; use std::{path::PathBuf, sync::LazyLock}; static NVML: LazyLock> = LazyLock::new(|| { - let nvml = Nvml::init(); - - if let Err(error) = nvml.as_ref() { - warn!("Connection to NVML failed, reason: {error}"); - } else { - debug!("Successfully connected to NVML"); - } - - nvml + Nvml::init() + .inspect_err(|err| warn!("Unable to connect to NVML: {err}")) + .inspect(|_| debug!("Successfully connected to NVML")) }); use crate::utils::pci::Device; diff --git a/src/utils/process.rs b/src/utils/process.rs index 0680f715..bf228d85 100644 --- a/src/utils/process.rs +++ b/src/utils/process.rs @@ -35,18 +35,29 @@ static OTHER_PROCESS: LazyLock> = LazyLock::new let child = if *IS_FLATPAK { debug!("Spawning resources-processes in Flatpak mode ({proxy_path})"); Command::new(FLATPAK_SPAWN) - .args(["--host", proxy_path.as_str()]) + .args([ + &format!( + "--env=RUST_LOG={}", + std::env::var("RUST_LOG=resources").unwrap_or("warn".into()) + ), + "--host", + proxy_path.as_str(), + ]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) - .stderr(Stdio::null()) + .stderr(Stdio::inherit()) .spawn() .unwrap() } else { debug!("Spawning resources-processes in native mode ({proxy_path})"); Command::new(proxy_path) + .arg(&format!( + "--env=RUST_LOG={}", + std::env::var("RUST_LOG=resources").unwrap_or("warn".into()) + )) .stdin(Stdio::piped()) .stdout(Stdio::piped()) - .stderr(Stdio::null()) + .stderr(Stdio::inherit()) .spawn() .unwrap() }; From 5fb2982d8ad721779f62b4bfda53efb2a23d0d58 Mon Sep 17 00:00:00 2001 From: nokyan Date: Sun, 15 Dec 2024 10:11:49 +0100 Subject: [PATCH 6/6] Small typo --- lib/process_data/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/process_data/src/lib.rs b/lib/process_data/src/lib.rs index e2ffbf9c..477e0250 100644 --- a/lib/process_data/src/lib.rs +++ b/lib/process_data/src/lib.rs @@ -651,7 +651,7 @@ impl ProcessData { mem: total_memory, }; - trace!("Success reading GPU data for {pci_slot}: {stats:?}"); + trace!("Success reading NPU data for {pci_slot}: {stats:?}"); return Ok((pci_slot, stats)); }