Skip to main content

resource_tracker/collector/
gpu.rs

1use crate::metrics::GpuMetrics;
2use libamdgpu_top::{
3    AMDGPU::{GpuMetrics as AmdHwMetrics, MetricsInfo},
4    DevicePath,
5    stat::{FdInfoStat, GpuActivity, ProcInfo, update_index_by_all_proc},
6};
7use nvml_wrapper::{
8    Nvml,
9    enum_wrappers::device::{Clock, TemperatureSensor},
10    enums::device::UsedGpuMemory,
11};
12use std::collections::{HashMap, HashSet};
13use std::path::{Path, PathBuf};
14use std::time::Duration;
15
16type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
17
18/// Collects per-GPU metrics from NVIDIA (via NVML) and AMD (via libamdgpu_top).
19///
20/// Both backends load their native libraries at runtime:
21/// - NVML via `libloading` (`libnvidia-ml.so`) — absent on non-NVIDIA hosts.
22/// - libdrm via the `libdrm_dynamic_loading` feature — absent on non-AMD hosts.
23///
24/// On a CPU-only host `collect()` returns an empty Vec with no error.
25pub struct GpuCollector {
26    nvml: Option<Nvml>,
27    /// Per-process fdinfo state for AMD GPU utilization delta tracking.
28    /// Populated lazily on first AMD host detection.
29    amd_fdinfo: Option<FdInfoStat>,
30}
31
32impl GpuCollector {
33    pub fn new() -> Self {
34        Self {
35            nvml: Nvml::init().ok(),
36            amd_fdinfo: None,
37        }
38    }
39
40    pub fn collect(&self) -> Result<Vec<GpuMetrics>> {
41        let mut metrics = Vec::new();
42        self.collect_nvidia(&mut metrics);
43        self.collect_amd(&mut metrics);
44        Ok(metrics)
45    }
46
47    /// Return `(process_gpu_vram_mib, process_gpu_usage, process_gpu_utilized)` for the given PIDs.
48    ///
49    /// `pids` is the tracked process tree (root + descendants) as u32 values.
50    ///
51    /// NVIDIA: queries NVML running-compute and running-graphics process lists
52    /// for each device; sums `used_gpu_memory` for matched PIDs.
53    /// SM utilization is sourced from `nvmlDeviceGetProcessUtilization`; the
54    /// latest sample per PID is taken, summed across all matched PIDs and devices,
55    /// then divided by 100 to yield fractional GPUs (e.g. 0.5 = half a GPU).
56    ///
57    /// AMD: reads `/proc/{pid}/fdinfo` for each PID, parses `drm-memory-vram`,
58    /// `drm-pdev`, and `drm-engine-gfx` from DRM fdinfo entries (Linux kernel >= 5.17).
59    /// GFX utilization is computed via `FdInfoStat` delta tracking and normalized
60    /// to fractional GPUs.
61    ///
62    /// Returns `(None, None, None)` when no GPU is present on the host.
63    /// Returns `(Some(0.0), Some(0.0), Some(0))` when a GPU is present but the
64    /// process tree has no allocations.
65    pub fn process_gpu_info(
66        &mut self,
67        pids: &[u32],
68        interval: Duration,
69    ) -> (Option<f64>, Option<f64>, Option<u32>) {
70        let mut total_vram_bytes: u64 = 0;
71        let mut total_sm_util: f64 = 0.0;
72        let mut n_utilized: u32 = 0;
73        let mut any_gpu = false;
74        let mut has_nvml = false;
75        let mut has_amd_util = false;
76
77        // --- NVIDIA via NVML -------------------------------------------------
78        if let Some(ref nvml) = self.nvml {
79            any_gpu = true;
80            has_nvml = true;
81            let pid_set: HashSet<u32> = pids.iter().copied().collect();
82            let count = nvml.device_count().unwrap_or(0);
83
84            (0..count).for_each(|i| {
85                let Ok(device) = nvml.device_by_index(i) else {
86                    return;
87                };
88                let procs: Vec<_> = device
89                    .running_compute_processes()
90                    .unwrap_or_default()
91                    .into_iter()
92                    .chain(device.running_graphics_processes().unwrap_or_default())
93                    .collect();
94
95                let mut device_vram: u64 = 0;
96                let mut found = false;
97                procs
98                    .iter()
99                    .filter(|p| pid_set.contains(&p.pid))
100                    .for_each(|p| {
101                        found = true;
102                        if let UsedGpuMemory::Used(bytes) = p.used_gpu_memory {
103                            device_vram += bytes;
104                        }
105                    });
106
107                if found {
108                    n_utilized += 1;
109                    total_vram_bytes += device_vram;
110                }
111
112                // Per-process SM utilization: take the latest sample per PID
113                // (nvmlDeviceGetProcessUtilization; no accounting mode required).
114                let util_samples = device.process_utilization_stats(0u64).unwrap_or_default();
115                let mut latest_sm: HashMap<u32, (u64, u32)> = HashMap::new();
116                for s in &util_samples {
117                    if pid_set.contains(&s.pid) {
118                        let e = latest_sm.entry(s.pid).or_insert((0, 0));
119                        if s.timestamp > e.0 {
120                            *e = (s.timestamp, s.sm_util);
121                        }
122                    }
123                }
124                for (_, sm) in latest_sm.values() {
125                    total_sm_util += f64::from(*sm);
126                }
127            });
128        }
129
130        // --- AMD via /proc/pid/fdinfo ----------------------------------------
131        // DRM fdinfo (kernel >= 5.17): each open DRM fd exposes drm-memory-vram,
132        // drm-pdev, and drm-engine-gfx so we can attribute VRAM and GFX engine
133        // utilization per process and per device.
134        if std::path::Path::new("/sys/module/amdgpu").exists() {
135            any_gpu = true;
136            has_amd_util = true;
137
138            let device_paths = DevicePath::get_device_path_list();
139
140            // Collect PCI addresses of all known AMD devices (lowercase for
141            // case-insensitive comparison with kernel fdinfo drm-pdev values).
142            let amd_pci_addrs: HashSet<String> = device_paths
143                .iter()
144                .map(|dp| format!("{}", dp.pci).to_lowercase())
145                .collect();
146
147            // Track which PCI addresses have any VRAM allocated by the tree.
148            let mut utilized_pcis: HashSet<String> = HashSet::new();
149
150            pids.iter().for_each(|&pid| {
151                let fdinfo_dir = format!("/proc/{pid}/fdinfo");
152                let Ok(entries) = std::fs::read_dir(&fdinfo_dir) else {
153                    return;
154                };
155                entries.filter_map(|e| e.ok()).for_each(|entry| {
156                    let Ok(content) = std::fs::read_to_string(entry.path()) else {
157                        return;
158                    };
159
160                    // Only process amdgpu DRM file descriptors.
161                    if !content
162                        .lines()
163                        .any(|l| l.starts_with("drm-driver:") && l.contains("amdgpu"))
164                    {
165                        return;
166                    }
167
168                    // Match drm-pdev against our known AMD GPU list.
169                    let pdev = content
170                        .lines()
171                        .find(|l| l.starts_with("drm-pdev:"))
172                        .and_then(|l| l.split_whitespace().nth(1))
173                        .map(|s| s.to_lowercase());
174                    let Some(pdev) = pdev else { return };
175                    if !amd_pci_addrs.contains(&pdev) {
176                        return;
177                    }
178
179                    // Parse drm-memory-vram (value in KiB, unit label "KiB").
180                    if let Some(kib) = content
181                        .lines()
182                        .find(|l| l.starts_with("drm-memory-vram:"))
183                        .and_then(|l| l.split_whitespace().nth(1))
184                        .and_then(|v| v.parse::<u64>().ok())
185                    {
186                        total_vram_bytes += kib * 1024;
187                        utilized_pcis.insert(pdev.clone());
188                    }
189                });
190            });
191
192            n_utilized += u32::try_from(utilized_pcis.len()).unwrap_or(0);
193
194            // Per-process GFX engine utilization via FdInfoStat delta tracking.
195            // FdInfoStat accumulates cumulative ns on first call and computes %
196            // utilization on subsequent calls using the interval as the denominator.
197            let fdinfo = self.amd_fdinfo.get_or_insert_with(FdInfoStat::default);
198            fdinfo.interval = interval;
199
200            // Collect the DRM render+card paths for all AMD devices so
201            // update_index_by_all_proc can identify which fds belong to AMD GPUs.
202            let dev_paths: Vec<PathBuf> = device_paths
203                .iter()
204                .flat_map(|dp| [dp.render.clone(), dp.card.clone()])
205                .collect();
206
207            // Build ProcInfo only for the tracked PIDs.
208            let pids_i32: Vec<i32> = pids.iter().filter_map(|&p| i32::try_from(p).ok()).collect();
209            let mut proc_infos: Vec<ProcInfo> = Vec::new();
210            update_index_by_all_proc(&mut proc_infos, &dev_paths, &pids_i32);
211
212            fdinfo.get_all_proc_usage(&proc_infos);
213
214            // Sum gfx utilization across all matched PIDs.
215            for pu in &fdinfo.proc_usage {
216                total_sm_util += f64::from(i32::try_from(pu.usage.gfx).unwrap_or(0).max(0));
217            }
218        }
219
220        if !any_gpu {
221            return (None, None, None);
222        }
223
224        let vram_mib = total_vram_bytes as f64 / 1_048_576.0;
225        // Normalize to fractional GPUs (same convention as process_cores_used):
226        // 1.0 = one GPU fully utilized at 100%.  Raw sm_util values are 0-100.
227        let usage_pct = if has_nvml || has_amd_util {
228            Some(total_sm_util / 100.0)
229        } else {
230            None
231        };
232        (Some(vram_mib), usage_pct, Some(n_utilized))
233    }
234
235    /// Return `(process_gpu_vram_mib, process_gpu_usage, process_gpu_utilized)` summed
236    /// across ALL GPU processes on the host (no PID filter).  Used when tracking is not
237    /// scoped to a specific PID so the full system-wide GPU allocation is
238    /// reported in the `process_` CSV columns.
239    ///
240    /// NVIDIA: sums `used_gpu_memory` for every running compute and graphics
241    /// process across all devices; counts each device that has at least one
242    /// process as "utilized".
243    /// SM utilization is summed across the latest sample per PID from
244    /// `nvmlDeviceGetProcessUtilization`.
245    ///
246    /// AMD: reads `mem_info_vram_used` from sysfs for each device (the kernel
247    /// already provides the system-wide VRAM used value there).
248    /// Per-process GPU utilization is not yet supported for AMD.
249    ///
250    /// Returns `(None, None, None)` when no GPU is present on the host.
251    pub fn all_gpu_process_info(
252        &mut self,
253        interval: Duration,
254    ) -> (Option<f64>, Option<f64>, Option<u32>) {
255        let mut total_vram_bytes: u64 = 0;
256        let mut total_sm_util: f64 = 0.0;
257        let mut n_utilized: u32 = 0;
258        let mut any_gpu = false;
259        let mut has_nvml = false;
260        let mut has_amd_util = false;
261
262        // --- NVIDIA via NVML -------------------------------------------------
263        if let Some(ref nvml) = self.nvml {
264            any_gpu = true;
265            has_nvml = true;
266            let count = nvml.device_count().unwrap_or(0);
267
268            (0..count).for_each(|i| {
269                let Ok(device) = nvml.device_by_index(i) else {
270                    return;
271                };
272                let procs: Vec<_> = device
273                    .running_compute_processes()
274                    .unwrap_or_default()
275                    .into_iter()
276                    .chain(device.running_graphics_processes().unwrap_or_default())
277                    .collect();
278
279                if procs.is_empty() {
280                    return;
281                }
282                n_utilized += 1;
283                procs.iter().for_each(|p| {
284                    if let UsedGpuMemory::Used(bytes) = p.used_gpu_memory {
285                        total_vram_bytes += bytes;
286                    }
287                });
288
289                // System-wide SM utilization: latest sample per PID, all processes.
290                let util_samples = device.process_utilization_stats(0u64).unwrap_or_default();
291                let mut latest_sm: HashMap<u32, (u64, u32)> = HashMap::new();
292                for s in &util_samples {
293                    let e = latest_sm.entry(s.pid).or_insert((0, 0));
294                    if s.timestamp > e.0 {
295                        *e = (s.timestamp, s.sm_util);
296                    }
297                }
298                for (_, sm) in latest_sm.values() {
299                    total_sm_util += f64::from(*sm);
300                }
301            });
302        }
303
304        // --- AMD via sysfs + fdinfo ------------------------------------------
305        if std::path::Path::new("/sys/module/amdgpu").exists() {
306            any_gpu = true;
307            has_amd_util = true;
308
309            let device_paths = DevicePath::get_device_path_list();
310
311            device_paths.iter().for_each(|dp| {
312                let used = read_sysfs_u64(dp.sysfs_path.join("mem_info_vram_used"));
313                if used > 0 {
314                    total_vram_bytes += used;
315                    n_utilized += 1;
316                }
317            });
318
319            // System-wide GFX utilization: sum gfx % across all processes on
320            // all AMD devices using FdInfoStat delta tracking.
321            let fdinfo = self.amd_fdinfo.get_or_insert_with(FdInfoStat::default);
322            fdinfo.interval = interval;
323
324            let dev_paths: Vec<PathBuf> = device_paths
325                .iter()
326                .flat_map(|dp| [dp.render.clone(), dp.card.clone()])
327                .collect();
328
329            // Enumerate all processes on the system that have AMD GPU fds open.
330            let all_pids = libamdgpu_top::stat::get_process_list();
331            let mut proc_infos: Vec<ProcInfo> = Vec::new();
332            update_index_by_all_proc(&mut proc_infos, &dev_paths, &all_pids);
333
334            fdinfo.get_all_proc_usage(&proc_infos);
335
336            for pu in &fdinfo.proc_usage {
337                total_sm_util += f64::from(i32::try_from(pu.usage.gfx).unwrap_or(0).max(0));
338            }
339        }
340
341        if !any_gpu {
342            return (None, None, None);
343        }
344
345        let vram_mib = total_vram_bytes as f64 / 1_048_576.0;
346        // Normalize to fractional GPUs (same convention as process_cores_used):
347        // 1.0 = one GPU fully utilized at 100%.  Raw sm_util values are 0-100.
348        let usage_pct = if has_nvml || has_amd_util {
349            Some(total_sm_util / 100.0)
350        } else {
351            None
352        };
353        (Some(vram_mib), usage_pct, Some(n_utilized))
354    }
355
356    // -----------------------------------------------------------------------
357    // NVIDIA — NVML runtime-loaded via libloading
358    // -----------------------------------------------------------------------
359
360    fn collect_nvidia(&self, out: &mut Vec<GpuMetrics>) {
361        let Some(ref nvml) = self.nvml else { return };
362
363        let count = nvml.device_count().unwrap_or(0);
364        let driver_version = nvml.sys_driver_version().unwrap_or_default();
365
366        for i in 0..count {
367            let Ok(device) = nvml.device_by_index(i) else {
368                continue;
369            };
370
371            let name = device.name().unwrap_or_default();
372            let uuid = device.uuid().unwrap_or_else(|_| format!("nvidia-{i}"));
373
374            let utilization_pct = device
375                .utilization_rates()
376                .map(|u| u.gpu as f64)
377                .unwrap_or(0.0);
378
379            let memory = device.memory_info().ok();
380            let vram_total_bytes = memory.as_ref().map(|m| m.total).unwrap_or(0);
381            let vram_used_bytes = memory.as_ref().map(|m| m.used).unwrap_or(0);
382            let vram_used_pct = if vram_total_bytes > 0 {
383                vram_used_bytes as f64 / vram_total_bytes as f64 * 100.0
384            } else {
385                0.0
386            };
387
388            let temperature_celsius = device.temperature(TemperatureSensor::Gpu).unwrap_or(0);
389
390            // NVML reports power in milliwatts; convert to watts.
391            let power_watts = device
392                .power_usage()
393                .map(|mw| mw as f64 / 1000.0)
394                .unwrap_or(0.0);
395
396            let frequency_mhz = device.clock_info(Clock::Graphics).unwrap_or(0);
397
398            let mut detail: HashMap<String, String> = HashMap::new();
399            if !driver_version.is_empty() {
400                detail.insert("driver_version".to_string(), driver_version.clone());
401            }
402            if let Ok(pci) = device.pci_info() {
403                detail.insert("pci_bus_id".to_string(), pci.bus_id);
404            }
405
406            out.push(GpuMetrics {
407                uuid,
408                name,
409                device_type: "GPU".to_string(),
410                host_id: i.to_string(),
411                detail,
412                utilization_pct,
413                vram_total_bytes,
414                vram_used_bytes,
415                vram_used_pct,
416                temperature_celsius,
417                power_watts,
418                frequency_mhz,
419                core_count: None,
420            });
421        }
422    }
423
424    // -----------------------------------------------------------------------
425    // AMD — libdrm runtime-loaded via libdrm_dynamic_loading feature.
426    // Dynamic metrics are read from the hardware gpu_metrics sysfs file;
427    // VRAM is read from per-device sysfs attributes (no DRM ioctl needed).
428    // -----------------------------------------------------------------------
429
430    fn collect_amd(&self, out: &mut Vec<GpuMetrics>) {
431        // libamdgpu_top panics when the amdgpu kernel module is not loaded.
432        // `catch_unwind` cannot help here because the release profile uses
433        // `panic = "abort"`.  Guard by checking the module's sysfs entry
434        // before calling into the library at all.
435        if !std::path::Path::new("/sys/module/amdgpu").exists() {
436            return;
437        }
438
439        DevicePath::get_device_path_list()
440            .into_iter()
441            .for_each(|dp| {
442                // VRAM: standard AMD GPU sysfs attributes, always available.
443                let vram_total_bytes = read_sysfs_u64(dp.sysfs_path.join("mem_info_vram_total"));
444                let vram_used_bytes = read_sysfs_u64(dp.sysfs_path.join("mem_info_vram_used"));
445                let vram_used_pct = if vram_total_bytes > 0 {
446                    vram_used_bytes as f64 / vram_total_bytes as f64 * 100.0
447                } else {
448                    0.0
449                };
450
451                // Hardware gpu_metrics file: preferred source for dynamic metrics.
452                let hw = AmdHwMetrics::get_from_sysfs_path(&dp.sysfs_path).ok();
453
454                let utilization_pct = hw
455                    .as_ref()
456                    .and_then(|m: &AmdHwMetrics| m.get_average_gfx_activity())
457                    .map(|u| u as f64)
458                    .unwrap_or_else(|| {
459                        // Fallback: sysfs gpu_busy_percent (older kernels / APUs).
460                        GpuActivity::get_from_sysfs(&dp.sysfs_path).gfx.unwrap_or(0) as f64
461                    });
462
463                let frequency_mhz: u32 = hw
464                    .as_ref()
465                    .and_then(|m: &AmdHwMetrics| m.get_average_gfxclk_frequency())
466                    .map(u32::from)
467                    .unwrap_or(0);
468
469                // get_temperature_edge() returns millidegrees on some ASICs.
470                let temperature_celsius: u32 = hw
471                    .as_ref()
472                    .and_then(|m: &AmdHwMetrics| m.get_temperature_edge())
473                    .map(|t| u32::from(if t > 1000 { t / 1000 } else { t }))
474                    .unwrap_or(0);
475
476                // get_average_socket_power() returns whole watts directly.
477                let power_watts = hw
478                    .as_ref()
479                    .and_then(|m: &AmdHwMetrics| m.get_average_socket_power())
480                    .map(|w| w as f64)
481                    .unwrap_or(0.0);
482
483                // AMD GPUs have no stable UUID; use PCI bus address instead.
484                let host_id = format!("{}", dp.pci);
485
486                let mut detail: HashMap<String, String> = HashMap::new();
487                detail.insert("pci_bus".to_string(), host_id.clone());
488                if let Some(rocm) = libamdgpu_top::get_rocm_version() {
489                    detail.insert("rocm_version".to_string(), format!("{rocm:?}"));
490                }
491
492                out.push(GpuMetrics {
493                    uuid: host_id.clone(),
494                    name: dp.device_name.clone(),
495                    device_type: "GPU".to_string(),
496                    host_id,
497                    detail,
498                    utilization_pct,
499                    vram_total_bytes,
500                    vram_used_bytes,
501                    vram_used_pct,
502                    temperature_celsius,
503                    power_watts,
504                    frequency_mhz,
505                    core_count: None,
506                });
507            });
508    }
509}
510
511// ---------------------------------------------------------------------------
512// Unit tests
513// ---------------------------------------------------------------------------
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518
519    // T-GPU-P1: process_gpu_info with an empty PID list must return (None, None, None)
520    // on a CPU-only host, or (Some(0.0), Some(_), Some(0)) on a GPU host -- never panic,
521    // and always return matching Some/None for vram and utilized.
522    #[test]
523    fn test_process_gpu_info_empty_pids_consistent() {
524        let mut collector = GpuCollector::new();
525        let (vram, _usage, utilized) = collector.process_gpu_info(&[], Duration::from_secs(1));
526        match (vram, utilized) {
527            (None, None) => {} // CPU-only host
528            (Some(v), Some(u)) => {
529                assert_eq!(v, 0.0, "empty PID list must produce 0.0 VRAM");
530                assert_eq!(u, 0, "empty PID list must produce 0 utilized GPUs");
531            }
532            _ => panic!("vram_mib and gpu_utilized must both be Some or both be None"),
533        }
534    }
535
536    // T-GPU-P2: process_gpu_info with the current process PID must not panic
537    // and must return a consistent shape: (None, None, None) on CPU-only hosts, or
538    // (Some(v), Some(_), Some(u)) with v >= 0.0 on GPU hosts.
539    #[test]
540    fn test_process_gpu_info_real_pid_does_not_panic() {
541        let mut collector = GpuCollector::new();
542        let pid = std::process::id();
543        let (vram, _usage, utilized) = collector.process_gpu_info(&[pid], Duration::from_secs(1));
544        match (vram, utilized) {
545            (None, None) => {}
546            (Some(v), Some(u)) => {
547                assert!(v >= 0.0, "vram_mib must be non-negative, got {v}");
548                let _ = u; // test process is unlikely to hold GPU allocations
549            }
550            _ => panic!("vram_mib and gpu_utilized must both be Some or both be None"),
551        }
552    }
553
554    // T-GPU-P3: on a CPU-only host (no NVML, no /sys/module/amdgpu),
555    // any PID list must return (None, None, None).  Skipped on GPU hosts.
556    #[test]
557    fn test_process_gpu_info_no_gpu_returns_none() {
558        let nvml_unavailable = Nvml::init().is_err();
559        let amd_absent = !std::path::Path::new("/sys/module/amdgpu").exists();
560        if !nvml_unavailable || !amd_absent {
561            // Host has a GPU; this test is not applicable.
562            return;
563        }
564        let mut collector = GpuCollector::new();
565        let (vram, usage, utilized) =
566            collector.process_gpu_info(&[1, 2, 3], Duration::from_secs(1));
567        assert_eq!(
568            (vram, usage, utilized),
569            (None, None, None),
570            "CPU-only host must return (None, None, None) for any PID list"
571        );
572    }
573
574    // T-GPU-A1: all_gpu_process_info() must not panic and must return a
575    // consistent shape on any host: (None, None, None) on CPU-only, or
576    // (Some(v), Some(_), Some(u)) with v >= 0.0 on GPU hosts.
577    #[test]
578    fn test_all_gpu_process_info_consistent() {
579        let mut collector = GpuCollector::new();
580        let (vram, _usage, utilized) = collector.all_gpu_process_info(Duration::from_secs(1));
581        match (vram, utilized) {
582            (None, None) => {} // CPU-only host
583            (Some(v), Some(u)) => {
584                assert!(v >= 0.0, "vram_mib must be non-negative, got {v}");
585                let _ = u;
586            }
587            _ => panic!("vram_mib and gpu_utilized must both be Some or both be None"),
588        }
589    }
590
591    // T-GPU-A2: on a CPU-only host, all_gpu_process_info() must return (None, None, None).
592    // Skipped on GPU hosts.
593    #[test]
594    fn test_all_gpu_process_info_no_gpu_returns_none() {
595        let nvml_unavailable = Nvml::init().is_err();
596        let amd_absent = !std::path::Path::new("/sys/module/amdgpu").exists();
597        if !nvml_unavailable || !amd_absent {
598            return;
599        }
600        let mut collector = GpuCollector::new();
601        let result = collector.all_gpu_process_info(Duration::from_secs(1));
602        assert_eq!(
603            result,
604            (None, None, None),
605            "CPU-only host must return (None, None, None)"
606        );
607    }
608
609    // T-GPU-A3: on a GPU host, all_gpu_process_info() must return Some for vram
610    // and utilized, with vram_mib >= 0.0.  Skipped on CPU-only hosts.
611    #[test]
612    fn test_all_gpu_process_info_gpu_host_returns_some() {
613        let nvml_available = Nvml::init().is_ok();
614        let amd_present = std::path::Path::new("/sys/module/amdgpu").exists();
615        if !nvml_available && !amd_present {
616            return; // CPU-only host; not applicable
617        }
618        let mut collector = GpuCollector::new();
619        let (vram, _usage, utilized) = collector.all_gpu_process_info(Duration::from_secs(1));
620        assert!(vram.is_some(), "GPU host: vram_mib must be Some, got None");
621        assert!(
622            utilized.is_some(),
623            "GPU host: gpu_utilized must be Some, got None"
624        );
625        assert!(
626            vram.unwrap() >= 0.0,
627            "GPU host: vram_mib must be non-negative"
628        );
629    }
630
631    // T-GPU-A4: all_gpu_process_info() must return >= the vram reported for an
632    // empty PID list via process_gpu_info() (which returns Some(0.0) on GPU hosts).
633    // Verifies that the no-PID path is strictly broader than a zero-PID-set query.
634    #[test]
635    fn test_all_gpu_process_info_ge_empty_pid_query() {
636        let nvml_available = Nvml::init().is_ok();
637        let amd_present = std::path::Path::new("/sys/module/amdgpu").exists();
638        if !nvml_available && !amd_present {
639            return;
640        }
641        let mut collector = GpuCollector::new();
642        let interval = Duration::from_secs(1);
643        let (all_vram, _, _) = collector.all_gpu_process_info(interval);
644        let (pid_vram, _, _) = collector.process_gpu_info(&[], interval);
645        // process_gpu_info(&[]) returns Some(0.0) on a GPU host; all_gpu_process_info
646        // must return >= 0.0 (can be 0.0 if no GPU processes are running).
647        if let (Some(av), Some(pv)) = (all_vram, pid_vram) {
648            assert!(
649                av >= pv,
650                "all_gpu_process_info vram ({av}) must be >= process_gpu_info([]) vram ({pv})"
651            );
652        }
653    }
654
655    // T-GPU-C1: collect() does not panic and returns Ok on any host.
656    #[test]
657    fn test_gpu_collect_does_not_panic() {
658        let collector = GpuCollector::new();
659        let result = collector.collect();
660        assert!(
661            result.is_ok(),
662            "collect() must return Ok on any host, got: {:?}",
663            result.err()
664        );
665    }
666
667    // T-GPU-C2: all returned GpuMetrics entries have non-empty uuid, name, and device_type.
668    #[test]
669    fn test_gpu_collect_identity_fields_nonempty() {
670        let collector = GpuCollector::new();
671        let gpus = collector.collect().expect("collect() failed");
672        gpus.iter().for_each(|g| {
673            assert!(!g.uuid.is_empty(), "uuid must not be empty");
674            assert!(
675                !g.name.is_empty(),
676                "name must not be empty for uuid={}",
677                g.uuid
678            );
679            assert!(
680                !g.device_type.is_empty(),
681                "device_type must not be empty for uuid={}",
682                g.uuid
683            );
684        });
685    }
686
687    // T-GPU-C3: utilization_pct is in range 0.0..=100.0 for all reported GPUs.
688    #[test]
689    fn test_gpu_collect_utilization_in_range() {
690        let collector = GpuCollector::new();
691        let gpus = collector.collect().expect("collect() failed");
692        gpus.iter().for_each(|g| {
693            assert!(
694                g.utilization_pct >= 0.0 && g.utilization_pct <= 100.0,
695                "utilization_pct out of range for {}: {}",
696                g.uuid,
697                g.utilization_pct
698            );
699        });
700    }
701
702    // T-GPU-C4: vram_used_bytes does not exceed vram_total_bytes.
703    #[test]
704    fn test_gpu_collect_vram_used_le_total() {
705        let collector = GpuCollector::new();
706        let gpus = collector.collect().expect("collect() failed");
707        gpus.iter().for_each(|g| {
708            assert!(
709                g.vram_used_bytes <= g.vram_total_bytes,
710                "vram_used_bytes {} > vram_total_bytes {} for {}",
711                g.vram_used_bytes,
712                g.vram_total_bytes,
713                g.uuid
714            );
715        });
716    }
717}
718
719/// Read a u64 value from a single-line sysfs attribute file.
720fn read_sysfs_u64(path: impl AsRef<Path>) -> u64 {
721    std::fs::read_to_string(path)
722        .ok()
723        .and_then(|s| s.trim().parse().ok())
724        .unwrap_or(0)
725}