1use crate::metrics::GpuMetrics;
2use libamdgpu_top::{
3 AMDGPU::{GpuMetrics as AmdHwMetrics, MetricsInfo},
4 DevicePath,
5 stat::{FdInfoStat, GpuActivity, ProcInfo, update_index_by_all_proc},
6};
7use nvml_wrapper::{
8 Nvml,
9 enum_wrappers::device::{Clock, TemperatureSensor},
10 enums::device::UsedGpuMemory,
11};
12use std::collections::{HashMap, HashSet};
13use std::path::{Path, PathBuf};
14use std::time::Duration;
15
16type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
17
18pub struct GpuCollector {
26 nvml: Option<Nvml>,
27 amd_fdinfo: Option<FdInfoStat>,
30}
31
32impl GpuCollector {
33 pub fn new() -> Self {
34 Self {
35 nvml: Nvml::init().ok(),
36 amd_fdinfo: None,
37 }
38 }
39
40 pub fn collect(&self) -> Result<Vec<GpuMetrics>> {
41 let mut metrics = Vec::new();
42 self.collect_nvidia(&mut metrics);
43 self.collect_amd(&mut metrics);
44 Ok(metrics)
45 }
46
47 pub fn process_gpu_info(
66 &mut self,
67 pids: &[u32],
68 interval: Duration,
69 ) -> (Option<f64>, Option<f64>, Option<u32>) {
70 let mut total_vram_bytes: u64 = 0;
71 let mut total_sm_util: f64 = 0.0;
72 let mut n_utilized: u32 = 0;
73 let mut any_gpu = false;
74 let mut has_nvml = false;
75 let mut has_amd_util = false;
76
77 if let Some(ref nvml) = self.nvml {
79 any_gpu = true;
80 has_nvml = true;
81 let pid_set: HashSet<u32> = pids.iter().copied().collect();
82 let count = nvml.device_count().unwrap_or(0);
83
84 (0..count).for_each(|i| {
85 let Ok(device) = nvml.device_by_index(i) else {
86 return;
87 };
88 let procs: Vec<_> = device
89 .running_compute_processes()
90 .unwrap_or_default()
91 .into_iter()
92 .chain(device.running_graphics_processes().unwrap_or_default())
93 .collect();
94
95 let mut device_vram: u64 = 0;
96 let mut found = false;
97 procs
98 .iter()
99 .filter(|p| pid_set.contains(&p.pid))
100 .for_each(|p| {
101 found = true;
102 if let UsedGpuMemory::Used(bytes) = p.used_gpu_memory {
103 device_vram += bytes;
104 }
105 });
106
107 if found {
108 n_utilized += 1;
109 total_vram_bytes += device_vram;
110 }
111
112 let util_samples = device.process_utilization_stats(0u64).unwrap_or_default();
115 let mut latest_sm: HashMap<u32, (u64, u32)> = HashMap::new();
116 for s in &util_samples {
117 if pid_set.contains(&s.pid) {
118 let e = latest_sm.entry(s.pid).or_insert((0, 0));
119 if s.timestamp > e.0 {
120 *e = (s.timestamp, s.sm_util);
121 }
122 }
123 }
124 for (_, sm) in latest_sm.values() {
125 total_sm_util += f64::from(*sm);
126 }
127 });
128 }
129
130 if std::path::Path::new("/sys/module/amdgpu").exists() {
135 any_gpu = true;
136 has_amd_util = true;
137
138 let device_paths = DevicePath::get_device_path_list();
139
140 let amd_pci_addrs: HashSet<String> = device_paths
143 .iter()
144 .map(|dp| format!("{}", dp.pci).to_lowercase())
145 .collect();
146
147 let mut utilized_pcis: HashSet<String> = HashSet::new();
149
150 pids.iter().for_each(|&pid| {
151 let fdinfo_dir = format!("/proc/{pid}/fdinfo");
152 let Ok(entries) = std::fs::read_dir(&fdinfo_dir) else {
153 return;
154 };
155 entries.filter_map(|e| e.ok()).for_each(|entry| {
156 let Ok(content) = std::fs::read_to_string(entry.path()) else {
157 return;
158 };
159
160 if !content
162 .lines()
163 .any(|l| l.starts_with("drm-driver:") && l.contains("amdgpu"))
164 {
165 return;
166 }
167
168 let pdev = content
170 .lines()
171 .find(|l| l.starts_with("drm-pdev:"))
172 .and_then(|l| l.split_whitespace().nth(1))
173 .map(|s| s.to_lowercase());
174 let Some(pdev) = pdev else { return };
175 if !amd_pci_addrs.contains(&pdev) {
176 return;
177 }
178
179 if let Some(kib) = content
181 .lines()
182 .find(|l| l.starts_with("drm-memory-vram:"))
183 .and_then(|l| l.split_whitespace().nth(1))
184 .and_then(|v| v.parse::<u64>().ok())
185 {
186 total_vram_bytes += kib * 1024;
187 utilized_pcis.insert(pdev.clone());
188 }
189 });
190 });
191
192 n_utilized += u32::try_from(utilized_pcis.len()).unwrap_or(0);
193
194 let fdinfo = self.amd_fdinfo.get_or_insert_with(FdInfoStat::default);
198 fdinfo.interval = interval;
199
200 let dev_paths: Vec<PathBuf> = device_paths
203 .iter()
204 .flat_map(|dp| [dp.render.clone(), dp.card.clone()])
205 .collect();
206
207 let pids_i32: Vec<i32> = pids.iter().filter_map(|&p| i32::try_from(p).ok()).collect();
209 let mut proc_infos: Vec<ProcInfo> = Vec::new();
210 update_index_by_all_proc(&mut proc_infos, &dev_paths, &pids_i32);
211
212 fdinfo.get_all_proc_usage(&proc_infos);
213
214 for pu in &fdinfo.proc_usage {
216 total_sm_util += f64::from(i32::try_from(pu.usage.gfx).unwrap_or(0).max(0));
217 }
218 }
219
220 if !any_gpu {
221 return (None, None, None);
222 }
223
224 let vram_mib = total_vram_bytes as f64 / 1_048_576.0;
225 let usage_pct = if has_nvml || has_amd_util {
228 Some(total_sm_util / 100.0)
229 } else {
230 None
231 };
232 (Some(vram_mib), usage_pct, Some(n_utilized))
233 }
234
235 pub fn all_gpu_process_info(
252 &mut self,
253 interval: Duration,
254 ) -> (Option<f64>, Option<f64>, Option<u32>) {
255 let mut total_vram_bytes: u64 = 0;
256 let mut total_sm_util: f64 = 0.0;
257 let mut n_utilized: u32 = 0;
258 let mut any_gpu = false;
259 let mut has_nvml = false;
260 let mut has_amd_util = false;
261
262 if let Some(ref nvml) = self.nvml {
264 any_gpu = true;
265 has_nvml = true;
266 let count = nvml.device_count().unwrap_or(0);
267
268 (0..count).for_each(|i| {
269 let Ok(device) = nvml.device_by_index(i) else {
270 return;
271 };
272 let procs: Vec<_> = device
273 .running_compute_processes()
274 .unwrap_or_default()
275 .into_iter()
276 .chain(device.running_graphics_processes().unwrap_or_default())
277 .collect();
278
279 if procs.is_empty() {
280 return;
281 }
282 n_utilized += 1;
283 procs.iter().for_each(|p| {
284 if let UsedGpuMemory::Used(bytes) = p.used_gpu_memory {
285 total_vram_bytes += bytes;
286 }
287 });
288
289 let util_samples = device.process_utilization_stats(0u64).unwrap_or_default();
291 let mut latest_sm: HashMap<u32, (u64, u32)> = HashMap::new();
292 for s in &util_samples {
293 let e = latest_sm.entry(s.pid).or_insert((0, 0));
294 if s.timestamp > e.0 {
295 *e = (s.timestamp, s.sm_util);
296 }
297 }
298 for (_, sm) in latest_sm.values() {
299 total_sm_util += f64::from(*sm);
300 }
301 });
302 }
303
304 if std::path::Path::new("/sys/module/amdgpu").exists() {
306 any_gpu = true;
307 has_amd_util = true;
308
309 let device_paths = DevicePath::get_device_path_list();
310
311 device_paths.iter().for_each(|dp| {
312 let used = read_sysfs_u64(dp.sysfs_path.join("mem_info_vram_used"));
313 if used > 0 {
314 total_vram_bytes += used;
315 n_utilized += 1;
316 }
317 });
318
319 let fdinfo = self.amd_fdinfo.get_or_insert_with(FdInfoStat::default);
322 fdinfo.interval = interval;
323
324 let dev_paths: Vec<PathBuf> = device_paths
325 .iter()
326 .flat_map(|dp| [dp.render.clone(), dp.card.clone()])
327 .collect();
328
329 let all_pids = libamdgpu_top::stat::get_process_list();
331 let mut proc_infos: Vec<ProcInfo> = Vec::new();
332 update_index_by_all_proc(&mut proc_infos, &dev_paths, &all_pids);
333
334 fdinfo.get_all_proc_usage(&proc_infos);
335
336 for pu in &fdinfo.proc_usage {
337 total_sm_util += f64::from(i32::try_from(pu.usage.gfx).unwrap_or(0).max(0));
338 }
339 }
340
341 if !any_gpu {
342 return (None, None, None);
343 }
344
345 let vram_mib = total_vram_bytes as f64 / 1_048_576.0;
346 let usage_pct = if has_nvml || has_amd_util {
349 Some(total_sm_util / 100.0)
350 } else {
351 None
352 };
353 (Some(vram_mib), usage_pct, Some(n_utilized))
354 }
355
356 fn collect_nvidia(&self, out: &mut Vec<GpuMetrics>) {
361 let Some(ref nvml) = self.nvml else { return };
362
363 let count = nvml.device_count().unwrap_or(0);
364 let driver_version = nvml.sys_driver_version().unwrap_or_default();
365
366 for i in 0..count {
367 let Ok(device) = nvml.device_by_index(i) else {
368 continue;
369 };
370
371 let name = device.name().unwrap_or_default();
372 let uuid = device.uuid().unwrap_or_else(|_| format!("nvidia-{i}"));
373
374 let utilization_pct = device
375 .utilization_rates()
376 .map(|u| u.gpu as f64)
377 .unwrap_or(0.0);
378
379 let memory = device.memory_info().ok();
380 let vram_total_bytes = memory.as_ref().map(|m| m.total).unwrap_or(0);
381 let vram_used_bytes = memory.as_ref().map(|m| m.used).unwrap_or(0);
382 let vram_used_pct = if vram_total_bytes > 0 {
383 vram_used_bytes as f64 / vram_total_bytes as f64 * 100.0
384 } else {
385 0.0
386 };
387
388 let temperature_celsius = device.temperature(TemperatureSensor::Gpu).unwrap_or(0);
389
390 let power_watts = device
392 .power_usage()
393 .map(|mw| mw as f64 / 1000.0)
394 .unwrap_or(0.0);
395
396 let frequency_mhz = device.clock_info(Clock::Graphics).unwrap_or(0);
397
398 let mut detail: HashMap<String, String> = HashMap::new();
399 if !driver_version.is_empty() {
400 detail.insert("driver_version".to_string(), driver_version.clone());
401 }
402 if let Ok(pci) = device.pci_info() {
403 detail.insert("pci_bus_id".to_string(), pci.bus_id);
404 }
405
406 out.push(GpuMetrics {
407 uuid,
408 name,
409 device_type: "GPU".to_string(),
410 host_id: i.to_string(),
411 detail,
412 utilization_pct,
413 vram_total_bytes,
414 vram_used_bytes,
415 vram_used_pct,
416 temperature_celsius,
417 power_watts,
418 frequency_mhz,
419 core_count: None,
420 });
421 }
422 }
423
424 fn collect_amd(&self, out: &mut Vec<GpuMetrics>) {
431 if !std::path::Path::new("/sys/module/amdgpu").exists() {
436 return;
437 }
438
439 DevicePath::get_device_path_list()
440 .into_iter()
441 .for_each(|dp| {
442 let vram_total_bytes = read_sysfs_u64(dp.sysfs_path.join("mem_info_vram_total"));
444 let vram_used_bytes = read_sysfs_u64(dp.sysfs_path.join("mem_info_vram_used"));
445 let vram_used_pct = if vram_total_bytes > 0 {
446 vram_used_bytes as f64 / vram_total_bytes as f64 * 100.0
447 } else {
448 0.0
449 };
450
451 let hw = AmdHwMetrics::get_from_sysfs_path(&dp.sysfs_path).ok();
453
454 let utilization_pct = hw
455 .as_ref()
456 .and_then(|m: &AmdHwMetrics| m.get_average_gfx_activity())
457 .map(|u| u as f64)
458 .unwrap_or_else(|| {
459 GpuActivity::get_from_sysfs(&dp.sysfs_path).gfx.unwrap_or(0) as f64
461 });
462
463 let frequency_mhz: u32 = hw
464 .as_ref()
465 .and_then(|m: &AmdHwMetrics| m.get_average_gfxclk_frequency())
466 .map(u32::from)
467 .unwrap_or(0);
468
469 let temperature_celsius: u32 = hw
471 .as_ref()
472 .and_then(|m: &AmdHwMetrics| m.get_temperature_edge())
473 .map(|t| u32::from(if t > 1000 { t / 1000 } else { t }))
474 .unwrap_or(0);
475
476 let power_watts = hw
478 .as_ref()
479 .and_then(|m: &AmdHwMetrics| m.get_average_socket_power())
480 .map(|w| w as f64)
481 .unwrap_or(0.0);
482
483 let host_id = format!("{}", dp.pci);
485
486 let mut detail: HashMap<String, String> = HashMap::new();
487 detail.insert("pci_bus".to_string(), host_id.clone());
488 if let Some(rocm) = libamdgpu_top::get_rocm_version() {
489 detail.insert("rocm_version".to_string(), format!("{rocm:?}"));
490 }
491
492 out.push(GpuMetrics {
493 uuid: host_id.clone(),
494 name: dp.device_name.clone(),
495 device_type: "GPU".to_string(),
496 host_id,
497 detail,
498 utilization_pct,
499 vram_total_bytes,
500 vram_used_bytes,
501 vram_used_pct,
502 temperature_celsius,
503 power_watts,
504 frequency_mhz,
505 core_count: None,
506 });
507 });
508 }
509}
510
511#[cfg(test)]
516mod tests {
517 use super::*;
518
519 #[test]
523 fn test_process_gpu_info_empty_pids_consistent() {
524 let mut collector = GpuCollector::new();
525 let (vram, _usage, utilized) = collector.process_gpu_info(&[], Duration::from_secs(1));
526 match (vram, utilized) {
527 (None, None) => {} (Some(v), Some(u)) => {
529 assert_eq!(v, 0.0, "empty PID list must produce 0.0 VRAM");
530 assert_eq!(u, 0, "empty PID list must produce 0 utilized GPUs");
531 }
532 _ => panic!("vram_mib and gpu_utilized must both be Some or both be None"),
533 }
534 }
535
536 #[test]
540 fn test_process_gpu_info_real_pid_does_not_panic() {
541 let mut collector = GpuCollector::new();
542 let pid = std::process::id();
543 let (vram, _usage, utilized) = collector.process_gpu_info(&[pid], Duration::from_secs(1));
544 match (vram, utilized) {
545 (None, None) => {}
546 (Some(v), Some(u)) => {
547 assert!(v >= 0.0, "vram_mib must be non-negative, got {v}");
548 let _ = u; }
550 _ => panic!("vram_mib and gpu_utilized must both be Some or both be None"),
551 }
552 }
553
554 #[test]
557 fn test_process_gpu_info_no_gpu_returns_none() {
558 let nvml_unavailable = Nvml::init().is_err();
559 let amd_absent = !std::path::Path::new("/sys/module/amdgpu").exists();
560 if !nvml_unavailable || !amd_absent {
561 return;
563 }
564 let mut collector = GpuCollector::new();
565 let (vram, usage, utilized) =
566 collector.process_gpu_info(&[1, 2, 3], Duration::from_secs(1));
567 assert_eq!(
568 (vram, usage, utilized),
569 (None, None, None),
570 "CPU-only host must return (None, None, None) for any PID list"
571 );
572 }
573
574 #[test]
578 fn test_all_gpu_process_info_consistent() {
579 let mut collector = GpuCollector::new();
580 let (vram, _usage, utilized) = collector.all_gpu_process_info(Duration::from_secs(1));
581 match (vram, utilized) {
582 (None, None) => {} (Some(v), Some(u)) => {
584 assert!(v >= 0.0, "vram_mib must be non-negative, got {v}");
585 let _ = u;
586 }
587 _ => panic!("vram_mib and gpu_utilized must both be Some or both be None"),
588 }
589 }
590
591 #[test]
594 fn test_all_gpu_process_info_no_gpu_returns_none() {
595 let nvml_unavailable = Nvml::init().is_err();
596 let amd_absent = !std::path::Path::new("/sys/module/amdgpu").exists();
597 if !nvml_unavailable || !amd_absent {
598 return;
599 }
600 let mut collector = GpuCollector::new();
601 let result = collector.all_gpu_process_info(Duration::from_secs(1));
602 assert_eq!(
603 result,
604 (None, None, None),
605 "CPU-only host must return (None, None, None)"
606 );
607 }
608
609 #[test]
612 fn test_all_gpu_process_info_gpu_host_returns_some() {
613 let nvml_available = Nvml::init().is_ok();
614 let amd_present = std::path::Path::new("/sys/module/amdgpu").exists();
615 if !nvml_available && !amd_present {
616 return; }
618 let mut collector = GpuCollector::new();
619 let (vram, _usage, utilized) = collector.all_gpu_process_info(Duration::from_secs(1));
620 assert!(vram.is_some(), "GPU host: vram_mib must be Some, got None");
621 assert!(
622 utilized.is_some(),
623 "GPU host: gpu_utilized must be Some, got None"
624 );
625 assert!(
626 vram.unwrap() >= 0.0,
627 "GPU host: vram_mib must be non-negative"
628 );
629 }
630
631 #[test]
635 fn test_all_gpu_process_info_ge_empty_pid_query() {
636 let nvml_available = Nvml::init().is_ok();
637 let amd_present = std::path::Path::new("/sys/module/amdgpu").exists();
638 if !nvml_available && !amd_present {
639 return;
640 }
641 let mut collector = GpuCollector::new();
642 let interval = Duration::from_secs(1);
643 let (all_vram, _, _) = collector.all_gpu_process_info(interval);
644 let (pid_vram, _, _) = collector.process_gpu_info(&[], interval);
645 if let (Some(av), Some(pv)) = (all_vram, pid_vram) {
648 assert!(
649 av >= pv,
650 "all_gpu_process_info vram ({av}) must be >= process_gpu_info([]) vram ({pv})"
651 );
652 }
653 }
654
655 #[test]
657 fn test_gpu_collect_does_not_panic() {
658 let collector = GpuCollector::new();
659 let result = collector.collect();
660 assert!(
661 result.is_ok(),
662 "collect() must return Ok on any host, got: {:?}",
663 result.err()
664 );
665 }
666
667 #[test]
669 fn test_gpu_collect_identity_fields_nonempty() {
670 let collector = GpuCollector::new();
671 let gpus = collector.collect().expect("collect() failed");
672 gpus.iter().for_each(|g| {
673 assert!(!g.uuid.is_empty(), "uuid must not be empty");
674 assert!(
675 !g.name.is_empty(),
676 "name must not be empty for uuid={}",
677 g.uuid
678 );
679 assert!(
680 !g.device_type.is_empty(),
681 "device_type must not be empty for uuid={}",
682 g.uuid
683 );
684 });
685 }
686
687 #[test]
689 fn test_gpu_collect_utilization_in_range() {
690 let collector = GpuCollector::new();
691 let gpus = collector.collect().expect("collect() failed");
692 gpus.iter().for_each(|g| {
693 assert!(
694 g.utilization_pct >= 0.0 && g.utilization_pct <= 100.0,
695 "utilization_pct out of range for {}: {}",
696 g.uuid,
697 g.utilization_pct
698 );
699 });
700 }
701
702 #[test]
704 fn test_gpu_collect_vram_used_le_total() {
705 let collector = GpuCollector::new();
706 let gpus = collector.collect().expect("collect() failed");
707 gpus.iter().for_each(|g| {
708 assert!(
709 g.vram_used_bytes <= g.vram_total_bytes,
710 "vram_used_bytes {} > vram_total_bytes {} for {}",
711 g.vram_used_bytes,
712 g.vram_total_bytes,
713 g.uuid
714 );
715 });
716 }
717}
718
719fn read_sysfs_u64(path: impl AsRef<Path>) -> u64 {
721 std::fs::read_to_string(path)
722 .ok()
723 .and_then(|s| s.trim().parse().ok())
724 .unwrap_or(0)
725}