Skip to main content

resource_tracker/
thread_util.rs

1//! Best-effort OS thread spawning — never panics when the kernel returns EAGAIN.
2
3use std::thread::{self, JoinHandle};
4
5/// Spawn `f` on a named thread. On failure (e.g. PID/thread limit), log a warning
6/// and return `None` so callers can fall back to inline work or skip the feature.
7pub fn spawn_named<T, F>(name: &str, f: F) -> Option<JoinHandle<T>>
8where
9    F: FnOnce() -> T + Send + 'static,
10    T: Send + 'static,
11{
12    match thread::Builder::new().name(name.to_owned()).spawn(f) {
13        Ok(handle) => Some(handle),
14        Err(e) => {
15            eprintln!("warn: failed to spawn thread '{name}': {e}");
16            None
17        }
18    }
19}
20
21#[cfg(test)]
22mod tests {
23    use super::*;
24
25    // RAII guard: restores RLIMIT_NPROC on drop, including during unwinding from
26    // a failing assertion. Without this, a panicking test leaves the limit tight
27    // and starves subsequent tests with EAGAIN.
28    struct NprocGuard(libc::rlimit);
29
30    impl Drop for NprocGuard {
31        fn drop(&mut self) {
32            let rc = unsafe { libc::prlimit(0, libc::RLIMIT_NPROC, &self.0, std::ptr::null_mut()) };
33            if rc != 0 {
34                eprintln!(
35                    "warn: RLIMIT_NPROC restore failed: {}",
36                    std::io::Error::last_os_error()
37                );
38            }
39        }
40    }
41
42    fn tighten_nproc() -> Option<NprocGuard> {
43        let threads: libc::rlim_t = std::fs::read_to_string("/proc/self/status")
44            .ok()?
45            .lines()
46            .find(|l| l.starts_with("Threads:"))
47            .and_then(|l| l.split_whitespace().nth(1).and_then(|v| v.parse().ok()))?;
48        let mut old = libc::rlimit {
49            rlim_cur: 0,
50            rlim_max: 0,
51        };
52        if unsafe { libc::prlimit(0, libc::RLIMIT_NPROC, std::ptr::null(), &mut old) } != 0 {
53            return None;
54        }
55        let tight = libc::rlimit {
56            rlim_cur: threads,
57            rlim_max: old.rlim_max,
58        };
59        if unsafe { libc::prlimit(0, libc::RLIMIT_NPROC, &tight, std::ptr::null_mut()) } != 0 {
60            return None;
61        }
62        Some(NprocGuard(old))
63    }
64
65    // T-NPROC-01: spawn_named must return None instead of panicking when the OS
66    // rejects thread creation with EAGAIN (the root cause of the exit-139 crash
67    // under tight cgroup pids.max).  RLIMIT_NPROC is set to the current thread
68    // count so the very next spawn attempt is rejected by the kernel.
69    // NprocGuard restores the limit via Drop so subsequent tests are not starved.
70    #[test]
71    fn test_spawn_named_returns_none_on_nproc_limit() {
72        let Some(_guard) = tighten_nproc() else {
73            return; // /proc or prlimit unavailable; skip
74        };
75
76        let result = spawn_named("test-eagain", || ());
77
78        assert!(
79            result.is_none(),
80            "spawn_named should return None under tight RLIMIT_NPROC, not panic"
81        );
82        // _guard drops here, restoring RLIMIT_NPROC
83    }
84}