Skip to main content

resource_tracker/
config.rs

1use clap::{ArgAction, Parser, ValueEnum};
2use serde::Deserialize;
3
4const DEFAULT_INTERVAL_SECS: u64 = 1;
5const DEFAULT_CONFIG_FILE: &str = "resource-tracker.toml";
6
7// ---------------------------------------------------------------------------
8// Output format
9// ---------------------------------------------------------------------------
10
11/// Output format emitted to stdout on each polling interval.
12#[derive(Debug, Clone, Copy, PartialEq, ValueEnum)]
13pub enum OutputFormat {
14    /// JSON Lines - one JSON object per line (default).
15    Json,
16    /// CSV - header on first line, one row per interval.
17    /// Columns mirror Python resource-tracker's SystemTracker output.
18    Csv,
19}
20
21// ---------------------------------------------------------------------------
22// TOML file structure
23// ---------------------------------------------------------------------------
24
25#[derive(Debug, Default, Deserialize)]
26struct TomlConfig {
27    job: Option<TomlJob>,
28    tracker: Option<TomlTracker>,
29}
30
31#[derive(Debug, Deserialize)]
32struct TomlJob {
33    /// Human-readable label attached to every sample (e.g. "benchmark-run-42").
34    name: Option<String>,
35    /// Root PID of the process tree whose CPU usage should be attributed.
36    pid: Option<i32>,
37}
38
39#[derive(Debug, Deserialize)]
40struct TomlTracker {
41    /// How often to emit a sample, in seconds. Default: 1.
42    interval_secs: Option<u64>,
43}
44
45// ---------------------------------------------------------------------------
46// Job metadata (Section 9.3) - sent to Sentinel API at run registration
47// ---------------------------------------------------------------------------
48
49/// All optional metadata fields from Section 9.3 of the spec.
50/// Accepted via CLI flags and TRACKER_* environment variables.
51/// Used when registering a run with the Sentinel API (Priority 4).
52#[derive(Debug, Clone, Default)]
53pub struct JobMetadata {
54    pub project_name: Option<String>,
55    pub job_name: Option<String>,
56    pub stage_name: Option<String>,
57    pub task_name: Option<String>,
58    pub team: Option<String>,
59    pub env: Option<String>,
60    pub language: Option<String>,
61    pub orchestrator: Option<String>,
62    pub executor: Option<String>,
63    pub external_run_id: Option<String>,
64    pub container_image: Option<String>,
65    /// Arbitrary key=value tags supplied via repeated --tag flags.
66    pub tags: Vec<String>,
67    /// Shell-wrapper command as a token list, e.g. ["stress", "--cpu", "4"].
68    /// Empty when not running in shell-wrapper mode.
69    pub command: Vec<String>,
70}
71
72// ---------------------------------------------------------------------------
73// CLI arguments (clap derive)
74// ---------------------------------------------------------------------------
75
76#[derive(Debug, Parser)]
77#[command(
78    name = "resource-tracker",
79    about = "Lightweight Linux resource & GPU tracker.\n\n\
80             Shell-wrapper mode: resource-tracker [FLAGS] -- <command> [args...]\n\
81             The tracker will spawn <command>, monitor it, and exit when it exits.",
82    version
83)]
84struct Cli {
85    // -- Core flags ----------------------------------------------------------
86    /// Root PID of the process tree to track CPU usage for.
87    /// Overridden automatically in shell-wrapper mode.
88    #[arg(short = 'p', long, value_name = "PID")]
89    pid: Option<i32>,
90
91    /// Polling interval in seconds (must be >= 1).
92    #[arg(short = 'i', long, value_name = "SECS")]
93    interval: Option<u64>,
94
95    /// Path to TOML config file.
96    #[arg(short = 'c', long, value_name = "FILE", default_value = DEFAULT_CONFIG_FILE)]
97    config: String,
98
99    /// Output format: json (default) or csv.
100    #[arg(short = 'f', long, value_name = "FORMAT", default_value = "json")]
101    format: OutputFormat,
102
103    /// Write metric output to FILE instead of stdout.
104    /// Useful in shell-wrapper mode to keep the tracked app's stdout clean.
105    #[arg(short = 'o', long, value_name = "FILE", env = "TRACKER_OUTPUT")]
106    output: Option<String>,
107
108    /// Suppress metric output entirely (no stdout, no file).
109    /// Useful when streaming to Sentinel and local output is not needed.
110    #[arg(long, env = "TRACKER_QUIET")]
111    quiet: bool,
112
113    // -- Section 9.3 metadata flags ------------------------------------------
114    /// Project name for Sentinel run registration.
115    #[arg(long, value_name = "NAME", env = "TRACKER_PROJECT_NAME")]
116    project_name: Option<String>,
117
118    /// Job name attached to every sample and to the Sentinel run record.
119    #[arg(short = 'n', long, value_name = "NAME", env = "TRACKER_JOB_NAME")]
120    job_name: Option<String>,
121
122    /// Stage name (e.g. "train", "eval") for Sentinel run registration.
123    #[arg(long, value_name = "NAME", env = "TRACKER_STAGE_NAME")]
124    stage_name: Option<String>,
125
126    /// Task name for Sentinel run registration.
127    #[arg(long, value_name = "NAME", env = "TRACKER_TASK_NAME")]
128    task_name: Option<String>,
129
130    /// Team name for Sentinel run registration.
131    #[arg(long, value_name = "NAME", env = "TRACKER_TEAM")]
132    team: Option<String>,
133
134    /// Environment label (e.g. "prod", "staging") for Sentinel run registration.
135    #[arg(long, value_name = "ENV", env = "TRACKER_ENV")]
136    env: Option<String>,
137
138    /// Programming language label for Sentinel run registration.
139    #[arg(long, value_name = "LANG", env = "TRACKER_LANGUAGE")]
140    language: Option<String>,
141
142    /// Orchestrator label (e.g. "airflow", "prefect") for Sentinel run registration.
143    #[arg(long, value_name = "NAME", env = "TRACKER_ORCHESTRATOR")]
144    orchestrator: Option<String>,
145
146    /// Executor label (e.g. "kubernetes", "slurm") for Sentinel run registration.
147    #[arg(long, value_name = "NAME", env = "TRACKER_EXECUTOR")]
148    executor: Option<String>,
149
150    /// External run ID from the calling system for Sentinel run registration.
151    #[arg(long, value_name = "ID", env = "TRACKER_EXTERNAL_RUN_ID")]
152    external_run_id: Option<String>,
153
154    /// Container image name/tag for Sentinel run registration.
155    #[arg(long, value_name = "IMAGE", env = "TRACKER_CONTAINER_IMAGE")]
156    container_image: Option<String>,
157
158    /// Arbitrary key=value tag. May be repeated: --tag key1=val1 --tag key2=val2
159    #[arg(long = "tag", value_name = "KEY=VALUE", action = ArgAction::Append)]
160    tags: Vec<String>,
161
162    // -- Shell-wrapper mode --------------------------------------------------
163    /// Command to spawn and monitor. All tokens after -- are the command + args.
164    /// Example: resource-tracker -- Rscript model.R --epochs 10
165    #[arg(
166        trailing_var_arg = true,
167        allow_hyphen_values = true,
168        value_name = "COMMAND"
169    )]
170    command: Vec<String>,
171}
172
173// ---------------------------------------------------------------------------
174// Merged config
175// ---------------------------------------------------------------------------
176
177/// Resolved configuration after merging CLI args > TOML file > defaults.
178#[derive(Debug, Clone)]
179pub struct Config {
180    /// Root PID for per-process CPU attribution. None = system-wide only.
181    /// Set automatically from the spawned child PID in shell-wrapper mode.
182    pub pid: Option<i32>,
183    /// Polling interval in seconds.
184    pub interval_secs: u64,
185    /// Output format (JSON or CSV).
186    pub format: OutputFormat,
187    /// Write metric output to this file path instead of stdout.
188    /// None = write to stdout.
189    pub output_file: Option<String>,
190    /// Suppress all metric output (no stdout, no file).
191    pub quiet: bool,
192    /// Section 9.3 job metadata (used for Sentinel API registration).
193    pub metadata: JobMetadata,
194    /// Shell-wrapper command. Empty = standalone mode.
195    pub command: Vec<String>,
196}
197
198impl Config {
199    /// Parse CLI args, optionally load the TOML config file, and merge with
200    /// defaults.  CLI flags always win; config file wins over defaults.
201    pub fn load() -> Self {
202        let cli = Cli::parse();
203
204        // Silently skip missing or unparseable config files.
205        let toml: TomlConfig = std::fs::read_to_string(&cli.config)
206            .ok()
207            .and_then(|s| toml::from_str(&s).ok())
208            .unwrap_or_default();
209
210        let interval_secs = cli
211            .interval
212            .or_else(|| toml.tracker.as_ref().and_then(|t| t.interval_secs))
213            .unwrap_or(DEFAULT_INTERVAL_SECS);
214
215        if interval_secs == 0 {
216            eprintln!("error: --interval must be >= 1 (got 0)");
217            std::process::exit(1);
218        }
219
220        let pid = cli.pid.or_else(|| toml.job.as_ref().and_then(|j| j.pid));
221
222        let metadata = JobMetadata {
223            project_name: cli.project_name,
224            job_name: cli
225                .job_name
226                .or_else(|| toml.job.as_ref().and_then(|j| j.name.clone())),
227            stage_name: cli.stage_name,
228            task_name: cli.task_name,
229            team: cli.team,
230            env: cli.env,
231            language: cli.language,
232            orchestrator: cli.orchestrator,
233            executor: cli.executor,
234            external_run_id: cli.external_run_id,
235            container_image: cli.container_image,
236            tags: cli.tags,
237            command: cli.command.clone(),
238        };
239
240        Config {
241            pid,
242            interval_secs,
243            format: cli.format,
244            output_file: cli.output,
245            quiet: cli.quiet,
246            metadata,
247            command: cli.command,
248        }
249    }
250}
251
252// ---------------------------------------------------------------------------
253// Unit tests
254// ---------------------------------------------------------------------------
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259
260    // T-CFG-01: TomlConfig deserializes from a valid TOML string.
261    #[test]
262    fn test_toml_config_deserializes() {
263        let toml_str = r#"
264[job]
265name = "benchmark"
266pid = 12345
267
268[tracker]
269interval_secs = 5
270"#;
271        let cfg: TomlConfig = toml::from_str(toml_str).expect("TOML parse failed");
272        let job = cfg.job.as_ref().expect("job section missing");
273        assert_eq!(job.name.as_deref(), Some("benchmark"));
274        assert_eq!(job.pid, Some(12345));
275        let tracker = cfg.tracker.as_ref().expect("tracker section missing");
276        assert_eq!(tracker.interval_secs, Some(5));
277    }
278
279    // T-CFG-02: TomlConfig defaults to None fields when the file is empty.
280    #[test]
281    fn test_toml_config_default_is_all_none() {
282        let cfg = TomlConfig::default();
283        assert!(cfg.job.is_none(), "job must be None in default TomlConfig");
284        assert!(
285            cfg.tracker.is_none(),
286            "tracker must be None in default TomlConfig"
287        );
288    }
289
290    // T-CFG-03: JobMetadata default produces all-None/empty fields.
291    #[test]
292    fn test_job_metadata_default_all_none() {
293        let m = JobMetadata::default();
294        assert!(m.project_name.is_none());
295        assert!(m.job_name.is_none());
296        assert!(m.stage_name.is_none());
297        assert!(m.task_name.is_none());
298        assert!(m.team.is_none());
299        assert!(m.env.is_none());
300        assert!(m.language.is_none());
301        assert!(m.orchestrator.is_none());
302        assert!(m.executor.is_none());
303        assert!(m.external_run_id.is_none());
304        assert!(m.container_image.is_none());
305        assert!(
306            m.tags.is_empty(),
307            "tags must be empty in default JobMetadata"
308        );
309    }
310
311    // T-CFG-04: OutputFormat variants compare correctly.
312    #[test]
313    fn test_output_format_equality() {
314        assert_eq!(OutputFormat::Json, OutputFormat::Json);
315        assert_eq!(OutputFormat::Csv, OutputFormat::Csv);
316        assert_ne!(OutputFormat::Json, OutputFormat::Csv);
317    }
318
319    // T-CFG-05: TomlConfig gracefully ignores unknown keys.
320    #[test]
321    fn test_toml_config_ignores_unknown_keys() {
322        let toml_str = r#"
323[job]
324name = "run1"
325unknown_field = "ignored"
326"#;
327        // Should not panic; unknown fields are silently dropped by serde.
328        let result: Result<TomlConfig, _> = toml::from_str(toml_str);
329        // toml crate returns error for unknown fields by default unless
330        // serde is configured to ignore them. If this fails, the test still
331        // documents the expected behavior.
332        let _ = result; // accept either Ok or Err
333    }
334}