cu29_runtime/
monitoring.rs

1//! Some basic internal monitoring tooling Copper uses to monitor itself and the tasks it is running.
2//!
3
4use crate::config::CuConfig;
5use crate::config::{BridgeChannelConfigRepresentation, BridgeConfig, Flavor};
6use crate::cutask::CuMsgMetadata;
7use cu29_clock::{CuDuration, RobotClock};
8#[allow(unused_imports)]
9use cu29_log::CuLogLevel;
10use cu29_traits::{CuError, CuResult};
11use petgraph::visit::IntoEdgeReferences;
12use serde_derive::{Deserialize, Serialize};
13
14#[cfg(not(feature = "std"))]
15extern crate alloc;
16
17#[cfg(feature = "std")]
18use std::{collections::HashMap as Map, string::String, string::ToString, vec::Vec};
19
20#[cfg(not(feature = "std"))]
21use alloc::{collections::BTreeMap as Map, string::String, string::ToString, vec::Vec};
22
23#[cfg(not(feature = "std"))]
24mod imp {
25    pub use alloc::alloc::{GlobalAlloc, Layout};
26    pub use core::sync::atomic::{AtomicUsize, Ordering};
27    pub use libm::sqrt;
28}
29
30#[cfg(feature = "std")]
31mod imp {
32    #[cfg(feature = "memory_monitoring")]
33    use super::CountingAlloc;
34    #[cfg(feature = "memory_monitoring")]
35    pub use std::alloc::System;
36    pub use std::alloc::{GlobalAlloc, Layout};
37    pub use std::sync::atomic::{AtomicUsize, Ordering};
38    #[cfg(feature = "memory_monitoring")]
39    #[global_allocator]
40    pub static GLOBAL: CountingAlloc<System> = CountingAlloc::new(System);
41}
42
43use imp::*;
44
45/// The state of a task.
46#[derive(Debug, Serialize, Deserialize)]
47pub enum CuTaskState {
48    Start,
49    Preprocess,
50    Process,
51    Postprocess,
52    Stop,
53}
54
55/// Monitor decision to be taken when a task errored out.
56#[derive(Debug)]
57pub enum Decision {
58    Abort,    // for a step (stop, start) or a copperlist, just stop trying to process it.
59    Ignore, // Ignore this error and try to continue, ie calling the other tasks steps, setting a None return value and continue a copperlist.
60    Shutdown, // This is a fatal error, shutdown the copper as cleanly as possible.
61}
62
63#[derive(Debug, Clone, Copy, PartialEq, Eq)]
64pub enum ComponentKind {
65    Task,
66    Bridge,
67}
68
69#[derive(Debug, Clone)]
70pub struct MonitorNode {
71    pub id: String,
72    pub type_name: Option<String>,
73    pub kind: ComponentKind,
74    /// Ordered list of input port identifiers.
75    pub inputs: Vec<String>,
76    /// Ordered list of output port identifiers.
77    pub outputs: Vec<String>,
78}
79
80#[derive(Debug, Clone)]
81pub struct MonitorConnection {
82    pub src: String,
83    pub src_port: Option<String>,
84    pub dst: String,
85    pub dst_port: Option<String>,
86    pub msg: String,
87}
88
89#[derive(Debug, Clone, Default)]
90pub struct MonitorTopology {
91    pub nodes: Vec<MonitorNode>,
92    pub connections: Vec<MonitorConnection>,
93}
94
95#[derive(Default, Debug, Clone, Copy)]
96struct NodeIoUsage {
97    has_incoming: bool,
98    has_outgoing: bool,
99}
100
101/// Derive a monitor-friendly topology from the runtime configuration.
102pub fn build_monitor_topology(
103    config: &CuConfig,
104    mission: Option<&str>,
105) -> CuResult<MonitorTopology> {
106    let graph = config.get_graph(mission)?;
107    let mut nodes: Map<String, MonitorNode> = Map::new();
108    let mut io_usage: Map<String, NodeIoUsage> = Map::new();
109
110    let mut bridge_lookup: Map<&str, &BridgeConfig> = Map::new();
111    for bridge in &config.bridges {
112        bridge_lookup.insert(bridge.id.as_str(), bridge);
113    }
114
115    for edge in graph.0.edge_references() {
116        let cnx = edge.weight();
117        io_usage.entry(cnx.src.clone()).or_default().has_outgoing = true;
118        io_usage.entry(cnx.dst.clone()).or_default().has_incoming = true;
119    }
120
121    for (_, node) in graph.get_all_nodes() {
122        let kind = match node.get_flavor() {
123            Flavor::Bridge => ComponentKind::Bridge,
124            _ => ComponentKind::Task,
125        };
126        let node_id = node.get_id();
127
128        let mut inputs = Vec::new();
129        let mut outputs = Vec::new();
130        if kind == ComponentKind::Bridge {
131            if let Some(bridge) = bridge_lookup.get(node_id.as_str()) {
132                for ch in &bridge.channels {
133                    match ch {
134                        BridgeChannelConfigRepresentation::Rx { id, .. } => {
135                            outputs.push(id.clone())
136                        }
137                        BridgeChannelConfigRepresentation::Tx { id, .. } => inputs.push(id.clone()),
138                    }
139                }
140            }
141        } else {
142            let usage = io_usage.get(node_id.as_str()).cloned().unwrap_or_default();
143            if usage.has_incoming || !usage.has_outgoing {
144                inputs.push("in".to_string());
145            }
146            if usage.has_outgoing || !usage.has_incoming {
147                outputs.push("out".to_string());
148            }
149        }
150
151        nodes.insert(
152            node_id.clone(),
153            MonitorNode {
154                id: node_id,
155                type_name: Some(node.get_type().to_string()),
156                kind,
157                inputs,
158                outputs,
159            },
160        );
161    }
162
163    let mut connections = Vec::new();
164    for edge in graph.0.edge_references() {
165        let cnx = edge.weight();
166        let src = cnx.src.clone();
167        let dst = cnx.dst.clone();
168
169        let src_port = cnx.src_channel.clone().or_else(|| {
170            nodes
171                .get(&src)
172                .and_then(|node| node.outputs.first().cloned())
173        });
174        let dst_port = cnx.dst_channel.clone().or_else(|| {
175            nodes
176                .get(&dst)
177                .and_then(|node| node.inputs.first().cloned())
178        });
179
180        connections.push(MonitorConnection {
181            src,
182            src_port,
183            dst,
184            dst_port,
185            msg: cnx.msg.clone(),
186        });
187    }
188
189    Ok(MonitorTopology {
190        nodes: nodes.into_values().collect(),
191        connections,
192    })
193}
194
195/// Trait to implement a monitoring task.
196pub trait CuMonitor: Sized {
197    fn new(config: &CuConfig, taskids: &'static [&'static str]) -> CuResult<Self>
198    where
199        Self: Sized;
200
201    fn set_topology(&mut self, _topology: MonitorTopology) {}
202
203    fn start(&mut self, _clock: &RobotClock) -> CuResult<()> {
204        Ok(())
205    }
206
207    /// Callback that will be trigger at the end of every copperlist (before, on or after the serialization).
208    fn process_copperlist(&self, msgs: &[&CuMsgMetadata]) -> CuResult<()>;
209
210    /// Callbacked when a Task errored out. The runtime requires an immediate decision.
211    fn process_error(&self, taskid: usize, step: CuTaskState, error: &CuError) -> Decision;
212
213    /// Callbacked when copper is stopping.
214    fn stop(&mut self, _clock: &RobotClock) -> CuResult<()> {
215        Ok(())
216    }
217}
218
219/// A do nothing monitor if no monitor is provided.
220/// This is basically defining the default behavior of Copper in case of error.
221pub struct NoMonitor {}
222impl CuMonitor for NoMonitor {
223    fn new(_config: &CuConfig, _taskids: &'static [&'static str]) -> CuResult<Self> {
224        Ok(NoMonitor {})
225    }
226
227    fn process_copperlist(&self, _msgs: &[&CuMsgMetadata]) -> CuResult<()> {
228        // By default, do nothing.
229        Ok(())
230    }
231
232    fn process_error(&self, _taskid: usize, _step: CuTaskState, _error: &CuError) -> Decision {
233        // By default, just try to continue.
234        Decision::Ignore
235    }
236}
237
238/// A simple allocator that counts the number of bytes allocated and deallocated.
239pub struct CountingAlloc<A: GlobalAlloc> {
240    inner: A,
241    allocated: AtomicUsize,
242    deallocated: AtomicUsize,
243}
244
245impl<A: GlobalAlloc> CountingAlloc<A> {
246    pub const fn new(inner: A) -> Self {
247        CountingAlloc {
248            inner,
249            allocated: AtomicUsize::new(0),
250            deallocated: AtomicUsize::new(0),
251        }
252    }
253
254    pub fn allocated(&self) -> usize {
255        self.allocated.load(Ordering::SeqCst)
256    }
257
258    pub fn deallocated(&self) -> usize {
259        self.deallocated.load(Ordering::SeqCst)
260    }
261
262    pub fn reset(&self) {
263        self.allocated.store(0, Ordering::SeqCst);
264        self.deallocated.store(0, Ordering::SeqCst);
265    }
266}
267
268unsafe impl<A: GlobalAlloc> GlobalAlloc for CountingAlloc<A> {
269    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
270        let p = self.inner.alloc(layout);
271        if !p.is_null() {
272            self.allocated.fetch_add(layout.size(), Ordering::SeqCst);
273        }
274        p
275    }
276
277    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
278        self.inner.dealloc(ptr, layout);
279        self.deallocated.fetch_add(layout.size(), Ordering::SeqCst);
280    }
281}
282
283/// A simple struct that counts the number of bytes allocated and deallocated in a scope.
284#[cfg(feature = "memory_monitoring")]
285pub struct ScopedAllocCounter {
286    bf_allocated: usize,
287    bf_deallocated: usize,
288}
289
290#[cfg(feature = "memory_monitoring")]
291impl Default for ScopedAllocCounter {
292    fn default() -> Self {
293        Self::new()
294    }
295}
296
297#[cfg(feature = "memory_monitoring")]
298impl ScopedAllocCounter {
299    pub fn new() -> Self {
300        ScopedAllocCounter {
301            bf_allocated: GLOBAL.allocated(),
302            bf_deallocated: GLOBAL.deallocated(),
303        }
304    }
305
306    /// Returns the total number of bytes allocated in the current scope
307    /// since the creation of this `ScopedAllocCounter`.
308    ///
309    /// # Example
310    /// ```
311    /// use cu29_runtime::monitoring::ScopedAllocCounter;
312    ///
313    /// let counter = ScopedAllocCounter::new();
314    /// let _vec = vec![0u8; 1024];
315    /// println!("Bytes allocated: {}", counter.get_allocated());
316    /// ```
317    pub fn allocated(&self) -> usize {
318        GLOBAL.allocated() - self.bf_allocated
319    }
320
321    /// Returns the total number of bytes deallocated in the current scope
322    /// since the creation of this `ScopedAllocCounter`.
323    ///
324    /// # Example
325    /// ```
326    /// use cu29_runtime::monitoring::ScopedAllocCounter;
327    ///
328    /// let counter = ScopedAllocCounter::new();
329    /// let _vec = vec![0u8; 1024];
330    /// drop(_vec);
331    /// println!("Bytes deallocated: {}", counter.get_deallocated());
332    /// ```
333    pub fn deallocated(&self) -> usize {
334        GLOBAL.deallocated() - self.bf_deallocated
335    }
336}
337
338/// Build a difference between the number of bytes allocated and deallocated in the scope at drop time.
339#[cfg(feature = "memory_monitoring")]
340impl Drop for ScopedAllocCounter {
341    fn drop(&mut self) {
342        let _allocated = GLOBAL.allocated() - self.bf_allocated;
343        let _deallocated = GLOBAL.deallocated() - self.bf_deallocated;
344        // TODO(gbin): Fix this when the logger is ready.
345        // debug!(
346        //     "Allocations: +{}B -{}B",
347        //     allocated = allocated,
348        //     deallocated = deallocated,
349        // );
350    }
351}
352
353const BUCKET_COUNT: usize = 1024;
354
355/// Accumulative stat object that can give your some real time statistics.
356/// Uses a fixed-size bucketed histogram for accurate percentile calculations.
357#[derive(Debug, Clone)]
358pub struct LiveStatistics {
359    buckets: [u64; BUCKET_COUNT],
360    min_val: u64,
361    max_val: u64,
362    sum: u64,
363    sum_sq: u64,
364    count: u64,
365    max_value: u64,
366}
367
368impl LiveStatistics {
369    /// Creates a new `LiveStatistics` instance with a specified maximum value.
370    ///
371    /// This function initializes a `LiveStatistics` structure with default values
372    /// for tracking statistical data, while setting an upper limit for the data
373    /// points that the structure tracks.
374    ///
375    /// # Parameters
376    /// - `max_value` (`u64`): The maximum value that can be recorded or tracked.
377    ///
378    /// # Returns
379    /// A new instance of `LiveStatistics` with:
380    /// - `buckets`: An array pre-filled with zeros to categorize data points.
381    /// - `min_val`: Initialized to the maximum possible `u64` value to track the minimum correctly.
382    /// - `max_val`: Initialized to zero.
383    /// - `sum`: The sum of all data points, initialized to zero.
384    /// - `sum_sq`: The sum of squares of all data points, initialized to zero.
385    /// - `count`: The total number of data points, initialized to zero.
386    /// - `max_value`: The maximum allowable value for data points, set to the provided `max_value`.
387    ///
388    pub fn new_with_max(max_value: u64) -> Self {
389        LiveStatistics {
390            buckets: [0; BUCKET_COUNT],
391            min_val: u64::MAX,
392            max_val: 0,
393            sum: 0,
394            sum_sq: 0,
395            count: 0,
396            max_value,
397        }
398    }
399
400    #[inline]
401    fn value_to_bucket(&self, value: u64) -> usize {
402        if value >= self.max_value {
403            BUCKET_COUNT - 1
404        } else {
405            ((value as u128 * BUCKET_COUNT as u128) / self.max_value as u128) as usize
406        }
407    }
408
409    #[inline]
410    pub fn min(&self) -> u64 {
411        if self.count == 0 {
412            0
413        } else {
414            self.min_val
415        }
416    }
417
418    #[inline]
419    pub fn max(&self) -> u64 {
420        self.max_val
421    }
422
423    #[inline]
424    pub fn mean(&self) -> f64 {
425        if self.count == 0 {
426            0.0
427        } else {
428            self.sum as f64 / self.count as f64
429        }
430    }
431
432    #[inline]
433    pub fn stdev(&self) -> f64 {
434        if self.count == 0 {
435            return 0.0;
436        }
437        let mean = self.mean();
438        let variance = (self.sum_sq as f64 / self.count as f64) - (mean * mean);
439        if variance < 0.0 {
440            return 0.0;
441        }
442        #[cfg(feature = "std")]
443        return variance.sqrt();
444        #[cfg(not(feature = "std"))]
445        return sqrt(variance);
446    }
447
448    #[inline]
449    pub fn percentile(&self, percentile: f64) -> u64 {
450        if self.count == 0 {
451            return 0;
452        }
453
454        let target_count = (self.count as f64 * percentile) as u64;
455        let mut accumulated = 0u64;
456
457        for (bucket_idx, &bucket_count) in self.buckets.iter().enumerate() {
458            accumulated += bucket_count;
459            if accumulated >= target_count {
460                // Linear interpolation within the bucket
461                let bucket_start = (bucket_idx as u64 * self.max_value) / BUCKET_COUNT as u64;
462                let bucket_end = ((bucket_idx + 1) as u64 * self.max_value) / BUCKET_COUNT as u64;
463                let bucket_fraction = if bucket_count > 0 {
464                    (target_count - (accumulated - bucket_count)) as f64 / bucket_count as f64
465                } else {
466                    0.5
467                };
468                return bucket_start
469                    + ((bucket_end - bucket_start) as f64 * bucket_fraction) as u64;
470            }
471        }
472
473        self.max_val
474    }
475
476    /// Adds a value to the statistics.
477    #[inline]
478    pub fn record(&mut self, value: u64) {
479        if value < self.min_val {
480            self.min_val = value;
481        }
482        if value > self.max_val {
483            self.max_val = value;
484        }
485        self.sum += value;
486        self.sum_sq += value * value;
487        self.count += 1;
488
489        let bucket = self.value_to_bucket(value);
490        self.buckets[bucket] += 1;
491    }
492
493    #[inline]
494    pub fn len(&self) -> u64 {
495        self.count
496    }
497
498    #[inline]
499    pub fn is_empty(&self) -> bool {
500        self.count == 0
501    }
502
503    #[inline]
504    pub fn reset(&mut self) {
505        self.buckets.fill(0);
506        self.min_val = u64::MAX;
507        self.max_val = 0;
508        self.sum = 0;
509        self.sum_sq = 0;
510        self.count = 0;
511    }
512}
513
514/// A Specialized statistics object for CuDuration.
515/// It will also keep track of the jitter between the values.
516#[derive(Debug, Clone)]
517pub struct CuDurationStatistics {
518    bare: LiveStatistics,
519    jitter: LiveStatistics,
520    last_value: CuDuration,
521}
522
523impl CuDurationStatistics {
524    pub fn new(max: CuDuration) -> Self {
525        let CuDuration(max) = max;
526        CuDurationStatistics {
527            bare: LiveStatistics::new_with_max(max),
528            jitter: LiveStatistics::new_with_max(max),
529            last_value: CuDuration::default(),
530        }
531    }
532
533    #[inline]
534    pub fn min(&self) -> CuDuration {
535        CuDuration(self.bare.min())
536    }
537
538    #[inline]
539    pub fn max(&self) -> CuDuration {
540        CuDuration(self.bare.max())
541    }
542
543    #[inline]
544    pub fn mean(&self) -> CuDuration {
545        CuDuration(self.bare.mean() as u64) // CuDuration is in ns, it is ok.
546    }
547
548    #[inline]
549    pub fn percentile(&self, percentile: f64) -> CuDuration {
550        CuDuration(self.bare.percentile(percentile))
551    }
552
553    #[inline]
554    pub fn stddev(&self) -> CuDuration {
555        CuDuration(self.bare.stdev() as u64)
556    }
557
558    #[inline]
559    pub fn len(&self) -> u64 {
560        self.bare.len()
561    }
562
563    #[inline]
564    pub fn is_empty(&self) -> bool {
565        self.bare.len() == 0
566    }
567
568    #[inline]
569    pub fn jitter_min(&self) -> CuDuration {
570        CuDuration(self.jitter.min())
571    }
572
573    #[inline]
574    pub fn jitter_max(&self) -> CuDuration {
575        CuDuration(self.jitter.max())
576    }
577
578    #[inline]
579    pub fn jitter_mean(&self) -> CuDuration {
580        CuDuration(self.jitter.mean() as u64)
581    }
582
583    #[inline]
584    pub fn jitter_stddev(&self) -> CuDuration {
585        CuDuration(self.jitter.stdev() as u64)
586    }
587
588    #[inline]
589    pub fn jitter_percentile(&self, percentile: f64) -> CuDuration {
590        CuDuration(self.jitter.percentile(percentile))
591    }
592
593    #[inline]
594    pub fn record(&mut self, value: CuDuration) {
595        let CuDuration(nanos) = value;
596        if self.bare.is_empty() {
597            self.bare.record(nanos);
598            self.last_value = value;
599            return;
600        }
601        self.bare.record(nanos);
602        let CuDuration(last_nanos) = self.last_value;
603        self.jitter.record(nanos.abs_diff(last_nanos));
604        self.last_value = value;
605    }
606
607    #[inline]
608    pub fn reset(&mut self) {
609        self.bare.reset();
610        self.jitter.reset();
611    }
612}
613
614#[cfg(test)]
615mod tests {
616    use super::*;
617
618    #[test]
619    fn test_live_statistics_percentiles() {
620        let mut stats = LiveStatistics::new_with_max(1000);
621
622        // Record 100 values from 0 to 99
623        for i in 0..100 {
624            stats.record(i);
625        }
626
627        assert_eq!(stats.len(), 100);
628        assert_eq!(stats.min(), 0);
629        assert_eq!(stats.max(), 99);
630        assert_eq!(stats.mean() as u64, 49); // Average of 0..99
631
632        // Test percentiles - should be approximately correct
633        let p50 = stats.percentile(0.5);
634        let p90 = stats.percentile(0.90);
635        let p95 = stats.percentile(0.95);
636        let p99 = stats.percentile(0.99);
637
638        // With 100 samples from 0-99, percentiles should be close to their index
639        assert!((p50 as i64 - 49).abs() < 5, "p50={} expected ~49", p50);
640        assert!((p90 as i64 - 89).abs() < 5, "p90={} expected ~89", p90);
641        assert!((p95 as i64 - 94).abs() < 5, "p95={} expected ~94", p95);
642        assert!((p99 as i64 - 98).abs() < 5, "p99={} expected ~98", p99);
643    }
644
645    #[test]
646    fn test_duration_stats() {
647        let mut stats = CuDurationStatistics::new(CuDuration(1000));
648        stats.record(CuDuration(100));
649        stats.record(CuDuration(200));
650        stats.record(CuDuration(500));
651        stats.record(CuDuration(400));
652        assert_eq!(stats.min(), CuDuration(100));
653        assert_eq!(stats.max(), CuDuration(500));
654        assert_eq!(stats.mean(), CuDuration(300));
655        assert_eq!(stats.len(), 4);
656        assert_eq!(stats.jitter.len(), 3);
657        assert_eq!(stats.jitter_min(), CuDuration(100));
658        assert_eq!(stats.jitter_max(), CuDuration(300));
659        assert_eq!(stats.jitter_mean(), CuDuration((100 + 300 + 100) / 3));
660        stats.reset();
661        assert_eq!(stats.len(), 0);
662    }
663}