diff --git a/profiles/monitoring/rules/node.nix b/profiles/monitoring/rules/node.nix index 968a1d6..fab9ede 100644 --- a/profiles/monitoring/rules/node.nix +++ b/profiles/monitoring/rules/node.nix @@ -240,32 +240,7 @@ labels = warning; }; - # Misc - - NodeLoad5Usage = { - expr = '' - node_load5 / ( - count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0 - ''; - for = "1m"; - labels = warning; - annotations.Load5PerCore = "{{ $value | humanize }}"; - }; - - NodeSystemdServiceFailed = { - expr = '' - node_systemd_unit_state{state="failed"} == 1 - ''; - for = "5m"; - labels = warning; - annotations.Service = "{{ $labels.name }}"; - }; - - NodeRequiresReboot = { - expr = "node_reboot_required > 0"; - for = "5m"; - labels = warning; - }; + # EDAC NodeEdacCorrectableErrorsDetected = { expr = '' @@ -284,4 +259,42 @@ labels = critical; annotations.DetectedErrors = "{{ $value }}"; }; + + # Misc + + NodeLoad5Usage = { + expr = '' + node_load5 / ( + count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0 + ''; + for = "1m"; + labels = warning; + annotations.Load5PerCore = "{{ $value | humanize }}"; + }; + + NodeSystemdUnitFailed = { + expr = '' + node_systemd_unit_state{state="failed"} == 1 + ''; + for = "5m"; + labels = warning; + annotations.Unit = "{{ $labels.name }}"; + }; + + NodeLastBorgmaticTooOld = { + expr = '' + time() + - (node_systemd_timer_last_trigger_seconds{name="borgmatic.timer"} + or on(instance) (node_systemd_version * 0)) > 26 * 60 * 60 + ''; + for = "0m"; + labels = warning; + annotations.Last = "{{ $value | humanizeDuration }}"; + }; + + NodeRequiresReboot = { + expr = "node_reboot_required > 0"; + for = "5m"; + labels = warning; + }; } diff --git a/profiles/prometheus-node-exporter.nix b/profiles/prometheus-node-exporter.nix index b309bf3..d57ccc3 100644 --- a/profiles/prometheus-node-exporter.nix +++ b/profiles/prometheus-node-exporter.nix @@ -1,4 +1,6 @@ -{ network, config, name, ... }: let +{ network, config, name, ... }: + +let port = config.services.prometheus.exporters.node.port; node = network.infra.nodes.${name}; in { @@ -6,5 +8,47 @@ in { services.prometheus.exporters.node = { enable = true; + enabledCollectors = [ + "arp" + "bonding" + "buddyinfo" + "cgroups" + "conntrack" + "cpu" + "cpu_vulnerabilities" + "cpufreq" + "diskstats" + "dmi" + "edac" + "entropy" + "filesystem" + "hwmon" + "interrupts" + "loadavg" + "meminfo" + "netclass" + "netdev" + "netstat" + "nvme" + "os" + "powersupplyclass" + "pressure" + "qdisc" + "rapl" + "schedstat" + "sockstat" + "softnet" + "stat" + "systemd" + "thermal_zone" + "time" + "timex" + "udp_queues" + "uname" + "vmstat" + "watchdog" + "zfs" + "zoneinfo" + ]; }; }