monitoring: cleanup rules + NodeLastBorgmaticTooOld

This commit is contained in:
jeltz 2025-04-07 20:28:12 +02:00
parent c7b9a8d839
commit 59789595d1
Signed by: jeltz
GPG key ID: 800882B66C0C3326
2 changed files with 84 additions and 27 deletions

View file

@ -240,32 +240,7 @@
labels = warning;
};
# Misc
NodeLoad5Usage = {
expr = ''
node_load5 / (
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0
'';
for = "1m";
labels = warning;
annotations.Load5PerCore = "{{ $value | humanize }}";
};
NodeSystemdServiceFailed = {
expr = ''
node_systemd_unit_state{state="failed"} == 1
'';
for = "5m";
labels = warning;
annotations.Service = "{{ $labels.name }}";
};
NodeRequiresReboot = {
expr = "node_reboot_required > 0";
for = "5m";
labels = warning;
};
# EDAC
NodeEdacCorrectableErrorsDetected = {
expr = ''
@ -284,4 +259,42 @@
labels = critical;
annotations.DetectedErrors = "{{ $value }}";
};
# Misc
NodeLoad5Usage = {
expr = ''
node_load5 / (
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0
'';
for = "1m";
labels = warning;
annotations.Load5PerCore = "{{ $value | humanize }}";
};
NodeSystemdUnitFailed = {
expr = ''
node_systemd_unit_state{state="failed"} == 1
'';
for = "5m";
labels = warning;
annotations.Unit = "{{ $labels.name }}";
};
NodeLastBorgmaticTooOld = {
expr = ''
time()
- (node_systemd_timer_last_trigger_seconds{name="borgmatic.timer"}
or on(instance) (node_systemd_version * 0)) > 26 * 60 * 60
'';
for = "0m";
labels = warning;
annotations.Last = "{{ $value | humanizeDuration }}";
};
NodeRequiresReboot = {
expr = "node_reboot_required > 0";
for = "5m";
labels = warning;
};
}

View file

@ -1,4 +1,6 @@
{ network, config, name, ... }: let
{ network, config, name, ... }:
let
port = config.services.prometheus.exporters.node.port;
node = network.infra.nodes.${name};
in {
@ -6,5 +8,47 @@ in {
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [
"arp"
"bonding"
"buddyinfo"
"cgroups"
"conntrack"
"cpu"
"cpu_vulnerabilities"
"cpufreq"
"diskstats"
"dmi"
"edac"
"entropy"
"filesystem"
"hwmon"
"interrupts"
"loadavg"
"meminfo"
"netclass"
"netdev"
"netstat"
"nvme"
"os"
"powersupplyclass"
"pressure"
"qdisc"
"rapl"
"schedstat"
"sockstat"
"softnet"
"stat"
"systemd"
"thermal_zone"
"time"
"timex"
"udp_queues"
"uname"
"vmstat"
"watchdog"
"zfs"
"zoneinfo"
];
};
}