monitoring: cleanup rules + NodeLastBorgmaticTooOld
This commit is contained in:
parent
c7b9a8d839
commit
59789595d1
2 changed files with 84 additions and 27 deletions
|
@ -240,32 +240,7 @@
|
|||
labels = warning;
|
||||
};
|
||||
|
||||
# Misc
|
||||
|
||||
NodeLoad5Usage = {
|
||||
expr = ''
|
||||
node_load5 / (
|
||||
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0
|
||||
'';
|
||||
for = "1m";
|
||||
labels = warning;
|
||||
annotations.Load5PerCore = "{{ $value | humanize }}";
|
||||
};
|
||||
|
||||
NodeSystemdServiceFailed = {
|
||||
expr = ''
|
||||
node_systemd_unit_state{state="failed"} == 1
|
||||
'';
|
||||
for = "5m";
|
||||
labels = warning;
|
||||
annotations.Service = "{{ $labels.name }}";
|
||||
};
|
||||
|
||||
NodeRequiresReboot = {
|
||||
expr = "node_reboot_required > 0";
|
||||
for = "5m";
|
||||
labels = warning;
|
||||
};
|
||||
# EDAC
|
||||
|
||||
NodeEdacCorrectableErrorsDetected = {
|
||||
expr = ''
|
||||
|
@ -284,4 +259,42 @@
|
|||
labels = critical;
|
||||
annotations.DetectedErrors = "{{ $value }}";
|
||||
};
|
||||
|
||||
# Misc
|
||||
|
||||
NodeLoad5Usage = {
|
||||
expr = ''
|
||||
node_load5 / (
|
||||
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0
|
||||
'';
|
||||
for = "1m";
|
||||
labels = warning;
|
||||
annotations.Load5PerCore = "{{ $value | humanize }}";
|
||||
};
|
||||
|
||||
NodeSystemdUnitFailed = {
|
||||
expr = ''
|
||||
node_systemd_unit_state{state="failed"} == 1
|
||||
'';
|
||||
for = "5m";
|
||||
labels = warning;
|
||||
annotations.Unit = "{{ $labels.name }}";
|
||||
};
|
||||
|
||||
NodeLastBorgmaticTooOld = {
|
||||
expr = ''
|
||||
time()
|
||||
- (node_systemd_timer_last_trigger_seconds{name="borgmatic.timer"}
|
||||
or on(instance) (node_systemd_version * 0)) > 26 * 60 * 60
|
||||
'';
|
||||
for = "0m";
|
||||
labels = warning;
|
||||
annotations.Last = "{{ $value | humanizeDuration }}";
|
||||
};
|
||||
|
||||
NodeRequiresReboot = {
|
||||
expr = "node_reboot_required > 0";
|
||||
for = "5m";
|
||||
labels = warning;
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
{ network, config, name, ... }: let
|
||||
{ network, config, name, ... }:
|
||||
|
||||
let
|
||||
port = config.services.prometheus.exporters.node.port;
|
||||
node = network.infra.nodes.${name};
|
||||
in {
|
||||
|
@ -6,5 +8,47 @@ in {
|
|||
|
||||
services.prometheus.exporters.node = {
|
||||
enable = true;
|
||||
enabledCollectors = [
|
||||
"arp"
|
||||
"bonding"
|
||||
"buddyinfo"
|
||||
"cgroups"
|
||||
"conntrack"
|
||||
"cpu"
|
||||
"cpu_vulnerabilities"
|
||||
"cpufreq"
|
||||
"diskstats"
|
||||
"dmi"
|
||||
"edac"
|
||||
"entropy"
|
||||
"filesystem"
|
||||
"hwmon"
|
||||
"interrupts"
|
||||
"loadavg"
|
||||
"meminfo"
|
||||
"netclass"
|
||||
"netdev"
|
||||
"netstat"
|
||||
"nvme"
|
||||
"os"
|
||||
"powersupplyclass"
|
||||
"pressure"
|
||||
"qdisc"
|
||||
"rapl"
|
||||
"schedstat"
|
||||
"sockstat"
|
||||
"softnet"
|
||||
"stat"
|
||||
"systemd"
|
||||
"thermal_zone"
|
||||
"time"
|
||||
"timex"
|
||||
"udp_queues"
|
||||
"uname"
|
||||
"vmstat"
|
||||
"watchdog"
|
||||
"zfs"
|
||||
"zoneinfo"
|
||||
];
|
||||
};
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue