nix/profiles/monitoring/rules/node.nix

300 lines
6.7 KiB
Nix

{ critical, warning, ... }:
{
# Memory
NodeOutOfMemory = {
expr = ''
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
'';
for = "3m";
labels = critical;
annotations.Available = "{{ $value | humanizePercentage }}";
};
NodeUnderMemoryPressure = {
expr = "rate(node_vmstat_pgmajfault[5m]) > 1000";
labels = critical;
for = "0m";
annotations.Pressure = "{{ $value | humanize }}";
};
NodeSwapIsFillingUp = {
expr = ''
(1 - (node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes)) > 0.5
'';
for = "1m";
labels = critical;
annotations.UsedSwap = "{{ $value | humanizePercentage }}";
};
NodeOomKillDetected = {
expr = "increase(node_vmstat_oom_kill[1m]) > 0";
for = "0m";
labels = critical;
};
# CPU
NodeCpuUsage = {
expr = ''
(avg by (instance)
(rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8
'';
for = "10m";
labels = warning;
annotations.AverageUsage = "{{ $value | humanizePercentage }}";
};
NodeCpuStealNoisyNeighbor = {
expr = ''
avg by (instance) (
rate(node_cpu_seconds_total{mode="steal"}[2m])
) > 0.1
'';
for = "10m";
labels = warning;
annotations.Steal = "{{ $value | humanizePercentage }}";
};
# Network
NodeLinkHighUsageIn = {
expr = ''
(rate(node_network_receive_bytes_total[5m])
/ on(instance, device) node_network_speed_bytes) > .80
'';
labels = warning;
for = "3m";
annotations = {
Usage = "{{ $value | humanizePercentage }}";
Device = "{{ $labels.device }}";
};
};
NodeLinkHighUsageOut = {
expr = ''
(rate(node_network_transmit_bytes_total[5m])
/ on(instance, device) node_network_speed_bytes) > .80
'';
labels = warning;
for = "3m";
annotations = {
Usage = "{{ $value | humanizePercentage }}";
Device = "{{ $labels.device }}";
};
};
NodeConntrackLimit = {
expr = ''
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.8
'';
for = "5m";
labels = warning;
annotations.Filled = "{{ $value | humanizePercentage }}";
};
NodeNetworkReceiveErrors = {
expr = ''
rate(node_network_receive_errs_total[2m])
/ rate(node_network_receive_packets_total[2m]) > 0.01
'';
for = "2m";
labels = warning;
annotations = {
Errors = "{{ $value | humanizePercentage }}";
Device = "{{ $labels.device }}";
};
};
NodeNetworkTransmitErrors = {
expr = ''
rate(node_network_transmit_errs_total[2m])
/ rate(node_network_transmit_packets_total[2m]) > 0.01
'';
for = "2m";
labels = warning;
annotations = {
Errors = "{{ $value | humanizePercentage }}";
Device = "{{ $labels.device }}";
};
};
NodeNetworkBondDegraded = {
expr = "node_bonding_active - node_bonding_slaves != 0";
for = "2m";
labels = warning;
annotations.Device = "{{ $labels.device }}";
};
# Temperature
NodePhysicalComponentTooHot = {
expr = ''
node_hwmon_temp_celsius > clamp_max(79, node_hwmon_temp_max_celsius)
'';
for = "0m";
labels = critical;
annotations = {
Temperature = "{{ $value | humanize }} °C";
Chip = "{{ $labels.chip }}";
Sensor = "{{ $labels.sensor }}";
};
};
NodeNodeOvertemperatureAlarm = {
expr = "node_hwmon_temp_crit_alarm_celsius == 1";
for = "0m";
labels = critical;
annotations = {
Chip = "{{ $labels.chip }}";
Sensor = "{{ $labels.sensor }}";
};
};
# Storage and disks
NodeRaidArrayGotInactive = {
expr = ''
node_md_state{state="inactive"} > 0
'';
for = "0m";
labels = critical;
annotations = {
Device = "{{ $labels.device }}";
};
};
NodeRaidDiskFailure = {
expr = ''
node_md_disks{state="failed"} > 0
'';
for = "0m";
labels = critical;
annotations = {
Device = "{{ $labels.md_device }}";
};
};
NodeOutOfDiskSpace = {
expr = ''
(node_filesystem_free_bytes / node_filesystem_size_bytes < 0.1)
and on (instance, device, mountpoint) (node_filesystem_readonly) == 0
'';
for = "5m";
labels = critical;
annotations = {
Mountpoint = "{{ $labels.mountpoint }}";
FreeSpace = "{{ $value | humanizePercentage }}";
};
};
NodeOutOfInodes = {
expr = "node_filesystem_files_free / node_filesystem_files < 0.1";
for = "3m";
labels = critical;
annotations = {
Mountpoint = "{{ $labels.mountpoint }}";
FreeInodes = "{{ $value | humanizePercentage }}";
};
};
NodeUnhealthyDisk = {
expr = "smartmon_device_smart_healthy < 1";
for = "10m";
labels = critical;
annotations.Disk = "{{ $labels.disk }}";
};
NodeZfsWrongState = {
expr = ''
node_zfs_zpool_state{state!="online"} > 0
'';
for = "5m";
labels = critical;
annotations = {
State = "{{ $labels.state }}";
ZPool = "{{ $labels.zpool }}";
};
};
# Clock
NodeClockSkew = {
expr = ''
(node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0)
or (node_timex_offset_seconds < -0.05
and deriv(node_timex_offset_seconds[5m]) <= 0)
'';
for = "2m";
labels = warning;
};
NodeClockNotSynchronising = {
expr = ''
min_over_time(node_timex_sync_status[1m]) == 0
and node_timex_maxerror_seconds >= 16
'';
for = "2m";
labels = warning;
};
# EDAC
NodeEdacCorrectableErrorsDetected = {
expr = ''
increase(node_edac_correctable_errors_total[1m]) > 0
'';
for = "0m";
labels = warning;
annotations.CorrectedErrors = "{{ $value }}";
};
NodeEdacUncorrectableErrorsDetected = {
expr = ''
increase(node_edac_uncorrectable_errors_total[1m]) > 0
'';
for = "0m";
labels = critical;
annotations.DetectedErrors = "{{ $value }}";
};
# Misc
NodeLoad5Usage = {
expr = ''
node_load5 / (
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0
'';
for = "1m";
labels = warning;
annotations.Load5PerCore = "{{ $value | humanize }}";
};
NodeSystemdUnitFailed = {
expr = ''
node_systemd_unit_state{state="failed"} == 1
'';
for = "5m";
labels = warning;
annotations.Unit = "{{ $labels.name }}";
};
NodeLastBorgmaticTooOld = {
expr = ''
time()
- (node_systemd_timer_last_trigger_seconds{name="borgmatic.timer"}
or on(instance) (node_systemd_version * 0)) > 26 * 60 * 60
'';
for = "0m";
labels = warning;
annotations.Last = "{{ $value | humanizeDuration }}";
};
NodeRequiresReboot = {
expr = "node_reboot_required > 0";
for = "5m";
labels = warning;
};
}