300 lines
6.7 KiB
Nix
300 lines
6.7 KiB
Nix
{ critical, warning, ... }:
|
|
{
|
|
# Memory
|
|
|
|
NodeOutOfMemory = {
|
|
expr = ''
|
|
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
|
|
'';
|
|
for = "3m";
|
|
labels = critical;
|
|
annotations.Available = "{{ $value | humanizePercentage }}";
|
|
};
|
|
|
|
NodeUnderMemoryPressure = {
|
|
expr = "rate(node_vmstat_pgmajfault[5m]) > 1000";
|
|
labels = critical;
|
|
for = "0m";
|
|
annotations.Pressure = "{{ $value | humanize }}";
|
|
};
|
|
|
|
NodeSwapIsFillingUp = {
|
|
expr = ''
|
|
(1 - (node_memory_SwapFree_bytes
|
|
/ node_memory_SwapTotal_bytes)) > 0.75
|
|
'';
|
|
for = "1m";
|
|
labels = critical;
|
|
annotations.UsedSwap = "{{ $value | humanizePercentage }}";
|
|
};
|
|
|
|
NodeOomKillDetected = {
|
|
expr = "increase(node_vmstat_oom_kill[1m]) > 0";
|
|
for = "0m";
|
|
labels = critical;
|
|
};
|
|
|
|
# CPU
|
|
|
|
NodeCpuUsage = {
|
|
expr = ''
|
|
(avg by (instance)
|
|
(rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8
|
|
'';
|
|
for = "10m";
|
|
labels = warning;
|
|
annotations.AverageUsage = "{{ $value | humanizePercentage }}";
|
|
};
|
|
|
|
NodeCpuStealNoisyNeighbor = {
|
|
expr = ''
|
|
avg by (instance) (
|
|
rate(node_cpu_seconds_total{mode="steal"}[2m])
|
|
) > 0.1
|
|
'';
|
|
for = "10m";
|
|
labels = warning;
|
|
annotations.Steal = "{{ $value | humanizePercentage }}";
|
|
};
|
|
|
|
# Network
|
|
|
|
NodeLinkHighUsageIn = {
|
|
expr = ''
|
|
(rate(node_network_receive_bytes_total[5m])
|
|
/ on(instance, device) node_network_speed_bytes) > .80
|
|
'';
|
|
labels = warning;
|
|
for = "3m";
|
|
annotations = {
|
|
Usage = "{{ $value | humanizePercentage }}";
|
|
Device = "{{ $labels.device }}";
|
|
};
|
|
};
|
|
|
|
NodeLinkHighUsageOut = {
|
|
expr = ''
|
|
(rate(node_network_transmit_bytes_total[5m])
|
|
/ on(instance, device) node_network_speed_bytes) > .80
|
|
'';
|
|
labels = warning;
|
|
for = "3m";
|
|
annotations = {
|
|
Usage = "{{ $value | humanizePercentage }}";
|
|
Device = "{{ $labels.device }}";
|
|
};
|
|
};
|
|
|
|
NodeConntrackLimit = {
|
|
expr = ''
|
|
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.8
|
|
'';
|
|
for = "5m";
|
|
labels = warning;
|
|
annotations.Filled = "{{ $value | humanizePercentage }}";
|
|
};
|
|
|
|
NodeNetworkReceiveErrors = {
|
|
expr = ''
|
|
rate(node_network_receive_errs_total[2m])
|
|
/ rate(node_network_receive_packets_total[2m]) > 0.01
|
|
'';
|
|
for = "2m";
|
|
labels = warning;
|
|
annotations = {
|
|
Errors = "{{ $value | humanizePercentage }}";
|
|
Device = "{{ $labels.device }}";
|
|
};
|
|
};
|
|
|
|
NodeNetworkTransmitErrors = {
|
|
expr = ''
|
|
rate(node_network_transmit_errs_total[2m])
|
|
/ rate(node_network_transmit_packets_total[2m]) > 0.01
|
|
'';
|
|
for = "2m";
|
|
labels = warning;
|
|
annotations = {
|
|
Errors = "{{ $value | humanizePercentage }}";
|
|
Device = "{{ $labels.device }}";
|
|
};
|
|
};
|
|
|
|
NodeNetworkBondDegraded = {
|
|
expr = "node_bonding_active - node_bonding_slaves != 0";
|
|
for = "2m";
|
|
labels = warning;
|
|
annotations.Device = "{{ $labels.device }}";
|
|
};
|
|
|
|
# Temperature
|
|
|
|
NodePhysicalComponentTooHot = {
|
|
expr = ''
|
|
node_hwmon_temp_celsius > clamp_max(node_hwmon_temp_max_celsius, 79)
|
|
'';
|
|
for = "0m";
|
|
labels = critical;
|
|
annotations = {
|
|
Temperature = "{{ $value | humanize }} °C";
|
|
Chip = "{{ $labels.chip }}";
|
|
Sensor = "{{ $labels.sensor }}";
|
|
};
|
|
};
|
|
|
|
NodeNodeOvertemperatureAlarm = {
|
|
expr = "node_hwmon_temp_crit_alarm_celsius == 1";
|
|
for = "0m";
|
|
labels = critical;
|
|
annotations = {
|
|
Chip = "{{ $labels.chip }}";
|
|
Sensor = "{{ $labels.sensor }}";
|
|
};
|
|
};
|
|
|
|
# Storage and disks
|
|
|
|
NodeRaidArrayGotInactive = {
|
|
expr = ''
|
|
node_md_state{state="inactive"} > 0
|
|
'';
|
|
for = "0m";
|
|
labels = critical;
|
|
annotations = {
|
|
Device = "{{ $labels.device }}";
|
|
};
|
|
};
|
|
|
|
NodeRaidDiskFailure = {
|
|
expr = ''
|
|
node_md_disks{state="failed"} > 0
|
|
'';
|
|
for = "0m";
|
|
labels = critical;
|
|
annotations = {
|
|
Device = "{{ $labels.md_device }}";
|
|
};
|
|
};
|
|
|
|
NodeOutOfDiskSpace = {
|
|
expr = ''
|
|
(node_filesystem_free_bytes / node_filesystem_size_bytes < 0.1)
|
|
and on (instance, device, mountpoint) (node_filesystem_readonly) == 0
|
|
'';
|
|
for = "5m";
|
|
labels = critical;
|
|
annotations = {
|
|
Mountpoint = "{{ $labels.mountpoint }}";
|
|
FreeSpace = "{{ $value | humanizePercentage }}";
|
|
};
|
|
};
|
|
|
|
NodeOutOfInodes = {
|
|
expr = "node_filesystem_files_free / node_filesystem_files < 0.1";
|
|
for = "3m";
|
|
labels = critical;
|
|
annotations = {
|
|
Mountpoint = "{{ $labels.mountpoint }}";
|
|
FreeInodes = "{{ $value | humanizePercentage }}";
|
|
};
|
|
};
|
|
|
|
NodeUnhealthyDisk = {
|
|
expr = "smartmon_device_smart_healthy < 1";
|
|
for = "10m";
|
|
labels = critical;
|
|
annotations.Disk = "{{ $labels.disk }}";
|
|
};
|
|
|
|
NodeZfsWrongState = {
|
|
expr = ''
|
|
node_zfs_zpool_state{state!="online"} > 0
|
|
'';
|
|
for = "5m";
|
|
labels = critical;
|
|
annotations = {
|
|
State = "{{ $labels.state }}";
|
|
ZPool = "{{ $labels.zpool }}";
|
|
};
|
|
};
|
|
|
|
# Clock
|
|
|
|
NodeClockSkew = {
|
|
expr = ''
|
|
(node_timex_offset_seconds > 0.05
|
|
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
|
or (node_timex_offset_seconds < -0.05
|
|
and deriv(node_timex_offset_seconds[5m]) <= 0)
|
|
'';
|
|
for = "2m";
|
|
labels = warning;
|
|
};
|
|
|
|
NodeClockNotSynchronising = {
|
|
expr = ''
|
|
min_over_time(node_timex_sync_status[1m]) == 0
|
|
and node_timex_maxerror_seconds >= 16
|
|
'';
|
|
for = "2m";
|
|
labels = warning;
|
|
};
|
|
|
|
# EDAC
|
|
|
|
NodeEdacCorrectableErrorsDetected = {
|
|
expr = ''
|
|
increase(node_edac_correctable_errors_total[1m]) > 0
|
|
'';
|
|
for = "0m";
|
|
labels = warning;
|
|
annotations.CorrectedErrors = "{{ $value }}";
|
|
};
|
|
|
|
NodeEdacUncorrectableErrorsDetected = {
|
|
expr = ''
|
|
increase(node_edac_uncorrectable_errors_total[1m]) > 0
|
|
'';
|
|
for = "0m";
|
|
labels = critical;
|
|
annotations.DetectedErrors = "{{ $value }}";
|
|
};
|
|
|
|
# Misc
|
|
|
|
NodeLoad5Usage = {
|
|
expr = ''
|
|
node_load5 / (
|
|
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.25
|
|
'';
|
|
for = "1m";
|
|
labels = warning;
|
|
annotations.Load5PerCore = "{{ $value | humanize }}";
|
|
};
|
|
|
|
NodeSystemdUnitFailed = {
|
|
expr = ''
|
|
node_systemd_unit_state{state="failed"} == 1
|
|
'';
|
|
for = "5m";
|
|
labels = warning;
|
|
annotations.Unit = "{{ $labels.name }}";
|
|
};
|
|
|
|
NodeLastBorgmaticTooOld = {
|
|
expr = ''
|
|
time()
|
|
- (node_systemd_timer_last_trigger_seconds{name="borgmatic.timer"}
|
|
or on(instance) (node_systemd_version * 0)) > 26 * 60 * 60
|
|
'';
|
|
for = "0m";
|
|
labels = warning;
|
|
annotations.Last = "{{ $value | humanizeDuration }}";
|
|
};
|
|
|
|
NodeRequiresReboot = {
|
|
expr = "node_reboot_required > 0";
|
|
for = "5m";
|
|
labels = warning;
|
|
};
|
|
}
|