{ critical, warning, ... }: { # Memory NodeOutOfMemory = { expr = '' (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 ''; for = "3m"; labels = critical; annotations.Available = "{{ $value | humanizePercentage }}"; }; NodeUnderMemoryPressure = { expr = "rate(node_vmstat_pgmajfault[5m]) > 1000"; labels = critical; for = "0m"; annotations.Pressure = "{{ $value | humanize }}"; }; NodeSwapIsFillingUp = { expr = '' (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) > 0.5 ''; for = "1m"; labels = critical; annotations.UsedSwap = "{{ $value | humanizePercentage }}"; }; NodeOomKillDetected = { expr = "increase(node_vmstat_oom_kill[1m]) > 0"; for = "0m"; labels = critical; }; # CPU NodeCpuUsage = { expr = '' (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8 ''; for = "10m"; labels = warning; annotations.AverageUsage = "{{ $value | humanizePercentage }}"; }; NodeCpuStealNoisyNeighbor = { expr = '' avg by (instance) ( rate(node_cpu_seconds_total{mode="steal"}[2m]) ) > 0.1 ''; for = "10m"; labels = warning; annotations.Steal = "{{ $value | humanizePercentage }}"; }; # Network NodeLinkHighUsageIn = { expr = '' (rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80 ''; labels = warning; for = "3m"; annotations = { Usage = "{{ $value | humanizePercentage }}"; Device = "{{ $labels.device }}"; }; }; NodeLinkHighUsageOut = { expr = '' (rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80 ''; labels = warning; for = "3m"; annotations = { Usage = "{{ $value | humanizePercentage }}"; Device = "{{ $labels.device }}"; }; }; NodeConntrackLimit = { expr = '' (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.8 ''; for = "5m"; labels = warning; annotations.Filled = "{{ $value | humanizePercentage }}"; }; NodeNetworkReceiveErrors = { expr = '' rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 ''; for = "2m"; labels = warning; annotations = { Errors = "{{ $value | humanizePercentage }}"; Device = "{{ $labels.device }}"; }; }; NodeNetworkTransmitErrors = { expr = '' rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 ''; for = "2m"; labels = warning; annotations = { Errors = "{{ $value | humanizePercentage }}"; Device = "{{ $labels.device }}"; }; }; NodeNetworkBondDegraded = { expr = "node_bonding_active - node_bonding_slaves != 0"; for = "2m"; labels = warning; annotations.Device = "{{ $labels.device }}"; }; # Temperature NodePhysicalComponentTooHot = { expr = '' node_hwmon_temp_celsius > clamp_max(79, node_hwmon_temp_max_celsius) ''; for = "0m"; labels = critical; annotations = { Temperature = "{{ $value | humanize }} °C"; Chip = "{{ $labels.chip }}"; Sensor = "{{ $labels.sensor }}"; }; }; NodeNodeOvertemperatureAlarm = { expr = "node_hwmon_temp_crit_alarm_celsius == 1"; for = "0m"; labels = critical; annotations = { Chip = "{{ $labels.chip }}"; Sensor = "{{ $labels.sensor }}"; }; }; # Storage and disks NodeRaidArrayGotInactive = { expr = '' node_md_state{state="inactive"} > 0 ''; for = "0m"; labels = critical; annotations = { Device = "{{ $labels.device }}"; }; }; NodeRaidDiskFailure = { expr = '' node_md_disks{state="failed"} > 0 ''; for = "0m"; labels = critical; annotations = { Device = "{{ $labels.md_device }}"; }; }; NodeOutOfDiskSpace = { expr = '' (node_filesystem_free_bytes / node_filesystem_size_bytes < 0.1) and on (instance, device, mountpoint) (node_filesystem_readonly) == 0 ''; for = "5m"; labels = critical; annotations = { Mountpoint = "{{ $labels.mountpoint }}"; FreeSpace = "{{ $value | humanizePercentage }}"; }; }; NodeOutOfInodes = { expr = "node_filesystem_files_free / node_filesystem_files < 0.1"; for = "3m"; labels = critical; annotations = { Mountpoint = "{{ $labels.mountpoint }}"; FreeInodes = "{{ $value | humanizePercentage }}"; }; }; NodeUnhealthyDisk = { expr = "smartmon_device_smart_healthy < 1"; for = "10m"; labels = critical; annotations.Disk = "{{ $labels.disk }}"; }; NodeZfsWrongState = { expr = '' node_zfs_zpool_state{state!="online"} > 0 ''; for = "5m"; labels = critical; annotations = { State = "{{ $labels.state }}"; ZPool = "{{ $labels.zpool }}"; }; }; # Clock NodeClockSkew = { expr = '' (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) ''; for = "2m"; labels = warning; }; NodeClockNotSynchronising = { expr = '' min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 ''; for = "2m"; labels = warning; }; # Misc NodeLoad5Usage = { expr = '' node_load5 / ( count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0 ''; for = "1m"; labels = warning; annotations.Load5PerCore = "{{ $value | humanize }}"; }; NodeSystemdServiceFailed = { expr = '' node_systemd_unit_state{state="failed"} == 1 ''; for = "5m"; labels = warning; annotations.Service = "{{ $labels.name }}"; }; NodeRequiresReboot = { expr = "node_reboot_required > 0"; for = "5m"; labels = warning; }; NodeEdacCorrectableErrorsDetected = { expr = '' increase(node_edac_correctable_errors_total[1m]) > 0 ''; for = "0m"; labels = warning; annotations.CorrectedErrors = "{{ $value }}"; }; NodeEdacUncorrectableErrorsDetected = { expr = '' increase(node_edac_uncorrectable_errors_total[1m]) > 0 ''; for = "0m"; labels = critical; annotations.DetectedErrors = "{{ $value }}"; }; }