wip: nixpkgs versions + infra network + monitoring
Signed-off-by: Jeltz <jeltz@federez.net>
This commit is contained in:
parent
01b5a0fe25
commit
a64b34810d
24 changed files with 1363 additions and 513 deletions
287
profiles/monitoring/rules/node.nix
Normal file
287
profiles/monitoring/rules/node.nix
Normal file
|
@ -0,0 +1,287 @@
|
|||
{ critical, warning, ... }:
|
||||
{
|
||||
# Memory
|
||||
|
||||
NodeOutOfMemory = {
|
||||
expr = ''
|
||||
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
|
||||
'';
|
||||
for = "3m";
|
||||
labels = critical;
|
||||
annotations.Available = "{{ $value | humanizePercentage }}";
|
||||
};
|
||||
|
||||
NodeUnderMemoryPressure = {
|
||||
expr = "rate(node_vmstat_pgmajfault[5m]) > 1000";
|
||||
labels = critical;
|
||||
for = "0m";
|
||||
annotations.Pressure = "{{ $value | humanize }}";
|
||||
};
|
||||
|
||||
NodeSwapIsFillingUp = {
|
||||
expr = ''
|
||||
(1 - (node_memory_SwapFree_bytes
|
||||
/ node_memory_SwapTotal_bytes)) > 0.5
|
||||
'';
|
||||
for = "1m";
|
||||
labels = critical;
|
||||
annotations.UsedSwap = "{{ $value | humanizePercentage }}";
|
||||
};
|
||||
|
||||
NodeOomKillDetected = {
|
||||
expr = "increase(node_vmstat_oom_kill[1m]) > 0";
|
||||
for = "0m";
|
||||
labels = critical;
|
||||
};
|
||||
|
||||
# CPU
|
||||
|
||||
NodeCpuUsage = {
|
||||
expr = ''
|
||||
(avg by (instance)
|
||||
(rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8
|
||||
'';
|
||||
for = "10m";
|
||||
labels = warning;
|
||||
annotations.AverageUsage = "{{ $value | humanizePercentage }}";
|
||||
};
|
||||
|
||||
NodeCpuStealNoisyNeighbor = {
|
||||
expr = ''
|
||||
avg by (instance) (
|
||||
rate(node_cpu_seconds_total{mode="steal"}[2m])
|
||||
) > 0.1
|
||||
'';
|
||||
for = "10m";
|
||||
labels = warning;
|
||||
annotations.Steal = "{{ $value | humanizePercentage }}";
|
||||
};
|
||||
|
||||
# Network
|
||||
|
||||
NodeLinkHighUsageIn = {
|
||||
expr = ''
|
||||
(rate(node_network_receive_bytes_total[5m])
|
||||
/ on(instance, device) node_network_speed_bytes) > .80
|
||||
'';
|
||||
labels = warning;
|
||||
for = "3m";
|
||||
annotations = {
|
||||
Usage = "{{ $value | humanizePercentage }}";
|
||||
Device = "{{ $labels.device }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeLinkHighUsageOut = {
|
||||
expr = ''
|
||||
(rate(node_network_transmit_bytes_total[5m])
|
||||
/ on(instance, device) node_network_speed_bytes) > .80
|
||||
'';
|
||||
labels = warning;
|
||||
for = "3m";
|
||||
annotations = {
|
||||
Usage = "{{ $value | humanizePercentage }}";
|
||||
Device = "{{ $labels.device }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeConntrackLimit = {
|
||||
expr = ''
|
||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.8
|
||||
'';
|
||||
for = "5m";
|
||||
labels = warning;
|
||||
annotations.Filled = "{{ $value | humanizePercentage }}";
|
||||
};
|
||||
|
||||
NodeNetworkReceiveErrors = {
|
||||
expr = ''
|
||||
rate(node_network_receive_errs_total[2m])
|
||||
/ rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
'';
|
||||
for = "2m";
|
||||
labels = warning;
|
||||
annotations = {
|
||||
Errors = "{{ $value | humanizePercentage }}";
|
||||
Device = "{{ $labels.device }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeNetworkTransmitErrors = {
|
||||
expr = ''
|
||||
rate(node_network_transmit_errs_total[2m])
|
||||
/ rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
'';
|
||||
for = "2m";
|
||||
labels = warning;
|
||||
annotations = {
|
||||
Errors = "{{ $value | humanizePercentage }}";
|
||||
Device = "{{ $labels.device }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeNetworkBondDegraded = {
|
||||
expr = "node_bonding_active - node_bonding_slaves != 0";
|
||||
for = "2m";
|
||||
labels = warning;
|
||||
annotations.Device = "{{ $labels.device }}";
|
||||
};
|
||||
|
||||
# Temperature
|
||||
|
||||
NodePhysicalComponentTooHot = {
|
||||
expr = ''
|
||||
node_hwmon_temp_celsius > clamp_max(79, node_hwmon_temp_max_celsius)
|
||||
'';
|
||||
for = "0m";
|
||||
labels = critical;
|
||||
annotations = {
|
||||
Temperature = "{{ $value | humanize }} °C";
|
||||
Chip = "{{ $labels.chip }}";
|
||||
Sensor = "{{ $labels.sensor }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeNodeOvertemperatureAlarm = {
|
||||
expr = "node_hwmon_temp_crit_alarm_celsius == 1";
|
||||
for = "0m";
|
||||
labels = critical;
|
||||
annotations = {
|
||||
Chip = "{{ $labels.chip }}";
|
||||
Sensor = "{{ $labels.sensor }}";
|
||||
};
|
||||
};
|
||||
|
||||
# Storage and disks
|
||||
|
||||
NodeRaidArrayGotInactive = {
|
||||
expr = ''
|
||||
node_md_state{state="inactive"} > 0
|
||||
'';
|
||||
for = "0m";
|
||||
labels = critical;
|
||||
annotations = {
|
||||
Device = "{{ $labels.device }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeRaidDiskFailure = {
|
||||
expr = ''
|
||||
node_md_disks{state="failed"} > 0
|
||||
'';
|
||||
for = "0m";
|
||||
labels = critical;
|
||||
annotations = {
|
||||
Device = "{{ $labels.md_device }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeOutOfDiskSpace = {
|
||||
expr = ''
|
||||
(node_filesystem_free_bytes / node_filesystem_size_bytes < 0.1)
|
||||
and on (instance, device, mountpoint) (node_filesystem_readonly) == 0
|
||||
'';
|
||||
for = "5m";
|
||||
labels = critical;
|
||||
annotations = {
|
||||
Mountpoint = "{{ $labels.mountpoint }}";
|
||||
FreeSpace = "{{ $value | humanizePercentage }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeOutOfInodes = {
|
||||
expr = "node_filesystem_files_free / node_filesystem_files < 0.1";
|
||||
for = "3m";
|
||||
labels = critical;
|
||||
annotations = {
|
||||
Mountpoint = "{{ $labels.mountpoint }}";
|
||||
FreeInodes = "{{ $value | humanizePercentage }}";
|
||||
};
|
||||
};
|
||||
|
||||
NodeUnhealthyDisk = {
|
||||
expr = "smartmon_device_smart_healthy < 1";
|
||||
for = "10m";
|
||||
labels = critical;
|
||||
annotations.Disk = "{{ $labels.disk }}";
|
||||
};
|
||||
|
||||
NodeZfsWrongState = {
|
||||
expr = ''
|
||||
node_zfs_zpool_state{state!="online"} > 0
|
||||
'';
|
||||
for = "5m";
|
||||
labels = critical;
|
||||
annotations = {
|
||||
State = "{{ $labels.state }}";
|
||||
ZPool = "{{ $labels.zpool }}";
|
||||
};
|
||||
};
|
||||
|
||||
# Clock
|
||||
|
||||
NodeClockSkew = {
|
||||
expr = ''
|
||||
(node_timex_offset_seconds > 0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
||||
or (node_timex_offset_seconds < -0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||
'';
|
||||
for = "2m";
|
||||
labels = warning;
|
||||
};
|
||||
|
||||
NodeClockNotSynchronising = {
|
||||
expr = ''
|
||||
min_over_time(node_timex_sync_status[1m]) == 0
|
||||
and node_timex_maxerror_seconds >= 16
|
||||
'';
|
||||
for = "2m";
|
||||
labels = warning;
|
||||
};
|
||||
|
||||
# Misc
|
||||
|
||||
NodeLoad5Usage = {
|
||||
expr = ''
|
||||
node_load5 / (
|
||||
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0
|
||||
'';
|
||||
for = "1m";
|
||||
labels = warning;
|
||||
annotations.Load5PerCore = "{{ $value | humanize }}";
|
||||
};
|
||||
|
||||
NodeSystemdServiceFailed = {
|
||||
expr = ''
|
||||
node_systemd_unit_state{state="failed"} == 1
|
||||
'';
|
||||
for = "5m";
|
||||
labels = warning;
|
||||
annotations.Service = "{{ $labels.name }}";
|
||||
};
|
||||
|
||||
NodeRequiresReboot = {
|
||||
expr = "node_reboot_required > 0";
|
||||
for = "5m";
|
||||
labels = warning;
|
||||
};
|
||||
|
||||
NodeEdacCorrectableErrorsDetected = {
|
||||
expr = ''
|
||||
increase(node_edac_correctable_errors_total[1m]) > 0
|
||||
'';
|
||||
for = "0m";
|
||||
labels = warning;
|
||||
annotations.CorrectedErrors = "{{ $value }}";
|
||||
};
|
||||
|
||||
NodeEdacUncorrectableErrorsDetected = {
|
||||
expr = ''
|
||||
increase(node_edac_uncorrectable_errors_total[1m]) > 0
|
||||
'';
|
||||
for = "0m";
|
||||
labels = critical;
|
||||
annotations.DetectedErrors = "{{ $value }}";
|
||||
};
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue