From 698bde585646bf3e70afe8769a38cbbac685f1f4 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Sat, 21 Jun 2025 11:49:25 +0200 Subject: [PATCH] monitoring: refactoring + blackbox --- profiles/monitoring/alertbot.nix | 22 +++++ profiles/monitoring/blackbox.nix | 35 ++++++++ profiles/monitoring/default.nix | 109 ++++++++++++++----------- profiles/monitoring/rules/blackbox.nix | 10 +++ profiles/monitoring/rules/node.nix | 4 +- 5 files changed, 129 insertions(+), 51 deletions(-) create mode 100644 profiles/monitoring/alertbot.nix create mode 100644 profiles/monitoring/blackbox.nix create mode 100644 profiles/monitoring/rules/blackbox.nix diff --git a/profiles/monitoring/alertbot.nix b/profiles/monitoring/alertbot.nix new file mode 100644 index 0000000..2316bc2 --- /dev/null +++ b/profiles/monitoring/alertbot.nix @@ -0,0 +1,22 @@ +{ pkgs, lib, config, network, ... }: + +{ + imports = [ + ../../modules/alertbot.nix + ]; + + age.secrets.alertbot-matrix-password = { + file = ../../secrets/alertbot-matrix-password.age; + }; + + services.alertbot = { + enable = true; + listenPort = 8081; + matrix = { + homeserver = "https://matrix.federez.net"; + user = "@alertbot:federez.net"; + passwordFile = config.age.secrets.alertbot-matrix-password.path; + roomId = "!bVyCrycmkkLXdQRquJ:federez.net"; + }; + }; +} \ No newline at end of file diff --git a/profiles/monitoring/blackbox.nix b/profiles/monitoring/blackbox.nix new file mode 100644 index 0000000..f101a46 --- /dev/null +++ b/profiles/monitoring/blackbox.nix @@ -0,0 +1,35 @@ +{ pkgs, lib, config, network, ... }: + +let + blackboxConfig = (pkgs.formats.yaml { }).generate "blackbox-config.yml" { + modules = { + https_get_200 = { + prober = "http"; + http = { + valid_status_codes = [ 200 ]; + method = "GET"; + follow_redirects = false; + fail_if_not_ssl = true; + }; + }; + dns_dodecagon_ptr = { + prober = "dns"; + dns = { + query_type = "PTR"; + query_name = "162.193.54.193.in-addr.arpa."; + validate_answer_rrs = { + fail_if_not_matches_regexp = [ + "162\.193\.54\.193\.in-addr\.arpa\.\t.*\tIN\tPTR\tdodecagon\.federez\.net\." + ]; + }; + }; + }; + }; + }; +in { + services.prometheus.exporters.blackbox = { + enable = true; + listenAddress = "localhost"; + configFile = blackboxConfig; + }; +} \ No newline at end of file diff --git a/profiles/monitoring/default.nix b/profiles/monitoring/default.nix index 4bf3941..689fe74 100644 --- a/profiles/monitoring/default.nix +++ b/profiles/monitoring/default.nix @@ -1,82 +1,95 @@ -{ lib, config, network, ... }: +{ pkgs, lib, config, network, ... }: let cfg = config.services.victoriametrics; - mkScrapeConfig = name: path: port: targets: { + victoriametricsPort = 8428; + alertmanagerPort = config.services.prometheus.alertmanager.port; + alertbotPort = config.services.alertbot.listenPort; + blackboxPort = config.services.prometheus.exporters.blackbox.port; + nodePort = 9100; + mkScrapeConfig = name: config: { job_name = name; - metrics_path = path; - static_configs = [ { targets = targets; } ]; + metrics_path = config.path; + static_configs = [ { targets = config.targets; } ]; + params = config.params or { }; relabel_configs = [ { source_labels = [ "__address__"]; target_label = "__param_target"; } { source_labels = [ "__param_target"]; target_label = "instance"; } { source_labels = [ "__param_target"]; target_label = "__address__"; - replacement = "$1.infra.federez.net:${toString port}"; + replacement = config.replacement; } ]; }; - nodePort = 9100; - vmPort = 8428; - nodesConfig = mkScrapeConfig "node" "/metrics" nodePort - (lib.attrsets.mapAttrsToList (n: _: n) network.infra.nodes); + mkScrapeConfigs = lib.attrsets.mapAttrsToList mkScrapeConfig; critical = { severity = "critical"; }; warning = { severity = "warning"; }; - importRules = path: let - attrs = import path { inherit critical warning; }; - in lib.attrsets.mapAttrsToList (n: a: a // { alert = n; }) attrs; + mkRuleGroups = lib.attrsets.mapAttrsToList (name: path: { + inherit name; + rules = lib.attrsets.mapAttrsToList + (alert: attrs: attrs // { inherit alert; }) + (import path { inherit critical warning; }); + }); in { imports = [ ../../modules/alertbot.nix + ./blackbox.nix + ./alertbot.nix ]; - age.secrets.alertbot-matrix-password = { - file = ../../secrets/alertbot-matrix-password.age; - }; - backups.directories = [ "/var/lib/${cfg.stateDir}" ]; - services.alertbot = { - enable = true; - listenPort = 8081; - matrix = { - homeserver = "https://matrix.federez.net"; - user = "@alertbot:federez.net"; - passwordFile = config.age.secrets.alertbot-matrix-password.path; - roomId = "!bVyCrycmkkLXdQRquJ:federez.net"; - }; - }; - services.victoriametrics = { enable = true; extraOptions = [ "-enableTCP6" ]; - listenAddress = "localhost:${toString vmPort}"; + listenAddress = "localhost:${toString victoriametricsPort}"; prometheusConfig = { - scrape_configs = [ nodesConfig ]; + scrape_configs = mkScrapeConfigs { + node = { + path = "/metrics"; + replacement = "$1.infra.federez.net:${toString nodePort}"; + targets = lib.attrsets.mapAttrsToList (n: _: n) network.infra.nodes; + }; + blackbox_https_get_200 = { + path = "/probe"; + replacement = "localhost:${toString blackboxPort}"; + params.module = [ "https_get_200" ]; + targets = [ + "https://federez.net/" + "https://re2o.federez.net/" + "https://gitlab2.federez.net/federez/nix" + "https://www.federez.net/" + "https://events.federez.net/" + "https://wiki.federez.net/" + "https://wiki-backup.federez.net/" + "https://lists.federez.net/postorius/lists/" + "https://element.federez.net/" + "https://chat.federez.net/login" + "https://nextcloud.federez.net/index.php/login" + "https://watch.federez.net/" + ]; + }; + }; }; }; services.vmalert = { enable = true; rules = { - groups = [ - { - name = "common"; - rules = importRules ./rules/common.nix; - } - { - name = "node"; - rules = importRules ./rules/node.nix; - } - ]; + groups = mkRuleGroups { + common = ./rules/common.nix; + node = ./rules/node.nix; + blackbox = ./rules/blackbox.nix; + }; }; settings = let - vmUrl = "http://localhost:${toString vmPort}"; - amUrl = "http://localhost:${toString config.services.prometheus.alertmanager.port}"; + victoriametricsUrl = "http://localhost:${toString victoriametricsPort}"; + alertmanagerUrl = "http://localhost:${toString alertmanagerPort}"; in { - "datasource.url" = vmUrl; - "remoteWrite.url" = vmUrl; - "remoteRead.url" = vmUrl; - "notifier.url" = [ amUrl ]; + "datasource.url" = victoriametricsUrl; + "remoteWrite.url" = victoriametricsUrl; + "remoteRead.url" = victoriametricsUrl; + "notifier.url" = [ alertmanagerUrl ]; }; }; @@ -100,11 +113,9 @@ in { receivers = [ { name = "webhook"; - webhook_configs = let - port = config.services.alertbot.listenPort; - in [ + webhook_configs = [ { - url = "http://localhost:${toString port}/webhook"; + url = "http://localhost:${toString alertbotPort}/webhook"; send_resolved = true; } ]; diff --git a/profiles/monitoring/rules/blackbox.nix b/profiles/monitoring/rules/blackbox.nix new file mode 100644 index 0000000..7ab582b --- /dev/null +++ b/profiles/monitoring/rules/blackbox.nix @@ -0,0 +1,10 @@ +{ critical, ... }: +{ + BlackboxHttps200Failure = { + expr = '' + probe_success{job="blackbox_https_get_200"} != 1 + ''; + for = "3m"; + labels = critical; + }; +} \ No newline at end of file diff --git a/profiles/monitoring/rules/node.nix b/profiles/monitoring/rules/node.nix index fab9ede..0b62631 100644 --- a/profiles/monitoring/rules/node.nix +++ b/profiles/monitoring/rules/node.nix @@ -21,7 +21,7 @@ NodeSwapIsFillingUp = { expr = '' (1 - (node_memory_SwapFree_bytes - / node_memory_SwapTotal_bytes)) > 0.5 + / node_memory_SwapTotal_bytes)) > 0.75 ''; for = "1m"; labels = critical; @@ -265,7 +265,7 @@ NodeLoad5Usage = { expr = '' node_load5 / ( - count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0 + count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.1 ''; for = "1m"; labels = warning;