monitoring: refactoring + blackbox
This commit is contained in:
parent
59789595d1
commit
698bde5856
5 changed files with 129 additions and 51 deletions
22
profiles/monitoring/alertbot.nix
Normal file
22
profiles/monitoring/alertbot.nix
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
{ pkgs, lib, config, network, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../../modules/alertbot.nix
|
||||||
|
];
|
||||||
|
|
||||||
|
age.secrets.alertbot-matrix-password = {
|
||||||
|
file = ../../secrets/alertbot-matrix-password.age;
|
||||||
|
};
|
||||||
|
|
||||||
|
services.alertbot = {
|
||||||
|
enable = true;
|
||||||
|
listenPort = 8081;
|
||||||
|
matrix = {
|
||||||
|
homeserver = "https://matrix.federez.net";
|
||||||
|
user = "@alertbot:federez.net";
|
||||||
|
passwordFile = config.age.secrets.alertbot-matrix-password.path;
|
||||||
|
roomId = "!bVyCrycmkkLXdQRquJ:federez.net";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
35
profiles/monitoring/blackbox.nix
Normal file
35
profiles/monitoring/blackbox.nix
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
{ pkgs, lib, config, network, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
blackboxConfig = (pkgs.formats.yaml { }).generate "blackbox-config.yml" {
|
||||||
|
modules = {
|
||||||
|
https_get_200 = {
|
||||||
|
prober = "http";
|
||||||
|
http = {
|
||||||
|
valid_status_codes = [ 200 ];
|
||||||
|
method = "GET";
|
||||||
|
follow_redirects = false;
|
||||||
|
fail_if_not_ssl = true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
dns_dodecagon_ptr = {
|
||||||
|
prober = "dns";
|
||||||
|
dns = {
|
||||||
|
query_type = "PTR";
|
||||||
|
query_name = "162.193.54.193.in-addr.arpa.";
|
||||||
|
validate_answer_rrs = {
|
||||||
|
fail_if_not_matches_regexp = [
|
||||||
|
"162\.193\.54\.193\.in-addr\.arpa\.\t.*\tIN\tPTR\tdodecagon\.federez\.net\."
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
in {
|
||||||
|
services.prometheus.exporters.blackbox = {
|
||||||
|
enable = true;
|
||||||
|
listenAddress = "localhost";
|
||||||
|
configFile = blackboxConfig;
|
||||||
|
};
|
||||||
|
}
|
|
@ -1,82 +1,95 @@
|
||||||
{ lib, config, network, ... }:
|
{ pkgs, lib, config, network, ... }:
|
||||||
let
|
let
|
||||||
cfg = config.services.victoriametrics;
|
cfg = config.services.victoriametrics;
|
||||||
mkScrapeConfig = name: path: port: targets: {
|
victoriametricsPort = 8428;
|
||||||
|
alertmanagerPort = config.services.prometheus.alertmanager.port;
|
||||||
|
alertbotPort = config.services.alertbot.listenPort;
|
||||||
|
blackboxPort = config.services.prometheus.exporters.blackbox.port;
|
||||||
|
nodePort = 9100;
|
||||||
|
mkScrapeConfig = name: config: {
|
||||||
job_name = name;
|
job_name = name;
|
||||||
metrics_path = path;
|
metrics_path = config.path;
|
||||||
static_configs = [ { targets = targets; } ];
|
static_configs = [ { targets = config.targets; } ];
|
||||||
|
params = config.params or { };
|
||||||
relabel_configs = [
|
relabel_configs = [
|
||||||
{ source_labels = [ "__address__"]; target_label = "__param_target"; }
|
{ source_labels = [ "__address__"]; target_label = "__param_target"; }
|
||||||
{ source_labels = [ "__param_target"]; target_label = "instance"; }
|
{ source_labels = [ "__param_target"]; target_label = "instance"; }
|
||||||
{
|
{
|
||||||
source_labels = [ "__param_target"];
|
source_labels = [ "__param_target"];
|
||||||
target_label = "__address__";
|
target_label = "__address__";
|
||||||
replacement = "$1.infra.federez.net:${toString port}";
|
replacement = config.replacement;
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
nodePort = 9100;
|
mkScrapeConfigs = lib.attrsets.mapAttrsToList mkScrapeConfig;
|
||||||
vmPort = 8428;
|
|
||||||
nodesConfig = mkScrapeConfig "node" "/metrics" nodePort
|
|
||||||
(lib.attrsets.mapAttrsToList (n: _: n) network.infra.nodes);
|
|
||||||
critical = { severity = "critical"; };
|
critical = { severity = "critical"; };
|
||||||
warning = { severity = "warning"; };
|
warning = { severity = "warning"; };
|
||||||
importRules = path: let
|
mkRuleGroups = lib.attrsets.mapAttrsToList (name: path: {
|
||||||
attrs = import path { inherit critical warning; };
|
inherit name;
|
||||||
in lib.attrsets.mapAttrsToList (n: a: a // { alert = n; }) attrs;
|
rules = lib.attrsets.mapAttrsToList
|
||||||
|
(alert: attrs: attrs // { inherit alert; })
|
||||||
|
(import path { inherit critical warning; });
|
||||||
|
});
|
||||||
in {
|
in {
|
||||||
imports = [
|
imports = [
|
||||||
../../modules/alertbot.nix
|
../../modules/alertbot.nix
|
||||||
|
./blackbox.nix
|
||||||
|
./alertbot.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
age.secrets.alertbot-matrix-password = {
|
|
||||||
file = ../../secrets/alertbot-matrix-password.age;
|
|
||||||
};
|
|
||||||
|
|
||||||
backups.directories = [ "/var/lib/${cfg.stateDir}" ];
|
backups.directories = [ "/var/lib/${cfg.stateDir}" ];
|
||||||
|
|
||||||
services.alertbot = {
|
|
||||||
enable = true;
|
|
||||||
listenPort = 8081;
|
|
||||||
matrix = {
|
|
||||||
homeserver = "https://matrix.federez.net";
|
|
||||||
user = "@alertbot:federez.net";
|
|
||||||
passwordFile = config.age.secrets.alertbot-matrix-password.path;
|
|
||||||
roomId = "!bVyCrycmkkLXdQRquJ:federez.net";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
services.victoriametrics = {
|
services.victoriametrics = {
|
||||||
enable = true;
|
enable = true;
|
||||||
extraOptions = [ "-enableTCP6" ];
|
extraOptions = [ "-enableTCP6" ];
|
||||||
listenAddress = "localhost:${toString vmPort}";
|
listenAddress = "localhost:${toString victoriametricsPort}";
|
||||||
prometheusConfig = {
|
prometheusConfig = {
|
||||||
scrape_configs = [ nodesConfig ];
|
scrape_configs = mkScrapeConfigs {
|
||||||
|
node = {
|
||||||
|
path = "/metrics";
|
||||||
|
replacement = "$1.infra.federez.net:${toString nodePort}";
|
||||||
|
targets = lib.attrsets.mapAttrsToList (n: _: n) network.infra.nodes;
|
||||||
|
};
|
||||||
|
blackbox_https_get_200 = {
|
||||||
|
path = "/probe";
|
||||||
|
replacement = "localhost:${toString blackboxPort}";
|
||||||
|
params.module = [ "https_get_200" ];
|
||||||
|
targets = [
|
||||||
|
"https://federez.net/"
|
||||||
|
"https://re2o.federez.net/"
|
||||||
|
"https://gitlab2.federez.net/federez/nix"
|
||||||
|
"https://www.federez.net/"
|
||||||
|
"https://events.federez.net/"
|
||||||
|
"https://wiki.federez.net/"
|
||||||
|
"https://wiki-backup.federez.net/"
|
||||||
|
"https://lists.federez.net/postorius/lists/"
|
||||||
|
"https://element.federez.net/"
|
||||||
|
"https://chat.federez.net/login"
|
||||||
|
"https://nextcloud.federez.net/index.php/login"
|
||||||
|
"https://watch.federez.net/"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
services.vmalert = {
|
services.vmalert = {
|
||||||
enable = true;
|
enable = true;
|
||||||
rules = {
|
rules = {
|
||||||
groups = [
|
groups = mkRuleGroups {
|
||||||
{
|
common = ./rules/common.nix;
|
||||||
name = "common";
|
node = ./rules/node.nix;
|
||||||
rules = importRules ./rules/common.nix;
|
blackbox = ./rules/blackbox.nix;
|
||||||
}
|
};
|
||||||
{
|
|
||||||
name = "node";
|
|
||||||
rules = importRules ./rules/node.nix;
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
};
|
||||||
settings = let
|
settings = let
|
||||||
vmUrl = "http://localhost:${toString vmPort}";
|
victoriametricsUrl = "http://localhost:${toString victoriametricsPort}";
|
||||||
amUrl = "http://localhost:${toString config.services.prometheus.alertmanager.port}";
|
alertmanagerUrl = "http://localhost:${toString alertmanagerPort}";
|
||||||
in {
|
in {
|
||||||
"datasource.url" = vmUrl;
|
"datasource.url" = victoriametricsUrl;
|
||||||
"remoteWrite.url" = vmUrl;
|
"remoteWrite.url" = victoriametricsUrl;
|
||||||
"remoteRead.url" = vmUrl;
|
"remoteRead.url" = victoriametricsUrl;
|
||||||
"notifier.url" = [ amUrl ];
|
"notifier.url" = [ alertmanagerUrl ];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -100,11 +113,9 @@ in {
|
||||||
receivers = [
|
receivers = [
|
||||||
{
|
{
|
||||||
name = "webhook";
|
name = "webhook";
|
||||||
webhook_configs = let
|
webhook_configs = [
|
||||||
port = config.services.alertbot.listenPort;
|
|
||||||
in [
|
|
||||||
{
|
{
|
||||||
url = "http://localhost:${toString port}/webhook";
|
url = "http://localhost:${toString alertbotPort}/webhook";
|
||||||
send_resolved = true;
|
send_resolved = true;
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
10
profiles/monitoring/rules/blackbox.nix
Normal file
10
profiles/monitoring/rules/blackbox.nix
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
{ critical, ... }:
|
||||||
|
{
|
||||||
|
BlackboxHttps200Failure = {
|
||||||
|
expr = ''
|
||||||
|
probe_success{job="blackbox_https_get_200"} != 1
|
||||||
|
'';
|
||||||
|
for = "3m";
|
||||||
|
labels = critical;
|
||||||
|
};
|
||||||
|
}
|
|
@ -21,7 +21,7 @@
|
||||||
NodeSwapIsFillingUp = {
|
NodeSwapIsFillingUp = {
|
||||||
expr = ''
|
expr = ''
|
||||||
(1 - (node_memory_SwapFree_bytes
|
(1 - (node_memory_SwapFree_bytes
|
||||||
/ node_memory_SwapTotal_bytes)) > 0.5
|
/ node_memory_SwapTotal_bytes)) > 0.75
|
||||||
'';
|
'';
|
||||||
for = "1m";
|
for = "1m";
|
||||||
labels = critical;
|
labels = critical;
|
||||||
|
@ -265,7 +265,7 @@
|
||||||
NodeLoad5Usage = {
|
NodeLoad5Usage = {
|
||||||
expr = ''
|
expr = ''
|
||||||
node_load5 / (
|
node_load5 / (
|
||||||
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.0
|
count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1.1
|
||||||
'';
|
'';
|
||||||
for = "1m";
|
for = "1m";
|
||||||
labels = warning;
|
labels = warning;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue