fix: idols-aquamarine - alert (#221)

* fix: idols-aquamarine - alert

* feat: add dashboards for victoriametrics

* fix: node_exporter - exclude docker/podman/kubelet mounts and /home/ryan bindmounts

* fix: alert - add coredns, comment out some useless alert rules
This commit is contained in:
Ryan Yin
2025-09-14 10:48:38 +08:00
committed by GitHub
14 changed files with 6757 additions and 107 deletions

1
.gitignore vendored
View File

@@ -8,3 +8,4 @@ logs/
core*
!core/
!core.nix
!coredns*

6
flake.lock generated
View File

@@ -526,10 +526,10 @@
"mysecrets": {
"flake": false,
"locked": {
"lastModified": 1752678564,
"narHash": "sha256-x2sbH7Umncbyc9oca5mqX8kMChHVUTytKk+QXEcB4i4=",
"lastModified": 1757651423,
"narHash": "sha256-w2hBme0vg3uDoEjP+0WuBT9hAhf1xJa4Np+GS2zQKXU=",
"ref": "refs/heads/main",
"rev": "a231913597362c15c71fd9212cef5092ae85a64c",
"rev": "44b2943b7ebed5717bb9855c1b7a95c8a89fb7f7",
"shallow": true,
"type": "git",
"url": "ssh://git@github.com/ryan4yin/nix-secrets.git"

View File

@@ -88,6 +88,11 @@ in
encode zstd gzip
reverse_proxy http://localhost:9093
'';
virtualHosts."vmalert.writefor.fun".extraConfig = ''
${hostCommonConfig}
encode zstd gzip
reverse_proxy http://localhost:8880
'';
virtualHosts."minio.writefor.fun".extraConfig = ''
${hostCommonConfig}
encode zstd gzip

View File

@@ -32,3 +32,7 @@ mixin provides a comprehensive package for monitoring Loki in production.
- Instance:
https://github.com/cloudnative-pg/grafana-dashboards/blob/main/charts/cluster/grafana-dashboard.json
- Pooler(PGBouncer): https://github.com/cloudnative-pg/grafana-dashboards/issues/7
## VictoriaMetrics
- https://grafana.com/orgs/victoriametrics/dashboards

View File

@@ -0,0 +1,134 @@
{ config, lib, ... }:
{
services.vmalert = {
enable = true;
settings = {
"httpListenAddr" = "127.0.0.1:8880";
"datasource.url" = "http://localhost:9090";
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
# Whether to disable long-lived connections to the datasource.
"datasource.disableKeepAlive" = true;
# Whether to avoid stripping sensitive information such as auth headers or passwords
# from URLs in log messages or UI and exported metrics.
"datasource.showURL" = false;
# Path to the files with alerting and/or recording rules.
rule = [
"${./alert_rules}/*.yml"
];
};
};
services.prometheus.alertmanager = {
enable = true;
listenAddress = "127.0.0.1";
port = 9093;
webExternalUrl = "http://alertmanager.writefor.fun";
logLevel = "info";
environmentFile = config.age.secrets."alertmanager.env".path;
configuration = {
global = {
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost = "smtp.qq.com:465";
smtp_from = "$SMTP_SENDER_EMAIL";
smtp_auth_username = "$SMTP_AUTH_USERNAME";
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
# https://service.mail.qq.com/detail/0/310
smtp_require_tls = false;
};
route = {
receiver = "telegram";
routes = [
{
receiver = "telegram";
# group alerts by labels
group_by = [
"job"
# --- Alert labels ---
"alertname"
"alertgroup"
# --- kubernetes labels ---
"namespace"
"service"
# --- custom labels ---
"cluster"
"env"
"type"
"host"
];
group_wait = "5m";
group_interval = "5m";
repeat_interval = "4h";
}
# {
# # Route only prod env's critical alerts to email (most severe alerts)
# match = {
# severity = "critical";
# env = "prd";
# };
# receiver = "email";
# group_by = [
# "host"
# "namespace"
# "pod"
# "job"
# ];
# group_wait = "1m";
# group_interval = "5m";
# repeat_interval = "2h";
# }
];
};
receivers = [
# {
# name = "email";
# email_configs = [
# {
# to = "ryan4yin@linux.com";
# # Whether to notify about resolved alerts.
# send_resolved = true;
# }
# ];
# }
{
name = "telegram";
telegram_configs = [
{
bot_token = "$TELEGRAM_BOT_TOKEN";
chat_id = 586169186; # My Telegram ID
# Whether to notify about resolved alerts.
send_resolved = true;
# Disable notifications for resolved alerts
disable_notifications = false;
# Parse mode for the message
parse_mode = "Markdown";
# Message template
message = ''
*Alert:* {{ .GroupLabels.alertname }}
*Status:* {{ .Status }}
*Severity:* {{ .CommonLabels.severity }}
{{ if .GroupLabels.namespace }}*Namespace:* {{ .GroupLabels.namespace }}{{ end }}
{{ if .GroupLabels.pod }}*Pod:* {{ .GroupLabels.pod }}{{ end }}
{{ if .GroupLabels.job }}*Job:* {{ .GroupLabels.job }}{{ end }}
{{ if .GroupLabels.host }}*Host:* {{ .GroupLabels.host }}{{ end }}
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
{{ if .Labels.instance }}*Instance:* {{ .Labels.instance }}{{ end }}
{{ if .Labels.container }}*Container:* {{ .Labels.container }}{{ end }}
*Started:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ if .EndsAt }}
*Ended:* {{ .EndsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
'';
}
];
}
];
};
};
}

View File

@@ -0,0 +1,13 @@
groups:
- name: CoreDNS Exporter
rules:
- alert: CorednsPanicCount
expr: "increase(coredns_panics_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
description:
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@@ -69,7 +69,7 @@ groups:
annotations:
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
description:
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
"High percentage of HTTP 4xx responses in Istio (> 5%).\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: IstioHigh5xxErrorRate

View File

@@ -203,18 +203,18 @@ groups:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr:
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description:
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
# - alert: HostCpuIsUnderutilized
# expr:
# '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
# group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description:
# "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
# $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr:

View File

@@ -73,16 +73,16 @@ groups:
"PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS
= {{ $labels }}"
- alert: PostgresqlNotEnoughConnections
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql not enough connections (instance {{ $labels.instance }})
description:
"PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
# - alert: PostgresqlNotEnoughConnections
# expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
# for: 2m
# labels:
# severity: critical
# annotations:
# summary: Postgresql not enough connections (instance {{ $labels.instance }})
# description:
# "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
# }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
@@ -109,17 +109,17 @@ groups:
"Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: PostgresqlCommitRateLow
expr:
'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql commit rate low (instance {{ $labels.instance }})
description:
"Postgresql seems to be processing very few transactions\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
# - alert: PostgresqlCommitRateLow
# expr:
# 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
# for: 2m
# labels:
# severity: critical
# annotations:
# summary: Postgresql commit rate low (instance {{ $labels.instance }})
# description:
# "Postgresql seems to be processing very few transactions\n VALUE = {{ $value
# }}\n LABELS = {{ $labels }}"
- alert: PostgresqlLowXidConsumption
expr: "rate(pg_txid_current[1m]) < 5"

View File

@@ -1,48 +0,0 @@
{ config, ... }:
{
services.prometheus.alertmanager = {
enable = true;
listenAddress = "127.0.0.1";
port = 9093;
webExternalUrl = "http://alertmanager.writefor.fun";
logLevel = "info";
environmentFile = config.age.secrets."alertmanager.env".path;
configuration = {
global = {
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost = "smtp.qq.com:465";
smtp_from = "$SMTP_SENDER_EMAIL";
smtp_auth_username = "$SMTP_AUTH_USERNAME";
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
# https://service.mail.qq.com/detail/0/310
smtp_require_tls = false;
};
route = {
receiver = "default";
routes = [
{
group_by = [ "host" ];
group_wait = "5m";
group_interval = "5m";
repeat_interval = "4h";
receiver = "default";
}
];
};
receivers = [
{
name = "default";
email_configs = [
{
to = "ryan4yin@linux.com";
# Whether to notify about resolved alerts.
send_resolved = true;
}
];
}
];
};
};
}

View File

@@ -2,6 +2,6 @@
{
imports = [
./victoriametrics.nix
./alertmanager.nix
./alert.nix
];
}

View File

@@ -116,25 +116,4 @@
) [ ] myvars.networking.hostsAddr);
};
};
services.vmalert = {
enable = true;
settings = {
"datasource.url" = "http://localhost:9090";
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
# Whether to disable long-lived connections to the datasource.
"datasource.disableKeepAlive" = true;
# Whether to avoid stripping sensitive information such as auth headers or passwords
# from URLs in log messages or UI and exported metrics.
"datasource.showURL" = false;
rule = [
./alert_rules/node-exporter.yml
./alert_rules/kubestate-exporter.yml
./alert_rules/etcd_embedded-exporter.yml
./alert_rules/istio_embedded-exporter.yml
./alert_rules/coredns_embedded-exporter.yml
];
};
};
}

View File

@@ -14,5 +14,18 @@
# use either enabledCollectors or disabledCollectors
# disabledCollectors = [];
extraFlags = [
# Exclude pseudo/ephemeral FS:
# - /proc, /sys: kernel pseudo-FS, always size 0
# - /dev: tmpfs/devices, not meaningful for disk usage
# Exclude container/runtime mounts:
# - /var/lib/docker/, /var/lib/containers/ and /var/lib/kubelet/ → too much overlay/tmpfs mounts,
# often EACCES (strict perms, namespaces) → false alerts
# Exclude user bind mounts:
# - /home/ryan/.+ → bind-mounted from /persistent (NixOS tmpfs-root setup),
# monitoring /persistent is sufficient
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/containers/.+|var/lib/kubelet/.+|home/ryan/.+)($|/)"
];
};
}