mirror of
https://github.com/ryan4yin/nix-config.git
synced 2026-01-11 22:30:25 +01:00
fix: idols-aquamarine - alert (#221)
* fix: idols-aquamarine - alert * feat: add dashboards for victoriametrics * fix: node_exporter - exclude docker/podman/kubelet mounts and /home/ryan bindmounts * fix: alert - add coredns, comment out some useless alert rules
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,3 +8,4 @@ logs/
|
||||
core*
|
||||
!core/
|
||||
!core.nix
|
||||
!coredns*
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -526,10 +526,10 @@
|
||||
"mysecrets": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1752678564,
|
||||
"narHash": "sha256-x2sbH7Umncbyc9oca5mqX8kMChHVUTytKk+QXEcB4i4=",
|
||||
"lastModified": 1757651423,
|
||||
"narHash": "sha256-w2hBme0vg3uDoEjP+0WuBT9hAhf1xJa4Np+GS2zQKXU=",
|
||||
"ref": "refs/heads/main",
|
||||
"rev": "a231913597362c15c71fd9212cef5092ae85a64c",
|
||||
"rev": "44b2943b7ebed5717bb9855c1b7a95c8a89fb7f7",
|
||||
"shallow": true,
|
||||
"type": "git",
|
||||
"url": "ssh://git@github.com/ryan4yin/nix-secrets.git"
|
||||
|
||||
@@ -88,6 +88,11 @@ in
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9093
|
||||
'';
|
||||
virtualHosts."vmalert.writefor.fun".extraConfig = ''
|
||||
${hostCommonConfig}
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:8880
|
||||
'';
|
||||
virtualHosts."minio.writefor.fun".extraConfig = ''
|
||||
${hostCommonConfig}
|
||||
encode zstd gzip
|
||||
|
||||
@@ -32,3 +32,7 @@ mixin provides a comprehensive package for monitoring Loki in production.
|
||||
- Instance:
|
||||
https://github.com/cloudnative-pg/grafana-dashboards/blob/main/charts/cluster/grafana-dashboard.json
|
||||
- Pooler(PGBouncer): https://github.com/cloudnative-pg/grafana-dashboards/issues/7
|
||||
|
||||
## VictoriaMetrics
|
||||
|
||||
- https://grafana.com/orgs/victoriametrics/dashboards
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
134
hosts/idols-aquamarine/monitoring/alert.nix
Normal file
134
hosts/idols-aquamarine/monitoring/alert.nix
Normal file
@@ -0,0 +1,134 @@
|
||||
{ config, lib, ... }:
|
||||
{
|
||||
services.vmalert = {
|
||||
enable = true;
|
||||
settings = {
|
||||
"httpListenAddr" = "127.0.0.1:8880";
|
||||
|
||||
"datasource.url" = "http://localhost:9090";
|
||||
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
|
||||
|
||||
# Whether to disable long-lived connections to the datasource.
|
||||
"datasource.disableKeepAlive" = true;
|
||||
# Whether to avoid stripping sensitive information such as auth headers or passwords
|
||||
# from URLs in log messages or UI and exported metrics.
|
||||
"datasource.showURL" = false;
|
||||
# Path to the files with alerting and/or recording rules.
|
||||
rule = [
|
||||
"${./alert_rules}/*.yml"
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9093;
|
||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||
logLevel = "info";
|
||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "smtp.qq.com:465";
|
||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||
# https://service.mail.qq.com/detail/0/310
|
||||
smtp_require_tls = false;
|
||||
};
|
||||
route = {
|
||||
receiver = "telegram";
|
||||
routes = [
|
||||
{
|
||||
receiver = "telegram";
|
||||
# group alerts by labels
|
||||
group_by = [
|
||||
"job"
|
||||
# --- Alert labels ---
|
||||
"alertname"
|
||||
"alertgroup"
|
||||
# --- kubernetes labels ---
|
||||
"namespace"
|
||||
"service"
|
||||
# --- custom labels ---
|
||||
"cluster"
|
||||
"env"
|
||||
"type"
|
||||
"host"
|
||||
];
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
}
|
||||
# {
|
||||
# # Route only prod env's critical alerts to email (most severe alerts)
|
||||
# match = {
|
||||
# severity = "critical";
|
||||
# env = "prd";
|
||||
# };
|
||||
# receiver = "email";
|
||||
# group_by = [
|
||||
# "host"
|
||||
# "namespace"
|
||||
# "pod"
|
||||
# "job"
|
||||
# ];
|
||||
# group_wait = "1m";
|
||||
# group_interval = "5m";
|
||||
# repeat_interval = "2h";
|
||||
# }
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
# {
|
||||
# name = "email";
|
||||
# email_configs = [
|
||||
# {
|
||||
# to = "ryan4yin@linux.com";
|
||||
# # Whether to notify about resolved alerts.
|
||||
# send_resolved = true;
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
{
|
||||
name = "telegram";
|
||||
telegram_configs = [
|
||||
{
|
||||
bot_token = "$TELEGRAM_BOT_TOKEN";
|
||||
chat_id = 586169186; # My Telegram ID
|
||||
# Whether to notify about resolved alerts.
|
||||
send_resolved = true;
|
||||
# Disable notifications for resolved alerts
|
||||
disable_notifications = false;
|
||||
# Parse mode for the message
|
||||
parse_mode = "Markdown";
|
||||
# Message template
|
||||
message = ''
|
||||
*Alert:* {{ .GroupLabels.alertname }}
|
||||
*Status:* {{ .Status }}
|
||||
*Severity:* {{ .CommonLabels.severity }}
|
||||
{{ if .GroupLabels.namespace }}*Namespace:* {{ .GroupLabels.namespace }}{{ end }}
|
||||
{{ if .GroupLabels.pod }}*Pod:* {{ .GroupLabels.pod }}{{ end }}
|
||||
{{ if .GroupLabels.job }}*Job:* {{ .GroupLabels.job }}{{ end }}
|
||||
{{ if .GroupLabels.host }}*Host:* {{ .GroupLabels.host }}{{ end }}
|
||||
|
||||
{{ range .Alerts }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
{{ if .Labels.instance }}*Instance:* {{ .Labels.instance }}{{ end }}
|
||||
{{ if .Labels.container }}*Container:* {{ .Labels.container }}{{ end }}
|
||||
*Started:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
|
||||
{{ if .EndsAt }}
|
||||
*Ended:* {{ .EndsAt.Format "2006-01-02 15:04:05" }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
'';
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
groups:
|
||||
- name: CoreDNS Exporter
|
||||
|
||||
rules:
|
||||
- alert: CorednsPanicCount
|
||||
expr: "increase(coredns_panics_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -69,7 +69,7 @@ groups:
|
||||
annotations:
|
||||
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
|
||||
"High percentage of HTTP 4xx responses in Istio (> 5%).\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh5xxErrorRate
|
||||
|
||||
@@ -203,18 +203,18 @@ groups:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr:
|
||||
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description:
|
||||
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
# - alert: HostCpuIsUnderutilized
|
||||
# expr:
|
||||
# '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
||||
# group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# for: 1w
|
||||
# labels:
|
||||
# severity: info
|
||||
# annotations:
|
||||
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
# description:
|
||||
# "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
||||
# $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr:
|
||||
|
||||
@@ -73,16 +73,16 @@ groups:
|
||||
"PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS
|
||||
= {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlNotEnoughConnections
|
||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||
description:
|
||||
"PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
# - alert: PostgresqlNotEnoughConnections
|
||||
# expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||
# for: 2m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# annotations:
|
||||
# summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||
# description:
|
||||
# "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
|
||||
# }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
@@ -109,17 +109,17 @@ groups:
|
||||
"Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlCommitRateLow
|
||||
expr:
|
||||
'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Postgresql seems to be processing very few transactions\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
# - alert: PostgresqlCommitRateLow
|
||||
# expr:
|
||||
# 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||
# for: 2m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# annotations:
|
||||
# summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||
# description:
|
||||
# "Postgresql seems to be processing very few transactions\n VALUE = {{ $value
|
||||
# }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlLowXidConsumption
|
||||
expr: "rate(pg_txid_current[1m]) < 5"
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
{ config, ... }:
|
||||
{
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9093;
|
||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||
logLevel = "info";
|
||||
|
||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "smtp.qq.com:465";
|
||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||
# https://service.mail.qq.com/detail/0/310
|
||||
smtp_require_tls = false;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = [ "host" ];
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "default";
|
||||
email_configs = [
|
||||
{
|
||||
to = "ryan4yin@linux.com";
|
||||
# Whether to notify about resolved alerts.
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -2,6 +2,6 @@
|
||||
{
|
||||
imports = [
|
||||
./victoriametrics.nix
|
||||
./alertmanager.nix
|
||||
./alert.nix
|
||||
];
|
||||
}
|
||||
|
||||
@@ -116,25 +116,4 @@
|
||||
) [ ] myvars.networking.hostsAddr);
|
||||
};
|
||||
};
|
||||
|
||||
services.vmalert = {
|
||||
enable = true;
|
||||
settings = {
|
||||
"datasource.url" = "http://localhost:9090";
|
||||
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
|
||||
|
||||
# Whether to disable long-lived connections to the datasource.
|
||||
"datasource.disableKeepAlive" = true;
|
||||
# Whether to avoid stripping sensitive information such as auth headers or passwords
|
||||
# from URLs in log messages or UI and exported metrics.
|
||||
"datasource.showURL" = false;
|
||||
rule = [
|
||||
./alert_rules/node-exporter.yml
|
||||
./alert_rules/kubestate-exporter.yml
|
||||
./alert_rules/etcd_embedded-exporter.yml
|
||||
./alert_rules/istio_embedded-exporter.yml
|
||||
./alert_rules/coredns_embedded-exporter.yml
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
@@ -14,5 +14,18 @@
|
||||
|
||||
# use either enabledCollectors or disabledCollectors
|
||||
# disabledCollectors = [];
|
||||
|
||||
extraFlags = [
|
||||
# Exclude pseudo/ephemeral FS:
|
||||
# - /proc, /sys: kernel pseudo-FS, always size 0
|
||||
# - /dev: tmpfs/devices, not meaningful for disk usage
|
||||
# Exclude container/runtime mounts:
|
||||
# - /var/lib/docker/, /var/lib/containers/ and /var/lib/kubelet/ → too much overlay/tmpfs mounts,
|
||||
# often EACCES (strict perms, namespaces) → false alerts
|
||||
# Exclude user bind mounts:
|
||||
# - /home/ryan/.+ → bind-mounted from /persistent (NixOS tmpfs-root setup),
|
||||
# monitoring /persistent is sufficient
|
||||
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/containers/.+|var/lib/kubelet/.+|home/ryan/.+)($|/)"
|
||||
];
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user