mirror of
https://github.com/ryan4yin/nix-config.git
synced 2026-04-25 17:38:29 +02:00
fix: idols-aquamarine - alert (#221)
* fix: idols-aquamarine - alert * feat: add dashboards for victoriametrics * fix: node_exporter - exclude docker/podman/kubelet mounts and /home/ryan bindmounts * fix: alert - add coredns, comment out some useless alert rules
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,3 +8,4 @@ logs/
|
|||||||
core*
|
core*
|
||||||
!core/
|
!core/
|
||||||
!core.nix
|
!core.nix
|
||||||
|
!coredns*
|
||||||
|
|||||||
6
flake.lock
generated
6
flake.lock
generated
@@ -526,10 +526,10 @@
|
|||||||
"mysecrets": {
|
"mysecrets": {
|
||||||
"flake": false,
|
"flake": false,
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1752678564,
|
"lastModified": 1757651423,
|
||||||
"narHash": "sha256-x2sbH7Umncbyc9oca5mqX8kMChHVUTytKk+QXEcB4i4=",
|
"narHash": "sha256-w2hBme0vg3uDoEjP+0WuBT9hAhf1xJa4Np+GS2zQKXU=",
|
||||||
"ref": "refs/heads/main",
|
"ref": "refs/heads/main",
|
||||||
"rev": "a231913597362c15c71fd9212cef5092ae85a64c",
|
"rev": "44b2943b7ebed5717bb9855c1b7a95c8a89fb7f7",
|
||||||
"shallow": true,
|
"shallow": true,
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "ssh://git@github.com/ryan4yin/nix-secrets.git"
|
"url": "ssh://git@github.com/ryan4yin/nix-secrets.git"
|
||||||
|
|||||||
@@ -88,6 +88,11 @@ in
|
|||||||
encode zstd gzip
|
encode zstd gzip
|
||||||
reverse_proxy http://localhost:9093
|
reverse_proxy http://localhost:9093
|
||||||
'';
|
'';
|
||||||
|
virtualHosts."vmalert.writefor.fun".extraConfig = ''
|
||||||
|
${hostCommonConfig}
|
||||||
|
encode zstd gzip
|
||||||
|
reverse_proxy http://localhost:8880
|
||||||
|
'';
|
||||||
virtualHosts."minio.writefor.fun".extraConfig = ''
|
virtualHosts."minio.writefor.fun".extraConfig = ''
|
||||||
${hostCommonConfig}
|
${hostCommonConfig}
|
||||||
encode zstd gzip
|
encode zstd gzip
|
||||||
|
|||||||
@@ -32,3 +32,7 @@ mixin provides a comprehensive package for monitoring Loki in production.
|
|||||||
- Instance:
|
- Instance:
|
||||||
https://github.com/cloudnative-pg/grafana-dashboards/blob/main/charts/cluster/grafana-dashboard.json
|
https://github.com/cloudnative-pg/grafana-dashboards/blob/main/charts/cluster/grafana-dashboard.json
|
||||||
- Pooler(PGBouncer): https://github.com/cloudnative-pg/grafana-dashboards/issues/7
|
- Pooler(PGBouncer): https://github.com/cloudnative-pg/grafana-dashboards/issues/7
|
||||||
|
|
||||||
|
## VictoriaMetrics
|
||||||
|
|
||||||
|
- https://grafana.com/orgs/victoriametrics/dashboards
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
134
hosts/idols-aquamarine/monitoring/alert.nix
Normal file
134
hosts/idols-aquamarine/monitoring/alert.nix
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
{ config, lib, ... }:
|
||||||
|
{
|
||||||
|
services.vmalert = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
"httpListenAddr" = "127.0.0.1:8880";
|
||||||
|
|
||||||
|
"datasource.url" = "http://localhost:9090";
|
||||||
|
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
|
||||||
|
|
||||||
|
# Whether to disable long-lived connections to the datasource.
|
||||||
|
"datasource.disableKeepAlive" = true;
|
||||||
|
# Whether to avoid stripping sensitive information such as auth headers or passwords
|
||||||
|
# from URLs in log messages or UI and exported metrics.
|
||||||
|
"datasource.showURL" = false;
|
||||||
|
# Path to the files with alerting and/or recording rules.
|
||||||
|
rule = [
|
||||||
|
"${./alert_rules}/*.yml"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
services.prometheus.alertmanager = {
|
||||||
|
enable = true;
|
||||||
|
listenAddress = "127.0.0.1";
|
||||||
|
port = 9093;
|
||||||
|
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||||
|
logLevel = "info";
|
||||||
|
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||||
|
configuration = {
|
||||||
|
global = {
|
||||||
|
# The smarthost and SMTP sender used for mail notifications.
|
||||||
|
smtp_smarthost = "smtp.qq.com:465";
|
||||||
|
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||||
|
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
||||||
|
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||||
|
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||||
|
# https://service.mail.qq.com/detail/0/310
|
||||||
|
smtp_require_tls = false;
|
||||||
|
};
|
||||||
|
route = {
|
||||||
|
receiver = "telegram";
|
||||||
|
routes = [
|
||||||
|
{
|
||||||
|
receiver = "telegram";
|
||||||
|
# group alerts by labels
|
||||||
|
group_by = [
|
||||||
|
"job"
|
||||||
|
# --- Alert labels ---
|
||||||
|
"alertname"
|
||||||
|
"alertgroup"
|
||||||
|
# --- kubernetes labels ---
|
||||||
|
"namespace"
|
||||||
|
"service"
|
||||||
|
# --- custom labels ---
|
||||||
|
"cluster"
|
||||||
|
"env"
|
||||||
|
"type"
|
||||||
|
"host"
|
||||||
|
];
|
||||||
|
group_wait = "5m";
|
||||||
|
group_interval = "5m";
|
||||||
|
repeat_interval = "4h";
|
||||||
|
}
|
||||||
|
# {
|
||||||
|
# # Route only prod env's critical alerts to email (most severe alerts)
|
||||||
|
# match = {
|
||||||
|
# severity = "critical";
|
||||||
|
# env = "prd";
|
||||||
|
# };
|
||||||
|
# receiver = "email";
|
||||||
|
# group_by = [
|
||||||
|
# "host"
|
||||||
|
# "namespace"
|
||||||
|
# "pod"
|
||||||
|
# "job"
|
||||||
|
# ];
|
||||||
|
# group_wait = "1m";
|
||||||
|
# group_interval = "5m";
|
||||||
|
# repeat_interval = "2h";
|
||||||
|
# }
|
||||||
|
];
|
||||||
|
};
|
||||||
|
receivers = [
|
||||||
|
# {
|
||||||
|
# name = "email";
|
||||||
|
# email_configs = [
|
||||||
|
# {
|
||||||
|
# to = "ryan4yin@linux.com";
|
||||||
|
# # Whether to notify about resolved alerts.
|
||||||
|
# send_resolved = true;
|
||||||
|
# }
|
||||||
|
# ];
|
||||||
|
# }
|
||||||
|
{
|
||||||
|
name = "telegram";
|
||||||
|
telegram_configs = [
|
||||||
|
{
|
||||||
|
bot_token = "$TELEGRAM_BOT_TOKEN";
|
||||||
|
chat_id = 586169186; # My Telegram ID
|
||||||
|
# Whether to notify about resolved alerts.
|
||||||
|
send_resolved = true;
|
||||||
|
# Disable notifications for resolved alerts
|
||||||
|
disable_notifications = false;
|
||||||
|
# Parse mode for the message
|
||||||
|
parse_mode = "Markdown";
|
||||||
|
# Message template
|
||||||
|
message = ''
|
||||||
|
*Alert:* {{ .GroupLabels.alertname }}
|
||||||
|
*Status:* {{ .Status }}
|
||||||
|
*Severity:* {{ .CommonLabels.severity }}
|
||||||
|
{{ if .GroupLabels.namespace }}*Namespace:* {{ .GroupLabels.namespace }}{{ end }}
|
||||||
|
{{ if .GroupLabels.pod }}*Pod:* {{ .GroupLabels.pod }}{{ end }}
|
||||||
|
{{ if .GroupLabels.job }}*Job:* {{ .GroupLabels.job }}{{ end }}
|
||||||
|
{{ if .GroupLabels.host }}*Host:* {{ .GroupLabels.host }}{{ end }}
|
||||||
|
|
||||||
|
{{ range .Alerts }}
|
||||||
|
*Alert:* {{ .Annotations.summary }}
|
||||||
|
*Description:* {{ .Annotations.description }}
|
||||||
|
{{ if .Labels.instance }}*Instance:* {{ .Labels.instance }}{{ end }}
|
||||||
|
{{ if .Labels.container }}*Container:* {{ .Labels.container }}{{ end }}
|
||||||
|
*Started:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
|
||||||
|
{{ if .EndsAt }}
|
||||||
|
*Ended:* {{ .EndsAt.Format "2006-01-02 15:04:05" }}
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
||||||
|
'';
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
groups:
|
||||||
|
- name: CoreDNS Exporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: CorednsPanicCount
|
||||||
|
expr: "increase(coredns_panics_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
|
||||||
|
description:
|
||||||
|
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
@@ -69,7 +69,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
|
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
|
||||||
description:
|
description:
|
||||||
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
|
"High percentage of HTTP 4xx responses in Istio (> 5%).\n VALUE = {{ $value
|
||||||
}}\n LABELS = {{ $labels }}"
|
}}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: IstioHigh5xxErrorRate
|
- alert: IstioHigh5xxErrorRate
|
||||||
|
|||||||
@@ -203,18 +203,18 @@ groups:
|
|||||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuIsUnderutilized
|
# - alert: HostCpuIsUnderutilized
|
||||||
expr:
|
# expr:
|
||||||
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
# '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
||||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
# group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
for: 1w
|
# for: 1w
|
||||||
labels:
|
# labels:
|
||||||
severity: info
|
# severity: info
|
||||||
annotations:
|
# annotations:
|
||||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||||
description:
|
# description:
|
||||||
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
# "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
||||||
$value }}\n LABELS = {{ $labels }}"
|
# $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuStealNoisyNeighbor
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
expr:
|
expr:
|
||||||
|
|||||||
@@ -73,16 +73,16 @@ groups:
|
|||||||
"PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS
|
"PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS
|
||||||
= {{ $labels }}"
|
= {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlNotEnoughConnections
|
# - alert: PostgresqlNotEnoughConnections
|
||||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
# expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||||
for: 2m
|
# for: 2m
|
||||||
labels:
|
# labels:
|
||||||
severity: critical
|
# severity: critical
|
||||||
annotations:
|
# annotations:
|
||||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
# summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||||
description:
|
# description:
|
||||||
"PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
|
# "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
|
||||||
}}\n LABELS = {{ $labels }}"
|
# }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlDeadLocks
|
- alert: PostgresqlDeadLocks
|
||||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||||
@@ -109,17 +109,17 @@ groups:
|
|||||||
"Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value
|
"Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value
|
||||||
}}\n LABELS = {{ $labels }}"
|
}}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlCommitRateLow
|
# - alert: PostgresqlCommitRateLow
|
||||||
expr:
|
# expr:
|
||||||
'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
# 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||||
for: 2m
|
# for: 2m
|
||||||
labels:
|
# labels:
|
||||||
severity: critical
|
# severity: critical
|
||||||
annotations:
|
# annotations:
|
||||||
summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
# summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||||
description:
|
# description:
|
||||||
"Postgresql seems to be processing very few transactions\n VALUE = {{ $value
|
# "Postgresql seems to be processing very few transactions\n VALUE = {{ $value
|
||||||
}}\n LABELS = {{ $labels }}"
|
# }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlLowXidConsumption
|
- alert: PostgresqlLowXidConsumption
|
||||||
expr: "rate(pg_txid_current[1m]) < 5"
|
expr: "rate(pg_txid_current[1m]) < 5"
|
||||||
|
|||||||
@@ -1,48 +0,0 @@
|
|||||||
{ config, ... }:
|
|
||||||
{
|
|
||||||
services.prometheus.alertmanager = {
|
|
||||||
enable = true;
|
|
||||||
listenAddress = "127.0.0.1";
|
|
||||||
port = 9093;
|
|
||||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
|
||||||
logLevel = "info";
|
|
||||||
|
|
||||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
|
||||||
configuration = {
|
|
||||||
global = {
|
|
||||||
# The smarthost and SMTP sender used for mail notifications.
|
|
||||||
smtp_smarthost = "smtp.qq.com:465";
|
|
||||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
|
||||||
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
|
||||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
|
||||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
|
||||||
# https://service.mail.qq.com/detail/0/310
|
|
||||||
smtp_require_tls = false;
|
|
||||||
};
|
|
||||||
route = {
|
|
||||||
receiver = "default";
|
|
||||||
routes = [
|
|
||||||
{
|
|
||||||
group_by = [ "host" ];
|
|
||||||
group_wait = "5m";
|
|
||||||
group_interval = "5m";
|
|
||||||
repeat_interval = "4h";
|
|
||||||
receiver = "default";
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
receivers = [
|
|
||||||
{
|
|
||||||
name = "default";
|
|
||||||
email_configs = [
|
|
||||||
{
|
|
||||||
to = "ryan4yin@linux.com";
|
|
||||||
# Whether to notify about resolved alerts.
|
|
||||||
send_resolved = true;
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -2,6 +2,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
./victoriametrics.nix
|
./victoriametrics.nix
|
||||||
./alertmanager.nix
|
./alert.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -116,25 +116,4 @@
|
|||||||
) [ ] myvars.networking.hostsAddr);
|
) [ ] myvars.networking.hostsAddr);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
services.vmalert = {
|
|
||||||
enable = true;
|
|
||||||
settings = {
|
|
||||||
"datasource.url" = "http://localhost:9090";
|
|
||||||
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
|
|
||||||
|
|
||||||
# Whether to disable long-lived connections to the datasource.
|
|
||||||
"datasource.disableKeepAlive" = true;
|
|
||||||
# Whether to avoid stripping sensitive information such as auth headers or passwords
|
|
||||||
# from URLs in log messages or UI and exported metrics.
|
|
||||||
"datasource.showURL" = false;
|
|
||||||
rule = [
|
|
||||||
./alert_rules/node-exporter.yml
|
|
||||||
./alert_rules/kubestate-exporter.yml
|
|
||||||
./alert_rules/etcd_embedded-exporter.yml
|
|
||||||
./alert_rules/istio_embedded-exporter.yml
|
|
||||||
./alert_rules/coredns_embedded-exporter.yml
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,5 +14,18 @@
|
|||||||
|
|
||||||
# use either enabledCollectors or disabledCollectors
|
# use either enabledCollectors or disabledCollectors
|
||||||
# disabledCollectors = [];
|
# disabledCollectors = [];
|
||||||
|
|
||||||
|
extraFlags = [
|
||||||
|
# Exclude pseudo/ephemeral FS:
|
||||||
|
# - /proc, /sys: kernel pseudo-FS, always size 0
|
||||||
|
# - /dev: tmpfs/devices, not meaningful for disk usage
|
||||||
|
# Exclude container/runtime mounts:
|
||||||
|
# - /var/lib/docker/, /var/lib/containers/ and /var/lib/kubelet/ → too much overlay/tmpfs mounts,
|
||||||
|
# often EACCES (strict perms, namespaces) → false alerts
|
||||||
|
# Exclude user bind mounts:
|
||||||
|
# - /home/ryan/.+ → bind-mounted from /persistent (NixOS tmpfs-root setup),
|
||||||
|
# monitoring /persistent is sufficient
|
||||||
|
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/containers/.+|var/lib/kubelet/.+|home/ryan/.+)($|/)"
|
||||||
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user