fix: idols-aquamarine - alert (#221)

* fix: idols-aquamarine - alert * feat: add dashboards for victoriametrics * fix: node_exporter - exclude docker/podman/kubelet mounts and /home/ryan bindmounts * fix: alert - add coredns, comment out some useless alert rules
2026-01-11 22:30:25 +01:00 · 2025-09-14 10:48:38 +08:00
parent 01b69e810e 7a82b8085a
commit 48a9d7c507
14 changed files with 6757 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ logs/
 core*
 !core/
 !core.nix
+!coredns*
--- a/flake.lock
+++ b/flake.lock
@@ -526,10 +526,10 @@
    "mysecrets": {
      "flake": false,
      "locked": {
-        "lastModified": 1752678564,
-        "narHash": "sha256-x2sbH7Umncbyc9oca5mqX8kMChHVUTytKk+QXEcB4i4=",
+        "lastModified": 1757651423,
+        "narHash": "sha256-w2hBme0vg3uDoEjP+0WuBT9hAhf1xJa4Np+GS2zQKXU=",
        "ref": "refs/heads/main",
-        "rev": "a231913597362c15c71fd9212cef5092ae85a64c",
+        "rev": "44b2943b7ebed5717bb9855c1b7a95c8a89fb7f7",
        "shallow": true,
        "type": "git",
        "url": "ssh://git@github.com/ryan4yin/nix-secrets.git"
--- a/hosts/idols-aquamarine/caddy.nix
+++ b/hosts/idols-aquamarine/caddy.nix
@@ -88,6 +88,11 @@ in
      encode zstd gzip
      reverse_proxy http://localhost:9093
    '';
+    virtualHosts."vmalert.writefor.fun".extraConfig = ''
+      ${hostCommonConfig}
+      encode zstd gzip
+      reverse_proxy http://localhost:8880
+    '';
    virtualHosts."minio.writefor.fun".extraConfig = ''
      ${hostCommonConfig}
      encode zstd gzip
--- a/hosts/idols-aquamarine/grafana/dashboards/README.md
+++ b/hosts/idols-aquamarine/grafana/dashboards/README.md
@@ -32,3 +32,7 @@ mixin provides a comprehensive package for monitoring Loki in production.
   - Instance:
     https://github.com/cloudnative-pg/grafana-dashboards/blob/main/charts/cluster/grafana-dashboard.json
   - Pooler(PGBouncer): https://github.com/cloudnative-pg/grafana-dashboards/issues/7
+
+## VictoriaMetrics
+
+- https://grafana.com/orgs/victoriametrics/dashboards
--- a/hosts/idols-aquamarine/grafana/dashboards/victoriametrics/victoria-metrics-single.json
+++ b/hosts/idols-aquamarine/grafana/dashboards/victoriametrics/victoria-metrics-single.json
--- a/hosts/idols-aquamarine/monitoring/alert.nix
+++ b/hosts/idols-aquamarine/monitoring/alert.nix
@@ -0,0 +1,134 @@
+{ config, lib, ... }:
+{
+  services.vmalert = {
+    enable = true;
+    settings = {
+      "httpListenAddr" = "127.0.0.1:8880";
+
+      "datasource.url" = "http://localhost:9090";
+      "notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
+
+      # Whether to disable long-lived connections to the datasource.
+      "datasource.disableKeepAlive" = true;
+      # Whether to avoid stripping sensitive information such as auth headers or passwords
+      # from URLs in log messages or UI and exported metrics.
+      "datasource.showURL" = false;
+      # Path to the files with alerting and/or recording rules.
+      rule = [
+        "${./alert_rules}/*.yml"
+      ];
+    };
+  };
+
+  services.prometheus.alertmanager = {
+    enable = true;
+    listenAddress = "127.0.0.1";
+    port = 9093;
+    webExternalUrl = "http://alertmanager.writefor.fun";
+    logLevel = "info";
+    environmentFile = config.age.secrets."alertmanager.env".path;
+    configuration = {
+      global = {
+        # The smarthost and SMTP sender used for mail notifications.
+        smtp_smarthost = "smtp.qq.com:465";
+        smtp_from = "$SMTP_SENDER_EMAIL";
+        smtp_auth_username = "$SMTP_AUTH_USERNAME";
+        smtp_auth_password = "$SMTP_AUTH_PASSWORD";
+        # smtp.qq.com:465 support SSL only, so we need to disable TLS here.
+        # https://service.mail.qq.com/detail/0/310
+        smtp_require_tls = false;
+      };
+      route = {
+        receiver = "telegram";
+        routes = [
+          {
+            receiver = "telegram";
+            # group alerts by labels
+            group_by = [
+              "job"
+              # --- Alert labels ---
+              "alertname"
+              "alertgroup"
+              # --- kubernetes labels ---
+              "namespace"
+              "service"
+              # --- custom labels ---
+              "cluster"
+              "env"
+              "type"
+              "host"
+            ];
+            group_wait = "5m";
+            group_interval = "5m";
+            repeat_interval = "4h";
+          }
+          # {
+          #   # Route only prod env's critical alerts to email (most severe alerts)
+          #   match = {
+          #     severity = "critical";
+          #     env = "prd";
+          #   };
+          #   receiver = "email";
+          #   group_by = [
+          #     "host"
+          #     "namespace"
+          #     "pod"
+          #     "job"
+          #   ];
+          #   group_wait = "1m";
+          #   group_interval = "5m";
+          #   repeat_interval = "2h";
+          # }
+        ];
+      };
+      receivers = [
+        # {
+        #   name = "email";
+        #   email_configs = [
+        #     {
+        #       to = "ryan4yin@linux.com";
+        #       # Whether to notify about resolved alerts.
+        #       send_resolved = true;
+        #     }
+        #   ];
+        # }
+        {
+          name = "telegram";
+          telegram_configs = [
+            {
+              bot_token = "$TELEGRAM_BOT_TOKEN";
+              chat_id = 586169186; # My Telegram ID
+              # Whether to notify about resolved alerts.
+              send_resolved = true;
+              # Disable notifications for resolved alerts
+              disable_notifications = false;
+              # Parse mode for the message
+              parse_mode = "Markdown";
+              # Message template
+              message = ''
+                *Alert:* {{ .GroupLabels.alertname }}
+                *Status:* {{ .Status }}
+                *Severity:* {{ .CommonLabels.severity }}
+                {{ if .GroupLabels.namespace }}*Namespace:* {{ .GroupLabels.namespace }}{{ end }}
+                {{ if .GroupLabels.pod }}*Pod:* {{ .GroupLabels.pod }}{{ end }}
+                {{ if .GroupLabels.job }}*Job:* {{ .GroupLabels.job }}{{ end }}
+                {{ if .GroupLabels.host }}*Host:* {{ .GroupLabels.host }}{{ end }}
+
+                {{ range .Alerts }}
+                *Alert:* {{ .Annotations.summary }}
+                *Description:* {{ .Annotations.description }}
+                {{ if .Labels.instance }}*Instance:* {{ .Labels.instance }}{{ end }}
+                {{ if .Labels.container }}*Container:* {{ .Labels.container }}{{ end }}
+                *Started:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
+                {{ if .EndsAt }}
+                *Ended:* {{ .EndsAt.Format "2006-01-02 15:04:05" }}
+                {{ end }}
+                {{ end }}
+              '';
+            }
+          ];
+        }
+      ];
+    };
+  };
+}
--- a/hosts/idols-aquamarine/monitoring/alert_rules/coredns-exporter.yml
+++ b/hosts/idols-aquamarine/monitoring/alert_rules/coredns-exporter.yml
@@ -0,0 +1,13 @@
+groups:
+  - name: CoreDNS Exporter
+
+    rules:
+      - alert: CorednsPanicCount
+        expr: "increase(coredns_panics_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: CoreDNS Panic Count (instance {{ $labels.instance }})
+          description:
+            "Number of CoreDNS panics encountered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/hosts/idols-aquamarine/monitoring/alert_rules/istio-exporter.yml
+++ b/hosts/idols-aquamarine/monitoring/alert_rules/istio-exporter.yml
@@ -69,7 +69,7 @@ groups:
        annotations:
          summary: Istio high 4xx error rate (instance {{ $labels.instance }})
          description:
-            "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value
+            "High percentage of HTTP 4xx responses in Istio (> 5%).\n  VALUE = {{ $value
            }}\n  LABELS = {{ $labels }}"

      - alert: IstioHigh5xxErrorRate
--- a/hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml
+++ b/hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml
@@ -203,18 +203,18 @@ groups:
          summary: Host high CPU load (instance {{ $labels.instance }})
          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-      - alert: HostCpuIsUnderutilized
-        expr:
-          '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
-          group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 1w
-        labels:
-          severity: info
-        annotations:
-          summary: Host CPU is underutilized (instance {{ $labels.instance }})
-          description:
-            "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
+      # - alert: HostCpuIsUnderutilized
+      #   expr:
+      #     '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
+      #     group_left (nodename) node_uname_info{nodename=~".+"}'
+      #   for: 1w
+      #   labels:
+      #     severity: info
+      #   annotations:
+      #     summary: Host CPU is underutilized (instance {{ $labels.instance }})
+      #     description:
+      #       "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{
+      #       $value }}\n  LABELS = {{ $labels }}"

      - alert: HostCpuStealNoisyNeighbor
        expr:
--- a/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml
+++ b/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml
@@ -73,16 +73,16 @@ groups:
            "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS
            = {{ $labels }}"

-      - alert: PostgresqlNotEnoughConnections
-        expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Postgresql not enough connections (instance {{ $labels.instance }})
-          description:
-            "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
+      # - alert: PostgresqlNotEnoughConnections
+      #   expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
+      #   for: 2m
+      #   labels:
+      #     severity: critical
+      #   annotations:
+      #     summary: Postgresql not enough connections (instance {{ $labels.instance }})
+      #     description:
+      #       "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value
+      #       }}\n  LABELS = {{ $labels }}"

      - alert: PostgresqlDeadLocks
        expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
@@ -109,17 +109,17 @@ groups:
            "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value
            }}\n  LABELS = {{ $labels }}"

-      - alert: PostgresqlCommitRateLow
-        expr:
-          'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Postgresql commit rate low (instance {{ $labels.instance }})
-          description:
-            "Postgresql seems to be processing very few transactions\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
+      # - alert: PostgresqlCommitRateLow
+      #   expr:
+      #     'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
+      #   for: 2m
+      #   labels:
+      #     severity: critical
+      #   annotations:
+      #     summary: Postgresql commit rate low (instance {{ $labels.instance }})
+      #     description:
+      #       "Postgresql seems to be processing very few transactions\n  VALUE = {{ $value
+      #       }}\n  LABELS = {{ $labels }}"

      - alert: PostgresqlLowXidConsumption
        expr: "rate(pg_txid_current[1m]) < 5"
--- a/hosts/idols-aquamarine/monitoring/alertmanager.nix
+++ b/hosts/idols-aquamarine/monitoring/alertmanager.nix
@@ -1,48 +0,0 @@
-{ config, ... }:
-{
-  services.prometheus.alertmanager = {
-    enable = true;
-    listenAddress = "127.0.0.1";
-    port = 9093;
-    webExternalUrl = "http://alertmanager.writefor.fun";
-    logLevel = "info";
-
-    environmentFile = config.age.secrets."alertmanager.env".path;
-    configuration = {
-      global = {
-        # The smarthost and SMTP sender used for mail notifications.
-        smtp_smarthost = "smtp.qq.com:465";
-        smtp_from = "$SMTP_SENDER_EMAIL";
-        smtp_auth_username = "$SMTP_AUTH_USERNAME";
-        smtp_auth_password = "$SMTP_AUTH_PASSWORD";
-        # smtp.qq.com:465 support SSL only, so we need to disable TLS here.
-        # https://service.mail.qq.com/detail/0/310
-        smtp_require_tls = false;
-      };
-      route = {
-        receiver = "default";
-        routes = [
-          {
-            group_by = [ "host" ];
-            group_wait = "5m";
-            group_interval = "5m";
-            repeat_interval = "4h";
-            receiver = "default";
-          }
-        ];
-      };
-      receivers = [
-        {
-          name = "default";
-          email_configs = [
-            {
-              to = "ryan4yin@linux.com";
-              # Whether to notify about resolved alerts.
-              send_resolved = true;
-            }
-          ];
-        }
-      ];
-    };
-  };
-}
--- a/hosts/idols-aquamarine/monitoring/default.nix
+++ b/hosts/idols-aquamarine/monitoring/default.nix
@@ -2,6 +2,6 @@
 {
  imports = [
    ./victoriametrics.nix
-    ./alertmanager.nix
+    ./alert.nix
  ];
 }
--- a/hosts/idols-aquamarine/monitoring/victoriametrics.nix
+++ b/hosts/idols-aquamarine/monitoring/victoriametrics.nix
@@ -116,25 +116,4 @@
      ) [ ] myvars.networking.hostsAddr);
    };
  };
-
-  services.vmalert = {
-    enable = true;
-    settings = {
-      "datasource.url" = "http://localhost:9090";
-      "notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
-
-      # Whether to disable long-lived connections to the datasource.
-      "datasource.disableKeepAlive" = true;
-      # Whether to avoid stripping sensitive information such as auth headers or passwords
-      # from URLs in log messages or UI and exported metrics.
-      "datasource.showURL" = false;
-      rule = [
-        ./alert_rules/node-exporter.yml
-        ./alert_rules/kubestate-exporter.yml
-        ./alert_rules/etcd_embedded-exporter.yml
-        ./alert_rules/istio_embedded-exporter.yml
-        ./alert_rules/coredns_embedded-exporter.yml
-      ];
-    };
-  };
 }
--- a/modules/nixos/base/monitoring.nix
+++ b/modules/nixos/base/monitoring.nix
@@ -14,5 +14,18 @@

    # use either enabledCollectors or disabledCollectors
    # disabledCollectors = [];
+
+    extraFlags = [
+      # Exclude pseudo/ephemeral FS:
+      #   - /proc, /sys: kernel pseudo-FS, always size 0
+      #   - /dev: tmpfs/devices, not meaningful for disk usage
+      # Exclude container/runtime mounts:
+      #   - /var/lib/docker/, /var/lib/containers/ and /var/lib/kubelet/ → too much overlay/tmpfs mounts,
+      #     often EACCES (strict perms, namespaces) → false alerts
+      # Exclude user bind mounts:
+      #   - /home/ryan/.+ → bind-mounted from /persistent (NixOS tmpfs-root setup),
+      #     monitoring /persistent is sufficient
+      "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/containers/.+|var/lib/kubelet/.+|home/ryan/.+)($|/)"
+    ];
  };
 }