From 7a82b8085a08f725b0c1dabd32dbd64aeb8eb8cb Mon Sep 17 00:00:00 2001 From: Ryan Yin Date: Sun, 14 Sep 2025 10:40:12 +0800 Subject: [PATCH] fix: alert - add coredns, comment out some useless alert rules --- .gitignore | 1 + .../alert_rules/coredns-exporter.yml | 13 ++++++ .../monitoring/alert_rules/node-exporter.yml | 24 +++++------ .../alert_rules/postgres-exporter.yml | 42 +++++++++---------- 4 files changed, 47 insertions(+), 33 deletions(-) create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/coredns-exporter.yml diff --git a/.gitignore b/.gitignore index 5e9d423d..dc6ba733 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ logs/ core* !core/ !core.nix +!coredns* diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/coredns-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/coredns-exporter.yml new file mode 100644 index 00000000..0ca8b86d --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/coredns-exporter.yml @@ -0,0 +1,13 @@ +groups: + - name: CoreDNS Exporter + + rules: + - alert: CorednsPanicCount + expr: "increase(coredns_panics_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: CoreDNS Panic Count (instance {{ $labels.instance }}) + description: + "Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml index 8ea26f8c..b28f4f4b 100644 --- a/hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml +++ b/hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml @@ -203,18 +203,18 @@ groups: summary: Host high CPU load (instance {{ $labels.instance }}) description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HostCpuIsUnderutilized - expr: - '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) - group_left (nodename) node_uname_info{nodename=~".+"}' - for: 1w - labels: - severity: info - annotations: - summary: Host CPU is underutilized (instance {{ $labels.instance }}) - description: - "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ - $value }}\n LABELS = {{ $labels }}" + # - alert: HostCpuIsUnderutilized + # expr: + # '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) + # group_left (nodename) node_uname_info{nodename=~".+"}' + # for: 1w + # labels: + # severity: info + # annotations: + # summary: Host CPU is underutilized (instance {{ $labels.instance }}) + # description: + # "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ + # $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor expr: diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml index 75d86a6b..9ea6ef5a 100644 --- a/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml +++ b/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml @@ -73,16 +73,16 @@ groups: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PostgresqlNotEnoughConnections - expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' - for: 2m - labels: - severity: critical - annotations: - summary: Postgresql not enough connections (instance {{ $labels.instance }}) - description: - "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value - }}\n LABELS = {{ $labels }}" + # - alert: PostgresqlNotEnoughConnections + # expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' + # for: 2m + # labels: + # severity: critical + # annotations: + # summary: Postgresql not enough connections (instance {{ $labels.instance }}) + # description: + # "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value + # }}\n LABELS = {{ $labels }}" - alert: PostgresqlDeadLocks expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' @@ -109,17 +109,17 @@ groups: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PostgresqlCommitRateLow - expr: - 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' - for: 2m - labels: - severity: critical - annotations: - summary: Postgresql commit rate low (instance {{ $labels.instance }}) - description: - "Postgresql seems to be processing very few transactions\n VALUE = {{ $value - }}\n LABELS = {{ $labels }}" + # - alert: PostgresqlCommitRateLow + # expr: + # 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' + # for: 2m + # labels: + # severity: critical + # annotations: + # summary: Postgresql commit rate low (instance {{ $labels.instance }}) + # description: + # "Postgresql seems to be processing very few transactions\n VALUE = {{ $value + # }}\n LABELS = {{ $labels }}" - alert: PostgresqlLowXidConsumption expr: "rate(pg_txid_current[1m]) < 5"