fix: alert - add coredns, comment out some useless alert rules

This commit is contained in:
Ryan Yin
2025-09-14 10:40:12 +08:00
parent c8182216ae
commit 7a82b8085a
4 changed files with 47 additions and 33 deletions

1
.gitignore vendored
View File

@@ -8,3 +8,4 @@ logs/
core*
!core/
!core.nix
!coredns*

View File

@@ -0,0 +1,13 @@
groups:
- name: CoreDNS Exporter
rules:
- alert: CorednsPanicCount
expr: "increase(coredns_panics_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
description:
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@@ -203,18 +203,18 @@ groups:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr:
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description:
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
# - alert: HostCpuIsUnderutilized
# expr:
# '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
# group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description:
# "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
# $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr:

View File

@@ -73,16 +73,16 @@ groups:
"PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS
= {{ $labels }}"
- alert: PostgresqlNotEnoughConnections
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql not enough connections (instance {{ $labels.instance }})
description:
"PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
# - alert: PostgresqlNotEnoughConnections
# expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
# for: 2m
# labels:
# severity: critical
# annotations:
# summary: Postgresql not enough connections (instance {{ $labels.instance }})
# description:
# "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
# }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
@@ -109,17 +109,17 @@ groups:
"Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: PostgresqlCommitRateLow
expr:
'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql commit rate low (instance {{ $labels.instance }})
description:
"Postgresql seems to be processing very few transactions\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
# - alert: PostgresqlCommitRateLow
# expr:
# 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
# for: 2m
# labels:
# severity: critical
# annotations:
# summary: Postgresql commit rate low (instance {{ $labels.instance }})
# description:
# "Postgresql seems to be processing very few transactions\n VALUE = {{ $value
# }}\n LABELS = {{ $labels }}"
- alert: PostgresqlLowXidConsumption
expr: "rate(pg_txid_current[1m]) < 5"