mirror of
https://github.com/ryan4yin/nix-config.git
synced 2026-01-11 22:30:25 +01:00
fix: alert - add coredns, comment out some useless alert rules
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,3 +8,4 @@ logs/
|
|||||||
core*
|
core*
|
||||||
!core/
|
!core/
|
||||||
!core.nix
|
!core.nix
|
||||||
|
!coredns*
|
||||||
|
|||||||
@@ -0,0 +1,13 @@
|
|||||||
|
groups:
|
||||||
|
- name: CoreDNS Exporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: CorednsPanicCount
|
||||||
|
expr: "increase(coredns_panics_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
|
||||||
|
description:
|
||||||
|
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
@@ -203,18 +203,18 @@ groups:
|
|||||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuIsUnderutilized
|
# - alert: HostCpuIsUnderutilized
|
||||||
expr:
|
# expr:
|
||||||
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
# '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
||||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
# group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
for: 1w
|
# for: 1w
|
||||||
labels:
|
# labels:
|
||||||
severity: info
|
# severity: info
|
||||||
annotations:
|
# annotations:
|
||||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||||
description:
|
# description:
|
||||||
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
# "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
||||||
$value }}\n LABELS = {{ $labels }}"
|
# $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuStealNoisyNeighbor
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
expr:
|
expr:
|
||||||
|
|||||||
@@ -73,16 +73,16 @@ groups:
|
|||||||
"PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS
|
"PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS
|
||||||
= {{ $labels }}"
|
= {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlNotEnoughConnections
|
# - alert: PostgresqlNotEnoughConnections
|
||||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
# expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||||
for: 2m
|
# for: 2m
|
||||||
labels:
|
# labels:
|
||||||
severity: critical
|
# severity: critical
|
||||||
annotations:
|
# annotations:
|
||||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
# summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||||
description:
|
# description:
|
||||||
"PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
|
# "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value
|
||||||
}}\n LABELS = {{ $labels }}"
|
# }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlDeadLocks
|
- alert: PostgresqlDeadLocks
|
||||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||||
@@ -109,17 +109,17 @@ groups:
|
|||||||
"Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value
|
"Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value
|
||||||
}}\n LABELS = {{ $labels }}"
|
}}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlCommitRateLow
|
# - alert: PostgresqlCommitRateLow
|
||||||
expr:
|
# expr:
|
||||||
'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
# 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||||
for: 2m
|
# for: 2m
|
||||||
labels:
|
# labels:
|
||||||
severity: critical
|
# severity: critical
|
||||||
annotations:
|
# annotations:
|
||||||
summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
# summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||||
description:
|
# description:
|
||||||
"Postgresql seems to be processing very few transactions\n VALUE = {{ $value
|
# "Postgresql seems to be processing very few transactions\n VALUE = {{ $value
|
||||||
}}\n LABELS = {{ $labels }}"
|
# }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlLowXidConsumption
|
- alert: PostgresqlLowXidConsumption
|
||||||
expr: "rate(pg_txid_current[1m]) < 5"
|
expr: "rate(pg_txid_current[1m]) < 5"
|
||||||
|
|||||||
Reference in New Issue
Block a user