mirror of
https://github.com/ryan4yin/nix-config.git
synced 2026-03-21 08:59:28 +01:00
feat: migrate all nixos services from idols to 12kingdoms
This commit is contained in:
@@ -1,13 +1,3 @@
|
||||
# Idols - Ruby
|
||||
|
||||
Host running operation and maintenance related services:
|
||||
|
||||
1. Backup or sync my personal data to cloud or NAS.
|
||||
- For safety, those data should be encrypted before sending to the cloud or my NAS.
|
||||
1. Collect and monitor the metrics/logs of my homelab.
|
||||
|
||||
## Services
|
||||
|
||||
1. prometheus + alertmanager + grafana + loki: Monitor the metrics/logs of my homelab.
|
||||
1. restic: Backup my personal data to cloud or NAS.
|
||||
1. synthing: Sync file between android/macbook/PC and NAS.
|
||||
TODO
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
{
|
||||
config,
|
||||
attic,
|
||||
...
|
||||
}: {
|
||||
#=====================================================
|
||||
#
|
||||
# Attic
|
||||
#
|
||||
# A self-hostable Nix Binary Cache server
|
||||
# backed by an S3-compatible storage provider
|
||||
#
|
||||
# https://docs.attic.rs/tutorial.html
|
||||
#
|
||||
#=====================================================
|
||||
|
||||
imports = [
|
||||
attic.nixosModules.atticd
|
||||
];
|
||||
|
||||
# Self-Hosted Nix Cache Server
|
||||
# https://github.com/zhaofengli/attic
|
||||
#
|
||||
# The first thing to do after setting up the server is:
|
||||
# 1. Generate a admin token on the server via command:
|
||||
# `sudo atticd-atticadm make-token --sub "admin-1" --validity "2y" --pull "*" --push "*" --delete "*" --create-cache "*" --configure-cache "*" --configure-cache-retention "*" --destroy-cache "*"`
|
||||
# 2. Login at the desktop via command:
|
||||
# `attic login central http://attic.writefor.fun <TOKEN>`
|
||||
# 3. Create a new cache via command:
|
||||
# `attic cache create rk3588`
|
||||
# `attic use cache rk3588`
|
||||
# 4. Push Caches to the cache server via:
|
||||
# it's similar to cachix, related docs:
|
||||
# https://docs.attic.rs/reference/attic-cli.html
|
||||
# https://docs.cachix.org/pushing#pushing
|
||||
services.atticd = {
|
||||
enable = true;
|
||||
|
||||
# Replace with absolute path to your credentials file
|
||||
# The HS256 JWT secret can be generated with the openssl:
|
||||
# openssl rand 64 | base64 -w0
|
||||
#
|
||||
# Content:
|
||||
# ATTIC_SERVER_TOKEN_HS256_SECRET_BASE64="output from openssl"
|
||||
credentialsFile = config.age.secrets."attic-nix-cache-server.env".path;
|
||||
|
||||
settings = {
|
||||
listen = "[::]:8888";
|
||||
|
||||
# Data chunking
|
||||
#
|
||||
# Warning: If you change any of the values here, it will be
|
||||
# difficult to reuse existing chunks for newly-uploaded NARs
|
||||
# since the cutpoints will be different. As a result, the
|
||||
# deduplication ratio will suffer for a while after the change.
|
||||
chunking = {
|
||||
# The minimum NAR size to trigger chunking
|
||||
#
|
||||
# If 0, chunking is disabled entirely for newly-uploaded NARs.
|
||||
# If 1, all NARs are chunked.
|
||||
nar-size-threshold = 64 * 1024; # 64 KiB
|
||||
|
||||
# The preferred minimum size of a chunk, in bytes
|
||||
min-size = 16 * 1024; # 16 KiB
|
||||
|
||||
# The preferred average size of a chunk, in bytes
|
||||
avg-size = 64 * 1024; # 64 KiB
|
||||
|
||||
# The preferred maximum size of a chunk, in bytes
|
||||
max-size = 256 * 1024; # 256 KiB
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,40 +0,0 @@
|
||||
{myvars, ...}: {
|
||||
services.caddy = {
|
||||
enable = true;
|
||||
# Reload Caddy instead of restarting it when configuration file changes.
|
||||
enableReload = true;
|
||||
user = "caddy"; # User account under which caddy runs.
|
||||
dataDir = "/var/lib/caddy";
|
||||
logDir = "/var/log/caddy";
|
||||
|
||||
# Additional lines of configuration appended to the global config section of the Caddyfile.
|
||||
# Refer to https://caddyserver.com/docs/caddyfile/options#global-options for details on supported values.
|
||||
globalConfig = ''
|
||||
http_port 80
|
||||
https_port 443
|
||||
auto_https off
|
||||
'';
|
||||
|
||||
# ACME related settings.
|
||||
# email = myvars.useremail;
|
||||
# acmeCA = "https://acme-v02.api.letsencrypt.org/directory";
|
||||
|
||||
virtualHosts."http://grafana.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3000
|
||||
'';
|
||||
virtualHosts."http://prometheus.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9090
|
||||
'';
|
||||
virtualHosts."http://alertmanager.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9093
|
||||
'';
|
||||
virtualHosts."http://attic.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:8888
|
||||
'';
|
||||
};
|
||||
networking.firewall.allowedTCPPorts = [80 443];
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
{mylib, ...}: {
|
||||
imports = mylib.scanPaths ./.;
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
{
|
||||
# TODO
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/monitoring/prometheus/exporters/pve.nix
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
# <string> an unique provider name. Required
|
||||
- name: "Dashboards"
|
||||
# <int> Org id. Default to 1
|
||||
orgId: 1
|
||||
# <string> provider type. Default to 'file'
|
||||
type: file
|
||||
# <bool> disable dashboard deletion
|
||||
disableDeletion: false
|
||||
# <int> how often Grafana will scan for changed dashboards
|
||||
updateIntervalSeconds: 20
|
||||
# <bool> allow updating provisioned dashboards from the UI
|
||||
allowUiUpdates: false
|
||||
options:
|
||||
# <string, required> path to dashboard files on disk. Required when using the 'file' type
|
||||
path: /etc/grafana/dashboards
|
||||
# <bool> use folder names from filesystem to create folders in Grafana
|
||||
foldersFromFilesStructure: true
|
||||
@@ -1,10 +0,0 @@
|
||||
# Grafana Dashboards
|
||||
|
||||
## Homelab
|
||||
|
||||
1. https://grafana.com/grafana/dashboards/1860-node-exporter-full/
|
||||
2. https://grafana.com/grafana/dashboards/9578-alertmanager/
|
||||
|
||||
## Kubernetes
|
||||
|
||||
1. https://github.com/dotdc/grafana-dashboards-kubernetes/
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,21 +0,0 @@
|
||||
# https://grafana.com/docs/grafana/latest/datasources/prometheus/
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: prometheus-homelab
|
||||
type: prometheus
|
||||
access: proxy
|
||||
# Access mode - proxy (server in the UI) or direct (browser in the UI).
|
||||
url: http://localhost:9090
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
manageAlerts: true
|
||||
prometheusType: Prometheus
|
||||
prometheusVersion: 2.49.0
|
||||
cacheLevel: "High"
|
||||
disableRecordingRules: false
|
||||
# As of Grafana 10, the Prometheus data source can be configured to query live dashboards
|
||||
# incrementally, instead of re-querying the entire duration on each dashboard refresh.
|
||||
# Increasing the duration of the incrementalQueryOverlapWindow will increase the size of every incremental query,
|
||||
# but might be helpful for instances that have inconsistent results for recent data.
|
||||
incrementalQueryOverlapWindow: 10m
|
||||
@@ -1,52 +0,0 @@
|
||||
{
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
dataDir = "/var/lib/grafana";
|
||||
# DeclarativePlugins = with pkgs.grafanaPlugins; [ grafana-piechart-panel ];
|
||||
settings = {
|
||||
server = {
|
||||
http_addr = "127.0.0.1";
|
||||
http_port = 3000;
|
||||
protocol = "http";
|
||||
domain = "grafana.writefo.fun";
|
||||
# Redirect to correct domain if the host header does not match the domain. Prevents DNS rebinding attacks.
|
||||
serve_from_sub_path = false;
|
||||
# Add subpath to the root_url if serve_from_sub_path is true
|
||||
root_url = "%(protocol)s://%(domain)s:%(http_port)s/";
|
||||
enforce_domain = false;
|
||||
read_timeout = "180s";
|
||||
# Enable HTTP compression, this can improve transfer speed and bandwidth utilization.
|
||||
enable_gzip = true;
|
||||
# Cdn for accelerating loading of frontend assets.
|
||||
# cdn_url = "https://cdn.jsdelivr.net/npm/grafana@7.5.5";
|
||||
};
|
||||
|
||||
security = {
|
||||
admin_user = myvars.username;
|
||||
admin_email = myvars.useremail;
|
||||
# Use file provider to read the admin password from a file.
|
||||
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
|
||||
admin_password = "$__file{${config.age.secrets."grafana-admin-password".path}}";
|
||||
};
|
||||
users = {
|
||||
allow_sign_up = false;
|
||||
# home_page = "";
|
||||
default_theme = "dark";
|
||||
};
|
||||
};
|
||||
|
||||
# Declaratively provision Grafana's data sources, dashboards, and alerting rules.
|
||||
# Grafana's alerting rules is not recommended to use, we use Prometheus alertmanager instead.
|
||||
# https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
|
||||
provision = {
|
||||
datasources.path = ./datasources.yml;
|
||||
dashboards.path = ./dashboards.yml;
|
||||
};
|
||||
};
|
||||
|
||||
environment.etc."grafana/dashboards".source = ./dashboards;
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
# Prometheus & Alertmanager
|
||||
|
||||
## Alert Rules
|
||||
|
||||
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
|
||||
Prometheus alerting rules
|
||||
@@ -1,13 +0,0 @@
|
||||
groups:
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
- alert: CorednsPanicCount
|
||||
expr: "increase(coredns_panics_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -1,162 +0,0 @@
|
||||
groups:
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
- alert: EtcdInsufficientMembers
|
||||
expr: "count(etcd_server_id) % 2 == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd insufficient Members (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd cluster should have an odd number of members\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: EtcdNoLeader
|
||||
expr: "etcd_server_has_leader == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd no Leader (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfLeaderChanges
|
||||
expr: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of leader changes (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr:
|
||||
'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr:
|
||||
'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: EtcdGrpcRequestsSlow
|
||||
expr:
|
||||
'histogram_quantile(0.99,
|
||||
sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service,
|
||||
grpc_method, le)) > 0.15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
|
||||
description:
|
||||
"GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr:
|
||||
"sum(rate(etcd_http_failed_total[1m])) BY (method) /
|
||||
sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr:
|
||||
"sum(rate(etcd_http_failed_total[1m])) BY (method) /
|
||||
sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: EtcdHttpRequestsSlow
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) >
|
||||
0.15"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd member communication slow (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd member communication slowing down, 99th percentile is over 0.15s\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedProposals
|
||||
expr: "increase(etcd_server_proposals_failed_total[1h]) > 5"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighFsyncDurations
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high fsync durations (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighCommitDurations
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) >
|
||||
0.25"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high commit durations (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd commit duration increasing, 99th percentile is over 0.25s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
@@ -1,123 +0,0 @@
|
||||
groups:
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
- alert: IstioKubernetesGatewayAvailabilityDrop
|
||||
expr:
|
||||
'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway",
|
||||
namespace="istio-system"}) without (instance, pod) < 2'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioPilotHighTotalRequestRate
|
||||
expr: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have
|
||||
outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioMixerPrometheusDispatchesLow
|
||||
expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being
|
||||
exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHighTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high total request rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Global request rate in the service mesh is unusually high.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioLowTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio low total request rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Global request rate in the service mesh is unusually low.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh4xxErrorRate
|
||||
expr:
|
||||
'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) /
|
||||
sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh5xxErrorRate
|
||||
expr:
|
||||
'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) /
|
||||
sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 5xx error rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHighRequestLatency
|
||||
expr:
|
||||
'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) /
|
||||
rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high request latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Istio average requests execution is longer than 100ms.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioLatency99Percentile
|
||||
expr:
|
||||
"histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by
|
||||
(destination_canonical_service, destination_workload_namespace, source_canonical_service,
|
||||
source_workload_namespace, le)) > 1000"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: IstioPilotDuplicateEntry
|
||||
expr: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -1,435 +0,0 @@
|
||||
groups:
|
||||
- name: KubestateExporter
|
||||
|
||||
rules:
|
||||
- alert: KubernetesNodeNotReady
|
||||
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Node ready (node {{ $labels.node }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeMemoryPressure
|
||||
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes memory pressure (node {{ $labels.node }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS
|
||||
= {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeDiskPressure
|
||||
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes disk pressure (node {{ $labels.node }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeNetworkUnavailable
|
||||
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeOutOfPodCapacity
|
||||
expr:
|
||||
'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node)
|
||||
(0 * kube_pod_info{pod_template_hash=""})) / sum by (node)
|
||||
(kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: KubernetesContainerOomKiller
|
||||
expr:
|
||||
'(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||
offset 10m >= 1) and ignoring (reason)
|
||||
min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||
== 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{
|
||||
$labels.container }})
|
||||
description:
|
||||
"Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has
|
||||
been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobFailed
|
||||
expr: "kube_job_status_failed > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
description:
|
||||
"Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobSuspended
|
||||
expr: "kube_cronjob_spec_suspend != 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
description:
|
||||
"CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeclaimPending
|
||||
expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
|
||||
$labels.persistentvolumeclaim }})
|
||||
description:
|
||||
"PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is
|
||||
pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesVolumeOutOfDiskSpace
|
||||
expr:
|
||||
"kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesVolumeFullInFourDays
|
||||
expr: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to
|
||||
fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeError
|
||||
expr:
|
||||
'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
|
||||
$labels.persistentvolumeclaim }})
|
||||
description:
|
||||
"Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetDown
|
||||
expr: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
description:
|
||||
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleInability
|
||||
expr:
|
||||
'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} ==
|
||||
1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to
|
||||
scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaMetricsUnavailability
|
||||
expr:
|
||||
'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"}
|
||||
== 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect
|
||||
metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleMaximum
|
||||
expr:
|
||||
"kube_horizontalpodautoscaler_status_desired_replicas >=
|
||||
kube_horizontalpodautoscaler_spec_max_replicas"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum
|
||||
number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaUnderutilized
|
||||
expr:
|
||||
"max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) ==
|
||||
kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at
|
||||
minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPodNotHealthy
|
||||
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
description:
|
||||
"Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for
|
||||
longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPodCrashLooping
|
||||
expr: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
description:
|
||||
"Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesReplicasetReplicasMismatch
|
||||
expr: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
|
||||
description:
|
||||
"ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE
|
||||
= {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDeploymentReplicasMismatch
|
||||
expr: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment
|
||||
}})
|
||||
description:
|
||||
"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE
|
||||
= {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetReplicasMismatch
|
||||
expr: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
|
||||
description:
|
||||
"StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDeploymentGenerationMismatch
|
||||
expr: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment
|
||||
}})
|
||||
description:
|
||||
"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been
|
||||
rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetGenerationMismatch
|
||||
expr: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{
|
||||
$labels.statefulset }})
|
||||
description:
|
||||
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not
|
||||
been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetUpdateNotRolledOut
|
||||
expr:
|
||||
"max without (revision) (kube_statefulset_status_current_revision unless
|
||||
kube_statefulset_status_update_revision) * (kube_statefulset_replicas !=
|
||||
kube_statefulset_status_replicas_updated)"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{
|
||||
$labels.statefulset }})
|
||||
description:
|
||||
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been
|
||||
rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetRolloutStuck
|
||||
expr:
|
||||
"kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100
|
||||
< 100 or kube_daemonset_status_desired_number_scheduled -
|
||||
kube_daemonset_status_current_number_scheduled > 0"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
description:
|
||||
"Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not
|
||||
scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetMisscheduled
|
||||
expr: "kube_daemonset_status_number_misscheduled > 0"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
description:
|
||||
"Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running
|
||||
where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobTooLong
|
||||
expr: "time() - kube_cronjob_next_schedule_time > 3600"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
description:
|
||||
"CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to
|
||||
complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobSlowCompletion
|
||||
expr: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
|
||||
for: 12h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
description:
|
||||
"Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in
|
||||
time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerErrors
|
||||
expr:
|
||||
'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) /
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API server errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS
|
||||
= {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiClientErrors
|
||||
expr:
|
||||
'(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) /
|
||||
sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API client errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS
|
||||
= {{ $labels }}"
|
||||
|
||||
- alert: KubernetesClientCertificateExpiresNextWeek
|
||||
expr:
|
||||
'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
|
||||
histogram_quantile(0.01, sum by (job, le)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
|
||||
7*24*60*60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
|
||||
description:
|
||||
"A client certificate used to authenticate to the apiserver is expiring next
|
||||
week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesClientCertificateExpiresSoon
|
||||
expr:
|
||||
'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
|
||||
histogram_quantile(0.01, sum by (job, le)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
|
||||
24*60*60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
|
||||
description:
|
||||
"A client certificate used to authenticate to the apiserver is expiring in less than
|
||||
24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerLatency
|
||||
expr:
|
||||
'histogram_quantile(0.99,
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}
|
||||
[10m])) WITHOUT (instance, resource)) > 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes API server latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{
|
||||
$labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels
|
||||
}}"
|
||||
@@ -1,508 +0,0 @@
|
||||
groups:
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
- alert: HostOutOfMemory
|
||||
expr:
|
||||
'(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance)
|
||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels
|
||||
}}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr:
|
||||
'(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description:
|
||||
"The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr:
|
||||
'(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes *
|
||||
100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{
|
||||
$labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr:
|
||||
'((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
|
||||
device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr:
|
||||
'((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
|
||||
device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 *
|
||||
3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Filesystem is predicted to run out of space within the next 24 hours at current write
|
||||
rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr:
|
||||
'(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
|
||||
* 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: "node_filesystem_device_error == 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description:
|
||||
"{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }}
|
||||
filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr:
|
||||
'(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
|
||||
* 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 *
|
||||
3600) < 0 and ON (instance, device, mountpoint)
|
||||
node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Filesystem is predicted to run out of inodes within the next 24 hours at current write
|
||||
rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr:
|
||||
'(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m])
|
||||
> 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left
|
||||
(nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr:
|
||||
'(rate(node_disk_write_time_seconds_total[1m]) /
|
||||
rate(node_disk_writes_completed_total[1m]) > 0.1 and
|
||||
rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr:
|
||||
'(sum by (instance) (avg by (mode, instance)
|
||||
(rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left
|
||||
(nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr:
|
||||
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description:
|
||||
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr:
|
||||
'(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description:
|
||||
"CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may
|
||||
be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr:
|
||||
'(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description:
|
||||
"CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr:
|
||||
'(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Time spent in IO is too high on {{ $labels.instance }}. Check storage for
|
||||
issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitching
|
||||
expr:
|
||||
'((rate(node_context_switches_total[5m])) / (count without(cpu, mode)
|
||||
(node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr:
|
||||
'((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr:
|
||||
'(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr:
|
||||
'((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor)
|
||||
node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr:
|
||||
'(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr:
|
||||
'(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description:
|
||||
"RAID array {{ $labels.device }} is in a degraded state due to one or more disk
|
||||
failures. The number of spare drives is insufficient to fix the issue
|
||||
automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr:
|
||||
'(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description:
|
||||
"At least one device in RAID array on {{ $labels.instance }} failed. Array {{
|
||||
$labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr:
|
||||
'(count(sum(label_replace(node_uname_info, "kernel", "$1", "release",
|
||||
"([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr:
|
||||
'(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr:
|
||||
'(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left
|
||||
(nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory
|
||||
errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr:
|
||||
'(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory
|
||||
errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr:
|
||||
'(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m])
|
||||
> 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
|
||||
\"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr:
|
||||
'(rate(node_network_transmit_errs_total[2m]) /
|
||||
rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
|
||||
\"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr:
|
||||
'((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) +
|
||||
rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) /
|
||||
node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description:
|
||||
"The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting
|
||||
overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr:
|
||||
'((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr:
|
||||
'(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance)
|
||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description:
|
||||
"The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr:
|
||||
'((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or
|
||||
(node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this
|
||||
host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr:
|
||||
'(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr:
|
||||
'(node_reboot_required > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description:
|
||||
"{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
@@ -1,157 +0,0 @@
|
||||
{
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
checkConfig = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9090;
|
||||
webExternalUrl = "http://prometheus.writefor.fun";
|
||||
|
||||
extraFlags = ["--storage.tsdb.retention.time=15d"];
|
||||
# Directory below /var/lib to store Prometheus metrics data.
|
||||
stateDir = "prometheus2";
|
||||
|
||||
# Reload prometheus when configuration file changes (instead of restart).
|
||||
enableReload = true;
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
|
||||
# remoteRead = [];
|
||||
|
||||
# Rules are read from these files.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
|
||||
#
|
||||
# Prometheus supports two types of rules which may be configured
|
||||
# and then evaluated at regular intervals:
|
||||
# 1. Recording rules
|
||||
# Recording rules allow you to precompute frequently needed or computationally
|
||||
# expensive expressions and save their result as a new set of time series.
|
||||
# Querying the precomputed result will then often be much faster than executing the original expression.
|
||||
# This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
|
||||
# 2. Alerting rules
|
||||
# Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
|
||||
# and to send notifications about firing alerts to an external service.
|
||||
ruleFiles = [
|
||||
./alert_rules/node-exporter.yml
|
||||
./alert_rules/kubestate-exporter.yml
|
||||
./alert_rules/etcd_embedded-exporter.yml
|
||||
./alert_rules/istio_embedded-exporter.yml
|
||||
./alert_rules/coredns_embedded-exporter.yml
|
||||
|
||||
# ./recording_rules.yml
|
||||
];
|
||||
|
||||
# specifies a set of targets and parameters describing how to scrape metrics from them.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
|
||||
scrapeConfigs = [
|
||||
# --- Hosts --- #
|
||||
{
|
||||
job_name = "node-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
# All my NixOS hosts.
|
||||
targets =
|
||||
map (addr: "${addr.ipv4}:9100")
|
||||
(builtins.attrValues myvars.networking.hostsAddr);
|
||||
labels.type = "node";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
# --- Homelab Applications --- #
|
||||
|
||||
{
|
||||
job_name = "dnsmasq-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "dnsmasq";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "v2ray-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.kana.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "sftpgo-embedded-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.kana.ipv4}:10000"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
# specifies Alertmanager instances the Prometheus server sends alerts to
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
|
||||
alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9093;
|
||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||
logLevel = "info";
|
||||
|
||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "smtp.qq.com:465";
|
||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||
# https://service.mail.qq.com/detail/0/310
|
||||
smtp_require_tls = false;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = ["host"];
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "default";
|
||||
email_configs = [
|
||||
{
|
||||
to = "ryan4yin@linux.com";
|
||||
# Whether to notify about resolved alerts.
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,83 +0,0 @@
|
||||
{pkgs, ...}: let
|
||||
passwordFile = "/etc/agenix/restic-password";
|
||||
sshKeyPath = "/etc/agenix/ssh-key-for-restic-backup";
|
||||
rcloneConfigFile = "/etc/agenix/rclone-conf-for-restic-backup";
|
||||
in {
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/backup/restic.nix
|
||||
services.restic.backups = {
|
||||
homelab-backup = {
|
||||
inherit passwordFile;
|
||||
initialize = true; # Initialize the repository if it doesn't exist.
|
||||
repository = "rclone:smb-downloads:/Downloads/proxmox-backup/"; # backup to a rclone remote
|
||||
|
||||
# rclone related
|
||||
# rcloneOptions = {
|
||||
# bwlimit = "100M"; # Limit the bandwidth used by rclone.
|
||||
# };
|
||||
inherit rcloneConfigFile;
|
||||
|
||||
# Which local paths to backup, in addition to ones specified via `dynamicFilesFrom`.
|
||||
paths = [
|
||||
"/tmp/restic-backup-temp"
|
||||
];
|
||||
#
|
||||
# A script that produces a list of files to back up. The
|
||||
# results of this command are given to the '--files-from'
|
||||
# option. The result is merged with paths specified via `paths`.
|
||||
# dynamicFilesFrom = "find /home/matt/git -type d -name .git";
|
||||
#
|
||||
# Patterns to exclude when backing up. See
|
||||
# https://restic.readthedocs.io/en/latest/040_backup.html#excluding-files
|
||||
# for details on syntax.
|
||||
exclude = [];
|
||||
|
||||
# A script that must run before starting the backup process.
|
||||
backupPrepareCommand = ''
|
||||
${pkgs.nushell}/bin/nu -c '
|
||||
let pve_nodes = [
|
||||
# proxmox cluster's nodes
|
||||
"um560"
|
||||
"gtr5"
|
||||
"s500plus"
|
||||
|
||||
# others
|
||||
"kana"
|
||||
]
|
||||
|
||||
pve_nodes | each {|it|
|
||||
rsync -avz \
|
||||
-e "ssh -i ${sshKeyPath}" \
|
||||
$"($it):/var/lib/vz" $"/tmp/restic-backup-temp/($it)"
|
||||
}
|
||||
'
|
||||
'';
|
||||
# A script that must run after finishing the backup process.
|
||||
backupCleanupCommand = "rm -rf /tmp/restic-backup-temp";
|
||||
|
||||
# Extra extended options to be passed to the restic --option flag.
|
||||
# extraOptions = [];
|
||||
|
||||
# Extra arguments passed to restic backup.
|
||||
# extraBackupArgs = [
|
||||
# "--exclude-file=/etc/restic/excludes-list"
|
||||
# ];
|
||||
|
||||
# repository = "/mnt/backup-hdd"; # backup to a local directory
|
||||
# When to run the backup. See {manpage}`systemd.timer(5)` for details.
|
||||
timerConfig = {
|
||||
OnCalendar = "01:30";
|
||||
RandomizedDelaySec = "1h";
|
||||
};
|
||||
# A list of options (--keep-* et al.) for 'restic forget --prune',
|
||||
# to automatically prune old snapshots.
|
||||
# The 'forget' command is run *after* the 'backup' command, so
|
||||
# keep that in mind when constructing the --keep-* options.
|
||||
pruneOpts = [
|
||||
"--keep-daily 3"
|
||||
"--keep-weekly 3"
|
||||
"--keep-monthly 3"
|
||||
"--keep-yearly 3"
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user