mirror of
https://github.com/ryan4yin/nix-config.git
synced 2026-04-24 01:38:28 +02:00
refactor: Use haumea for filesystem-based module system for flake outputs
refactor: Use hyphen(`-`) for variable names & folder names(except Python), replace all unserscore(`_`) with hyphen(`-`).
This commit is contained in:
14
hosts/idols-ruby/README.md
Normal file
14
hosts/idols-ruby/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
# Idols - Ruby
|
||||
|
||||
Host running operation and maintenance related services:
|
||||
|
||||
1. Backup or sync my personal data to cloud or NAS.
|
||||
- For safety, those data should be encrypted before sending to the cloud or my NAS.
|
||||
1. Collect and monitor the metrics/logs of my homelab.
|
||||
|
||||
## Services
|
||||
|
||||
1. prometheus + alertmanager + grafana + loki: Monitor the metrics/logs of my homelab.
|
||||
1. restic: Backup my personal data to cloud or NAS.
|
||||
1. synthing: Sync file between android/macbook/PC and NAS.
|
||||
|
||||
74
hosts/idols-ruby/attic.nix
Normal file
74
hosts/idols-ruby/attic.nix
Normal file
@@ -0,0 +1,74 @@
|
||||
{
|
||||
config,
|
||||
attic,
|
||||
...
|
||||
}: {
|
||||
#=====================================================
|
||||
#
|
||||
# Attic
|
||||
#
|
||||
# A self-hostable Nix Binary Cache server
|
||||
# backed by an S3-compatible storage provider
|
||||
#
|
||||
# https://docs.attic.rs/tutorial.html
|
||||
#
|
||||
#=====================================================
|
||||
|
||||
imports = [
|
||||
attic.nixosModules.atticd
|
||||
];
|
||||
|
||||
# Self-Hosted Nix Cache Server
|
||||
# https://github.com/zhaofengli/attic
|
||||
#
|
||||
# The first thing to do after setting up the server is:
|
||||
# 1. Generate a admin token on the server via command:
|
||||
# `sudo atticd-atticadm make-token --sub "admin-1" --validity "2y" --pull "*" --push "*" --delete "*" --create-cache "*" --configure-cache "*" --configure-cache-retention "*" --destroy-cache "*"`
|
||||
# 2. Login at the desktop via command:
|
||||
# `attic login central http://attic.writefor.fun <TOKEN>`
|
||||
# 3. Create a new cache via command:
|
||||
# `attic cache create rk3588`
|
||||
# `attic use cache rk3588`
|
||||
# 4. Push Caches to the cache server via:
|
||||
# it's similar to cachix, related docs:
|
||||
# https://docs.attic.rs/reference/attic-cli.html
|
||||
# https://docs.cachix.org/pushing#pushing
|
||||
services.atticd = {
|
||||
enable = true;
|
||||
|
||||
# Replace with absolute path to your credentials file
|
||||
# The HS256 JWT secret can be generated with the openssl:
|
||||
# openssl rand 64 | base64 -w0
|
||||
#
|
||||
# Content:
|
||||
# ATTIC_SERVER_TOKEN_HS256_SECRET_BASE64="output from openssl"
|
||||
credentialsFile = config.age.secrets."attic-nix-cache-server.env".path;
|
||||
|
||||
settings = {
|
||||
listen = "[::]:8888";
|
||||
|
||||
# Data chunking
|
||||
#
|
||||
# Warning: If you change any of the values here, it will be
|
||||
# difficult to reuse existing chunks for newly-uploaded NARs
|
||||
# since the cutpoints will be different. As a result, the
|
||||
# deduplication ratio will suffer for a while after the change.
|
||||
chunking = {
|
||||
# The minimum NAR size to trigger chunking
|
||||
#
|
||||
# If 0, chunking is disabled entirely for newly-uploaded NARs.
|
||||
# If 1, all NARs are chunked.
|
||||
nar-size-threshold = 64 * 1024; # 64 KiB
|
||||
|
||||
# The preferred minimum size of a chunk, in bytes
|
||||
min-size = 16 * 1024; # 16 KiB
|
||||
|
||||
# The preferred average size of a chunk, in bytes
|
||||
avg-size = 64 * 1024; # 64 KiB
|
||||
|
||||
# The preferred maximum size of a chunk, in bytes
|
||||
max-size = 256 * 1024; # 256 KiB
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
40
hosts/idols-ruby/caddy.nix
Normal file
40
hosts/idols-ruby/caddy.nix
Normal file
@@ -0,0 +1,40 @@
|
||||
{myvars, ...}: {
|
||||
services.caddy = {
|
||||
enable = true;
|
||||
# Reload Caddy instead of restarting it when configuration file changes.
|
||||
enableReload = true;
|
||||
user = "caddy"; # User account under which caddy runs.
|
||||
dataDir = "/var/lib/caddy";
|
||||
logDir = "/var/log/caddy";
|
||||
|
||||
# Additional lines of configuration appended to the global config section of the Caddyfile.
|
||||
# Refer to https://caddyserver.com/docs/caddyfile/options#global-options for details on supported values.
|
||||
globalConfig = ''
|
||||
http_port 80
|
||||
https_port 443
|
||||
auto_https off
|
||||
'';
|
||||
|
||||
# ACME related settings.
|
||||
# email = myvars.useremail;
|
||||
# acmeCA = "https://acme-v02.api.letsencrypt.org/directory";
|
||||
|
||||
virtualHosts."http://grafana.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3000
|
||||
'';
|
||||
virtualHosts."http://prometheus.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9090
|
||||
'';
|
||||
virtualHosts."http://alertmanager.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9093
|
||||
'';
|
||||
virtualHosts."http://attic.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:8888
|
||||
'';
|
||||
};
|
||||
networking.firewall.allowedTCPPorts = [80 443];
|
||||
}
|
||||
51
hosts/idols-ruby/default.nix
Normal file
51
hosts/idols-ruby/default.nix
Normal file
@@ -0,0 +1,51 @@
|
||||
{
|
||||
myvars,
|
||||
mylib,
|
||||
...
|
||||
}:
|
||||
#############################################################
|
||||
#
|
||||
# Ruby - a NixOS VM running on Proxmox
|
||||
#
|
||||
#############################################################
|
||||
let
|
||||
hostName = "ruby"; # Define your hostname.
|
||||
hostAddress = myvars.networking.hostAddress.${hostName};
|
||||
in {
|
||||
imports = mylib.scanPaths ./.;
|
||||
|
||||
# supported file systems, so we can mount any removable disks with these filesystems
|
||||
boot.supportedFilesystems = [
|
||||
"ext4"
|
||||
"btrfs"
|
||||
"xfs"
|
||||
#"zfs"
|
||||
"ntfs"
|
||||
"fat"
|
||||
"vfat"
|
||||
"exfat"
|
||||
"cifs" # mount windows share
|
||||
];
|
||||
|
||||
boot.kernelModules = ["kvm-amd"];
|
||||
boot.extraModprobeConfig = "options kvm_amd nested=1"; # for amd cpu
|
||||
|
||||
networking = {
|
||||
inherit hostName;
|
||||
inherit (myvars.networking) defaultGateway nameservers;
|
||||
|
||||
networkmanager.enable = false;
|
||||
interfaces.ens18 = {
|
||||
useDHCP = false;
|
||||
ipv4.addresses = [hostAddress];
|
||||
};
|
||||
};
|
||||
|
||||
# This value determines the NixOS release from which the default
|
||||
# settings for stateful data, like file locations and database versions
|
||||
# on your system were taken. It‘s perfectly fine and recommended to leave
|
||||
# this value at the release version of the first install of this system.
|
||||
# Before changing this value read the documentation for this option
|
||||
# (e.g. man configuration.nix or on https://nixos.org/nixos/options.html).
|
||||
system.stateVersion = "23.11"; # Did you read the comment?
|
||||
}
|
||||
3
hosts/idols-ruby/exporters/default.nix
Normal file
3
hosts/idols-ruby/exporters/default.nix
Normal file
@@ -0,0 +1,3 @@
|
||||
{mylib, ...}: {
|
||||
imports = mylib.scanPaths ./.;
|
||||
}
|
||||
4
hosts/idols-ruby/exporters/pve.nix
Normal file
4
hosts/idols-ruby/exporters/pve.nix
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
# TODO
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/monitoring/prometheus/exporters/pve.nix
|
||||
}
|
||||
20
hosts/idols-ruby/grafana/dashboards.yml
Normal file
20
hosts/idols-ruby/grafana/dashboards.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
# <string> an unique provider name. Required
|
||||
- name: 'Dashboards'
|
||||
# <int> Org id. Default to 1
|
||||
orgId: 1
|
||||
# <string> provider type. Default to 'file'
|
||||
type: file
|
||||
# <bool> disable dashboard deletion
|
||||
disableDeletion: false
|
||||
# <int> how often Grafana will scan for changed dashboards
|
||||
updateIntervalSeconds: 20
|
||||
# <bool> allow updating provisioned dashboards from the UI
|
||||
allowUiUpdates: false
|
||||
options:
|
||||
# <string, required> path to dashboard files on disk. Required when using the 'file' type
|
||||
path: /etc/grafana/dashboards
|
||||
# <bool> use folder names from filesystem to create folders in Grafana
|
||||
foldersFromFilesStructure: true
|
||||
11
hosts/idols-ruby/grafana/dashboards/README.md
Normal file
11
hosts/idols-ruby/grafana/dashboards/README.md
Normal file
@@ -0,0 +1,11 @@
|
||||
# Grafana Dashbaords
|
||||
|
||||
## Homelab
|
||||
|
||||
1. https://grafana.com/grafana/dashboards/1860-node-exporter-full/
|
||||
2. https://grafana.com/grafana/dashboards/9578-alertmanager/
|
||||
|
||||
## Kubernetes
|
||||
|
||||
1. https://github.com/dotdc/grafana-dashboards-kubernetes/
|
||||
|
||||
11216
hosts/idols-ruby/grafana/dashboards/homelab/alertmanager-9578_rev4.json
Normal file
11216
hosts/idols-ruby/grafana/dashboards/homelab/alertmanager-9578_rev4.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
2989
hosts/idols-ruby/grafana/dashboards/kubernetes/k8s-views-global.json
Normal file
2989
hosts/idols-ruby/grafana/dashboards/kubernetes/k8s-views-global.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
3939
hosts/idols-ruby/grafana/dashboards/kubernetes/k8s-views-nodes.json
Normal file
3939
hosts/idols-ruby/grafana/dashboards/kubernetes/k8s-views-nodes.json
Normal file
File diff suppressed because it is too large
Load Diff
2674
hosts/idols-ruby/grafana/dashboards/kubernetes/k8s-views-pods.json
Normal file
2674
hosts/idols-ruby/grafana/dashboards/kubernetes/k8s-views-pods.json
Normal file
File diff suppressed because it is too large
Load Diff
21
hosts/idols-ruby/grafana/datasources.yml
Normal file
21
hosts/idols-ruby/grafana/datasources.yml
Normal file
@@ -0,0 +1,21 @@
|
||||
# https://grafana.com/docs/grafana/latest/datasources/prometheus/
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: prometheus-homelab
|
||||
type: prometheus
|
||||
access: proxy
|
||||
# Access mode - proxy (server in the UI) or direct (browser in the UI).
|
||||
url: http://localhost:9090
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
manageAlerts: true
|
||||
prometheusType: Prometheus
|
||||
prometheusVersion: 2.49.0
|
||||
cacheLevel: 'High'
|
||||
disableRecordingRules: false
|
||||
# As of Grafana 10, the Prometheus data source can be configured to query live dashboards
|
||||
# incrementally, instead of re-querying the entire duration on each dashboard refresh.
|
||||
# Increasing the duration of the incrementalQueryOverlapWindow will increase the size of every incremental query,
|
||||
# but might be helpful for instances that have inconsistent results for recent data.
|
||||
incrementalQueryOverlapWindow: 10m
|
||||
52
hosts/idols-ruby/grafana/default.nix
Normal file
52
hosts/idols-ruby/grafana/default.nix
Normal file
@@ -0,0 +1,52 @@
|
||||
{
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
dataDir = "/var/lib/grafana";
|
||||
# DeclarativePlugins = with pkgs.grafanaPlugins; [ grafana-piechart-panel ];
|
||||
settings = {
|
||||
server = {
|
||||
http_addr = "127.0.0.1";
|
||||
http_port = 3000;
|
||||
protocol = "http";
|
||||
domain = "grafana.writefo.fun";
|
||||
# Redirect to correct domain if the host header does not match the domain. Prevents DNS rebinding attacks.
|
||||
serve_from_sub_path = false;
|
||||
# Add subpath to the root_url if serve_from_sub_path is true
|
||||
root_url = "%(protocol)s://%(domain)s:%(http_port)s/";
|
||||
enforce_domain = false;
|
||||
read_timeout = "180s";
|
||||
# Enable HTTP compression, this can improve transfer speed and bandwidth utilization.
|
||||
enable_gzip = true;
|
||||
# Cdn for accelerating loading of frontend assets.
|
||||
# cdn_url = "https://cdn.jsdelivr.net/npm/grafana@7.5.5";
|
||||
};
|
||||
|
||||
security = {
|
||||
admin_user = myvars.username;
|
||||
admin_email = myvars.useremail;
|
||||
# Use file provider to read the admin password from a file.
|
||||
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
|
||||
admin_password = "$__file{${config.age.secrets."grafana-admin-password".path}}";
|
||||
};
|
||||
users = {
|
||||
allow_sign_up = false;
|
||||
# home_page = "";
|
||||
default_theme = "dark";
|
||||
};
|
||||
};
|
||||
|
||||
# Declaratively provision Grafana's data sources, dashboards, and alerting rules.
|
||||
# Grafana's alerting rules is not recommended to use, we use Prometheus alertmanager instead.
|
||||
# https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
|
||||
provision = {
|
||||
datasources.path = ./datasources.yml;
|
||||
dashboards.path = ./dashboards.yml;
|
||||
};
|
||||
};
|
||||
|
||||
environment.etc."grafana/dashboards".source = ./dashboards;
|
||||
}
|
||||
7
hosts/idols-ruby/prometheus/README.md
Normal file
7
hosts/idols-ruby/prometheus/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Prometheus & Alertmanager
|
||||
|
||||
## Alert Rules
|
||||
|
||||
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of Prometheus alerting rules
|
||||
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CorednsPanicCount
|
||||
expr: 'increase(coredns_panics_total[1m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
|
||||
description: "Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -0,0 +1,122 @@
|
||||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: EtcdInsufficientMembers
|
||||
expr: 'count(etcd_server_id) % 2 == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd insufficient Members (instance {{ $labels.instance }})
|
||||
description: "Etcd cluster should have an odd number of members\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdNoLeader
|
||||
expr: 'etcd_server_has_leader == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd no Leader (instance {{ $labels.instance }})
|
||||
description: "Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfLeaderChanges
|
||||
expr: 'increase(etcd_server_leader_changes_seen_total[10m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of leader changes (instance {{ $labels.instance }})
|
||||
description: "Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
description: "More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
description: "More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdGrpcRequestsSlow
|
||||
expr: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
|
||||
description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHttpRequestsSlow
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
|
||||
description: "HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd member communication slow (instance {{ $labels.instance }})
|
||||
description: "Etcd member communication slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedProposals
|
||||
expr: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
|
||||
description: "Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighFsyncDurations
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high fsync durations (instance {{ $labels.instance }})
|
||||
description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighCommitDurations
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high commit durations (instance {{ $labels.instance }})
|
||||
description: "Etcd commit duration increasing, 99th percentile is over 0.25s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -0,0 +1,95 @@
|
||||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: IstioKubernetesGatewayAvailabilityDrop
|
||||
expr: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
|
||||
description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioPilotHighTotalRequestRate
|
||||
expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
|
||||
description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioMixerPrometheusDispatchesLow
|
||||
expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
|
||||
description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHighTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high total request rate (instance {{ $labels.instance }})
|
||||
description: "Global request rate in the service mesh is unusually high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioLowTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio low total request rate (instance {{ $labels.instance }})
|
||||
description: "Global request rate in the service mesh is unusually low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh4xxErrorRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
|
||||
description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh5xxErrorRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHighRequestLatency
|
||||
expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high request latency (instance {{ $labels.instance }})
|
||||
description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioLatency99Percentile
|
||||
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
|
||||
description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioPilotDuplicateEntry
|
||||
expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
|
||||
description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
311
hosts/idols-ruby/prometheus/alert_rules/kubestate-exporter.yml
Normal file
311
hosts/idols-ruby/prometheus/alert_rules/kubestate-exporter.yml
Normal file
@@ -0,0 +1,311 @@
|
||||
groups:
|
||||
|
||||
- name: KubestateExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: KubernetesNodeNotReady
|
||||
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Node ready (node {{ $labels.node }})
|
||||
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeMemoryPressure
|
||||
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes memory pressure (node {{ $labels.node }})
|
||||
description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeDiskPressure
|
||||
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes disk pressure (node {{ $labels.node }})
|
||||
description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeNetworkUnavailable
|
||||
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
|
||||
description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeOutOfPodCapacity
|
||||
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
|
||||
description: "Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesContainerOomKiller
|
||||
expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobFailed
|
||||
expr: 'kube_job_status_failed > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobSuspended
|
||||
expr: 'kube_cronjob_spec_suspend != 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeclaimPending
|
||||
expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
||||
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesVolumeOutOfDiskSpace
|
||||
expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
|
||||
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesVolumeFullInFourDays
|
||||
expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
|
||||
description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeError
|
||||
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
||||
description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetDown
|
||||
expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleInability
|
||||
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
|
||||
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaMetricsUnavailability
|
||||
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
|
||||
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleMaximum
|
||||
expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
|
||||
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaUnderutilized
|
||||
expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
|
||||
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPodNotHealthy
|
||||
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPodCrashLooping
|
||||
expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesReplicasetReplicasMismatch
|
||||
expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
|
||||
description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDeploymentReplicasMismatch
|
||||
expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
||||
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetReplicasMismatch
|
||||
expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
|
||||
description: "StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDeploymentGenerationMismatch
|
||||
expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
||||
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetGenerationMismatch
|
||||
expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetUpdateNotRolledOut
|
||||
expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetRolloutStuck
|
||||
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetMisscheduled
|
||||
expr: 'kube_daemonset_status_number_misscheduled > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobTooLong
|
||||
expr: 'time() - kube_cronjob_next_schedule_time > 3600'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobSlowCompletion
|
||||
expr: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
|
||||
for: 12h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerErrors
|
||||
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API server errors (instance {{ $labels.instance }})
|
||||
description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiClientErrors
|
||||
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API client errors (instance {{ $labels.instance }})
|
||||
description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesClientCertificateExpiresNextWeek
|
||||
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
|
||||
description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesClientCertificateExpiresSoon
|
||||
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
|
||||
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerLatency
|
||||
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes API server latency (instance {{ $labels.instance }})
|
||||
description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
347
hosts/idols-ruby/prometheus/alert_rules/node-exporter.yml
Normal file
347
hosts/idols-ruby/prometheus/alert_rules/node-exporter.yml
Normal file
@@ -0,0 +1,347 @@
|
||||
groups:
|
||||
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: 'node_filesystem_device_error == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitching
|
||||
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
157
hosts/idols-ruby/prometheus/default.nix
Normal file
157
hosts/idols-ruby/prometheus/default.nix
Normal file
@@ -0,0 +1,157 @@
|
||||
{
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
checkConfig = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9090;
|
||||
webExternalUrl = "http://prometheus.writefor.fun";
|
||||
|
||||
extraFlags = ["--storage.tsdb.retention.time=15d"];
|
||||
# Directory below /var/lib to store Prometheus metrics data.
|
||||
stateDir = "prometheus2";
|
||||
|
||||
# Reload prometheus when configuration file changes (instead of restart).
|
||||
enableReload = true;
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
|
||||
# remoteRead = [];
|
||||
|
||||
# Rules are read from these files.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
|
||||
#
|
||||
# Prometheus supports two types of rules which may be configured
|
||||
# and then evaluated at regular intervals:
|
||||
# 1. Recording rules
|
||||
# Recording rules allow you to precompute frequently needed or computationally
|
||||
# expensive expressions and save their result as a new set of time series.
|
||||
# Querying the precomputed result will then often be much faster than executing the original expression.
|
||||
# This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
|
||||
# 2. Alerting rules
|
||||
# Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
|
||||
# and to send notifications about firing alerts to an external service.
|
||||
ruleFiles = [
|
||||
./alert_rules/node-exporter.yml
|
||||
./alert_rules/kubestate-exporter.yml
|
||||
./alert_rules/etcd_embedded-exporter.yml
|
||||
./alert_rules/istio_embedded-exporter.yml
|
||||
./alert_rules/coredns_embedded-exporter.yml
|
||||
|
||||
# ./recording_rules.yml
|
||||
];
|
||||
|
||||
# specifies a set of targets and parameters describing how to scrape metrics from them.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
|
||||
scrapeConfigs = [
|
||||
# --- Hosts --- #
|
||||
{
|
||||
job_name = "node-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
# All my NixOS hosts.
|
||||
targets =
|
||||
map (host: "${host.address}:9100")
|
||||
(builtins.attrValues myvars.networking.hostAddress);
|
||||
labels.type = "node";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
# --- Homelab Applications --- #
|
||||
|
||||
{
|
||||
job_name = "dnsmasq-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostAddress.aquamarine.address}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "dnsmasq";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "v2ray-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostAddress.kana.address}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "sftpgo-embedded-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostAddress.kana.address}:10000"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
# specifies Alertmanager instances the Prometheus server sends alerts to
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
|
||||
alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9093;
|
||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||
logLevel = "info";
|
||||
|
||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "smtp.qq.com:465";
|
||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||
smtp_auth_myvars.username = "$SMTP_AUTH_USERNAME";
|
||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||
# https://service.mail.qq.com/detail/0/310
|
||||
smtp_require_tls = false;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = ["host"];
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "default";
|
||||
email_configs = [
|
||||
{
|
||||
to = "ryan4yin@linux.com";
|
||||
# Whether to notify about resolved alerts.
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
83
hosts/idols-ruby/restic.nix
Normal file
83
hosts/idols-ruby/restic.nix
Normal file
@@ -0,0 +1,83 @@
|
||||
{pkgs, ...}: let
|
||||
passwordFile = "/etc/agenix/restic-password";
|
||||
sshKeyPath = "/etc/agenix/ssh-key-for-restic-backup";
|
||||
rcloneConfigFile = "/etc/agenix/rclone-conf-for-restic-backup";
|
||||
in {
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/backup/restic.nix
|
||||
services.restic.backups = {
|
||||
homelab-backup = {
|
||||
inherit passwordFile;
|
||||
initialize = true; # Initialize the repository if it doesn't exist.
|
||||
repository = "rclone:smb-downloads:/Downloads/proxmox-backup/"; # backup to a rclone remote
|
||||
|
||||
# rclone related
|
||||
# rcloneOptions = {
|
||||
# bwlimit = "100M"; # Limit the bandwidth used by rclone.
|
||||
# };
|
||||
inherit rcloneConfigFile;
|
||||
|
||||
# Which local paths to backup, in addition to ones specified via `dynamicFilesFrom`.
|
||||
paths = [
|
||||
"/tmp/restic-backup-temp"
|
||||
];
|
||||
#
|
||||
# A script that produces a list of files to back up. The
|
||||
# results of this command are given to the '--files-from'
|
||||
# option. The result is merged with paths specified via `paths`.
|
||||
# dynamicFilesFrom = "find /home/matt/git -type d -name .git";
|
||||
#
|
||||
# Patterns to exclude when backing up. See
|
||||
# https://restic.readthedocs.io/en/latest/040_backup.html#excluding-files
|
||||
# for details on syntax.
|
||||
exclude = [];
|
||||
|
||||
# A script that must run before starting the backup process.
|
||||
backupPrepareCommand = ''
|
||||
${pkgs.nushell}/bin/nu -c '
|
||||
let pve_nodes = [
|
||||
# proxmox cluster's nodes
|
||||
"um560"
|
||||
"gtr5"
|
||||
"s500plus"
|
||||
|
||||
# others
|
||||
"kana"
|
||||
]
|
||||
|
||||
pve_nodes | each {|it|
|
||||
rsync -avz \
|
||||
-e "ssh -i ${sshKeyPath}" \
|
||||
$"($it):/var/lib/vz" $"/tmp/restic-backup-temp/($it)"
|
||||
}
|
||||
'
|
||||
'';
|
||||
# A script that must run after finishing the backup process.
|
||||
backupCleanupCommand = "rm -rf /tmp/restic-backup-temp";
|
||||
|
||||
# Extra extended options to be passed to the restic --option flag.
|
||||
# extraOptions = [];
|
||||
|
||||
# Extra arguments passed to restic backup.
|
||||
# extraBackupArgs = [
|
||||
# "--exclude-file=/etc/restic/excludes-list"
|
||||
# ];
|
||||
|
||||
# repository = "/mnt/backup-hdd"; # backup to a local directory
|
||||
# When to run the backup. See {manpage}`systemd.timer(5)` for details.
|
||||
timerConfig = {
|
||||
OnCalendar = "01:30";
|
||||
RandomizedDelaySec = "1h";
|
||||
};
|
||||
# A list of options (--keep-* et al.) for 'restic forget --prune',
|
||||
# to automatically prune old snapshots.
|
||||
# The 'forget' command is run *after* the 'backup' command, so
|
||||
# keep that in mind when constructing the --keep-* options.
|
||||
pruneOpts = [
|
||||
"--keep-daily 3"
|
||||
"--keep-weekly 3"
|
||||
"--keep-monthly 3"
|
||||
"--keep-yearly 3"
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user