mirror of
https://github.com/ryan4yin/nix-config.git
synced 2026-05-19 04:56:51 +02:00
feat: migrate all nixos services from idols to 12kingdoms
This commit is contained in:
@@ -0,0 +1,143 @@
|
||||
# Rakushun - Disk and Installation
|
||||
|
||||
Disk layout:
|
||||
|
||||
```bash
|
||||
[ryan@rakushun:~]$ lsblk
|
||||
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS
|
||||
sda 8:0 1 58.6G 0 disk
|
||||
└─sda1 8:1 1 487M 0 part
|
||||
mtdblock0 31:0 0 16M 0 disk
|
||||
zram0 254:0 0 0B 0 disk
|
||||
nvme0n1 259:0 0 1.8T 0 disk
|
||||
├─nvme0n1p1 259:1 0 630M 0 part /boot
|
||||
└─nvme0n1p2 259:2 0 1.8T 0 part
|
||||
└─encrypted 253:0 0 1.8T 0 crypt /tmp
|
||||
/swap
|
||||
/snapshots
|
||||
/home/ryan/tmp
|
||||
/home/ryan/nix-config
|
||||
/home/ryan/go
|
||||
/home/ryan/codes
|
||||
/home/ryan/.ssh
|
||||
/home/ryan/.local/state
|
||||
/home/ryan/.npm
|
||||
/home/ryan/.local/share
|
||||
/home/ryan/.conda
|
||||
/etc/ssh
|
||||
/etc/nix/inputs
|
||||
/etc/secureboot
|
||||
/etc/agenix
|
||||
/etc/NetworkManager/system-connections
|
||||
/etc/machine-id
|
||||
/nix/store
|
||||
/var/log
|
||||
/var/lib
|
||||
/nix
|
||||
/persistent
|
||||
|
||||
[ryan@rakushun:~]$ df -Th
|
||||
Filesystem Type Size Used Avail Use% Mounted on
|
||||
devtmpfs devtmpfs 785M 0 785M 0% /dev
|
||||
tmpfs tmpfs 7.7G 0 7.7G 0% /dev/shm
|
||||
tmpfs tmpfs 3.9G 6.8M 3.9G 1% /run
|
||||
tmpfs tmpfs 7.7G 1.9M 7.7G 1% /run/wrappers
|
||||
none tmpfs 4.0G 48K 4.0G 1% /
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /persistent
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /nix
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /snapshots
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /swap
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /tmp
|
||||
/dev/nvme0n1p1 vfat 629M 96M 534M 16% /boot
|
||||
tmpfs tmpfs 1.6G 4.0K 1.6G 1% /run/user/1000
|
||||
```
|
||||
|
||||
CPU info:
|
||||
|
||||
```bash
|
||||
[ryan@rakushun:~]$ lscpu
|
||||
Architecture: aarch64
|
||||
CPU op-mode(s): 32-bit, 64-bit
|
||||
Byte Order: Little Endian
|
||||
CPU(s): 8
|
||||
On-line CPU(s) list: 0-7
|
||||
Vendor ID: ARM
|
||||
Model name: Cortex-A55
|
||||
Model: 0
|
||||
Thread(s) per core: 1
|
||||
Core(s) per socket: 4
|
||||
Socket(s): 1
|
||||
Stepping: r2p0
|
||||
CPU(s) scaling MHz: 67%
|
||||
CPU max MHz: 1800.0000
|
||||
CPU min MHz: 408.0000
|
||||
BogoMIPS: 48.00
|
||||
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
|
||||
Model name: Cortex-A76
|
||||
Model: 0
|
||||
Thread(s) per core: 1
|
||||
Core(s) per socket: 2
|
||||
Socket(s): 2
|
||||
Stepping: r4p0
|
||||
CPU(s) scaling MHz: 18%
|
||||
CPU max MHz: 2256.0000
|
||||
CPU min MHz: 408.0000
|
||||
BogoMIPS: 48.00
|
||||
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
|
||||
Caches (sum of all):
|
||||
L1d: 384 KiB (8 instances)
|
||||
L1i: 384 KiB (8 instances)
|
||||
L2: 2.5 MiB (8 instances)
|
||||
L3: 3 MiB (1 instance)
|
||||
```
|
||||
|
||||
## How to install NixOS on Orange Pi 5 Plus
|
||||
|
||||
### 1. Prepare a USB LUKS key
|
||||
|
||||
Generate LUKS keyfile to encrypt the root partition, it's used by disko.
|
||||
|
||||
```bash
|
||||
# partition the usb stick
|
||||
DEV=/dev/sdX
|
||||
parted ${DEV} -- mklabel gpt
|
||||
parted ${DEV} -- mkpart OPI5P_DSC fat32 0% 512MB
|
||||
mkfs.fat -F 32 -n OPI5P_DSC ${DEV}1
|
||||
|
||||
# Generate a keyfile from the true random number generator
|
||||
KEYFILE=./orangepi5plus-luks-keyfile
|
||||
dd bs=512 count=64 iflag=fullblock if=/dev/random of=$KEYFILE
|
||||
|
||||
# copy the keyfile and token to the usb stick
|
||||
KEYFILE=./orangepi5plus-luks-keyfile
|
||||
DEVICE=/dev/disk/by-label/OPI5P_DSC
|
||||
# seek=128 skip N obs-sized output blocks to avoid overwriting the filesystem header
|
||||
dd bs=512 count=64 iflag=fullblock seek=128 if=$KEYFILE of=$DEVICE
|
||||
```
|
||||
|
||||
### 2. Partition the SSD & install NixOS via disko
|
||||
|
||||
First, follow
|
||||
[UEFI - ryan4yin/nixos-rk3588](https://github.com/ryan4yin/nixos-rk3588/blob/main/UEFI.md) to
|
||||
install UEFI bootloader and boot into NixOS live environment via a USB stick.
|
||||
|
||||
Then, run the following commands:
|
||||
|
||||
```bash
|
||||
# transfer the nix-config to the target machine
|
||||
rsync -avzP ~/nix-config rk@<ip-addr>:/home/rk/
|
||||
|
||||
# login via ssh
|
||||
ssh rk@<ip-addr>
|
||||
|
||||
cd ~/nix-config/hosts/12kingdoms_rakushun
|
||||
# 1. change the disk device path in ./disko-fs.nix to the disk you want to use
|
||||
# 2. partition & format the disk via disko
|
||||
sudo nix --experimental-features "nix-command flakes" run github:nix-community/disko -- --mode disko ./disko-fs.nix
|
||||
|
||||
|
||||
cd ~/nix-config
|
||||
# install nixos
|
||||
# NOTE: the root password you set here will be discarded when reboot
|
||||
sudo nixos-install --root /mnt --flake .#rakushun --no-root-password --show-trace --verbose
|
||||
```
|
||||
@@ -2,148 +2,33 @@
|
||||
|
||||
LUKS encrypted SSD for NixOS, on Orange Pi 5 Plus.
|
||||
|
||||
Host running storage, operation and maintenance related services:
|
||||
|
||||
1. Storage such as git server, file server/browser, torrent downloader,, etc.
|
||||
1. Backup or sync my personal data to cloud or NAS.
|
||||
- For safety, those data should be encrypted before sending to the cloud or my NAS.
|
||||
1. Collect and monitor the metrics/logs of my homelab.
|
||||
|
||||
## Showcases
|
||||
|
||||

|
||||
|
||||
Disk layout:
|
||||
## Features
|
||||
|
||||
```bash
|
||||
[ryan@rakushun:~]$ lsblk
|
||||
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS
|
||||
sda 8:0 1 58.6G 0 disk
|
||||
└─sda1 8:1 1 487M 0 part
|
||||
mtdblock0 31:0 0 16M 0 disk
|
||||
zram0 254:0 0 0B 0 disk
|
||||
nvme0n1 259:0 0 1.8T 0 disk
|
||||
├─nvme0n1p1 259:1 0 630M 0 part /boot
|
||||
└─nvme0n1p2 259:2 0 1.8T 0 part
|
||||
└─encrypted 253:0 0 1.8T 0 crypt /tmp
|
||||
/swap
|
||||
/snapshots
|
||||
/home/ryan/tmp
|
||||
/home/ryan/nix-config
|
||||
/home/ryan/go
|
||||
/home/ryan/codes
|
||||
/home/ryan/.ssh
|
||||
/home/ryan/.local/state
|
||||
/home/ryan/.npm
|
||||
/home/ryan/.local/share
|
||||
/home/ryan/.conda
|
||||
/etc/ssh
|
||||
/etc/nix/inputs
|
||||
/etc/secureboot
|
||||
/etc/agenix
|
||||
/etc/NetworkManager/system-connections
|
||||
/etc/machine-id
|
||||
/nix/store
|
||||
/var/log
|
||||
/var/lib
|
||||
/nix
|
||||
/persistent
|
||||
Services:
|
||||
|
||||
[ryan@rakushun:~]$ df -Th
|
||||
Filesystem Type Size Used Avail Use% Mounted on
|
||||
devtmpfs devtmpfs 785M 0 785M 0% /dev
|
||||
tmpfs tmpfs 7.7G 0 7.7G 0% /dev/shm
|
||||
tmpfs tmpfs 3.9G 6.8M 3.9G 1% /run
|
||||
tmpfs tmpfs 7.7G 1.9M 7.7G 1% /run/wrappers
|
||||
none tmpfs 4.0G 48K 4.0G 1% /
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /persistent
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /nix
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /snapshots
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /swap
|
||||
/dev/mapper/crypted btrfs 1.9T 19G 1.8T 2% /tmp
|
||||
/dev/nvme0n1p1 vfat 629M 96M 534M 16% /boot
|
||||
tmpfs tmpfs 1.6G 4.0K 1.6G 1% /run/user/1000
|
||||
```
|
||||
1. prometheus + alertmanager + grafana + loki: Monitor the metrics/logs of my homelab.
|
||||
1. restic: Backup my personal data to cloud or NAS.
|
||||
1. synthing: Sync file between android/macbook/PC and NAS.
|
||||
1. attic: Nix cache server.
|
||||
1. gitea: Self-hosted git service.
|
||||
1. sftpgo: SFTP server.
|
||||
1. transmission & AriaNg: Torrent downloader and HTTP downloader
|
||||
1. alist/filebrower: File browser for local/SMB/Cloud
|
||||
|
||||
CPU info:
|
||||
All the services assumes a reverse proxy to be setup in the front, they are all listening on
|
||||
localhost, and a caddy service is listening on the local network interface and proxy the requests to
|
||||
the services.
|
||||
|
||||
```bash
|
||||
[ryan@rakushun:~]$ lscpu
|
||||
Architecture: aarch64
|
||||
CPU op-mode(s): 32-bit, 64-bit
|
||||
Byte Order: Little Endian
|
||||
CPU(s): 8
|
||||
On-line CPU(s) list: 0-7
|
||||
Vendor ID: ARM
|
||||
Model name: Cortex-A55
|
||||
Model: 0
|
||||
Thread(s) per core: 1
|
||||
Core(s) per socket: 4
|
||||
Socket(s): 1
|
||||
Stepping: r2p0
|
||||
CPU(s) scaling MHz: 67%
|
||||
CPU max MHz: 1800.0000
|
||||
CPU min MHz: 408.0000
|
||||
BogoMIPS: 48.00
|
||||
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
|
||||
Model name: Cortex-A76
|
||||
Model: 0
|
||||
Thread(s) per core: 1
|
||||
Core(s) per socket: 2
|
||||
Socket(s): 2
|
||||
Stepping: r4p0
|
||||
CPU(s) scaling MHz: 18%
|
||||
CPU max MHz: 2256.0000
|
||||
CPU min MHz: 408.0000
|
||||
BogoMIPS: 48.00
|
||||
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
|
||||
Caches (sum of all):
|
||||
L1d: 384 KiB (8 instances)
|
||||
L1i: 384 KiB (8 instances)
|
||||
L2: 2.5 MiB (8 instances)
|
||||
L3: 3 MiB (1 instance)
|
||||
```
|
||||
|
||||
## How to install NixOS on Orange Pi 5 Plus
|
||||
|
||||
### 1. Prepare a USB LUKS key
|
||||
|
||||
Generate LUKS keyfile to encrypt the root partition, it's used by disko.
|
||||
|
||||
```bash
|
||||
# partition the usb stick
|
||||
DEV=/dev/sdX
|
||||
parted ${DEV} -- mklabel gpt
|
||||
parted ${DEV} -- mkpart OPI5P_DSC fat32 0% 512MB
|
||||
mkfs.fat -F 32 -n OPI5P_DSC ${DEV}1
|
||||
|
||||
# Generate a keyfile from the true random number generator
|
||||
KEYFILE=./orangepi5plus-luks-keyfile
|
||||
dd bs=512 count=64 iflag=fullblock if=/dev/random of=$KEYFILE
|
||||
|
||||
# copy the keyfile and token to the usb stick
|
||||
KEYFILE=./orangepi5plus-luks-keyfile
|
||||
DEVICE=/dev/disk/by-label/OPI5P_DSC
|
||||
# seek=128 skip N obs-sized output blocks to avoid overwriting the filesystem header
|
||||
dd bs=512 count=64 iflag=fullblock seek=128 if=$KEYFILE of=$DEVICE
|
||||
```
|
||||
|
||||
### 2. Partition the SSD & install NixOS via disko
|
||||
|
||||
First, follow
|
||||
[UEFI - ryan4yin/nixos-rk3588](https://github.com/ryan4yin/nixos-rk3588/blob/main/UEFI.md) to
|
||||
install UEFI bootloader and boot into NixOS live environment via a USB stick.
|
||||
|
||||
Then, run the following commands:
|
||||
|
||||
```bash
|
||||
# transfer the nix-config to the target machine
|
||||
rsync -avzP ~/nix-config rk@<ip-addr>:/home/rk/
|
||||
|
||||
# login via ssh
|
||||
ssh rk@<ip-addr>
|
||||
|
||||
cd ~/nix-config/hosts/12kingdoms_rakushun
|
||||
# 1. change the disk device path in ./disko-fs.nix to the disk you want to use
|
||||
# 2. partition & format the disk via disko
|
||||
sudo nix --experimental-features "nix-command flakes" run github:nix-community/disko -- --mode disko ./disko-fs.nix
|
||||
|
||||
|
||||
cd ~/nix-config
|
||||
# install nixos
|
||||
# NOTE: the root password you set here will be discarded when reboot
|
||||
sudo nixos-install --root /mnt --flake .#rakushun --no-root-password --show-trace --verbose
|
||||
```
|
||||
TODO: create a private PKI for caddy, to achieve end-to-end encryption between caddy and the
|
||||
services.
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
{
|
||||
config,
|
||||
attic,
|
||||
...
|
||||
}: {
|
||||
#=====================================================
|
||||
#
|
||||
# Attic
|
||||
#
|
||||
# A self-hostable Nix Binary Cache server
|
||||
# backed by an S3-compatible storage provider
|
||||
#
|
||||
# https://docs.attic.rs/tutorial.html
|
||||
#
|
||||
#=====================================================
|
||||
|
||||
imports = [
|
||||
attic.nixosModules.atticd
|
||||
];
|
||||
|
||||
# Self-Hosted Nix Cache Server
|
||||
# https://github.com/zhaofengli/attic
|
||||
#
|
||||
# The first thing to do after setting up the server is:
|
||||
# 1. Generate a admin token on the server via command:
|
||||
# `sudo atticd-atticadm make-token --sub "admin-1" --validity "2y" --pull "*" --push "*" --delete "*" --create-cache "*" --configure-cache "*" --configure-cache-retention "*" --destroy-cache "*"`
|
||||
# 2. Login at the desktop via command:
|
||||
# `attic login central http://attic.writefor.fun <TOKEN>`
|
||||
# 3. Create a new cache via command:
|
||||
# `attic cache create rk3588`
|
||||
# `attic use cache rk3588`
|
||||
# 4. Push Caches to the cache server via:
|
||||
# it's similar to cachix, related docs:
|
||||
# https://docs.attic.rs/reference/attic-cli.html
|
||||
# https://docs.cachix.org/pushing#pushing
|
||||
services.atticd = {
|
||||
enable = true;
|
||||
|
||||
# Replace with absolute path to your credentials file
|
||||
# The HS256 JWT secret can be generated with the openssl:
|
||||
# openssl rand 64 | base64 -w0
|
||||
#
|
||||
# Content:
|
||||
# ATTIC_SERVER_TOKEN_HS256_SECRET_BASE64="output from openssl"
|
||||
credentialsFile = config.age.secrets."attic-nix-cache-server.env".path;
|
||||
|
||||
settings = {
|
||||
listen = "127.0.0.1:3300";
|
||||
|
||||
# Data chunking
|
||||
#
|
||||
# Warning: If you change any of the values here, it will be
|
||||
# difficult to reuse existing chunks for newly-uploaded NARs
|
||||
# since the cutpoints will be different. As a result, the
|
||||
# deduplication ratio will suffer for a while after the change.
|
||||
chunking = {
|
||||
# The minimum NAR size to trigger chunking
|
||||
#
|
||||
# If 0, chunking is disabled entirely for newly-uploaded NARs.
|
||||
# If 1, all NARs are chunked.
|
||||
nar-size-threshold = 64 * 1024; # 64 KiB
|
||||
|
||||
# The preferred minimum size of a chunk, in bytes
|
||||
min-size = 16 * 1024; # 16 KiB
|
||||
|
||||
# The preferred average size of a chunk, in bytes
|
||||
avg-size = 64 * 1024; # 64 KiB
|
||||
|
||||
# The preferred maximum size of a chunk, in bytes
|
||||
max-size = 256 * 1024; # 256 KiB
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -19,9 +19,10 @@
|
||||
# email = myvars.useremail;
|
||||
# acmeCA = "https://acme-v02.api.letsencrypt.org/directory";
|
||||
|
||||
virtualHosts."http://git.writefor.fun".extraConfig = ''
|
||||
# Dashboard
|
||||
virtualHosts."http://home.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3000
|
||||
reverse_proxy http://localhost:4401
|
||||
'';
|
||||
|
||||
# https://caddyserver.com/docs/caddyfile/directives/file_server
|
||||
@@ -33,13 +34,54 @@
|
||||
precompressed zstd br gzip
|
||||
}
|
||||
'';
|
||||
|
||||
# Datastore
|
||||
virtualHosts."http://attic.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3300
|
||||
'';
|
||||
|
||||
virtualHosts."http://git.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3301
|
||||
'';
|
||||
virtualHosts."http://sftpgo.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3302
|
||||
'';
|
||||
virtualHosts."http://webdav.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3303
|
||||
'';
|
||||
virtualHosts."http://transmission.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9091
|
||||
'';
|
||||
|
||||
# Monitoring
|
||||
virtualHosts."http://uptime-kuma.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3350
|
||||
'';
|
||||
virtualHosts."http://grafana.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:3351
|
||||
'';
|
||||
virtualHosts."http://prometheus.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9090
|
||||
'';
|
||||
virtualHosts."http://alertmanager.writefor.fun".extraConfig = ''
|
||||
encode zstd gzip
|
||||
reverse_proxy http://localhost:9093
|
||||
'';
|
||||
};
|
||||
networking.firewall.allowedTCPPorts = [80 443];
|
||||
|
||||
# Create Directories
|
||||
systemd.tmpfiles.rules = [
|
||||
"d /var/lib/caddy/fileserver/ 0755 caddy caddy"
|
||||
# directory for virual machine's images
|
||||
# directory for virtual machine's images
|
||||
"d /var/lib/caddy/fileserver/vms 0755 caddy caddy"
|
||||
];
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
{
|
||||
mylib,
|
||||
disko,
|
||||
nixos-rk3588,
|
||||
myvars,
|
||||
@@ -12,17 +13,13 @@
|
||||
let
|
||||
hostName = "rakushun"; # Define your hostname.
|
||||
in {
|
||||
imports = [
|
||||
# import the rk3588 module, which contains the configuration for bootloader/kernel/firmware
|
||||
nixos-rk3588.nixosModules.orangepi5plus.core
|
||||
disko.nixosModules.default
|
||||
./hardware-configuration.nix
|
||||
./disko-fs.nix
|
||||
./impermanence.nix
|
||||
|
||||
./gitea.nix
|
||||
./caddy.nix
|
||||
];
|
||||
imports =
|
||||
(mylib.scanPaths ./.)
|
||||
++ [
|
||||
# import the rk3588 module, which contains the configuration for bootloader/kernel/firmware
|
||||
nixos-rk3588.nixosModules.orangepi5plus.core
|
||||
disko.nixosModules.default
|
||||
];
|
||||
|
||||
networking = {
|
||||
inherit hostName;
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
{mylib, ...}: {
|
||||
imports = mylib.scanPaths ./.;
|
||||
}
|
||||
@@ -21,7 +21,7 @@ in {
|
||||
server = {
|
||||
SSH_PORT = 2222;
|
||||
PROTOCOL = "http";
|
||||
HTTP_PORT = 3000;
|
||||
HTTP_PORT = 3301;
|
||||
HTTP_ADDR = "127.0.0.1";
|
||||
DOMAIN = "git.writefor.fun";
|
||||
};
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
# <string> an unique provider name. Required
|
||||
- name: "Dashboards"
|
||||
# <int> Org id. Default to 1
|
||||
orgId: 1
|
||||
# <string> provider type. Default to 'file'
|
||||
type: file
|
||||
# <bool> disable dashboard deletion
|
||||
disableDeletion: false
|
||||
# <int> how often Grafana will scan for changed dashboards
|
||||
updateIntervalSeconds: 20
|
||||
# <bool> allow updating provisioned dashboards from the UI
|
||||
allowUiUpdates: false
|
||||
options:
|
||||
# <string, required> path to dashboard files on disk. Required when using the 'file' type
|
||||
path: /etc/grafana/dashboards
|
||||
# <bool> use folder names from filesystem to create folders in Grafana
|
||||
foldersFromFilesStructure: true
|
||||
@@ -0,0 +1,10 @@
|
||||
# Grafana Dashboards
|
||||
|
||||
## Homelab
|
||||
|
||||
1. https://grafana.com/grafana/dashboards/1860-node-exporter-full/
|
||||
2. https://grafana.com/grafana/dashboards/9578-alertmanager/
|
||||
|
||||
## Kubernetes
|
||||
|
||||
1. https://github.com/dotdc/grafana-dashboards-kubernetes/
|
||||
File diff suppressed because it is too large
Load Diff
+23268
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+2647
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,21 @@
|
||||
# https://grafana.com/docs/grafana/latest/datasources/prometheus/
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: prometheus-homelab
|
||||
type: prometheus
|
||||
access: proxy
|
||||
# Access mode - proxy (server in the UI) or direct (browser in the UI).
|
||||
url: http://localhost:9090
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
manageAlerts: true
|
||||
prometheusType: Prometheus
|
||||
prometheusVersion: 2.49.0
|
||||
cacheLevel: "High"
|
||||
disableRecordingRules: false
|
||||
# As of Grafana 10, the Prometheus data source can be configured to query live dashboards
|
||||
# incrementally, instead of re-querying the entire duration on each dashboard refresh.
|
||||
# Increasing the duration of the incrementalQueryOverlapWindow will increase the size of every incremental query,
|
||||
# but might be helpful for instances that have inconsistent results for recent data.
|
||||
incrementalQueryOverlapWindow: 10m
|
||||
@@ -0,0 +1,52 @@
|
||||
{
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
dataDir = "/var/lib/grafana";
|
||||
# DeclarativePlugins = with pkgs.grafanaPlugins; [ grafana-piechart-panel ];
|
||||
settings = {
|
||||
server = {
|
||||
http_addr = "127.0.0.1";
|
||||
http_port = 3351;
|
||||
protocol = "http";
|
||||
domain = "grafana.writefo.fun";
|
||||
# Redirect to correct domain if the host header does not match the domain. Prevents DNS rebinding attacks.
|
||||
serve_from_sub_path = false;
|
||||
# Add subpath to the root_url if serve_from_sub_path is true
|
||||
root_url = "%(protocol)s://%(domain)s:%(http_port)s/";
|
||||
enforce_domain = false;
|
||||
read_timeout = "180s";
|
||||
# Enable HTTP compression, this can improve transfer speed and bandwidth utilization.
|
||||
enable_gzip = true;
|
||||
# Cdn for accelerating loading of frontend assets.
|
||||
# cdn_url = "https://cdn.jsdelivr.net/npm/grafana@7.5.5";
|
||||
};
|
||||
|
||||
security = {
|
||||
admin_user = myvars.username;
|
||||
admin_email = myvars.useremail;
|
||||
# Use file provider to read the admin password from a file.
|
||||
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
|
||||
admin_password = "$__file{${config.age.secrets."grafana-admin-password".path}}";
|
||||
};
|
||||
users = {
|
||||
allow_sign_up = false;
|
||||
# home_page = "";
|
||||
default_theme = "dark";
|
||||
};
|
||||
};
|
||||
|
||||
# Declaratively provision Grafana's data sources, dashboards, and alerting rules.
|
||||
# Grafana's alerting rules is not recommended to use, we use Prometheus alertmanager instead.
|
||||
# https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
|
||||
provision = {
|
||||
datasources.path = ./datasources.yml;
|
||||
dashboards.path = ./dashboards.yml;
|
||||
};
|
||||
};
|
||||
|
||||
environment.etc."grafana/dashboards".source = ./dashboards;
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
# Homepage for my Homelab
|
||||
|
||||
> WIP, just a demo for now
|
||||
@@ -0,0 +1,8 @@
|
||||
---
|
||||
- About Me:
|
||||
- Blog:
|
||||
- abbr: Blog
|
||||
href: https://thiscute.world/
|
||||
- Github:
|
||||
- abbr: GH
|
||||
href: https://github.com/ryan4yin
|
||||
@@ -0,0 +1,3 @@
|
||||
# kana-docker:
|
||||
# socket: /var/run/docker.sock
|
||||
#
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 5.5 MiB |
@@ -0,0 +1,6 @@
|
||||
# https://gethomepage.dev/latest/configs/kubernetes/
|
||||
|
||||
# uses the default kubeconfig to access the cluster
|
||||
# read kubbecofig from $KUBECONFIG or $HOME/.kube/config
|
||||
# mode: default
|
||||
mode: disabled
|
||||
@@ -0,0 +1,68 @@
|
||||
---
|
||||
# For configuration options and examples, please see:
|
||||
# https://gethomepage.dev/latest/configs/services
|
||||
|
||||
- Proxmox VE 虚拟化集群:
|
||||
- PVE-UM560:
|
||||
icon: si-proxmox
|
||||
href: https://192.168.5.173:8006/
|
||||
description: "CPU: R5-5625U / MEM: 32G / DISK: 512G+4T*2"
|
||||
siteMonitor: https://192.168.5.173:8006/
|
||||
|
||||
- PVE-S500Plus:
|
||||
icon: si-proxmox
|
||||
href: https://192.168.5.174:8006/
|
||||
description: "CPU: R7-5825U / MEM: 64G / DISK: 1T"
|
||||
siteMonitor: https://192.168.5.174:8006/
|
||||
|
||||
- PVE-GTR5:
|
||||
icon: si-proxmox
|
||||
href: https://192.168.5.172:8006/
|
||||
description: "CPU: R9-5900HX / MEM: 64G / DISK: 1T"
|
||||
siteMonitor: https://192.168.5.172:8006/
|
||||
|
||||
- Homelab Monitoring:
|
||||
- Grafana:
|
||||
icon: si-grafana
|
||||
href: http://grafana.writefor.fun
|
||||
description: Data visualised on dashboards
|
||||
siteMonitor: http://grafana.writefor.fun
|
||||
- Prometheus Dashboard:
|
||||
icon: si-prometheus
|
||||
href: http://prometheus.writefor.fun
|
||||
description: Monitoring - Prometheus
|
||||
siteMonitor: http://prometheus.writefor.fun
|
||||
- Uptime Kuma:
|
||||
icon: si-uptimekuma
|
||||
href: http://uptime-kuma.writefor.fun
|
||||
description: Uptime Checking
|
||||
siteMonitor: http://uptime-kuma.writefor.fun
|
||||
|
||||
- Homelab Applications:
|
||||
- SFTPGO:
|
||||
icon: sftpgo.png
|
||||
href: "http://sftpgo.writefor.fun/web/admin/folders"
|
||||
description: WebDAV & SFTP server
|
||||
siteMonitor: http://sftpgo.writefor.fun/
|
||||
# - Kubernetes Monitoring:
|
||||
# # TODO: Update this
|
||||
# - Emby:
|
||||
# icon: emby.png
|
||||
# href: "http://emby.home/"
|
||||
# description: Media server
|
||||
# namespace: media # The kubernetes namespace the app resides in
|
||||
# app: emby # The name of the deployed app
|
||||
#
|
||||
# - Element Chat:
|
||||
# icon: matrix-light.png
|
||||
# href: https://chat.example.com
|
||||
# description: Matrix Synapse Powered Chat
|
||||
# app: matrix-element
|
||||
# namespace: comms
|
||||
# pod-selector: >-
|
||||
# app.kubernetes.io/instance in (
|
||||
# matrix-element,
|
||||
# matrix-media-repo,
|
||||
# matrix-media-repo-postgresql,
|
||||
# matrix-synapse
|
||||
# )
|
||||
@@ -0,0 +1,82 @@
|
||||
---
|
||||
# For configuration options and examples, please see:
|
||||
# https://gethomepage.dev/latest/configs/settings
|
||||
|
||||
title: Ryan Yin's Homelab
|
||||
base: http://home.writefor.fun/
|
||||
favicon: https://thiscute.world/favicon.ico
|
||||
|
||||
# https://developer.mozilla.org/en-US/docs/Web/Manifest/start_url
|
||||
# Used by some browsers to determine the start page of the web application
|
||||
startUrl: http://home.writefor.fun/
|
||||
|
||||
language: zh
|
||||
|
||||
# Define shared API provider options and secrets here,
|
||||
# You can then pass provider instead of apiKey in your widget configuration.
|
||||
providers:
|
||||
# read api keys from environment variables
|
||||
openweathermap: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
|
||||
weatherapi: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
|
||||
|
||||
background:
|
||||
image: /images/rolling-girls.png
|
||||
blur: sm # sm, "", md, xl... see https://tailwindcss.com/docs/backdrop-blur
|
||||
saturate: 50 # 0, 50, 100... see https://tailwindcss.com/docs/backdrop-saturate
|
||||
brightness: 50 # 0, 50, 75... see https://tailwindcss.com/docs/backdrop-brightness
|
||||
opacity: 50 # 0-100
|
||||
|
||||
theme: dark # or light
|
||||
|
||||
# Supported colors are:
|
||||
# slate, gray, zinc, neutral, stone, amber,
|
||||
# yellow, lime, green, emerald, teal, cyan,
|
||||
# sky, blue, indigo, violet, purple, fuchsia, pink, rose, red, white
|
||||
color: indigo
|
||||
|
||||
# make all cards in a row the same height.
|
||||
useEqualHeights: true
|
||||
|
||||
# Groups and its layout
|
||||
# Groups Name should match the name defined in your services.yaml or widgets.yaml
|
||||
layout:
|
||||
Proxmox VE 虚拟化集群:
|
||||
icon: si-proxmox
|
||||
tab: First
|
||||
|
||||
Group A:
|
||||
initiallyCollapsed: true # collapsed by default
|
||||
tab: First
|
||||
style: row
|
||||
columns: 4
|
||||
|
||||
Second Service Group:
|
||||
useEqualHeights: true # overrides global setting
|
||||
tab: Second
|
||||
columns: 4
|
||||
|
||||
Third Service Group:
|
||||
tab: Third
|
||||
style: row
|
||||
|
||||
Bookmark Group on Fourth Tab:
|
||||
tab: Fourth
|
||||
|
||||
Service Group on every Tab:
|
||||
style: row
|
||||
columns: 4
|
||||
|
||||
# https://gethomepage.dev/latest/configs/services/#icons
|
||||
# iconStyle: theme # optional, defaults to gradient
|
||||
|
||||
# Typing in homepage to quick search
|
||||
quicklaunch:
|
||||
searchDescriptions: true
|
||||
hideInternetSearch: true
|
||||
showSearchSuggestions: true
|
||||
hideVisitURL: true
|
||||
|
||||
# Show docker stats
|
||||
showStats: true
|
||||
|
||||
hideErrors: false
|
||||
@@ -0,0 +1,21 @@
|
||||
# TODO: add access to kubernetes cluster
|
||||
# - kubernetes:
|
||||
# cluster:
|
||||
# show: true
|
||||
# cpu: true
|
||||
# memory: true
|
||||
# showLabel: true
|
||||
# label: "cluster"
|
||||
# nodes:
|
||||
# show: true
|
||||
# cpu: true
|
||||
# memory: true
|
||||
# showLabel: true
|
||||
# - resources:
|
||||
# backend: resources
|
||||
# expanded: true
|
||||
# cpu: true
|
||||
# memory: true
|
||||
- search:
|
||||
provider: google
|
||||
target: _blank
|
||||
@@ -0,0 +1,25 @@
|
||||
{pkgs, ...}: let
|
||||
configDir = "/var/lib/homepage-dashboard";
|
||||
in {
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/services/misc/homepage-dashboard.nix
|
||||
services.homepage-dashboard = {
|
||||
enable = true;
|
||||
listenPort = 4401;
|
||||
openFirewall = false;
|
||||
};
|
||||
systemd.services.homepage-dashboard.environment = {
|
||||
HOMEPAGE_CONFIG_DIR = configDir;
|
||||
|
||||
# 1. The value of env var HOMEPAGE_VAR_XXX will replace {{HOMEPAGE_VAR_XXX}} in any config
|
||||
# HOMEPAGE_VAR_XXX_APIKEY = "myapikey";
|
||||
# 2. The value of env var HOMEPAGE_FILE_XXX must be a file path,
|
||||
# the contents of which will be used to replace {{HOMEPAGE_FILE_XXX}} in any config
|
||||
};
|
||||
# Install the homepage-dashboard configuration files
|
||||
system.activationScripts.installHomepageDashboardConfig = ''
|
||||
mkdir -p ${configDir}
|
||||
${pkgs.rsync}/bin/rsync -avz --chmod=D2755,F600 ${./config}/ ${configDir}/
|
||||
|
||||
${pkgs.systemdMinimal}/bin/systemctl restart homepage-dashboard
|
||||
'';
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
{
|
||||
# Replace dashy with gethomepage, because dashy is too slow to start/reload.
|
||||
|
||||
# # Install the dashy configuration file instead of symlink it
|
||||
# system.activationScripts.installDashyConfig = ''
|
||||
# install -Dm 600 ${./dashy_conf.yml} /etc/dashy/dashy_conf.yml
|
||||
# '';
|
||||
#
|
||||
# # https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/virtualisation/oci-containers.nix
|
||||
# virtualisation.oci-containers.containers = {
|
||||
# # check its logs via `journalctl -u podman-dashy`
|
||||
# dashy = {
|
||||
# hostname = "dashy";
|
||||
# image = "lissy93/dashy:latest";
|
||||
# ports = ["127.0.0.1:4000:80"];
|
||||
# environment = {
|
||||
# "NODE_ENV" = "production";
|
||||
# };
|
||||
# volumes = [
|
||||
# "/etc/dashy/dashy_conf.yml:/app/public/conf.yml"
|
||||
# ];
|
||||
# autoStart = true;
|
||||
# # cmd = [];
|
||||
# };
|
||||
# };
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
lib,
|
||||
mylib,
|
||||
...
|
||||
}: {
|
||||
imports = mylib.scanPaths ./.;
|
||||
|
||||
virtualisation = {
|
||||
docker.enable = lib.mkForce false;
|
||||
podman = {
|
||||
enable = true;
|
||||
# Create a `docker` alias for podman, to use it as a drop-in replacement
|
||||
dockerCompat = true;
|
||||
# Required for containers under podman-compose to be able to talk to each other.
|
||||
defaultNetwork.settings.dns_enabled = true;
|
||||
# Periodically prune Podman resources
|
||||
autoPrune = {
|
||||
enable = true;
|
||||
dates = "weekly";
|
||||
flags = ["--all"];
|
||||
};
|
||||
};
|
||||
|
||||
oci-containers = {
|
||||
backend = "podman";
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
# Prometheus & Alertmanager
|
||||
|
||||
## Alert Rules
|
||||
|
||||
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
|
||||
Prometheus alerting rules
|
||||
@@ -0,0 +1,13 @@
|
||||
groups:
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
- alert: CorednsPanicCount
|
||||
expr: "increase(coredns_panics_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -0,0 +1,162 @@
|
||||
groups:
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
- alert: EtcdInsufficientMembers
|
||||
expr: "count(etcd_server_id) % 2 == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd insufficient Members (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd cluster should have an odd number of members\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: EtcdNoLeader
|
||||
expr: "etcd_server_has_leader == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd no Leader (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfLeaderChanges
|
||||
expr: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of leader changes (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr:
|
||||
'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr:
|
||||
'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: EtcdGrpcRequestsSlow
|
||||
expr:
|
||||
'histogram_quantile(0.99,
|
||||
sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service,
|
||||
grpc_method, le)) > 0.15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
|
||||
description:
|
||||
"GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr:
|
||||
"sum(rate(etcd_http_failed_total[1m])) BY (method) /
|
||||
sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr:
|
||||
"sum(rate(etcd_http_failed_total[1m])) BY (method) /
|
||||
sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
description:
|
||||
"More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: EtcdHttpRequestsSlow
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) >
|
||||
0.15"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd member communication slow (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd member communication slowing down, 99th percentile is over 0.15s\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedProposals
|
||||
expr: "increase(etcd_server_proposals_failed_total[1h]) > 5"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighFsyncDurations
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high fsync durations (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighCommitDurations
|
||||
expr:
|
||||
"histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) >
|
||||
0.25"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high commit durations (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Etcd commit duration increasing, 99th percentile is over 0.25s\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
@@ -0,0 +1,123 @@
|
||||
groups:
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
- alert: IstioKubernetesGatewayAvailabilityDrop
|
||||
expr:
|
||||
'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway",
|
||||
namespace="istio-system"}) without (instance, pod) < 2'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioPilotHighTotalRequestRate
|
||||
expr: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have
|
||||
outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioMixerPrometheusDispatchesLow
|
||||
expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being
|
||||
exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHighTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high total request rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Global request rate in the service mesh is unusually high.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioLowTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio low total request rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Global request rate in the service mesh is unusually low.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh4xxErrorRate
|
||||
expr:
|
||||
'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) /
|
||||
sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh5xxErrorRate
|
||||
expr:
|
||||
'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) /
|
||||
sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 5xx error rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHighRequestLatency
|
||||
expr:
|
||||
'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) /
|
||||
rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high request latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Istio average requests execution is longer than 100ms.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioLatency99Percentile
|
||||
expr:
|
||||
"histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by
|
||||
(destination_canonical_service, destination_workload_namespace, source_canonical_service,
|
||||
source_workload_namespace, le)) > 1000"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: IstioPilotDuplicateEntry
|
||||
expr: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -0,0 +1,435 @@
|
||||
groups:
|
||||
- name: KubestateExporter
|
||||
|
||||
rules:
|
||||
- alert: KubernetesNodeNotReady
|
||||
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Node ready (node {{ $labels.node }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeMemoryPressure
|
||||
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes memory pressure (node {{ $labels.node }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS
|
||||
= {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeDiskPressure
|
||||
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes disk pressure (node {{ $labels.node }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeNetworkUnavailable
|
||||
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeOutOfPodCapacity
|
||||
expr:
|
||||
'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node)
|
||||
(0 * kube_pod_info{pod_template_hash=""})) / sum by (node)
|
||||
(kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: KubernetesContainerOomKiller
|
||||
expr:
|
||||
'(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||
offset 10m >= 1) and ignoring (reason)
|
||||
min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||
== 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{
|
||||
$labels.container }})
|
||||
description:
|
||||
"Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has
|
||||
been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobFailed
|
||||
expr: "kube_job_status_failed > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
description:
|
||||
"Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobSuspended
|
||||
expr: "kube_cronjob_spec_suspend != 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
description:
|
||||
"CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeclaimPending
|
||||
expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
|
||||
$labels.persistentvolumeclaim }})
|
||||
description:
|
||||
"PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is
|
||||
pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesVolumeOutOfDiskSpace
|
||||
expr:
|
||||
"kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesVolumeFullInFourDays
|
||||
expr: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to
|
||||
fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeError
|
||||
expr:
|
||||
'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
|
||||
$labels.persistentvolumeclaim }})
|
||||
description:
|
||||
"Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetDown
|
||||
expr: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
description:
|
||||
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleInability
|
||||
expr:
|
||||
'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} ==
|
||||
1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to
|
||||
scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaMetricsUnavailability
|
||||
expr:
|
||||
'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"}
|
||||
== 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect
|
||||
metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleMaximum
|
||||
expr:
|
||||
"kube_horizontalpodautoscaler_status_desired_replicas >=
|
||||
kube_horizontalpodautoscaler_spec_max_replicas"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum
|
||||
number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaUnderutilized
|
||||
expr:
|
||||
"max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) ==
|
||||
kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
|
||||
description:
|
||||
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at
|
||||
minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPodNotHealthy
|
||||
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
description:
|
||||
"Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for
|
||||
longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPodCrashLooping
|
||||
expr: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
description:
|
||||
"Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesReplicasetReplicasMismatch
|
||||
expr: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
|
||||
description:
|
||||
"ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE
|
||||
= {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDeploymentReplicasMismatch
|
||||
expr: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment
|
||||
}})
|
||||
description:
|
||||
"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE
|
||||
= {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetReplicasMismatch
|
||||
expr: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
|
||||
description:
|
||||
"StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDeploymentGenerationMismatch
|
||||
expr: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment
|
||||
}})
|
||||
description:
|
||||
"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been
|
||||
rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetGenerationMismatch
|
||||
expr: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{
|
||||
$labels.statefulset }})
|
||||
description:
|
||||
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not
|
||||
been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetUpdateNotRolledOut
|
||||
expr:
|
||||
"max without (revision) (kube_statefulset_status_current_revision unless
|
||||
kube_statefulset_status_update_revision) * (kube_statefulset_replicas !=
|
||||
kube_statefulset_status_replicas_updated)"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{
|
||||
$labels.statefulset }})
|
||||
description:
|
||||
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been
|
||||
rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetRolloutStuck
|
||||
expr:
|
||||
"kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100
|
||||
< 100 or kube_daemonset_status_desired_number_scheduled -
|
||||
kube_daemonset_status_current_number_scheduled > 0"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
description:
|
||||
"Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not
|
||||
scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetMisscheduled
|
||||
expr: "kube_daemonset_status_number_misscheduled > 0"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary:
|
||||
Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
description:
|
||||
"Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running
|
||||
where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobTooLong
|
||||
expr: "time() - kube_cronjob_next_schedule_time > 3600"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
description:
|
||||
"CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to
|
||||
complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobSlowCompletion
|
||||
expr: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
|
||||
for: 12h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
description:
|
||||
"Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in
|
||||
time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerErrors
|
||||
expr:
|
||||
'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) /
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API server errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS
|
||||
= {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiClientErrors
|
||||
expr:
|
||||
'(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) /
|
||||
sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API client errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS
|
||||
= {{ $labels }}"
|
||||
|
||||
- alert: KubernetesClientCertificateExpiresNextWeek
|
||||
expr:
|
||||
'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
|
||||
histogram_quantile(0.01, sum by (job, le)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
|
||||
7*24*60*60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
|
||||
description:
|
||||
"A client certificate used to authenticate to the apiserver is expiring next
|
||||
week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesClientCertificateExpiresSoon
|
||||
expr:
|
||||
'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
|
||||
histogram_quantile(0.01, sum by (job, le)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
|
||||
24*60*60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
|
||||
description:
|
||||
"A client certificate used to authenticate to the apiserver is expiring in less than
|
||||
24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerLatency
|
||||
expr:
|
||||
'histogram_quantile(0.99,
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}
|
||||
[10m])) WITHOUT (instance, resource)) > 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes API server latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{
|
||||
$labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels
|
||||
}}"
|
||||
@@ -0,0 +1,508 @@
|
||||
groups:
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
- alert: HostOutOfMemory
|
||||
expr:
|
||||
'(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance)
|
||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels
|
||||
}}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr:
|
||||
'(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description:
|
||||
"The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr:
|
||||
'(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes *
|
||||
100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{
|
||||
$labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr:
|
||||
'(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr:
|
||||
'((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
|
||||
device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr:
|
||||
'((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
|
||||
device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 *
|
||||
3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Filesystem is predicted to run out of space within the next 24 hours at current write
|
||||
rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr:
|
||||
'(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
|
||||
* 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: "node_filesystem_device_error == 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description:
|
||||
"{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }}
|
||||
filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr:
|
||||
'(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
|
||||
* 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 *
|
||||
3600) < 0 and ON (instance, device, mountpoint)
|
||||
node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Filesystem is predicted to run out of inodes within the next 24 hours at current write
|
||||
rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr:
|
||||
'(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m])
|
||||
> 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left
|
||||
(nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr:
|
||||
'(rate(node_disk_write_time_seconds_total[1m]) /
|
||||
rate(node_disk_writes_completed_total[1m]) > 0.1 and
|
||||
rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS =
|
||||
{{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr:
|
||||
'(sum by (instance) (avg by (mode, instance)
|
||||
(rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left
|
||||
(nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr:
|
||||
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
|
||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description:
|
||||
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr:
|
||||
'(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description:
|
||||
"CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may
|
||||
be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr:
|
||||
'(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description:
|
||||
"CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr:
|
||||
'(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Time spent in IO is too high on {{ $labels.instance }}. Check storage for
|
||||
issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitching
|
||||
expr:
|
||||
'((rate(node_context_switches_total[5m])) / (count without(cpu, mode)
|
||||
(node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr:
|
||||
'((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr:
|
||||
'(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr:
|
||||
'((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor)
|
||||
node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr:
|
||||
'(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr:
|
||||
'(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description:
|
||||
"RAID array {{ $labels.device }} is in a degraded state due to one or more disk
|
||||
failures. The number of spare drives is insufficient to fix the issue
|
||||
automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr:
|
||||
'(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description:
|
||||
"At least one device in RAID array on {{ $labels.instance }} failed. Array {{
|
||||
$labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr:
|
||||
'(count(sum(label_replace(node_uname_info, "kernel", "$1", "release",
|
||||
"([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr:
|
||||
'(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr:
|
||||
'(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left
|
||||
(nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory
|
||||
errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr:
|
||||
'(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory
|
||||
errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr:
|
||||
'(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m])
|
||||
> 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
|
||||
\"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr:
|
||||
'(rate(node_network_transmit_errs_total[2m]) /
|
||||
rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
|
||||
\"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr:
|
||||
'((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) +
|
||||
rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) /
|
||||
node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description:
|
||||
"The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting
|
||||
overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr:
|
||||
'((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{
|
||||
$value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr:
|
||||
'(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance)
|
||||
group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description:
|
||||
"The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr:
|
||||
'((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or
|
||||
(node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this
|
||||
host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr:
|
||||
'(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) *
|
||||
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value
|
||||
}}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr:
|
||||
'(node_reboot_required > 0) * on(instance) group_left (nodename)
|
||||
node_uname_info{nodename=~".+"}'
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description:
|
||||
"{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{
|
||||
$labels }}"
|
||||
@@ -0,0 +1,157 @@
|
||||
{
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
checkConfig = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9090;
|
||||
webExternalUrl = "http://prometheus.writefor.fun";
|
||||
|
||||
extraFlags = ["--storage.tsdb.retention.time=45d"];
|
||||
# Directory below /var/lib to store Prometheus metrics data.
|
||||
stateDir = "prometheus2";
|
||||
|
||||
# Reload prometheus when configuration file changes (instead of restart).
|
||||
enableReload = true;
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
|
||||
# remoteRead = [];
|
||||
|
||||
# Rules are read from these files.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
|
||||
#
|
||||
# Prometheus supports two types of rules which may be configured
|
||||
# and then evaluated at regular intervals:
|
||||
# 1. Recording rules
|
||||
# Recording rules allow you to precompute frequently needed or computationally
|
||||
# expensive expressions and save their result as a new set of time series.
|
||||
# Querying the precomputed result will then often be much faster than executing the original expression.
|
||||
# This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
|
||||
# 2. Alerting rules
|
||||
# Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
|
||||
# and to send notifications about firing alerts to an external service.
|
||||
ruleFiles = [
|
||||
./alert_rules/node-exporter.yml
|
||||
./alert_rules/kubestate-exporter.yml
|
||||
./alert_rules/etcd_embedded-exporter.yml
|
||||
./alert_rules/istio_embedded-exporter.yml
|
||||
./alert_rules/coredns_embedded-exporter.yml
|
||||
|
||||
# ./recording_rules.yml
|
||||
];
|
||||
|
||||
# specifies a set of targets and parameters describing how to scrape metrics from them.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
|
||||
scrapeConfigs = [
|
||||
# --- Hosts --- #
|
||||
{
|
||||
job_name = "node-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
# All my NixOS hosts.
|
||||
targets =
|
||||
map (addr: "${addr.ipv4}:9100")
|
||||
(builtins.attrValues myvars.networking.hostsAddr);
|
||||
labels.type = "node";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
# --- Homelab Applications --- #
|
||||
|
||||
{
|
||||
job_name = "dnsmasq-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "dnsmasq";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "v2ray-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "sftpgo-embedded-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:10000"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
# specifies Alertmanager instances the Prometheus server sends alerts to
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
|
||||
alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9093;
|
||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||
logLevel = "info";
|
||||
|
||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "smtp.qq.com:465";
|
||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||
# https://service.mail.qq.com/detail/0/310
|
||||
smtp_require_tls = false;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = ["host"];
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "default";
|
||||
email_configs = [
|
||||
{
|
||||
to = "ryan4yin@linux.com";
|
||||
# Whether to notify about resolved alerts.
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
{
|
||||
# dae(running on aquamarine) do not provides http/socks5 proxy server; so we use v2ray here.
|
||||
# https=//github.com/v2fly
|
||||
services.v2ray = {
|
||||
enable = true;
|
||||
config = {
|
||||
# for monitoring
|
||||
"stats" = {};
|
||||
"api" = {
|
||||
"tag" = "api";
|
||||
"services" = [
|
||||
"StatsService"
|
||||
];
|
||||
};
|
||||
"policy" = {
|
||||
"levels" = {
|
||||
"0" = {
|
||||
"statsUserUplink" = true;
|
||||
"statsUserDownlink" = true;
|
||||
};
|
||||
};
|
||||
"system" = {
|
||||
"statsInboundUplink" = true;
|
||||
"statsInboundDownlink" = true;
|
||||
"statsOutboundUplink" = true;
|
||||
"statsOutboundDownlink" = true;
|
||||
};
|
||||
};
|
||||
|
||||
inbounds = [
|
||||
# core inbound
|
||||
{
|
||||
listen = "0.0.0.0";
|
||||
port = 7890;
|
||||
protocol = "http";
|
||||
}
|
||||
{
|
||||
listen = "0.0.0.0";
|
||||
port = 7891;
|
||||
protocol = "socks";
|
||||
settings = {
|
||||
auth = "noauth";
|
||||
udp = true;
|
||||
};
|
||||
}
|
||||
|
||||
# for monitoring
|
||||
{
|
||||
"tag" = "api";
|
||||
"listen" = "127.0.0.1";
|
||||
"port" = 54321;
|
||||
"protocol" = "dokodemo-door";
|
||||
"settings" = {
|
||||
"address" = "127.0.0.1";
|
||||
};
|
||||
}
|
||||
];
|
||||
outbounds = [
|
||||
# forward traffic directly via system's default network(to dae proxy running on aquamarine)
|
||||
{
|
||||
protocol = "freedom";
|
||||
tag = "freedom";
|
||||
}
|
||||
];
|
||||
|
||||
# for monitoring
|
||||
"routing" = {
|
||||
"rules" = [
|
||||
{
|
||||
"inboundTag" = [
|
||||
"api"
|
||||
];
|
||||
"outboundTag" = "api";
|
||||
"type" = "field";
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/monitoring/prometheus/exporters/v2ray.nix
|
||||
# https://github.com/wi1dcard/v2ray-exporter
|
||||
services.prometheus.exporters.v2ray = {
|
||||
enable = true;
|
||||
listenAddress = "0.0.0.0";
|
||||
port = 9153;
|
||||
openFirewall = false;
|
||||
v2rayEndpoint = "127.0.0.1:54321";
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
{pkgs, ...}: let
|
||||
passwordFile = "/etc/agenix/restic-password";
|
||||
sshKeyPath = "/etc/agenix/ssh-key-for-restic-backup";
|
||||
rcloneConfigFile = "/etc/agenix/rclone-conf-for-restic-backup";
|
||||
in {
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/backup/restic.nix
|
||||
services.restic.backups = {
|
||||
homelab-backup = {
|
||||
inherit passwordFile;
|
||||
initialize = true; # Initialize the repository if it doesn't exist.
|
||||
repository = "rclone:smb-downloads:/Downloads/proxmox-backup/"; # backup to a rclone remote
|
||||
|
||||
# rclone related
|
||||
# rcloneOptions = {
|
||||
# bwlimit = "100M"; # Limit the bandwidth used by rclone.
|
||||
# };
|
||||
inherit rcloneConfigFile;
|
||||
|
||||
# Which local paths to backup, in addition to ones specified via `dynamicFilesFrom`.
|
||||
paths = [
|
||||
"/tmp/restic-backup-temp"
|
||||
];
|
||||
#
|
||||
# A script that produces a list of files to back up. The
|
||||
# results of this command are given to the '--files-from'
|
||||
# option. The result is merged with paths specified via `paths`.
|
||||
# dynamicFilesFrom = "find /home/matt/git -type d -name .git";
|
||||
#
|
||||
# Patterns to exclude when backing up. See
|
||||
# https://restic.readthedocs.io/en/latest/040_backup.html#excluding-files
|
||||
# for details on syntax.
|
||||
exclude = [];
|
||||
|
||||
# A script that must run before starting the backup process.
|
||||
backupPrepareCommand = ''
|
||||
${pkgs.nushell}/bin/nu -c '
|
||||
let pve_nodes = [
|
||||
# proxmox cluster's nodes
|
||||
"um560"
|
||||
"gtr5"
|
||||
"s500plus"
|
||||
|
||||
# others
|
||||
"kana"
|
||||
]
|
||||
|
||||
pve_nodes | each {|it|
|
||||
rsync -avz \
|
||||
-e "ssh -i ${sshKeyPath}" \
|
||||
$"($it):/var/lib/vz" $"/tmp/restic-backup-temp/($it)"
|
||||
}
|
||||
'
|
||||
'';
|
||||
# A script that must run after finishing the backup process.
|
||||
backupCleanupCommand = "rm -rf /tmp/restic-backup-temp";
|
||||
|
||||
# Extra extended options to be passed to the restic --option flag.
|
||||
# extraOptions = [];
|
||||
|
||||
# Extra arguments passed to restic backup.
|
||||
# extraBackupArgs = [
|
||||
# "--exclude-file=/etc/restic/excludes-list"
|
||||
# ];
|
||||
|
||||
# repository = "/mnt/backup-hdd"; # backup to a local directory
|
||||
# When to run the backup. See {manpage}`systemd.timer(5)` for details.
|
||||
timerConfig = {
|
||||
OnCalendar = "01:30";
|
||||
RandomizedDelaySec = "1h";
|
||||
};
|
||||
# A list of options (--keep-* et al.) for 'restic forget --prune',
|
||||
# to automatically prune old snapshots.
|
||||
# The 'forget' command is run *after* the 'backup' command, so
|
||||
# keep that in mind when constructing the --keep-* options.
|
||||
pruneOpts = [
|
||||
"--keep-daily 3"
|
||||
"--keep-weekly 3"
|
||||
"--keep-monthly 3"
|
||||
"--keep-yearly 3"
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
{config, ...}: {
|
||||
# Read SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD from a file
|
||||
systemd.services.sftpgo.serviceConfig.EnvironmentFile = config.age.secrets."sftpgo.env".path;
|
||||
|
||||
services.sftpgo = {
|
||||
enable = true;
|
||||
user = "sftpgo";
|
||||
dataDir = "/var/lib/sftpgo";
|
||||
extraArgs = [
|
||||
"--log-level"
|
||||
"info"
|
||||
];
|
||||
# https://github.com/drakkan/sftpgo/blob/2.5.x/docs/full-configuration.md
|
||||
settings = {
|
||||
common = {
|
||||
# Auto-blocking policy for SFTPGo and thus helps to prevent DoS (Denial of Service) and brute force password guessing.
|
||||
defender = {
|
||||
enable = true;
|
||||
};
|
||||
};
|
||||
# Where to store stfpgo's data
|
||||
data_provider = {
|
||||
driver = "sqlite";
|
||||
name = "sftpgo.db";
|
||||
password_hashing = {
|
||||
algo = "argon2id";
|
||||
# options for argon2id hashing algorithm.
|
||||
# The memory and iterations parameters control the computational cost of hashing the password.
|
||||
argon2_options = {
|
||||
memory = 65536; # KiB
|
||||
iterations = 2; # The number of iterations over the memory.
|
||||
parallelism = 2; # The number of threads (or lanes) used by the algorithm.
|
||||
};
|
||||
};
|
||||
password_validation = {
|
||||
# What Entropy Value Should I Use?
|
||||
# somewhere in the 50-70 range seems "reasonable".
|
||||
# https://github.com/wagslane/go-password-validator#what-entropy-value-should-i-use
|
||||
admins.min_entropy = 60;
|
||||
users.min_entropy = 60;
|
||||
};
|
||||
# Cache passwords in memory to avoid hashing the same password multiple times(it costs).
|
||||
password_caching = true;
|
||||
# create the default admin user via environment variables
|
||||
# SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD
|
||||
create_default_admin = true;
|
||||
};
|
||||
|
||||
# WebDAV is a popular protocol for file sharing, better than CIFS/SMB, NFS, etc.
|
||||
# it's save to use WebDAV over HTTPS on public networks.
|
||||
webdavd.bindings = [
|
||||
{
|
||||
address = "127.0.0.1";
|
||||
port = 3303;
|
||||
}
|
||||
];
|
||||
# HTTP Server provides a simple web interface to manage the server.
|
||||
httpd.bindings = [
|
||||
{
|
||||
address = "127.0.0.1";
|
||||
enable_https = false;
|
||||
port = 3302;
|
||||
client_ip_proxy_header = "X-Forwarded-For";
|
||||
# a basic built-in web interface that allows you to manage users,
|
||||
# virtual folders, admins and connections.
|
||||
# url: http://127.0.0.1:8080/web/admin
|
||||
enable_web_admin = true;
|
||||
# A basic front-end web interface for your users.
|
||||
# It allows end-users to browse and manage their files and change their credentials.
|
||||
enable_web_client = true;
|
||||
enable_rest_api = true;
|
||||
}
|
||||
];
|
||||
# prometheus metrics
|
||||
telemetry = {
|
||||
bind_port = 10000;
|
||||
bind_address = "0.0.0.0";
|
||||
# auth_user_file = "";
|
||||
};
|
||||
# multi-factor authentication settings
|
||||
mfa.totp = [
|
||||
{
|
||||
# Unique configuration name, not visible to the authentication apps.
|
||||
# Should not to be changed after the first user has been created.
|
||||
name = "SFTPGo";
|
||||
# Name of the issuing Organization/Company
|
||||
issuer = "SFTPGo";
|
||||
# Algorithm to use for HMAC
|
||||
# Currently Google Authenticator app on iPhone seems to only support sha1
|
||||
algo = "sha1";
|
||||
}
|
||||
];
|
||||
# SMTP configuration enables SFTPGo email sending capabilities
|
||||
# smtp = {};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
{
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: let
|
||||
dataDir = "/var/lib/transmission";
|
||||
name = "transmission";
|
||||
in {
|
||||
# the headless Transmission BitTorrent daemon
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/torrent/transmission.nix
|
||||
# https://wiki.archlinux.org/title/transmission
|
||||
services.transmission = {
|
||||
enable = true;
|
||||
user = name;
|
||||
group = name;
|
||||
home = dataDir;
|
||||
downloadDirPermissions = "0770";
|
||||
|
||||
# Whether to enable tweaking of kernel parameters to open many more connections at the same time.
|
||||
# Note that you may also want to increase peer-limit-global.
|
||||
# And be aware that these settings are quite aggressive and might not suite your regular desktop use.
|
||||
# For instance, SSH sessions may time out more easily.
|
||||
performanceNetParameters = true;
|
||||
|
||||
# Path to a JSON file to be merged with the settings.
|
||||
# Useful to merge a file which is better kept out of the Nix store to set secret config parameters like `rpc-password`.
|
||||
credentialsFile = config.age.secrets."transmission-credentials.json".path;
|
||||
|
||||
# Whether to open the RPC port in the firewall.
|
||||
openRPCPort = false;
|
||||
openPeerPorts = true;
|
||||
|
||||
# https://github.com/transmission/transmission/blob/main/docs/Editing-Configuration-Files.md
|
||||
settings = {
|
||||
# 0 = None, 1 = Critical, 2 = Error, 3 = Warn, 4 = Info, 5 = Debug, 6 = Trace;
|
||||
message-level = 3;
|
||||
|
||||
# Encryption may help get around some ISP filtering,
|
||||
# but at the cost of slightly higher CPU use.
|
||||
# 0 = Prefer unencrypted connections,
|
||||
# 1 = Prefer encrypted connections,
|
||||
# 2 = Require encrypted connections; default = 1)
|
||||
encryption = 2;
|
||||
|
||||
# rpc = Web Interface
|
||||
rpc-port = 9091;
|
||||
rpc-bind-address = "127.0.0.1";
|
||||
anti-brute-force-enabled = true;
|
||||
# After this amount of failed authentication attempts is surpassed,
|
||||
# the RPC server will deny any further authentication attempts until it is restarted.
|
||||
# This is not tracked per IP but in total.
|
||||
anti-brute-force-threshold = 20;
|
||||
rpc-authentication-required = true;
|
||||
|
||||
# Comma-delimited list of IP addresses.
|
||||
# Wildcards allowed using '*'. Example: "127.0.0.*,192.168.*.*",
|
||||
rpc-whitelist-enabled = true;
|
||||
rpc-whitelist = "127.0.0.*,192.168.*.*";
|
||||
# Comma-delimited list of domain names.
|
||||
# Wildcards allowed using '*'. Example: "*.foo.org,example.com",
|
||||
rpc-host-whitelist-enabled = true;
|
||||
rpc-host-whitelist = "*.writefor.fun,localhost,192.168.5.*";
|
||||
rpc-user = myvars.username;
|
||||
rpc-username = myvars.username;
|
||||
# rpc-password = "test"; # you'd better use the credentialsFile for this.
|
||||
|
||||
incomplete-dir-enabled = true;
|
||||
incomplete-dir = "${dataDir}/incomplete";
|
||||
download-dir = "${dataDir}/downloads";
|
||||
|
||||
# Watch a directory for torrent files and add them to transmission.
|
||||
watch-dir-enabled = false;
|
||||
watch-dir = "${dataDir}/watch";
|
||||
# Whether to enable Micro Transport Protocol (µTP).
|
||||
utp-enabled = true;
|
||||
# Executable to be run at torrent completion.
|
||||
script-torrent-done-enabled = false;
|
||||
# script-torrent-done-filename = "/path/to/script";
|
||||
|
||||
# Enable Local Peer Discovery (LPD).
|
||||
lpd-enabled = true;
|
||||
# The peer port to listen for incoming connections.
|
||||
peer-port = 51413;
|
||||
# Enable UPnP or NAT-PMP to forward a port through your firewall(NAT).
|
||||
# https://github.com/transmission/transmission/blob/main/docs/Port-Forwarding-Guide.md
|
||||
port-forwarding-enabled = true;
|
||||
|
||||
# "normal" speed limits
|
||||
speed-limit-down-enabled = true;
|
||||
speed-limit-down = 30000; # KB/s
|
||||
speed-limit-up-enabled = true;
|
||||
speed-limit-up = 500; # KB/s
|
||||
upload-slots-per-torrent = 8;
|
||||
|
||||
# Start torrents as soon as they are added
|
||||
start-added-torrents = true;
|
||||
|
||||
# Queuing
|
||||
# When true, Transmission will only download
|
||||
# download-queue-size non-stalled torrents at once.
|
||||
download-queue-enabled = true;
|
||||
download-queue-size = 5;
|
||||
|
||||
# When true, torrents that have not shared data for
|
||||
# queue-stalled-minutes are treated as 'stalled'
|
||||
# and are not counted against the queue-download-size
|
||||
# and seed-queue-size limits.
|
||||
queue-stalled-enabled = true;
|
||||
queue-stalled-minutes = 60;
|
||||
|
||||
# When true. Transmission will only seed seed-queue-size
|
||||
# non-stalled torrents at once.
|
||||
seed-queue-enabled = true;
|
||||
seed-queue-size = 10;
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
# https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/monitoring/uptime-kuma.nix
|
||||
services.uptime-kuma = {
|
||||
enable = true;
|
||||
# https://github.com/louislam/uptime-kuma/wiki/Environment-Variables
|
||||
settings = {
|
||||
"UPTIME_KUMA_HOST" = "127.0.0.1";
|
||||
"UPTIME_KUMA_PORT" = "3350";
|
||||
"DATA_DIR" = "/var/lib/uptime-kuma/";
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user