feat: migrate all nixos services from idols to 12kingdoms

2026-07-10 23:02:58 +02:00 · 2024-03-29 11:58:48 +08:00
parent 4da995fa88
commit bf6bc4bee2
65 changed files with 422 additions and 446 deletions
@@ -0,0 +1,143 @@
+# Rakushun - Disk and Installation
+
+Disk layout:
+
+```bash
+[ryan@rakushun:~]$ lsblk
+NAME        MAJ:MIN RM  SIZE RO TYPE  MOUNTPOINTS
+sda           8:0    1 58.6G  0 disk
+└─sda1        8:1    1  487M  0 part
+mtdblock0    31:0    0   16M  0 disk
+zram0       254:0    0    0B  0 disk
+nvme0n1     259:0    0  1.8T  0 disk
+├─nvme0n1p1 259:1    0  630M  0 part  /boot
+└─nvme0n1p2 259:2    0  1.8T  0 part
+  └─encrypted 253:0    0  1.8T  0 crypt /tmp
+                                      /swap
+                                      /snapshots
+                                      /home/ryan/tmp
+                                      /home/ryan/nix-config
+                                      /home/ryan/go
+                                      /home/ryan/codes
+                                      /home/ryan/.ssh
+                                      /home/ryan/.local/state
+                                      /home/ryan/.npm
+                                      /home/ryan/.local/share
+                                      /home/ryan/.conda
+                                      /etc/ssh
+                                      /etc/nix/inputs
+                                      /etc/secureboot
+                                      /etc/agenix
+                                      /etc/NetworkManager/system-connections
+                                      /etc/machine-id
+                                      /nix/store
+                                      /var/log
+                                      /var/lib
+                                      /nix
+                                      /persistent
+
+[ryan@rakushun:~]$ df -Th
+Filesystem          Type      Size  Used Avail Use% Mounted on
+devtmpfs            devtmpfs  785M     0  785M   0% /dev
+tmpfs               tmpfs     7.7G     0  7.7G   0% /dev/shm
+tmpfs               tmpfs     3.9G  6.8M  3.9G   1% /run
+tmpfs               tmpfs     7.7G  1.9M  7.7G   1% /run/wrappers
+none                tmpfs     4.0G   48K  4.0G   1% /
+/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /persistent
+/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /nix
+/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /snapshots
+/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /swap
+/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /tmp
+/dev/nvme0n1p1      vfat      629M   96M  534M  16% /boot
+tmpfs               tmpfs     1.6G  4.0K  1.6G   1% /run/user/1000
+```
+
+CPU info:
+
+```bash
+[ryan@rakushun:~]$ lscpu
+Architecture:           aarch64
+  CPU op-mode(s):       32-bit, 64-bit
+  Byte Order:           Little Endian
+CPU(s):                 8
+  On-line CPU(s) list:  0-7
+Vendor ID:              ARM
+  Model name:           Cortex-A55
+    Model:              0
+    Thread(s) per core: 1
+    Core(s) per socket: 4
+    Socket(s):          1
+    Stepping:           r2p0
+    CPU(s) scaling MHz: 67%
+    CPU max MHz:        1800.0000
+    CPU min MHz:        408.0000
+    BogoMIPS:           48.00
+    Flags:              fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
+  Model name:           Cortex-A76
+    Model:              0
+    Thread(s) per core: 1
+    Core(s) per socket: 2
+    Socket(s):          2
+    Stepping:           r4p0
+    CPU(s) scaling MHz: 18%
+    CPU max MHz:        2256.0000
+    CPU min MHz:        408.0000
+    BogoMIPS:           48.00
+    Flags:              fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
+Caches (sum of all):
+  L1d:                  384 KiB (8 instances)
+  L1i:                  384 KiB (8 instances)
+  L2:                   2.5 MiB (8 instances)
+  L3:                   3 MiB (1 instance)
+```
+
+## How to install NixOS on Orange Pi 5 Plus
+
+### 1. Prepare a USB LUKS key
+
+Generate LUKS keyfile to encrypt the root partition, it's used by disko.
+
+```bash
+# partition the usb stick
+DEV=/dev/sdX
+parted ${DEV} -- mklabel gpt
+parted ${DEV} -- mkpart OPI5P_DSC fat32 0% 512MB
+mkfs.fat -F 32 -n OPI5P_DSC ${DEV}1
+
+# Generate a keyfile from the true random number generator
+KEYFILE=./orangepi5plus-luks-keyfile
+dd bs=512 count=64 iflag=fullblock if=/dev/random of=$KEYFILE
+
+# copy the keyfile and token to the usb stick
+KEYFILE=./orangepi5plus-luks-keyfile
+DEVICE=/dev/disk/by-label/OPI5P_DSC
+# seek=128 skip N obs-sized output blocks to avoid overwriting the filesystem header
+dd bs=512 count=64 iflag=fullblock seek=128 if=$KEYFILE of=$DEVICE
+```
+
+### 2. Partition the SSD & install NixOS via disko
+
+First, follow
+[UEFI - ryan4yin/nixos-rk3588](https://github.com/ryan4yin/nixos-rk3588/blob/main/UEFI.md) to
+install UEFI bootloader and boot into NixOS live environment via a USB stick.
+
+Then, run the following commands:
+
+```bash
+# transfer the nix-config to the target machine
+rsync -avzP ~/nix-config rk@<ip-addr>:/home/rk/
+
+# login via ssh
+ssh rk@<ip-addr>
+
+cd ~/nix-config/hosts/12kingdoms_rakushun
+# 1. change the disk device path in ./disko-fs.nix to the disk you want to use
+# 2. partition & format the disk via disko
+sudo nix --experimental-features "nix-command flakes" run github:nix-community/disko -- --mode disko ./disko-fs.nix
+
+
+cd ~/nix-config
+# install nixos
+# NOTE: the root password you set here will be discarded when reboot
+sudo nixos-install --root /mnt --flake .#rakushun --no-root-password --show-trace --verbose
+```
@@ -2,148 +2,33 @@

 LUKS encrypted SSD for NixOS, on Orange Pi 5 Plus.

+Host running storage, operation and maintenance related services:
+
+1. Storage such as git server, file server/browser, torrent downloader,, etc.
+1. Backup or sync my personal data to cloud or NAS.
+   - For safety, those data should be encrypted before sending to the cloud or my NAS.
+1. Collect and monitor the metrics/logs of my homelab.
+
 ## Showcases

 ![](../../_img/2024-03-07_orangepi5plus_rakushun.webp)

-Disk layout:
+## Features

-```bash
-[ryan@rakushun:~]$ lsblk
-NAME        MAJ:MIN RM  SIZE RO TYPE  MOUNTPOINTS
-sda           8:0    1 58.6G  0 disk
-└─sda1        8:1    1  487M  0 part
-mtdblock0    31:0    0   16M  0 disk
-zram0       254:0    0    0B  0 disk
-nvme0n1     259:0    0  1.8T  0 disk
-├─nvme0n1p1 259:1    0  630M  0 part  /boot
-└─nvme0n1p2 259:2    0  1.8T  0 part
-  └─encrypted 253:0    0  1.8T  0 crypt /tmp
-                                      /swap
-                                      /snapshots
-                                      /home/ryan/tmp
-                                      /home/ryan/nix-config
-                                      /home/ryan/go
-                                      /home/ryan/codes
-                                      /home/ryan/.ssh
-                                      /home/ryan/.local/state
-                                      /home/ryan/.npm
-                                      /home/ryan/.local/share
-                                      /home/ryan/.conda
-                                      /etc/ssh
-                                      /etc/nix/inputs
-                                      /etc/secureboot
-                                      /etc/agenix
-                                      /etc/NetworkManager/system-connections
-                                      /etc/machine-id
-                                      /nix/store
-                                      /var/log
-                                      /var/lib
-                                      /nix
-                                      /persistent
+Services:

-[ryan@rakushun:~]$ df -Th
-Filesystem          Type      Size  Used Avail Use% Mounted on
-devtmpfs            devtmpfs  785M     0  785M   0% /dev
-tmpfs               tmpfs     7.7G     0  7.7G   0% /dev/shm
-tmpfs               tmpfs     3.9G  6.8M  3.9G   1% /run
-tmpfs               tmpfs     7.7G  1.9M  7.7G   1% /run/wrappers
-none                tmpfs     4.0G   48K  4.0G   1% /
-/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /persistent
-/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /nix
-/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /snapshots
-/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /swap
-/dev/mapper/crypted btrfs     1.9T   19G  1.8T   2% /tmp
-/dev/nvme0n1p1      vfat      629M   96M  534M  16% /boot
-tmpfs               tmpfs     1.6G  4.0K  1.6G   1% /run/user/1000
-```
+1. prometheus + alertmanager + grafana + loki: Monitor the metrics/logs of my homelab.
+1. restic: Backup my personal data to cloud or NAS.
+1. synthing: Sync file between android/macbook/PC and NAS.
+1. attic: Nix cache server.
+1. gitea: Self-hosted git service.
+1. sftpgo: SFTP server.
+1. transmission & AriaNg: Torrent downloader and HTTP downloader
+1. alist/filebrower: File browser for local/SMB/Cloud

-CPU info:
+All the services assumes a reverse proxy to be setup in the front, they are all listening on
+localhost, and a caddy service is listening on the local network interface and proxy the requests to
+the services.

-```bash
-[ryan@rakushun:~]$ lscpu
-Architecture:           aarch64
-  CPU op-mode(s):       32-bit, 64-bit
-  Byte Order:           Little Endian
-CPU(s):                 8
-  On-line CPU(s) list:  0-7
-Vendor ID:              ARM
-  Model name:           Cortex-A55
-    Model:              0
-    Thread(s) per core: 1
-    Core(s) per socket: 4
-    Socket(s):          1
-    Stepping:           r2p0
-    CPU(s) scaling MHz: 67%
-    CPU max MHz:        1800.0000
-    CPU min MHz:        408.0000
-    BogoMIPS:           48.00
-    Flags:              fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
-  Model name:           Cortex-A76
-    Model:              0
-    Thread(s) per core: 1
-    Core(s) per socket: 2
-    Socket(s):          2
-    Stepping:           r4p0
-    CPU(s) scaling MHz: 18%
-    CPU max MHz:        2256.0000
-    CPU min MHz:        408.0000
-    BogoMIPS:           48.00
-    Flags:              fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp
-Caches (sum of all):
-  L1d:                  384 KiB (8 instances)
-  L1i:                  384 KiB (8 instances)
-  L2:                   2.5 MiB (8 instances)
-  L3:                   3 MiB (1 instance)
-```
-
-## How to install NixOS on Orange Pi 5 Plus
-
-### 1. Prepare a USB LUKS key
-
-Generate LUKS keyfile to encrypt the root partition, it's used by disko.
-
-```bash
-# partition the usb stick
-DEV=/dev/sdX
-parted ${DEV} -- mklabel gpt
-parted ${DEV} -- mkpart OPI5P_DSC fat32 0% 512MB
-mkfs.fat -F 32 -n OPI5P_DSC ${DEV}1
-
-# Generate a keyfile from the true random number generator
-KEYFILE=./orangepi5plus-luks-keyfile
-dd bs=512 count=64 iflag=fullblock if=/dev/random of=$KEYFILE
-
-# copy the keyfile and token to the usb stick
-KEYFILE=./orangepi5plus-luks-keyfile
-DEVICE=/dev/disk/by-label/OPI5P_DSC
-# seek=128 skip N obs-sized output blocks to avoid overwriting the filesystem header
-dd bs=512 count=64 iflag=fullblock seek=128 if=$KEYFILE of=$DEVICE
-```
-
-### 2. Partition the SSD & install NixOS via disko
-
-First, follow
-[UEFI - ryan4yin/nixos-rk3588](https://github.com/ryan4yin/nixos-rk3588/blob/main/UEFI.md) to
-install UEFI bootloader and boot into NixOS live environment via a USB stick.
-
-Then, run the following commands:
-
-```bash
-# transfer the nix-config to the target machine
-rsync -avzP ~/nix-config rk@<ip-addr>:/home/rk/
-
-# login via ssh
-ssh rk@<ip-addr>
-
-cd ~/nix-config/hosts/12kingdoms_rakushun
-# 1. change the disk device path in ./disko-fs.nix to the disk you want to use
-# 2. partition & format the disk via disko
-sudo nix --experimental-features "nix-command flakes" run github:nix-community/disko -- --mode disko ./disko-fs.nix
-
-
-cd ~/nix-config
-# install nixos
-# NOTE: the root password you set here will be discarded when reboot
-sudo nixos-install --root /mnt --flake .#rakushun --no-root-password --show-trace --verbose
-```
+TODO: create a private PKI for caddy, to achieve end-to-end encryption between caddy and the
+services.
@@ -0,0 +1,74 @@
+{
+  config,
+  attic,
+  ...
+}: {
+  #=====================================================
+  #
+  # Attic
+  #
+  # A self-hostable Nix Binary Cache server
+  # backed by an S3-compatible storage provider
+  #
+  # https://docs.attic.rs/tutorial.html
+  #
+  #=====================================================
+
+  imports = [
+    attic.nixosModules.atticd
+  ];
+
+  # Self-Hosted Nix Cache Server
+  # https://github.com/zhaofengli/attic
+  #
+  # The first thing to do after setting up the server is:
+  # 1. Generate a admin token on the server via command:
+  #    `sudo atticd-atticadm make-token --sub "admin-1" --validity "2y" --pull "*" --push "*"  --delete "*" --create-cache "*" --configure-cache "*"  --configure-cache-retention "*"  --destroy-cache "*"`
+  # 2. Login at the desktop via command:
+  #    `attic login central http://attic.writefor.fun <TOKEN>`
+  # 3. Create a new cache via command:
+  #    `attic cache create rk3588`
+  #    `attic use cache rk3588`
+  # 4. Push Caches to the cache server via:
+  #    it's similar to cachix, related docs:
+  #    https://docs.attic.rs/reference/attic-cli.html
+  #    https://docs.cachix.org/pushing#pushing
+  services.atticd = {
+    enable = true;
+
+    # Replace with absolute path to your credentials file
+    # The HS256 JWT secret can be generated with the openssl:
+    #   openssl rand 64 | base64 -w0
+    #
+    # Content:
+    #   ATTIC_SERVER_TOKEN_HS256_SECRET_BASE64="output from openssl"
+    credentialsFile = config.age.secrets."attic-nix-cache-server.env".path;
+
+    settings = {
+      listen = "127.0.0.1:3300";
+
+      # Data chunking
+      #
+      # Warning: If you change any of the values here, it will be
+      # difficult to reuse existing chunks for newly-uploaded NARs
+      # since the cutpoints will be different. As a result, the
+      # deduplication ratio will suffer for a while after the change.
+      chunking = {
+        # The minimum NAR size to trigger chunking
+        #
+        # If 0, chunking is disabled entirely for newly-uploaded NARs.
+        # If 1, all NARs are chunked.
+        nar-size-threshold = 64 * 1024; # 64 KiB
+
+        # The preferred minimum size of a chunk, in bytes
+        min-size = 16 * 1024; # 16 KiB
+
+        # The preferred average size of a chunk, in bytes
+        avg-size = 64 * 1024; # 64 KiB
+
+        # The preferred maximum size of a chunk, in bytes
+        max-size = 256 * 1024; # 256 KiB
+      };
+    };
+  };
+}
@@ -19,9 +19,10 @@
    # email = myvars.useremail;
    # acmeCA = "https://acme-v02.api.letsencrypt.org/directory";

-    virtualHosts."http://git.writefor.fun".extraConfig = ''
+    # Dashboard
+    virtualHosts."http://home.writefor.fun".extraConfig = ''
      encode zstd gzip
-      reverse_proxy http://localhost:3000
+      reverse_proxy http://localhost:4401
    '';

    # https://caddyserver.com/docs/caddyfile/directives/file_server
@@ -33,13 +34,54 @@
        precompressed zstd br gzip
      }
    '';
+
+    # Datastore
+    virtualHosts."http://attic.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:3300
+    '';
+
+    virtualHosts."http://git.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:3301
+    '';
+    virtualHosts."http://sftpgo.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:3302
+    '';
+    virtualHosts."http://webdav.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:3303
+    '';
+    virtualHosts."http://transmission.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:9091
+    '';
+
+    # Monitoring
+    virtualHosts."http://uptime-kuma.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:3350
+    '';
+    virtualHosts."http://grafana.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:3351
+    '';
+    virtualHosts."http://prometheus.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:9090
+    '';
+    virtualHosts."http://alertmanager.writefor.fun".extraConfig = ''
+      encode zstd gzip
+      reverse_proxy http://localhost:9093
+    '';
  };
  networking.firewall.allowedTCPPorts = [80 443];

  # Create Directories
  systemd.tmpfiles.rules = [
    "d /var/lib/caddy/fileserver/ 0755 caddy caddy"
-    # directory for virual machine's images
+    # directory for virtual machine's images
    "d /var/lib/caddy/fileserver/vms 0755 caddy caddy"
  ];
 }
@@ -1,4 +1,5 @@
 {
+  mylib,
  disko,
  nixos-rk3588,
  myvars,
@@ -12,17 +13,13 @@
 let
  hostName = "rakushun"; # Define your hostname.
 in {
-  imports = [
-    # import the rk3588 module, which contains the configuration for bootloader/kernel/firmware
-    nixos-rk3588.nixosModules.orangepi5plus.core
-    disko.nixosModules.default
-    ./hardware-configuration.nix
-    ./disko-fs.nix
-    ./impermanence.nix
-
-    ./gitea.nix
-    ./caddy.nix
-  ];
+  imports =
+    (mylib.scanPaths ./.)
+    ++ [
+      # import the rk3588 module, which contains the configuration for bootloader/kernel/firmware
+      nixos-rk3588.nixosModules.orangepi5plus.core
+      disko.nixosModules.default
+    ];

  networking = {
    inherit hostName;
@@ -0,0 +1,3 @@
+{mylib, ...}: {
+  imports = mylib.scanPaths ./.;
+}
@@ -21,7 +21,7 @@ in {
      server = {
        SSH_PORT = 2222;
        PROTOCOL = "http";
-        HTTP_PORT = 3000;
+        HTTP_PORT = 3301;
        HTTP_ADDR = "127.0.0.1";
        DOMAIN = "git.writefor.fun";
      };
@@ -0,0 +1,20 @@
+apiVersion: 1
+
+providers:
+  # <string> an unique provider name. Required
+  - name: "Dashboards"
+    # <int> Org id. Default to 1
+    orgId: 1
+    # <string> provider type. Default to 'file'
+    type: file
+    # <bool> disable dashboard deletion
+    disableDeletion: false
+    # <int> how often Grafana will scan for changed dashboards
+    updateIntervalSeconds: 20
+    # <bool> allow updating provisioned dashboards from the UI
+    allowUiUpdates: false
+    options:
+      # <string, required> path to dashboard files on disk. Required when using the 'file' type
+      path: /etc/grafana/dashboards
+      # <bool> use folder names from filesystem to create folders in Grafana
+      foldersFromFilesStructure: true
@@ -0,0 +1,10 @@
+# Grafana Dashboards
+
+## Homelab
+
+1. https://grafana.com/grafana/dashboards/1860-node-exporter-full/
+2. https://grafana.com/grafana/dashboards/9578-alertmanager/
+
+## Kubernetes
+
+1. https://github.com/dotdc/grafana-dashboards-kubernetes/
@@ -0,0 +1,21 @@
+# https://grafana.com/docs/grafana/latest/datasources/prometheus/
+apiVersion: 1
+
+datasources:
+  - name: prometheus-homelab
+    type: prometheus
+    access: proxy
+    # Access mode - proxy (server in the UI) or direct (browser in the UI).
+    url: http://localhost:9090
+    jsonData:
+      httpMethod: POST
+      manageAlerts: true
+      prometheusType: Prometheus
+      prometheusVersion: 2.49.0
+      cacheLevel: "High"
+      disableRecordingRules: false
+      # As of Grafana 10, the Prometheus data source can be configured to query live dashboards
+      # incrementally, instead of re-querying the entire duration on each dashboard refresh.
+      # Increasing the duration of the incrementalQueryOverlapWindow will increase the size of every incremental query,
+      # but might be helpful for instances that have inconsistent results for recent data.
+      incrementalQueryOverlapWindow: 10m
@@ -0,0 +1,52 @@
+{
+  config,
+  myvars,
+  ...
+}: {
+  services.grafana = {
+    enable = true;
+    dataDir = "/var/lib/grafana";
+    # DeclarativePlugins = with pkgs.grafanaPlugins; [ grafana-piechart-panel ];
+    settings = {
+      server = {
+        http_addr = "127.0.0.1";
+        http_port = 3351;
+        protocol = "http";
+        domain = "grafana.writefo.fun";
+        # Redirect to correct domain if the host header does not match the domain. Prevents DNS rebinding attacks.
+        serve_from_sub_path = false;
+        # Add subpath to the root_url if serve_from_sub_path is true
+        root_url = "%(protocol)s://%(domain)s:%(http_port)s/";
+        enforce_domain = false;
+        read_timeout = "180s";
+        # Enable HTTP compression, this can improve transfer speed and bandwidth utilization.
+        enable_gzip = true;
+        # Cdn for accelerating loading of frontend assets.
+        # cdn_url = "https://cdn.jsdelivr.net/npm/grafana@7.5.5";
+      };
+
+      security = {
+        admin_user = myvars.username;
+        admin_email = myvars.useremail;
+        # Use file provider to read the admin password from a file.
+        # https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
+        admin_password = "$__file{${config.age.secrets."grafana-admin-password".path}}";
+      };
+      users = {
+        allow_sign_up = false;
+        # home_page = "";
+        default_theme = "dark";
+      };
+    };
+
+    # Declaratively provision Grafana's data sources, dashboards, and alerting rules.
+    # Grafana's alerting rules is not recommended to use, we use Prometheus alertmanager instead.
+    # https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
+    provision = {
+      datasources.path = ./datasources.yml;
+      dashboards.path = ./dashboards.yml;
+    };
+  };
+
+  environment.etc."grafana/dashboards".source = ./dashboards;
+}
@@ -0,0 +1,3 @@
+# Homepage for my Homelab
+
+> WIP, just a demo for now
@@ -0,0 +1,8 @@
+---
+- About Me:
+    - Blog:
+        - abbr: Blog
+          href: https://thiscute.world/
+    - Github:
+        - abbr: GH
+          href: https://github.com/ryan4yin
@@ -0,0 +1,3 @@
+# kana-docker:
+#   socket: /var/run/docker.sock
+#
@@ -0,0 +1,6 @@
+# https://gethomepage.dev/latest/configs/kubernetes/
+
+# uses the default kubeconfig to access the cluster
+# read kubbecofig from $KUBECONFIG or $HOME/.kube/config
+# mode: default
+mode: disabled
@@ -0,0 +1,68 @@
+---
+# For configuration options and examples, please see:
+# https://gethomepage.dev/latest/configs/services
+
+- Proxmox VE 虚拟化集群:
+    - PVE-UM560:
+        icon: si-proxmox
+        href: https://192.168.5.173:8006/
+        description: "CPU: R5-5625U / MEM: 32G / DISK: 512G+4T*2"
+        siteMonitor: https://192.168.5.173:8006/
+
+    - PVE-S500Plus:
+        icon: si-proxmox
+        href: https://192.168.5.174:8006/
+        description: "CPU: R7-5825U / MEM: 64G / DISK: 1T"
+        siteMonitor: https://192.168.5.174:8006/
+
+    - PVE-GTR5:
+        icon: si-proxmox
+        href: https://192.168.5.172:8006/
+        description: "CPU: R9-5900HX / MEM: 64G / DISK: 1T"
+        siteMonitor: https://192.168.5.172:8006/
+
+- Homelab Monitoring:
+    - Grafana:
+        icon: si-grafana
+        href: http://grafana.writefor.fun
+        description: Data visualised on dashboards
+        siteMonitor: http://grafana.writefor.fun
+    - Prometheus Dashboard:
+        icon: si-prometheus
+        href: http://prometheus.writefor.fun
+        description: Monitoring - Prometheus
+        siteMonitor: http://prometheus.writefor.fun
+    - Uptime Kuma:
+        icon: si-uptimekuma
+        href: http://uptime-kuma.writefor.fun
+        description: Uptime Checking
+        siteMonitor: http://uptime-kuma.writefor.fun
+
+- Homelab Applications:
+    - SFTPGO:
+      icon: sftpgo.png
+      href: "http://sftpgo.writefor.fun/web/admin/folders"
+      description: WebDAV & SFTP server
+      siteMonitor: http://sftpgo.writefor.fun/
+# - Kubernetes Monitoring:
+#     # TODO: Update this
+#     - Emby:
+#       icon: emby.png
+#       href: "http://emby.home/"
+#       description: Media server
+#       namespace: media # The kubernetes namespace the app resides in
+#       app: emby # The name of the deployed app
+#
+#     - Element Chat:
+#         icon: matrix-light.png
+#         href: https://chat.example.com
+#         description: Matrix Synapse Powered Chat
+#         app: matrix-element
+#         namespace: comms
+#         pod-selector: >-
+#           app.kubernetes.io/instance in (
+#               matrix-element,
+#               matrix-media-repo,
+#               matrix-media-repo-postgresql,
+#               matrix-synapse
+#           )
@@ -0,0 +1,82 @@
+---
+# For configuration options and examples, please see:
+# https://gethomepage.dev/latest/configs/settings
+
+title: Ryan Yin's Homelab
+base: http://home.writefor.fun/
+favicon: https://thiscute.world/favicon.ico
+
+# https://developer.mozilla.org/en-US/docs/Web/Manifest/start_url
+# Used by some browsers to determine the start page of the web application
+startUrl: http://home.writefor.fun/
+
+language: zh
+
+# Define shared API provider options and secrets here,
+# You can then pass provider instead of apiKey in your widget configuration.
+providers:
+  # read api keys from environment variables
+  openweathermap: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
+  weatherapi: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
+
+background:
+  image: /images/rolling-girls.png
+  blur: sm # sm, "", md, xl... see https://tailwindcss.com/docs/backdrop-blur
+  saturate: 50 # 0, 50, 100... see https://tailwindcss.com/docs/backdrop-saturate
+  brightness: 50 # 0, 50, 75... see https://tailwindcss.com/docs/backdrop-brightness
+  opacity: 50 # 0-100
+
+theme: dark # or light
+
+# Supported colors are:
+# slate, gray, zinc, neutral, stone, amber,
+# yellow, lime, green, emerald, teal, cyan,
+# sky, blue, indigo, violet, purple, fuchsia, pink, rose, red, white
+color: indigo
+
+# make all cards in a row the same height.
+useEqualHeights: true
+
+# Groups and its layout
+# Groups Name should match the name defined in your services.yaml or widgets.yaml
+layout:
+  Proxmox VE 虚拟化集群:
+    icon: si-proxmox
+    tab: First
+
+  Group A:
+    initiallyCollapsed: true # collapsed by default
+    tab: First
+    style: row
+    columns: 4
+
+  Second Service Group:
+    useEqualHeights: true # overrides global setting
+    tab: Second
+    columns: 4
+
+  Third Service Group:
+    tab: Third
+    style: row
+
+  Bookmark Group on Fourth Tab:
+    tab: Fourth
+
+  Service Group on every Tab:
+    style: row
+    columns: 4
+
+# https://gethomepage.dev/latest/configs/services/#icons
+# iconStyle: theme # optional, defaults to gradient
+
+# Typing in homepage to quick search
+quicklaunch:
+  searchDescriptions: true
+  hideInternetSearch: true
+  showSearchSuggestions: true
+  hideVisitURL: true
+
+# Show docker stats
+showStats: true
+
+hideErrors: false
@@ -0,0 +1,21 @@
+# TODO: add access to kubernetes cluster
+# - kubernetes:
+#     cluster:
+#       show: true
+#       cpu: true
+#       memory: true
+#       showLabel: true
+#       label: "cluster"
+#     nodes:
+#       show: true
+#       cpu: true
+#       memory: true
+#       showLabel: true
+# - resources:
+#     backend: resources
+#     expanded: true
+#     cpu: true
+#     memory: true
+- search:
+    provider: google
+    target: _blank
@@ -0,0 +1,25 @@
+{pkgs, ...}: let
+  configDir = "/var/lib/homepage-dashboard";
+in {
+  # https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/services/misc/homepage-dashboard.nix
+  services.homepage-dashboard = {
+    enable = true;
+    listenPort = 4401;
+    openFirewall = false;
+  };
+  systemd.services.homepage-dashboard.environment = {
+    HOMEPAGE_CONFIG_DIR = configDir;
+
+    # 1. The value of env var HOMEPAGE_VAR_XXX will replace {{HOMEPAGE_VAR_XXX}} in any config
+    # HOMEPAGE_VAR_XXX_APIKEY = "myapikey";
+    # 2. The value of env var HOMEPAGE_FILE_XXX must be a file path,
+    # the contents of which will be used to replace {{HOMEPAGE_FILE_XXX}} in any config
+  };
+  # Install the homepage-dashboard configuration files
+  system.activationScripts.installHomepageDashboardConfig = ''
+    mkdir -p ${configDir}
+    ${pkgs.rsync}/bin/rsync -avz --chmod=D2755,F600 ${./config}/ ${configDir}/
+
+    ${pkgs.systemdMinimal}/bin/systemctl restart homepage-dashboard
+  '';
+}
@@ -0,0 +1,26 @@
+{
+  # Replace dashy with gethomepage, because dashy is too slow to start/reload.
+
+  # # Install the dashy configuration file instead of symlink it
+  # system.activationScripts.installDashyConfig = ''
+  #   install -Dm 600 ${./dashy_conf.yml} /etc/dashy/dashy_conf.yml
+  # '';
+  #
+  # # https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/virtualisation/oci-containers.nix
+  # virtualisation.oci-containers.containers = {
+  #   # check its logs via `journalctl -u podman-dashy`
+  #   dashy = {
+  #     hostname = "dashy";
+  #     image = "lissy93/dashy:latest";
+  #     ports = ["127.0.0.1:4000:80"];
+  #     environment = {
+  #       "NODE_ENV" = "production";
+  #     };
+  #     volumes = [
+  #       "/etc/dashy/dashy_conf.yml:/app/public/conf.yml"
+  #     ];
+  #     autoStart = true;
+  #     # cmd = [];
+  #   };
+  # };
+}
@@ -0,0 +1,28 @@
+{
+  lib,
+  mylib,
+  ...
+}: {
+  imports = mylib.scanPaths ./.;
+
+  virtualisation = {
+    docker.enable = lib.mkForce false;
+    podman = {
+      enable = true;
+      # Create a `docker` alias for podman, to use it as a drop-in replacement
+      dockerCompat = true;
+      # Required for containers under podman-compose to be able to talk to each other.
+      defaultNetwork.settings.dns_enabled = true;
+      # Periodically prune Podman resources
+      autoPrune = {
+        enable = true;
+        dates = "weekly";
+        flags = ["--all"];
+      };
+    };
+
+    oci-containers = {
+      backend = "podman";
+    };
+  };
+}
@@ -0,0 +1,6 @@
+# Prometheus & Alertmanager
+
+## Alert Rules
+
+- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
+  Prometheus alerting rules
@@ -0,0 +1,13 @@
+groups:
+  - name: EmbeddedExporter
+
+    rules:
+      - alert: CorednsPanicCount
+        expr: "increase(coredns_panics_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: CoreDNS Panic Count (instance {{ $labels.instance }})
+          description:
+            "Number of CoreDNS panics encountered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -0,0 +1,162 @@
+groups:
+  - name: EmbeddedExporter
+
+    rules:
+      - alert: EtcdInsufficientMembers
+        expr: "count(etcd_server_id) % 2 == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Etcd insufficient Members (instance {{ $labels.instance }})
+          description:
+            "Etcd cluster should have an odd number of members\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: EtcdNoLeader
+        expr: "etcd_server_has_leader == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Etcd no Leader (instance {{ $labels.instance }})
+          description:
+            "Etcd cluster have no leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: EtcdHighNumberOfLeaderChanges
+        expr: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd high number of leader changes (instance {{ $labels.instance }})
+          description:
+            "Etcd leader changed more than 2 times during 10 minutes\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: EtcdHighNumberOfFailedGrpcRequests
+        expr:
+          'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
+          / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
+          description:
+            "More than 1% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: EtcdHighNumberOfFailedGrpcRequests
+        expr:
+          'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
+          / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
+          description:
+            "More than 5% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: EtcdGrpcRequestsSlow
+        expr:
+          'histogram_quantile(0.99,
+          sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service,
+          grpc_method, le)) > 0.15'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
+          description:
+            "GRPC requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: EtcdHighNumberOfFailedHttpRequests
+        expr:
+          "sum(rate(etcd_http_failed_total[1m])) BY (method) /
+          sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
+          description:
+            "More than 1% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
+
+      - alert: EtcdHighNumberOfFailedHttpRequests
+        expr:
+          "sum(rate(etcd_http_failed_total[1m])) BY (method) /
+          sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
+          description:
+            "More than 5% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
+
+      - alert: EtcdHttpRequestsSlow
+        expr:
+          "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
+          description:
+            "HTTP requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: EtcdMemberCommunicationSlow
+        expr:
+          "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) >
+          0.15"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd member communication slow (instance {{ $labels.instance }})
+          description:
+            "Etcd member communication slowing down, 99th percentile is over 0.15s\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: EtcdHighNumberOfFailedProposals
+        expr: "increase(etcd_server_proposals_failed_total[1h]) > 5"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
+          description:
+            "Etcd server got more than 5 failed proposals past hour\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: EtcdHighFsyncDurations
+        expr:
+          "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd high fsync durations (instance {{ $labels.instance }})
+          description:
+            "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: EtcdHighCommitDurations
+        expr:
+          "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) >
+          0.25"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Etcd high commit durations (instance {{ $labels.instance }})
+          description:
+            "Etcd commit duration increasing, 99th percentile is over 0.25s\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
@@ -0,0 +1,123 @@
+groups:
+  - name: EmbeddedExporter
+
+    rules:
+      - alert: IstioKubernetesGatewayAvailabilityDrop
+        expr:
+          'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway",
+          namespace="istio-system"}) without (instance, pod) < 2'
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
+          description:
+            "Gateway pods have dropped. Inbound traffic will likely be affected.\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioPilotHighTotalRequestRate
+        expr: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
+          description:
+            "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have
+            outdated configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioMixerPrometheusDispatchesLow
+        expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
+          description:
+            "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being
+            exported properly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioHighTotalRequestRate
+        expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio high total request rate (instance {{ $labels.instance }})
+          description:
+            "Global request rate in the service mesh is unusually high.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioLowTotalRequestRate
+        expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio low total request rate (instance {{ $labels.instance }})
+          description:
+            "Global request rate in the service mesh is unusually low.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioHigh4xxErrorRate
+        expr:
+          'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) /
+          sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio high 4xx error rate (instance {{ $labels.instance }})
+          description:
+            "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioHigh5xxErrorRate
+        expr:
+          'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) /
+          sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio high 5xx error rate (instance {{ $labels.instance }})
+          description:
+            "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioHighRequestLatency
+        expr:
+          'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) /
+          rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio high request latency (instance {{ $labels.instance }})
+          description:
+            "Istio average requests execution is longer than 100ms.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: IstioLatency99Percentile
+        expr:
+          "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by
+          (destination_canonical_service, destination_workload_namespace, source_canonical_service,
+          source_workload_namespace, le)) > 1000"
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Istio latency 99 percentile (instance {{ $labels.instance }})
+          description:
+            "Istio 1% slowest requests are longer than 1000ms.\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: IstioPilotDuplicateEntry
+        expr: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
+          description:
+            "Istio pilot duplicate entry error.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -0,0 +1,435 @@
+groups:
+  - name: KubestateExporter
+
+    rules:
+      - alert: KubernetesNodeNotReady
+        expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Node ready (node {{ $labels.node }})
+          description:
+            "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesNodeMemoryPressure
+        expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes memory pressure (node {{ $labels.node }})
+          description:
+            "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS
+            = {{ $labels }}"
+
+      - alert: KubernetesNodeDiskPressure
+        expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes disk pressure (node {{ $labels.node }})
+          description:
+            "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: KubernetesNodeNetworkUnavailable
+        expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
+          description:
+            "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesNodeOutOfPodCapacity
+        expr:
+          'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node)
+          (0 * kube_pod_info{pod_template_hash=""})) / sum by (node)
+          (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
+          description:
+            "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
+
+      - alert: KubernetesContainerOomKiller
+        expr:
+          '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
+          offset 10m >= 1) and ignoring (reason)
+          min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
+          == 1'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary:
+            Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{
+            $labels.container }})
+          description:
+            "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has
+            been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesJobFailed
+        expr: "kube_job_status_failed > 0"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
+          description:
+            "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesCronjobSuspended
+        expr: "kube_cronjob_spec_suspend != 0"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
+          description:
+            "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesPersistentvolumeclaimPending
+        expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary:
+            Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
+            $labels.persistentvolumeclaim }})
+          description:
+            "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is
+            pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesVolumeOutOfDiskSpace
+        expr:
+          "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
+          description:
+            "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesVolumeFullInFourDays
+        expr: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
+          description:
+            "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to
+            fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesPersistentvolumeError
+        expr:
+          'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary:
+            Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
+            $labels.persistentvolumeclaim }})
+          description:
+            "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesStatefulsetDown
+        expr: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
+          description:
+            "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesHpaScaleInability
+        expr:
+          'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} ==
+          1'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
+          description:
+            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to
+            scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesHpaMetricsUnavailability
+        expr:
+          'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"}
+          == 1'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
+          description:
+            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect
+            metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesHpaScaleMaximum
+        expr:
+          "kube_horizontalpodautoscaler_status_desired_replicas >=
+          kube_horizontalpodautoscaler_spec_max_replicas"
+        for: 2m
+        labels:
+          severity: info
+        annotations:
+          summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
+          description:
+            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum
+            number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesHpaUnderutilized
+        expr:
+          "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) ==
+          kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3"
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
+          description:
+            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at
+            minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesPodNotHealthy
+        expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
+        for: 15m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
+          description:
+            "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for
+            longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesPodCrashLooping
+        expr: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
+          description:
+            "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesReplicasetReplicasMismatch
+        expr: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary:
+            Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
+          description:
+            "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE
+            = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesDeploymentReplicasMismatch
+        expr: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary:
+            Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment
+            }})
+          description:
+            "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE
+            = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesStatefulsetReplicasMismatch
+        expr: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
+          description:
+            "StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesDeploymentGenerationMismatch
+        expr: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary:
+            Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment
+            }})
+          description:
+            "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been
+            rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesStatefulsetGenerationMismatch
+        expr: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary:
+            Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{
+            $labels.statefulset }})
+          description:
+            "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not
+            been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesStatefulsetUpdateNotRolledOut
+        expr:
+          "max without (revision) (kube_statefulset_status_current_revision unless
+          kube_statefulset_status_update_revision) * (kube_statefulset_replicas !=
+          kube_statefulset_status_replicas_updated)"
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary:
+            Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{
+            $labels.statefulset }})
+          description:
+            "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been
+            rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesDaemonsetRolloutStuck
+        expr:
+          "kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100
+          < 100 or kube_daemonset_status_desired_number_scheduled -
+          kube_daemonset_status_current_number_scheduled > 0"
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary:
+            Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
+          description:
+            "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not
+            scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesDaemonsetMisscheduled
+        expr: "kube_daemonset_status_number_misscheduled > 0"
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary:
+            Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
+          description:
+            "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running
+            where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesCronjobTooLong
+        expr: "time() - kube_cronjob_next_schedule_time > 3600"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
+          description:
+            "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to
+            complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesJobSlowCompletion
+        expr: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
+        for: 12h
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
+          description:
+            "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in
+            time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesApiServerErrors
+        expr:
+          'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) /
+          sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes API server errors (instance {{ $labels.instance }})
+          description:
+            "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS
+            = {{ $labels }}"
+
+      - alert: KubernetesApiClientErrors
+        expr:
+          '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) /
+          sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes API client errors (instance {{ $labels.instance }})
+          description:
+            "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS
+            = {{ $labels }}"
+
+      - alert: KubernetesClientCertificateExpiresNextWeek
+        expr:
+          'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
+          histogram_quantile(0.01, sum by (job, le)
+          (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
+          7*24*60*60'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
+          description:
+            "A client certificate used to authenticate to the apiserver is expiring next
+            week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesClientCertificateExpiresSoon
+        expr:
+          'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
+          histogram_quantile(0.01, sum by (job, le)
+          (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
+          24*60*60'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
+          description:
+            "A client certificate used to authenticate to the apiserver is expiring in less than
+            24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: KubernetesApiServerLatency
+        expr:
+          'histogram_quantile(0.99,
+          sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}
+          [10m])) WITHOUT (instance, resource)) > 1'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Kubernetes API server latency (instance {{ $labels.instance }})
+          description:
+            "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{
+            $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels
+            }}"
@@ -0,0 +1,508 @@
+groups:
+  - name: NodeExporter
+
+    rules:
+      - alert: HostOutOfMemory
+        expr:
+          '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance)
+          group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description:
+            "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels
+            }}"
+
+      - alert: HostMemoryUnderMemoryPressure
+        expr:
+          '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host memory under memory pressure (instance {{ $labels.instance }})
+          description:
+            "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostMemoryIsUnderutilized
+        expr:
+          '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes *
+          100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 1w
+        labels:
+          severity: info
+        annotations:
+          summary: Host Memory is underutilized (instance {{ $labels.instance }})
+          description:
+            "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{
+            $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualNetworkThroughputIn
+        expr:
+          '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput in (instance {{ $labels.instance }})
+          description:
+            "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualNetworkThroughputOut
+        expr:
+          '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput out (instance {{ $labels.instance }})
+          description:
+            "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualDiskReadRate
+        expr:
+          '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read rate (instance {{ $labels.instance }})
+          description:
+            "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: HostUnusualDiskWriteRate
+        expr:
+          '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk write rate (instance {{ $labels.instance }})
+          description:
+            "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: HostOutOfDiskSpace
+        expr:
+          '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
+          device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of disk space (instance {{ $labels.instance }})
+          description:
+            "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostDiskWillFillIn24Hours
+        expr:
+          '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
+          device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 *
+          3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+          description:
+            "Filesystem is predicted to run out of space within the next 24 hours at current write
+            rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostOutOfInodes
+        expr:
+          '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
+          * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of inodes (instance {{ $labels.instance }})
+          description:
+            "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostFilesystemDeviceError
+        expr: "node_filesystem_device_error == 1"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host filesystem device error (instance {{ $labels.instance }})
+          description:
+            "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }}
+            filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostInodesWillFillIn24Hours
+        expr:
+          '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
+          * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 *
+          3600) < 0 and ON (instance, device, mountpoint)
+          node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+          description:
+            "Filesystem is predicted to run out of inodes within the next 24 hours at current write
+            rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualDiskReadLatency
+        expr:
+          '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m])
+          > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left
+          (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read latency (instance {{ $labels.instance }})
+          description:
+            "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: HostUnusualDiskWriteLatency
+        expr:
+          '(rate(node_disk_write_time_seconds_total[1m]) /
+          rate(node_disk_writes_completed_total[1m]) > 0.1 and
+          rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk write latency (instance {{ $labels.instance }})
+          description:
+            "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS =
+            {{ $labels }}"
+
+      - alert: HostHighCpuLoad
+        expr:
+          '(sum by (instance) (avg by (mode, instance)
+          (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left
+          (nodename) node_uname_info{nodename=~".+"}'
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host high CPU load (instance {{ $labels.instance }})
+          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostCpuIsUnderutilized
+        expr:
+          '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
+          group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 1w
+        labels:
+          severity: info
+        annotations:
+          summary: Host CPU is underutilized (instance {{ $labels.instance }})
+          description:
+            "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostCpuStealNoisyNeighbor
+        expr:
+          '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+          description:
+            "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may
+            be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostCpuHighIowait
+        expr:
+          '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU high iowait (instance {{ $labels.instance }})
+          description:
+            "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualDiskIo
+        expr:
+          '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk IO (instance {{ $labels.instance }})
+          description:
+            "Time spent in IO is too high on {{ $labels.instance }}. Check storage for
+            issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostContextSwitching
+        expr:
+          '((rate(node_context_switches_total[5m])) / (count without(cpu, mode)
+          (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host context switching (instance {{ $labels.instance }})
+          description:
+            "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostSwapIsFillingUp
+        expr:
+          '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host swap is filling up (instance {{ $labels.instance }})
+          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostSystemdServiceCrashed
+        expr:
+          '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host systemd service crashed (instance {{ $labels.instance }})
+          description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostPhysicalComponentTooHot
+        expr:
+          '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor)
+          node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host physical component too hot (instance {{ $labels.instance }})
+          description:
+            "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNodeOvertemperatureAlarm
+        expr:
+          '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+          description:
+            "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
+
+      - alert: HostRaidArrayGotInactive
+        expr:
+          '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host RAID array got inactive (instance {{ $labels.instance }})
+          description:
+            "RAID array {{ $labels.device }} is in a degraded state due to one or more disk
+            failures. The number of spare drives is insufficient to fix the issue
+            automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostRaidDiskFailure
+        expr:
+          '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host RAID disk failure (instance {{ $labels.instance }})
+          description:
+            "At least one device in RAID array on {{ $labels.instance }} failed. Array {{
+            $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostKernelVersionDeviations
+        expr:
+          '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release",
+          "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 6h
+        labels:
+          severity: warning
+        annotations:
+          summary: Host kernel version deviations (instance {{ $labels.instance }})
+          description:
+            "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostOomKillDetected
+        expr:
+          '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host OOM kill detected (instance {{ $labels.instance }})
+          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostEdacCorrectableErrorsDetected
+        expr:
+          '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left
+          (nodename) node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+          description:
+            "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory
+            errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
+
+      - alert: HostEdacUncorrectableErrorsDetected
+        expr:
+          '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+          description:
+            "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory
+            errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
+
+      - alert: HostNetworkReceiveErrors
+        expr:
+          '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m])
+          > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Receive Errors (instance {{ $labels.instance }})
+          description:
+            "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
+            \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNetworkTransmitErrors
+        expr:
+          '(rate(node_network_transmit_errs_total[2m]) /
+          rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+          description:
+            "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
+            \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNetworkInterfaceSaturated
+        expr:
+          '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) +
+          rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) /
+          node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+          description:
+            "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting
+            overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNetworkBondDegraded
+        expr:
+          '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+          description:
+            "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{
+            $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostConntrackLimit
+        expr:
+          '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance)
+          group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host conntrack limit (instance {{ $labels.instance }})
+          description:
+            "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
+
+      - alert: HostClockSkew
+        expr:
+          '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or
+          (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock skew (instance {{ $labels.instance }})
+          description:
+            "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this
+            host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostClockNotSynchronising
+        expr:
+          '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) *
+          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock not synchronising (instance {{ $labels.instance }})
+          description:
+            "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value
+            }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostRequiresReboot
+        expr:
+          '(node_reboot_required > 0) * on(instance) group_left (nodename)
+          node_uname_info{nodename=~".+"}'
+        for: 4h
+        labels:
+          severity: info
+        annotations:
+          summary: Host requires reboot (instance {{ $labels.instance }})
+          description:
+            "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{
+            $labels }}"
@@ -0,0 +1,157 @@
+{
+  config,
+  myvars,
+  ...
+}: {
+  # https://prometheus.io/docs/prometheus/latest/configuration/configuration/
+  services.prometheus = {
+    enable = true;
+    checkConfig = true;
+    listenAddress = "127.0.0.1";
+    port = 9090;
+    webExternalUrl = "http://prometheus.writefor.fun";
+
+    extraFlags = ["--storage.tsdb.retention.time=45d"];
+    # Directory below /var/lib to store Prometheus metrics data.
+    stateDir = "prometheus2";
+
+    # Reload prometheus when configuration file changes (instead of restart).
+    enableReload = true;
+    # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
+    # remoteRead = [];
+
+    # Rules are read from these files.
+    # https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
+    #
+    # Prometheus supports two types of rules which may be configured
+    # and then evaluated at regular intervals:
+    #   1. Recording rules
+    #      Recording rules allow you to precompute frequently needed or computationally
+    #      expensive expressions and save their result as a new set of time series.
+    #      Querying the precomputed result will then often be much faster than executing the original expression.
+    #      This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
+    #   2. Alerting rules
+    #      Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
+    #      and to send notifications about firing alerts to an external service.
+    ruleFiles = [
+      ./alert_rules/node-exporter.yml
+      ./alert_rules/kubestate-exporter.yml
+      ./alert_rules/etcd_embedded-exporter.yml
+      ./alert_rules/istio_embedded-exporter.yml
+      ./alert_rules/coredns_embedded-exporter.yml
+
+      # ./recording_rules.yml
+    ];
+
+    # specifies a set of targets and parameters describing how to scrape metrics from them.
+    # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
+    scrapeConfigs = [
+      # --- Hosts --- #
+      {
+        job_name = "node-exporter";
+        scrape_interval = "30s";
+        metrics_path = "/metrics";
+        static_configs = [
+          {
+            # All my NixOS hosts.
+            targets =
+              map (addr: "${addr.ipv4}:9100")
+              (builtins.attrValues myvars.networking.hostsAddr);
+            labels.type = "node";
+          }
+        ];
+      }
+
+      # --- Homelab Applications --- #
+
+      {
+        job_name = "dnsmasq-exporter";
+        scrape_interval = "30s";
+        metrics_path = "/metrics";
+        static_configs = [
+          {
+            targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
+            labels.type = "app";
+            labels.app = "dnsmasq";
+          }
+        ];
+      }
+
+      {
+        job_name = "v2ray-exporter";
+        scrape_interval = "30s";
+        metrics_path = "/metrics";
+        static_configs = [
+          {
+            targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:9153"];
+            labels.type = "app";
+            labels.app = "v2ray";
+          }
+        ];
+      }
+
+      {
+        job_name = "sftpgo-embedded-exporter";
+        scrape_interval = "30s";
+        metrics_path = "/metrics";
+        static_configs = [
+          {
+            targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:10000"];
+            labels.type = "app";
+            labels.app = "v2ray";
+          }
+        ];
+      }
+    ];
+
+    # specifies Alertmanager instances the Prometheus server sends alerts to
+    # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
+    alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
+  };
+
+  services.prometheus.alertmanager = {
+    enable = true;
+    listenAddress = "127.0.0.1";
+    port = 9093;
+    webExternalUrl = "http://alertmanager.writefor.fun";
+    logLevel = "info";
+
+    environmentFile = config.age.secrets."alertmanager.env".path;
+    configuration = {
+      global = {
+        # The smarthost and SMTP sender used for mail notifications.
+        smtp_smarthost = "smtp.qq.com:465";
+        smtp_from = "$SMTP_SENDER_EMAIL";
+        smtp_auth_username = "$SMTP_AUTH_USERNAME";
+        smtp_auth_password = "$SMTP_AUTH_PASSWORD";
+        # smtp.qq.com:465 support SSL only, so we need to disable TLS here.
+        # https://service.mail.qq.com/detail/0/310
+        smtp_require_tls = false;
+      };
+      route = {
+        receiver = "default";
+        routes = [
+          {
+            group_by = ["host"];
+            group_wait = "5m";
+            group_interval = "5m";
+            repeat_interval = "4h";
+            receiver = "default";
+          }
+        ];
+      };
+      receivers = [
+        {
+          name = "default";
+          email_configs = [
+            {
+              to = "ryan4yin@linux.com";
+              # Whether to notify about resolved alerts.
+              send_resolved = true;
+            }
+          ];
+        }
+      ];
+    };
+  };
+}
@@ -0,0 +1,90 @@
+{
+  # dae(running on aquamarine) do not provides http/socks5 proxy server; so we use v2ray here.
+  # https=//github.com/v2fly
+  services.v2ray = {
+    enable = true;
+    config = {
+      # for monitoring
+      "stats" = {};
+      "api" = {
+        "tag" = "api";
+        "services" = [
+          "StatsService"
+        ];
+      };
+      "policy" = {
+        "levels" = {
+          "0" = {
+            "statsUserUplink" = true;
+            "statsUserDownlink" = true;
+          };
+        };
+        "system" = {
+          "statsInboundUplink" = true;
+          "statsInboundDownlink" = true;
+          "statsOutboundUplink" = true;
+          "statsOutboundDownlink" = true;
+        };
+      };
+
+      inbounds = [
+        # core inbound
+        {
+          listen = "0.0.0.0";
+          port = 7890;
+          protocol = "http";
+        }
+        {
+          listen = "0.0.0.0";
+          port = 7891;
+          protocol = "socks";
+          settings = {
+            auth = "noauth";
+            udp = true;
+          };
+        }
+
+        # for monitoring
+        {
+          "tag" = "api";
+          "listen" = "127.0.0.1";
+          "port" = 54321;
+          "protocol" = "dokodemo-door";
+          "settings" = {
+            "address" = "127.0.0.1";
+          };
+        }
+      ];
+      outbounds = [
+        # forward traffic directly via system's default network(to dae proxy running on aquamarine)
+        {
+          protocol = "freedom";
+          tag = "freedom";
+        }
+      ];
+
+      # for monitoring
+      "routing" = {
+        "rules" = [
+          {
+            "inboundTag" = [
+              "api"
+            ];
+            "outboundTag" = "api";
+            "type" = "field";
+          }
+        ];
+      };
+    };
+  };
+
+  # https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/monitoring/prometheus/exporters/v2ray.nix
+  # https://github.com/wi1dcard/v2ray-exporter
+  services.prometheus.exporters.v2ray = {
+    enable = true;
+    listenAddress = "0.0.0.0";
+    port = 9153;
+    openFirewall = false;
+    v2rayEndpoint = "127.0.0.1:54321";
+  };
+}
@@ -0,0 +1,83 @@
+{pkgs, ...}: let
+  passwordFile = "/etc/agenix/restic-password";
+  sshKeyPath = "/etc/agenix/ssh-key-for-restic-backup";
+  rcloneConfigFile = "/etc/agenix/rclone-conf-for-restic-backup";
+in {
+  # https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/backup/restic.nix
+  services.restic.backups = {
+    homelab-backup = {
+      inherit passwordFile;
+      initialize = true; # Initialize the repository if it doesn't exist.
+      repository = "rclone:smb-downloads:/Downloads/proxmox-backup/"; # backup to a rclone remote
+
+      # rclone related
+      # rcloneOptions = {
+      #   bwlimit = "100M";  # Limit the bandwidth used by rclone.
+      # };
+      inherit rcloneConfigFile;
+
+      # Which local paths to backup, in addition to ones specified via `dynamicFilesFrom`.
+      paths = [
+        "/tmp/restic-backup-temp"
+      ];
+      #
+      # A script that produces a list of files to back up.  The
+      # results of this command are given to the '--files-from'
+      # option. The result is merged with paths specified via `paths`.
+      # dynamicFilesFrom = "find /home/matt/git -type d -name .git";
+      #
+      # Patterns to exclude when backing up. See
+      #   https://restic.readthedocs.io/en/latest/040_backup.html#excluding-files
+      # for details on syntax.
+      exclude = [];
+
+      # A script that must run before starting the backup process.
+      backupPrepareCommand = ''
+        ${pkgs.nushell}/bin/nu -c '
+          let pve_nodes = [
+            # proxmox cluster's nodes
+            "um560"
+            "gtr5"
+            "s500plus"
+
+            # others
+            "kana"
+          ]
+
+          pve_nodes | each {|it|
+            rsync -avz \
+            -e "ssh -i ${sshKeyPath}"  \
+            $"($it):/var/lib/vz" $"/tmp/restic-backup-temp/($it)"
+          }
+        '
+      '';
+      # A script that must run after finishing the backup process.
+      backupCleanupCommand = "rm -rf /tmp/restic-backup-temp";
+
+      # Extra extended options to be passed to the restic --option flag.
+      # extraOptions = [];
+
+      # Extra arguments passed to restic backup.
+      # extraBackupArgs = [
+      #   "--exclude-file=/etc/restic/excludes-list"
+      # ];
+
+      # repository = "/mnt/backup-hdd"; # backup to a local directory
+      # When to run the backup. See {manpage}`systemd.timer(5)` for details.
+      timerConfig = {
+        OnCalendar = "01:30";
+        RandomizedDelaySec = "1h";
+      };
+      # A list of options (--keep-* et al.) for 'restic forget --prune',
+      # to automatically prune old snapshots.
+      # The 'forget' command is run *after* the 'backup' command, so
+      # keep that in mind when constructing the --keep-* options.
+      pruneOpts = [
+        "--keep-daily 3"
+        "--keep-weekly 3"
+        "--keep-monthly 3"
+        "--keep-yearly 3"
+      ];
+    };
+  };
+}
@@ -0,0 +1,97 @@
+{config, ...}: {
+  # Read SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD from a file
+  systemd.services.sftpgo.serviceConfig.EnvironmentFile = config.age.secrets."sftpgo.env".path;
+
+  services.sftpgo = {
+    enable = true;
+    user = "sftpgo";
+    dataDir = "/var/lib/sftpgo";
+    extraArgs = [
+      "--log-level"
+      "info"
+    ];
+    # https://github.com/drakkan/sftpgo/blob/2.5.x/docs/full-configuration.md
+    settings = {
+      common = {
+        # Auto-blocking policy for SFTPGo and thus helps to prevent DoS (Denial of Service) and brute force password guessing.
+        defender = {
+          enable = true;
+        };
+      };
+      # Where to store stfpgo's data
+      data_provider = {
+        driver = "sqlite";
+        name = "sftpgo.db";
+        password_hashing = {
+          algo = "argon2id";
+          # options for argon2id hashing algorithm.
+          # The memory and iterations parameters control the computational cost of hashing the password.
+          argon2_options = {
+            memory = 65536; # KiB
+            iterations = 2; # The number of iterations over the memory.
+            parallelism = 2; # The number of threads (or lanes) used by the algorithm.
+          };
+        };
+        password_validation = {
+          # What Entropy Value Should I Use?
+          # somewhere in the 50-70 range seems "reasonable".
+          # https://github.com/wagslane/go-password-validator#what-entropy-value-should-i-use
+          admins.min_entropy = 60;
+          users.min_entropy = 60;
+        };
+        # Cache passwords in memory to avoid hashing the same password multiple times(it costs).
+        password_caching = true;
+        # create the default admin user via environment variables
+        # SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD
+        create_default_admin = true;
+      };
+
+      # WebDAV is a popular protocol for file sharing, better than CIFS/SMB, NFS, etc.
+      # it's save to use WebDAV over HTTPS on public networks.
+      webdavd.bindings = [
+        {
+          address = "127.0.0.1";
+          port = 3303;
+        }
+      ];
+      # HTTP Server provides a simple web interface to manage the server.
+      httpd.bindings = [
+        {
+          address = "127.0.0.1";
+          enable_https = false;
+          port = 3302;
+          client_ip_proxy_header = "X-Forwarded-For";
+          # a basic built-in web interface that allows you to manage users,
+          # virtual folders, admins and connections.
+          # url: http://127.0.0.1:8080/web/admin
+          enable_web_admin = true;
+          # A basic front-end web interface for your users.
+          # It allows end-users to browse and manage their files and change their credentials.
+          enable_web_client = true;
+          enable_rest_api = true;
+        }
+      ];
+      # prometheus metrics
+      telemetry = {
+        bind_port = 10000;
+        bind_address = "0.0.0.0";
+        # auth_user_file = "";
+      };
+      # multi-factor authentication settings
+      mfa.totp = [
+        {
+          # Unique configuration name, not visible to the authentication apps.
+          # Should not to be changed after the first user has been created.
+          name = "SFTPGo";
+          # Name of the issuing Organization/Company
+          issuer = "SFTPGo";
+          # Algorithm to use for HMAC
+          # Currently Google Authenticator app on iPhone seems to only support sha1
+          algo = "sha1";
+        }
+      ];
+      # SMTP configuration enables SFTPGo email sending capabilities
+      # smtp = {};
+    };
+  };
+}
@@ -0,0 +1,117 @@
+{
+  config,
+  myvars,
+  ...
+}: let
+  dataDir = "/var/lib/transmission";
+  name = "transmission";
+in {
+  # the headless Transmission BitTorrent daemon
+  # https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/torrent/transmission.nix
+  # https://wiki.archlinux.org/title/transmission
+  services.transmission = {
+    enable = true;
+    user = name;
+    group = name;
+    home = dataDir;
+    downloadDirPermissions = "0770";
+
+    # Whether to enable tweaking of kernel parameters to open many more connections at the same time.
+    # Note that you may also want to increase peer-limit-global.
+    # And be aware that these settings are quite aggressive and might not suite your regular desktop use.
+    # For instance, SSH sessions may time out more easily.
+    performanceNetParameters = true;
+
+    # Path to a JSON file to be merged with the settings.
+    # Useful to merge a file which is better kept out of the Nix store to set secret config parameters like `rpc-password`.
+    credentialsFile = config.age.secrets."transmission-credentials.json".path;
+
+    # Whether to open the RPC port in the firewall.
+    openRPCPort = false;
+    openPeerPorts = true;
+
+    # https://github.com/transmission/transmission/blob/main/docs/Editing-Configuration-Files.md
+    settings = {
+      # 0 = None, 1 = Critical, 2 = Error, 3 = Warn, 4 = Info, 5 = Debug, 6 = Trace;
+      message-level = 3;
+
+      # Encryption may help get around some ISP filtering,
+      # but at the cost of slightly higher CPU use.
+      # 0 = Prefer unencrypted connections,
+      # 1 = Prefer encrypted connections,
+      # 2 = Require encrypted connections; default = 1)
+      encryption = 2;
+
+      # rpc = Web Interface
+      rpc-port = 9091;
+      rpc-bind-address = "127.0.0.1";
+      anti-brute-force-enabled = true;
+      # After this amount of failed authentication attempts is surpassed,
+      # the RPC server will deny any further authentication attempts until it is restarted.
+      # This is not tracked per IP but in total.
+      anti-brute-force-threshold = 20;
+      rpc-authentication-required = true;
+
+      # Comma-delimited list of IP addresses.
+      # Wildcards allowed using '*'. Example: "127.0.0.*,192.168.*.*",
+      rpc-whitelist-enabled = true;
+      rpc-whitelist = "127.0.0.*,192.168.*.*";
+      # Comma-delimited list of domain names.
+      # Wildcards allowed using '*'. Example: "*.foo.org,example.com",
+      rpc-host-whitelist-enabled = true;
+      rpc-host-whitelist = "*.writefor.fun,localhost,192.168.5.*";
+      rpc-user = myvars.username;
+      rpc-username = myvars.username;
+      # rpc-password = "test"; # you'd better use the credentialsFile for this.
+
+      incomplete-dir-enabled = true;
+      incomplete-dir = "${dataDir}/incomplete";
+      download-dir = "${dataDir}/downloads";
+
+      # Watch a directory for torrent files and add them to transmission.
+      watch-dir-enabled = false;
+      watch-dir = "${dataDir}/watch";
+      # Whether to enable Micro Transport Protocol (µTP).
+      utp-enabled = true;
+      # Executable to be run at torrent completion.
+      script-torrent-done-enabled = false;
+      # script-torrent-done-filename = "/path/to/script";
+
+      # Enable Local Peer Discovery (LPD).
+      lpd-enabled = true;
+      # The peer port to listen for incoming connections.
+      peer-port = 51413;
+      # Enable UPnP or NAT-PMP to forward a port through your firewall(NAT).
+      # https://github.com/transmission/transmission/blob/main/docs/Port-Forwarding-Guide.md
+      port-forwarding-enabled = true;
+
+      # "normal" speed limits
+      speed-limit-down-enabled = true;
+      speed-limit-down = 30000; # KB/s
+      speed-limit-up-enabled = true;
+      speed-limit-up = 500; # KB/s
+      upload-slots-per-torrent = 8;
+
+      # Start torrents as soon as they are added
+      start-added-torrents = true;
+
+      # Queuing
+      # When true, Transmission will only download
+      # download-queue-size non-stalled torrents at once.
+      download-queue-enabled = true;
+      download-queue-size = 5;
+
+      # When true, torrents that have not shared data for
+      # queue-stalled-minutes are treated as 'stalled'
+      # and are not counted against the queue-download-size
+      # and seed-queue-size limits.
+      queue-stalled-enabled = true;
+      queue-stalled-minutes = 60;
+
+      # When true. Transmission will only seed seed-queue-size
+      # non-stalled torrents at once.
+      seed-queue-enabled = true;
+      seed-queue-size = 10;
+    };
+  };
+}
@@ -0,0 +1,12 @@
+{
+  # https://github.com/NixOS/nixpkgs/blob/nixos-23.11/nixos/modules/services/monitoring/uptime-kuma.nix
+  services.uptime-kuma = {
+    enable = true;
+    # https://github.com/louislam/uptime-kuma/wiki/Environment-Variables
+    settings = {
+      "UPTIME_KUMA_HOST" = "127.0.0.1";
+      "UPTIME_KUMA_PORT" = "3350";
+      "DATA_DIR" = "/var/lib/uptime-kuma/";
+    };
+  };
+}