feat: migrate services to aqua

2026-07-14 08:42:40 +02:00 · 2024-07-31 11:11:14 +08:00
parent 80e0bcf031
commit b671c05db9
38 changed files with 11 additions and 8 deletions
@@ -1,97 +0,0 @@
-{
-  pkgs,
-  config,
-  wallpapers,
-  ...
-}: let
-  hostCommonConfig = ''
-    encode zstd gzip
-    tls ${../../certs/ecc-server.crt} ${config.age.secrets."certs/ecc-server.key".path} {
-      protocols tls1.3 tls1.3
-      curves x25519 secp384r1 secp521r1
-    }
-  '';
-in {
-  services.caddy = {
-    enable = true;
-    # Reload Caddy instead of restarting it when configuration file changes.
-    enableReload = true;
-    user = "caddy"; # User account under which caddy runs.
-    dataDir = "/var/lib/caddy";
-    logDir = "/var/log/caddy";
-
-    # Additional lines of configuration appended to the global config section of the Caddyfile.
-    # Refer to https://caddyserver.com/docs/caddyfile/options#global-options for details on supported values.
-    globalConfig = ''
-      http_port    80
-      https_port   443
-      auto_https   disable_certs
-    '';
-
-    # Dashboard
-    virtualHosts."home.writefor.fun".extraConfig = ''
-      ${hostCommonConfig}
-      reverse_proxy http://localhost:4401
-    '';
-
-    # https://caddyserver.com/docs/caddyfile/directives/file_server
-    virtualHosts."file.writefor.fun".extraConfig = ''
-      root * /var/lib/caddy/fileserver/
-      ${hostCommonConfig}
-      file_server browse {
-        hide .git
-        precompressed zstd br gzip
-      }
-    '';
-
-    virtualHosts."git.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:3301
-    '';
-    virtualHosts."sftpgo.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:3302
-    '';
-    virtualHosts."webdav.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:3303
-    '';
-    virtualHosts."transmission.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:9091
-    '';
-
-    # Monitoring
-    virtualHosts."uptime-kuma.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:3350
-    '';
-    virtualHosts."grafana.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:3351
-    '';
-    virtualHosts."prometheus.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:9090
-    '';
-    virtualHosts."alertmanager.writefor.fun".extraConfig = ''
-      encode zstd gzip
-      reverse_proxy http://localhost:9093
-    '';
-  };
-  networking.firewall.allowedTCPPorts = [80 443];
-
-  # Create Directories
-  systemd.tmpfiles.rules = [
-    "d /var/lib/caddy/fileserver/ 0755 caddy caddy"
-    # directory for virtual machine's images
-    "d /var/lib/caddy/fileserver/vms 0755 caddy caddy"
-  ];
-
-  # Add all my wallpapers into /var/lib/caddy/fileserver/wallpapers
-  # Install the homepage-dashboard configuration files
-  system.activationScripts.installCaddyWallpapers = ''
-    mkdir -p /var/lib/caddy/fileserver/wallpapers
-    ${pkgs.rsync}/bin/rsync -avz --chmod=D2755,F644 ${wallpapers}/ /var/lib/caddy/fileserver/wallpapers/
-  '';
-}
@@ -1,3 +0,0 @@
-{mylib, ...}: {
-  imports = mylib.scanPaths ./.;
-}
@@ -1,88 +0,0 @@
-{pkgs, ...}: let
-in {
-  # https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/misc/gitea.nix
-  services.gitea = {
-    enable = true;
-    user = "gitea";
-    group = "gitea";
-    stateDir = "/var/lib/gitea";
-    appName = "Ryan Yin's Gitea Service";
-    lfs.enable = true;
-    # Enable a timer that runs gitea dump to generate backup-files of the current gitea database and repositories.
-    dump = {
-      enable = false;
-      interval = "hourly";
-      file = "gitea-dump";
-      type = "tar.zst";
-    };
-    # Path to a file containing the SMTP password.
-    # mailerPasswordFile = "";
-    settings = {
-      server = {
-        SSH_PORT = 2222;
-        PROTOCOL = "http";
-        HTTP_PORT = 3301;
-        HTTP_ADDR = "127.0.0.1";
-        DOMAIN = "git.writefor.fun";
-      };
-      # one of "Trace", "Debug", "Info", "Warn", "Error", "Critical"
-      log.LEVEL = "Info";
-      session.COOKIE_SECURE = false;
-      service.DISABLE_REGISTRATION = true;
-
-      # "cron.sync_external_users" = {
-      #   RUN_AT_START = true;
-      #   SCHEDULE = "@every 24h";
-      #   UPDATE_EXISTING = true;
-      # };
-      mailer = {
-        ENABLED = true;
-        MAILER_TYPE = "sendmail";
-        FROM = "do-not-reply@writefor.fun";
-        SENDMAIL_PATH = "${pkgs.system-sendmail}/bin/sendmail";
-      };
-      other = {
-        SHOW_FOOTER_VERSION = false;
-      };
-    };
-    database = {
-      type = "sqlite3";
-      # create a local database automatically.
-      createDatabase = true;
-    };
-  };
-
-  # services.gitea-actions-runner.instances."default" = {
-  #   enable = true;
-  #   name = "default";
-  #   labels = [
-  #     # provide a debian base with nodejs for actions
-  #     "debian-latest:docker://node:18-bullseye"
-  #     # fake the ubuntu name, because node provides no ubuntu builds
-  #     "ubuntu-latest:docker://node:18-bullseye"
-  #     # provide native execution on the host
-  #     "native:host"
-  #   ];
-  #   gitea = "http://git.writefor.fun";
-  #   # Path to an environment file,
-  #   # containing the TOKEN environment variable,
-  #   # that holds a token to register at the configured Gitea instance.
-  #   tokenFile = "xxx"; # use agenix for secrets.
-  #   # Configuration for act_runner daemon.
-  #   # For an example configuration, see:
-  #   #  https://gitea.com/gitea/act_runner/src/branch/main/internal/pkg/config/config.example.yaml
-  #   settings = {};
-  #   # List of packages, that are available to actions,
-  #   # when the runner is configured with a host execution label.
-  #   hostPackages = with pkgs; [
-  #     bash
-  #     coreutils
-  #     curl
-  #     gawk
-  #     gitMinimal
-  #     gnused
-  #     nodejs
-  #     wget
-  #   ];
-  # };
-}
@@ -1,20 +0,0 @@
-apiVersion: 1
-
-providers:
-  # <string> an unique provider name. Required
-  - name: "Dashboards"
-    # <int> Org id. Default to 1
-    orgId: 1
-    # <string> provider type. Default to 'file'
-    type: file
-    # <bool> disable dashboard deletion
-    disableDeletion: false
-    # <int> how often Grafana will scan for changed dashboards
-    updateIntervalSeconds: 20
-    # <bool> allow updating provisioned dashboards from the UI
-    allowUiUpdates: false
-    options:
-      # <string, required> path to dashboard files on disk. Required when using the 'file' type
-      path: /etc/grafana/dashboards
-      # <bool> use folder names from filesystem to create folders in Grafana
-      foldersFromFilesStructure: true
@@ -1,10 +0,0 @@
-# Grafana Dashboards
-
-## Homelab
-
-1. https://grafana.com/grafana/dashboards/1860-node-exporter-full/
-2. https://grafana.com/grafana/dashboards/9578-alertmanager/
-
-## Kubernetes
-
-1. https://github.com/dotdc/grafana-dashboards-kubernetes/
@@ -1,21 +0,0 @@
-# https://grafana.com/docs/grafana/latest/datasources/prometheus/
-apiVersion: 1
-
-datasources:
-  - name: prometheus-homelab
-    type: prometheus
-    access: proxy
-    # Access mode - proxy (server in the UI) or direct (browser in the UI).
-    url: http://localhost:9090
-    jsonData:
-      httpMethod: POST
-      manageAlerts: true
-      prometheusType: Prometheus
-      prometheusVersion: 2.49.0
-      cacheLevel: "High"
-      disableRecordingRules: false
-      # As of Grafana 10, the Prometheus data source can be configured to query live dashboards
-      # incrementally, instead of re-querying the entire duration on each dashboard refresh.
-      # Increasing the duration of the incrementalQueryOverlapWindow will increase the size of every incremental query,
-      # but might be helpful for instances that have inconsistent results for recent data.
-      incrementalQueryOverlapWindow: 10m
@@ -1,52 +0,0 @@
-{
-  config,
-  myvars,
-  ...
-}: {
-  services.grafana = {
-    enable = true;
-    dataDir = "/var/lib/grafana";
-    # DeclarativePlugins = with pkgs.grafanaPlugins; [ grafana-piechart-panel ];
-    settings = {
-      server = {
-        http_addr = "127.0.0.1";
-        http_port = 3351;
-        protocol = "http";
-        domain = "grafana.writefo.fun";
-        # Redirect to correct domain if the host header does not match the domain. Prevents DNS rebinding attacks.
-        serve_from_sub_path = false;
-        # Add subpath to the root_url if serve_from_sub_path is true
-        root_url = "%(protocol)s://%(domain)s:%(http_port)s/";
-        enforce_domain = false;
-        read_timeout = "180s";
-        # Enable HTTP compression, this can improve transfer speed and bandwidth utilization.
-        enable_gzip = true;
-        # Cdn for accelerating loading of frontend assets.
-        # cdn_url = "https://cdn.jsdelivr.net/npm/grafana@7.5.5";
-      };
-
-      security = {
-        admin_user = myvars.username;
-        admin_email = myvars.useremail;
-        # Use file provider to read the admin password from a file.
-        # https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
-        admin_password = "$__file{${config.age.secrets."grafana-admin-password".path}}";
-      };
-      users = {
-        allow_sign_up = false;
-        # home_page = "";
-        default_theme = "dark";
-      };
-    };
-
-    # Declaratively provision Grafana's data sources, dashboards, and alerting rules.
-    # Grafana's alerting rules is not recommended to use, we use Prometheus alertmanager instead.
-    # https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
-    provision = {
-      datasources.path = ./datasources.yml;
-      dashboards.path = ./dashboards.yml;
-    };
-  };
-
-  environment.etc."grafana/dashboards".source = ./dashboards;
-}
@@ -1 +0,0 @@
-# Homepage for my Homelab
@@ -1,8 +0,0 @@
---
- About Me:
-    - Blog:
-        - abbr: Blog
-          href: https://thiscute.world/
-    - Github:
-        - abbr: GH
-          href: https://github.com/ryan4yin
@@ -1,3 +0,0 @@
-# kana-docker:
-#   socket: /var/run/docker.sock
-#
@@ -1,6 +0,0 @@
-# https://gethomepage.dev/latest/configs/kubernetes/
-
-# uses the default kubeconfig to access the cluster
-# read kubbecofig from $KUBECONFIG or $HOME/.kube/config
-# mode: default
-mode: disabled
@@ -1,75 +0,0 @@
---
-# For configuration options and examples, please see:
-# https://gethomepage.dev/latest/configs/services
-
- KubeVirt 虚拟化集群:
-    - KubeVirt-Shoryu:
-        icon: si-kubevirt
-        description: "CPU: R7-5825U / MEM: 64G / DISK: 1T"
-        href: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.181:9100
-        siteMonitor: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.181:9100
-    - KubeVirt-Shushou:
-        icon: si-kubevirt
-        description: "CPU: R9-5900HX / MEM: 64G / DISK: 1T"
-        href: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.182:9100
-        siteMonitor: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.182:9100
-    - KubeVirt-Youko:
-        icon: si-kubevirt
-        description: "CPU: R5-5625U / MEM: 32G / DISK: 512G+4T*2"
-        href: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.183:9100
-        siteMonitor: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.183:9100
-    - LongHorn-Storage:
-        icon: si-longhorn
-        href: http://longhorn.writefor.fun/
-    - Victoria-Metrics:
-        icon: si-victoriametrics
-        href: http://vm.writefor.fun/
-    - KubeVirt-Grafana:
-        icon: si-grafana
-        href: http://k8s-grafana.writefor.fun/
-
- Homelab Monitoring:
-    - Grafana:
-        icon: si-grafana
-        href: http://grafana.writefor.fun
-        description: Data visualised on dashboards
-        siteMonitor: http://grafana.writefor.fun
-    - Prometheus Dashboard:
-        icon: si-prometheus
-        href: http://prometheus.writefor.fun
-        description: Monitoring - Prometheus
-        siteMonitor: http://prometheus.writefor.fun
-    - Uptime Kuma:
-        icon: si-uptimekuma
-        href: http://uptime-kuma.writefor.fun
-        description: Uptime Checking
-        siteMonitor: http://uptime-kuma.writefor.fun
-
- Homelab Applications:
-    - SFTPGO:
-      icon: sftpgo.png
-      href: "http://sftpgo.writefor.fun/web/admin/folders"
-      description: WebDAV & SFTP server
-      siteMonitor: http://sftpgo.writefor.fun/
-# - Kubernetes Monitoring:
-#     # TODO: Update this
-#     - Emby:
-#       icon: emby.png
-#       href: "http://emby.home/"
-#       description: Media server
-#       namespace: media # The kubernetes namespace the app resides in
-#       app: emby # The name of the deployed app
-#
-#     - Element Chat:
-#         icon: matrix-light.png
-#         href: https://chat.example.com
-#         description: Matrix Synapse Powered Chat
-#         app: matrix-element
-#         namespace: comms
-#         pod-selector: >-
-#           app.kubernetes.io/instance in (
-#               matrix-element,
-#               matrix-media-repo,
-#               matrix-media-repo-postgresql,
-#               matrix-synapse
-#           )
@@ -1,82 +0,0 @@
---
-# For configuration options and examples, please see:
-# https://gethomepage.dev/latest/configs/settings
-
-title: Ryan Yin's Homelab
-base: https://home.writefor.fun/
-favicon: https://thiscute.world/favicon.ico
-
-# https://developer.mozilla.org/en-US/docs/Web/Manifest/start_url
-# Used by some browsers to determine the start page of the web application
-startUrl: https://home.writefor.fun/
-
-language: zh
-
-# Define shared API provider options and secrets here,
-# You can then pass provider instead of apiKey in your widget configuration.
-providers:
-  # read api keys from environment variables
-  openweathermap: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
-  weatherapi: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
-
-background:
-  image: https://file.writefor.fun/wallpapers/rolling-girls.png
-  blur: sm # sm, "", md, xl... see https://tailwindcss.com/docs/backdrop-blur
-  saturate: 90 # 0, 50, 100... see https://tailwindcss.com/docs/backdrop-saturate
-  brightness: 90 # 0, 50, 75... see https://tailwindcss.com/docs/backdrop-brightness
-  opacity: 85 # 0-100
-
-theme: dark # or light
-
-# Supported colors are:
-# slate, gray, zinc, neutral, stone, amber,
-# yellow, lime, green, emerald, teal, cyan,
-# sky, blue, indigo, violet, purple, fuchsia, pink, rose, red, white
-color: indigo
-
-# make all cards in a row the same height.
-useEqualHeights: true
-
-# Groups and its layout
-# Groups Name should match the name defined in your services.yaml or widgets.yaml
-layout:
-  KubeVirt 虚拟化集群:
-    icon: si-kubevirt
-    tab: First
-
-  Group A:
-    initiallyCollapsed: true # collapsed by default
-    tab: First
-    style: row
-    columns: 4
-
-  Second Service Group:
-    useEqualHeights: true # overrides global setting
-    tab: Second
-    columns: 4
-
-  Third Service Group:
-    tab: Third
-    style: row
-
-  Bookmark Group on Fourth Tab:
-    tab: Fourth
-
-  Service Group on every Tab:
-    style: row
-    columns: 4
-
-# https://gethomepage.dev/latest/configs/services/#icons
-# iconStyle: theme # optional, defaults to gradient
-
-# Typing in homepage to quick search
-quicklaunch:
-  searchDescriptions: true
-  hideInternetSearch: true
-  showSearchSuggestions: true
-  hideVisitURL: true
-
-# Show docker stats
-showStats: true
-
-hideErrors: false
@@ -1,21 +0,0 @@
-# TODO: add access to kubernetes cluster
-# - kubernetes:
-#     cluster:
-#       show: true
-#       cpu: true
-#       memory: true
-#       showLabel: true
-#       label: "cluster"
-#     nodes:
-#       show: true
-#       cpu: true
-#       memory: true
-#       showLabel: true
-# - resources:
-#     backend: resources
-#     expanded: true
-#     cpu: true
-#     memory: true
- search:
-    provider: google
-    target: _blank
@@ -1,25 +0,0 @@
-{pkgs, ...}: let
-  configDir = "/var/lib/homepage-dashboard";
-in {
-  # https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/services/misc/homepage-dashboard.nix
-  services.homepage-dashboard = {
-    enable = true;
-    listenPort = 4401;
-    openFirewall = false;
-  };
-  systemd.services.homepage-dashboard.environment = {
-    HOMEPAGE_CONFIG_DIR = configDir;
-
-    # 1. The value of env var HOMEPAGE_VAR_XXX will replace {{HOMEPAGE_VAR_XXX}} in any config
-    # HOMEPAGE_VAR_XXX_APIKEY = "myapikey";
-    # 2. The value of env var HOMEPAGE_FILE_XXX must be a file path,
-    # the contents of which will be used to replace {{HOMEPAGE_FILE_XXX}} in any config
-  };
-  # Install the homepage-dashboard configuration files
-  system.activationScripts.installHomepageDashboardConfig = ''
-    mkdir -p ${configDir}
-    ${pkgs.rsync}/bin/rsync -avz --chmod=D2755,F600 ${./config}/ ${configDir}/
-
-    ${pkgs.systemdMinimal}/bin/systemctl restart homepage-dashboard
-  '';
-}
@@ -1,6 +0,0 @@
-# Prometheus & Alertmanager
-
-## Alert Rules
-
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
-  Prometheus alerting rules
@@ -1,13 +0,0 @@
-groups:
-  - name: EmbeddedExporter
-
-    rules:
-      - alert: CorednsPanicCount
-        expr: "increase(coredns_panics_total[1m]) > 0"
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: CoreDNS Panic Count (instance {{ $labels.instance }})
-          description:
-            "Number of CoreDNS panics encountered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -1,162 +0,0 @@
-groups:
-  - name: EmbeddedExporter
-
-    rules:
-      - alert: EtcdInsufficientMembers
-        expr: "count(etcd_server_id) % 2 == 0"
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Etcd insufficient Members (instance {{ $labels.instance }})
-          description:
-            "Etcd cluster should have an odd number of members\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: EtcdNoLeader
-        expr: "etcd_server_has_leader == 0"
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Etcd no Leader (instance {{ $labels.instance }})
-          description:
-            "Etcd cluster have no leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: EtcdHighNumberOfLeaderChanges
-        expr: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd high number of leader changes (instance {{ $labels.instance }})
-          description:
-            "Etcd leader changed more than 2 times during 10 minutes\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: EtcdHighNumberOfFailedGrpcRequests
-        expr:
-          'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
-          / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
-          description:
-            "More than 1% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: EtcdHighNumberOfFailedGrpcRequests
-        expr:
-          'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
-          / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
-          description:
-            "More than 5% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: EtcdGrpcRequestsSlow
-        expr:
-          'histogram_quantile(0.99,
-          sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service,
-          grpc_method, le)) > 0.15'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
-          description:
-            "GRPC requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: EtcdHighNumberOfFailedHttpRequests
-        expr:
-          "sum(rate(etcd_http_failed_total[1m])) BY (method) /
-          sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
-          description:
-            "More than 1% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
-
-      - alert: EtcdHighNumberOfFailedHttpRequests
-        expr:
-          "sum(rate(etcd_http_failed_total[1m])) BY (method) /
-          sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
-          description:
-            "More than 5% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
-
-      - alert: EtcdHttpRequestsSlow
-        expr:
-          "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
-          description:
-            "HTTP requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: EtcdMemberCommunicationSlow
-        expr:
-          "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) >
-          0.15"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd member communication slow (instance {{ $labels.instance }})
-          description:
-            "Etcd member communication slowing down, 99th percentile is over 0.15s\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: EtcdHighNumberOfFailedProposals
-        expr: "increase(etcd_server_proposals_failed_total[1h]) > 5"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
-          description:
-            "Etcd server got more than 5 failed proposals past hour\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: EtcdHighFsyncDurations
-        expr:
-          "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd high fsync durations (instance {{ $labels.instance }})
-          description:
-            "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: EtcdHighCommitDurations
-        expr:
-          "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) >
-          0.25"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Etcd high commit durations (instance {{ $labels.instance }})
-          description:
-            "Etcd commit duration increasing, 99th percentile is over 0.25s\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
@@ -1,123 +0,0 @@
-groups:
-  - name: EmbeddedExporter
-
-    rules:
-      - alert: IstioKubernetesGatewayAvailabilityDrop
-        expr:
-          'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway",
-          namespace="istio-system"}) without (instance, pod) < 2'
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
-          description:
-            "Gateway pods have dropped. Inbound traffic will likely be affected.\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioPilotHighTotalRequestRate
-        expr: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
-          description:
-            "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have
-            outdated configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioMixerPrometheusDispatchesLow
-        expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
-          description:
-            "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being
-            exported properly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioHighTotalRequestRate
-        expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio high total request rate (instance {{ $labels.instance }})
-          description:
-            "Global request rate in the service mesh is unusually high.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioLowTotalRequestRate
-        expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio low total request rate (instance {{ $labels.instance }})
-          description:
-            "Global request rate in the service mesh is unusually low.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioHigh4xxErrorRate
-        expr:
-          'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) /
-          sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio high 4xx error rate (instance {{ $labels.instance }})
-          description:
-            "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioHigh5xxErrorRate
-        expr:
-          'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) /
-          sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio high 5xx error rate (instance {{ $labels.instance }})
-          description:
-            "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioHighRequestLatency
-        expr:
-          'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) /
-          rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio high request latency (instance {{ $labels.instance }})
-          description:
-            "Istio average requests execution is longer than 100ms.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: IstioLatency99Percentile
-        expr:
-          "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by
-          (destination_canonical_service, destination_workload_namespace, source_canonical_service,
-          source_workload_namespace, le)) > 1000"
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Istio latency 99 percentile (instance {{ $labels.instance }})
-          description:
-            "Istio 1% slowest requests are longer than 1000ms.\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: IstioPilotDuplicateEntry
-        expr: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
-          description:
-            "Istio pilot duplicate entry error.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -1,435 +0,0 @@
-groups:
-  - name: KubestateExporter
-
-    rules:
-      - alert: KubernetesNodeNotReady
-        expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes Node ready (node {{ $labels.node }})
-          description:
-            "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesNodeMemoryPressure
-        expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes memory pressure (node {{ $labels.node }})
-          description:
-            "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS
-            = {{ $labels }}"
-
-      - alert: KubernetesNodeDiskPressure
-        expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes disk pressure (node {{ $labels.node }})
-          description:
-            "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: KubernetesNodeNetworkUnavailable
-        expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
-          description:
-            "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesNodeOutOfPodCapacity
-        expr:
-          'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node)
-          (0 * kube_pod_info{pod_template_hash=""})) / sum by (node)
-          (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
-          description:
-            "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
-
-      - alert: KubernetesContainerOomKiller
-        expr:
-          '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
-          offset 10m >= 1) and ignoring (reason)
-          min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
-          == 1'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary:
-            Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{
-            $labels.container }})
-          description:
-            "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has
-            been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesJobFailed
-        expr: "kube_job_status_failed > 0"
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
-          description:
-            "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesCronjobSuspended
-        expr: "kube_cronjob_spec_suspend != 0"
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
-          description:
-            "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesPersistentvolumeclaimPending
-        expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary:
-            Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
-            $labels.persistentvolumeclaim }})
-          description:
-            "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is
-            pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesVolumeOutOfDiskSpace
-        expr:
-          "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
-          description:
-            "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesVolumeFullInFourDays
-        expr: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
-          description:
-            "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to
-            fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesPersistentvolumeError
-        expr:
-          'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary:
-            Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
-            $labels.persistentvolumeclaim }})
-          description:
-            "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesStatefulsetDown
-        expr: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
-        for: 1m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
-          description:
-            "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesHpaScaleInability
-        expr:
-          'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} ==
-          1'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
-          description:
-            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to
-            scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesHpaMetricsUnavailability
-        expr:
-          'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"}
-          == 1'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
-          description:
-            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect
-            metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesHpaScaleMaximum
-        expr:
-          "kube_horizontalpodautoscaler_status_desired_replicas >=
-          kube_horizontalpodautoscaler_spec_max_replicas"
-        for: 2m
-        labels:
-          severity: info
-        annotations:
-          summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
-          description:
-            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum
-            number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesHpaUnderutilized
-        expr:
-          "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) ==
-          kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3"
-        for: 0m
-        labels:
-          severity: info
-        annotations:
-          summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
-          description:
-            "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at
-            minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesPodNotHealthy
-        expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
-        for: 15m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
-          description:
-            "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for
-            longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesPodCrashLooping
-        expr: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
-          description:
-            "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesReplicasetReplicasMismatch
-        expr: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary:
-            Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
-          description:
-            "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE
-            = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesDeploymentReplicasMismatch
-        expr: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary:
-            Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment
-            }})
-          description:
-            "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE
-            = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesStatefulsetReplicasMismatch
-        expr: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
-          description:
-            "StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesDeploymentGenerationMismatch
-        expr: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary:
-            Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment
-            }})
-          description:
-            "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been
-            rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesStatefulsetGenerationMismatch
-        expr: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary:
-            Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{
-            $labels.statefulset }})
-          description:
-            "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not
-            been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesStatefulsetUpdateNotRolledOut
-        expr:
-          "max without (revision) (kube_statefulset_status_current_revision unless
-          kube_statefulset_status_update_revision) * (kube_statefulset_replicas !=
-          kube_statefulset_status_replicas_updated)"
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary:
-            Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{
-            $labels.statefulset }})
-          description:
-            "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been
-            rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesDaemonsetRolloutStuck
-        expr:
-          "kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100
-          < 100 or kube_daemonset_status_desired_number_scheduled -
-          kube_daemonset_status_current_number_scheduled > 0"
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary:
-            Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
-          description:
-            "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not
-            scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesDaemonsetMisscheduled
-        expr: "kube_daemonset_status_number_misscheduled > 0"
-        for: 1m
-        labels:
-          severity: critical
-        annotations:
-          summary:
-            Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
-          description:
-            "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running
-            where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesCronjobTooLong
-        expr: "time() - kube_cronjob_next_schedule_time > 3600"
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
-          description:
-            "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to
-            complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesJobSlowCompletion
-        expr: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
-        for: 12h
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
-          description:
-            "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in
-            time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesApiServerErrors
-        expr:
-          'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) /
-          sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes API server errors (instance {{ $labels.instance }})
-          description:
-            "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS
-            = {{ $labels }}"
-
-      - alert: KubernetesApiClientErrors
-        expr:
-          '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) /
-          sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes API client errors (instance {{ $labels.instance }})
-          description:
-            "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS
-            = {{ $labels }}"
-
-      - alert: KubernetesClientCertificateExpiresNextWeek
-        expr:
-          'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
-          histogram_quantile(0.01, sum by (job, le)
-          (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
-          7*24*60*60'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
-          description:
-            "A client certificate used to authenticate to the apiserver is expiring next
-            week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesClientCertificateExpiresSoon
-        expr:
-          'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
-          histogram_quantile(0.01, sum by (job, le)
-          (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
-          24*60*60'
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
-          description:
-            "A client certificate used to authenticate to the apiserver is expiring in less than
-            24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: KubernetesApiServerLatency
-        expr:
-          'histogram_quantile(0.99,
-          sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}
-          [10m])) WITHOUT (instance, resource)) > 1'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Kubernetes API server latency (instance {{ $labels.instance }})
-          description:
-            "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{
-            $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels
-            }}"
@@ -1,508 +0,0 @@
-groups:
-  - name: NodeExporter
-
-    rules:
-      - alert: HostOutOfMemory
-        expr:
-          '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance)
-          group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host out of memory (instance {{ $labels.instance }})
-          description:
-            "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels
-            }}"
-
-      - alert: HostMemoryUnderMemoryPressure
-        expr:
-          '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host memory under memory pressure (instance {{ $labels.instance }})
-          description:
-            "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostMemoryIsUnderutilized
-        expr:
-          '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes *
-          100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 1w
-        labels:
-          severity: info
-        annotations:
-          summary: Host Memory is underutilized (instance {{ $labels.instance }})
-          description:
-            "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{
-            $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostUnusualNetworkThroughputIn
-        expr:
-          '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual network throughput in (instance {{ $labels.instance }})
-          description:
-            "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostUnusualNetworkThroughputOut
-        expr:
-          '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual network throughput out (instance {{ $labels.instance }})
-          description:
-            "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostUnusualDiskReadRate
-        expr:
-          '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual disk read rate (instance {{ $labels.instance }})
-          description:
-            "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: HostUnusualDiskWriteRate
-        expr:
-          '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual disk write rate (instance {{ $labels.instance }})
-          description:
-            "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: HostOutOfDiskSpace
-        expr:
-          '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
-          device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host out of disk space (instance {{ $labels.instance }})
-          description:
-            "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostDiskWillFillIn24Hours
-        expr:
-          '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
-          device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 *
-          3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
-          description:
-            "Filesystem is predicted to run out of space within the next 24 hours at current write
-            rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostOutOfInodes
-        expr:
-          '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
-          * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host out of inodes (instance {{ $labels.instance }})
-          description:
-            "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostFilesystemDeviceError
-        expr: "node_filesystem_device_error == 1"
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Host filesystem device error (instance {{ $labels.instance }})
-          description:
-            "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }}
-            filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostInodesWillFillIn24Hours
-        expr:
-          '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
-          * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 *
-          3600) < 0 and ON (instance, device, mountpoint)
-          node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
-          description:
-            "Filesystem is predicted to run out of inodes within the next 24 hours at current write
-            rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostUnusualDiskReadLatency
-        expr:
-          '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m])
-          > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left
-          (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual disk read latency (instance {{ $labels.instance }})
-          description:
-            "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: HostUnusualDiskWriteLatency
-        expr:
-          '(rate(node_disk_write_time_seconds_total[1m]) /
-          rate(node_disk_writes_completed_total[1m]) > 0.1 and
-          rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual disk write latency (instance {{ $labels.instance }})
-          description:
-            "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS =
-            {{ $labels }}"
-
-      - alert: HostHighCpuLoad
-        expr:
-          '(sum by (instance) (avg by (mode, instance)
-          (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left
-          (nodename) node_uname_info{nodename=~".+"}'
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host high CPU load (instance {{ $labels.instance }})
-          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostCpuIsUnderutilized
-        expr:
-          '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
-          group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 1w
-        labels:
-          severity: info
-        annotations:
-          summary: Host CPU is underutilized (instance {{ $labels.instance }})
-          description:
-            "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostCpuStealNoisyNeighbor
-        expr:
-          '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
-          description:
-            "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may
-            be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostCpuHighIowait
-        expr:
-          '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host CPU high iowait (instance {{ $labels.instance }})
-          description:
-            "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostUnusualDiskIo
-        expr:
-          '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual disk IO (instance {{ $labels.instance }})
-          description:
-            "Time spent in IO is too high on {{ $labels.instance }}. Check storage for
-            issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostContextSwitching
-        expr:
-          '((rate(node_context_switches_total[5m])) / (count without(cpu, mode)
-          (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host context switching (instance {{ $labels.instance }})
-          description:
-            "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostSwapIsFillingUp
-        expr:
-          '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host swap is filling up (instance {{ $labels.instance }})
-          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostSystemdServiceCrashed
-        expr:
-          '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host systemd service crashed (instance {{ $labels.instance }})
-          description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostPhysicalComponentTooHot
-        expr:
-          '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor)
-          node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host physical component too hot (instance {{ $labels.instance }})
-          description:
-            "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostNodeOvertemperatureAlarm
-        expr:
-          '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
-          description:
-            "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
-
-      - alert: HostRaidArrayGotInactive
-        expr:
-          '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Host RAID array got inactive (instance {{ $labels.instance }})
-          description:
-            "RAID array {{ $labels.device }} is in a degraded state due to one or more disk
-            failures. The number of spare drives is insufficient to fix the issue
-            automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostRaidDiskFailure
-        expr:
-          '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host RAID disk failure (instance {{ $labels.instance }})
-          description:
-            "At least one device in RAID array on {{ $labels.instance }} failed. Array {{
-            $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostKernelVersionDeviations
-        expr:
-          '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release",
-          "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 6h
-        labels:
-          severity: warning
-        annotations:
-          summary: Host kernel version deviations (instance {{ $labels.instance }})
-          description:
-            "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostOomKillDetected
-        expr:
-          '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host OOM kill detected (instance {{ $labels.instance }})
-          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostEdacCorrectableErrorsDetected
-        expr:
-          '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left
-          (nodename) node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: info
-        annotations:
-          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
-          description:
-            "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory
-            errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
-
-      - alert: HostEdacUncorrectableErrorsDetected
-        expr:
-          '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
-          description:
-            "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory
-            errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
-
-      - alert: HostNetworkReceiveErrors
-        expr:
-          '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m])
-          > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host Network Receive Errors (instance {{ $labels.instance }})
-          description:
-            "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
-            \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostNetworkTransmitErrors
-        expr:
-          '(rate(node_network_transmit_errs_total[2m]) /
-          rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host Network Transmit Errors (instance {{ $labels.instance }})
-          description:
-            "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
-            \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostNetworkInterfaceSaturated
-        expr:
-          '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) +
-          rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) /
-          node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host Network Interface Saturated (instance {{ $labels.instance }})
-          description:
-            "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting
-            overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostNetworkBondDegraded
-        expr:
-          '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host Network Bond Degraded (instance {{ $labels.instance }})
-          description:
-            "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{
-            $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostConntrackLimit
-        expr:
-          '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance)
-          group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host conntrack limit (instance {{ $labels.instance }})
-          description:
-            "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
-
-      - alert: HostClockSkew
-        expr:
-          '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or
-          (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host clock skew (instance {{ $labels.instance }})
-          description:
-            "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this
-            host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostClockNotSynchronising
-        expr:
-          '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) *
-          on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host clock not synchronising (instance {{ $labels.instance }})
-          description:
-            "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value
-            }}\n  LABELS = {{ $labels }}"
-
-      - alert: HostRequiresReboot
-        expr:
-          '(node_reboot_required > 0) * on(instance) group_left (nodename)
-          node_uname_info{nodename=~".+"}'
-        for: 4h
-        labels:
-          severity: info
-        annotations:
-          summary: Host requires reboot (instance {{ $labels.instance }})
-          description:
-            "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{
-            $labels }}"
@@ -1,157 +0,0 @@
-{
-  config,
-  myvars,
-  ...
-}: {
-  # https://prometheus.io/docs/prometheus/latest/configuration/configuration/
-  services.prometheus = {
-    enable = true;
-    checkConfig = true;
-    listenAddress = "127.0.0.1";
-    port = 9090;
-    webExternalUrl = "http://prometheus.writefor.fun";
-
-    extraFlags = ["--storage.tsdb.retention.time=45d"];
-    # Directory below /var/lib to store Prometheus metrics data.
-    stateDir = "prometheus2";
-
-    # Reload prometheus when configuration file changes (instead of restart).
-    enableReload = true;
-    # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
-    # remoteRead = [];
-
-    # Rules are read from these files.
-    # https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
-    #
-    # Prometheus supports two types of rules which may be configured
-    # and then evaluated at regular intervals:
-    #   1. Recording rules
-    #      Recording rules allow you to precompute frequently needed or computationally
-    #      expensive expressions and save their result as a new set of time series.
-    #      Querying the precomputed result will then often be much faster than executing the original expression.
-    #      This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
-    #   2. Alerting rules
-    #      Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
-    #      and to send notifications about firing alerts to an external service.
-    ruleFiles = [
-      ./alert_rules/node-exporter.yml
-      ./alert_rules/kubestate-exporter.yml
-      ./alert_rules/etcd_embedded-exporter.yml
-      ./alert_rules/istio_embedded-exporter.yml
-      ./alert_rules/coredns_embedded-exporter.yml
-
-      # ./recording_rules.yml
-    ];
-
-    # specifies a set of targets and parameters describing how to scrape metrics from them.
-    # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
-    scrapeConfigs = [
-      # --- Hosts --- #
-      {
-        job_name = "node-exporter";
-        scrape_interval = "30s";
-        metrics_path = "/metrics";
-        static_configs = [
-          {
-            # All my NixOS hosts.
-            targets =
-              map (addr: "${addr.ipv4}:9100")
-              (builtins.attrValues myvars.networking.hostsAddr);
-            labels.type = "node";
-          }
-        ];
-      }
-
-      # --- Homelab Applications --- #
-
-      {
-        job_name = "dnsmasq-exporter";
-        scrape_interval = "30s";
-        metrics_path = "/metrics";
-        static_configs = [
-          {
-            targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
-            labels.type = "app";
-            labels.app = "dnsmasq";
-          }
-        ];
-      }
-
-      {
-        job_name = "v2ray-exporter";
-        scrape_interval = "30s";
-        metrics_path = "/metrics";
-        static_configs = [
-          {
-            targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:9153"];
-            labels.type = "app";
-            labels.app = "v2ray";
-          }
-        ];
-      }
-
-      {
-        job_name = "sftpgo-embedded-exporter";
-        scrape_interval = "30s";
-        metrics_path = "/metrics";
-        static_configs = [
-          {
-            targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:10000"];
-            labels.type = "app";
-            labels.app = "v2ray";
-          }
-        ];
-      }
-    ];
-
-    # specifies Alertmanager instances the Prometheus server sends alerts to
-    # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
-    alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
-  };
-
-  services.prometheus.alertmanager = {
-    enable = true;
-    listenAddress = "127.0.0.1";
-    port = 9093;
-    webExternalUrl = "http://alertmanager.writefor.fun";
-    logLevel = "info";
-
-    environmentFile = config.age.secrets."alertmanager.env".path;
-    configuration = {
-      global = {
-        # The smarthost and SMTP sender used for mail notifications.
-        smtp_smarthost = "smtp.qq.com:465";
-        smtp_from = "$SMTP_SENDER_EMAIL";
-        smtp_auth_username = "$SMTP_AUTH_USERNAME";
-        smtp_auth_password = "$SMTP_AUTH_PASSWORD";
-        # smtp.qq.com:465 support SSL only, so we need to disable TLS here.
-        # https://service.mail.qq.com/detail/0/310
-        smtp_require_tls = false;
-      };
-      route = {
-        receiver = "default";
-        routes = [
-          {
-            group_by = ["host"];
-            group_wait = "5m";
-            group_interval = "5m";
-            repeat_interval = "4h";
-            receiver = "default";
-          }
-        ];
-      };
-      receivers = [
-        {
-          name = "default";
-          email_configs = [
-            {
-              to = "ryan4yin@linux.com";
-              # Whether to notify about resolved alerts.
-              send_resolved = true;
-            }
-          ];
-        }
-      ];
-    };
-  };
-}
@@ -1,79 +0,0 @@
-{pkgs, ...}: let
-  passwordFile = "/etc/agenix/restic-password";
-  sshKeyPath = "/etc/agenix/ssh-key-for-restic-backup";
-  rcloneConfigFile = "/etc/agenix/rclone-conf-for-restic-backup";
-in {
-  # https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/backup/restic.nix
-  services.restic.backups = {
-    homelab-backup = {
-      inherit passwordFile;
-      initialize = true; # Initialize the repository if it doesn't exist.
-      repository = "rclone:smb-downloads:/Downloads/kubevirt-backup/"; # backup to a rclone remote
-
-      # rclone related
-      # rcloneOptions = {
-      #   bwlimit = "100M";  # Limit the bandwidth used by rclone.
-      # };
-      inherit rcloneConfigFile;
-
-      # Which local paths to backup, in addition to ones specified via `dynamicFilesFrom`.
-      paths = [
-        "/tmp/restic-backup-temp"
-      ];
-      #
-      # A script that produces a list of files to back up.  The
-      # results of this command are given to the '--files-from'
-      # option. The result is merged with paths specified via `paths`.
-      # dynamicFilesFrom = "find /home/matt/git -type d -name .git";
-      #
-      # Patterns to exclude when backing up. See
-      #   https://restic.readthedocs.io/en/latest/040_backup.html#excluding-files
-      # for details on syntax.
-      exclude = [];
-
-      # A script that must run before starting the backup process.
-      backupPrepareCommand = ''
-        ${pkgs.nushell}/bin/nu -c '
-          let kubevirt_nodes = [
-            "kubevirt-shoryu"
-            "kubevirt-shushou"
-            "kubevirt-youko"
-          ]
-
-          kubevirt_nodes | each {|it|
-            rsync -avz \
-            -e "ssh -i ${sshKeyPath}"  \
-            $"($it):/perissitent/" $"/tmp/restic-backup-temp/($it)"
-          }
-        '
-      '';
-      # A script that must run after finishing the backup process.
-      backupCleanupCommand = "rm -rf /tmp/restic-backup-temp";
-
-      # Extra extended options to be passed to the restic --option flag.
-      # extraOptions = [];
-
-      # Extra arguments passed to restic backup.
-      # extraBackupArgs = [
-      #   "--exclude-file=/etc/restic/excludes-list"
-      # ];
-
-      # repository = "/mnt/backup-hdd"; # backup to a local directory
-      # When to run the backup. See {manpage}`systemd.timer(5)` for details.
-      timerConfig = {
-        OnCalendar = "01:30";
-        RandomizedDelaySec = "1h";
-      };
-      # A list of options (--keep-* et al.) for 'restic forget --prune',
-      # to automatically prune old snapshots.
-      # The 'forget' command is run *after* the 'backup' command, so
-      # keep that in mind when constructing the --keep-* options.
-      pruneOpts = [
-        "--keep-daily 3"
-        "--keep-weekly 3"
-        "--keep-monthly 3"
-        "--keep-yearly 3"
-      ];
-    };
-  };
-}
@@ -1,97 +0,0 @@
-{config, ...}: {
-  # Read SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD from a file
-  systemd.services.sftpgo.serviceConfig.EnvironmentFile = config.age.secrets."sftpgo.env".path;
-
-  services.sftpgo = {
-    enable = true;
-    user = "sftpgo";
-    dataDir = "/var/lib/sftpgo";
-    extraArgs = [
-      "--log-level"
-      "info"
-    ];
-    # https://github.com/drakkan/sftpgo/blob/2.5.x/docs/full-configuration.md
-    settings = {
-      common = {
-        # Auto-blocking policy for SFTPGo and thus helps to prevent DoS (Denial of Service) and brute force password guessing.
-        defender = {
-          enable = true;
-        };
-      };
-      # Where to store stfpgo's data
-      data_provider = {
-        driver = "sqlite";
-        name = "sftpgo.db";
-        password_hashing = {
-          algo = "argon2id";
-          # options for argon2id hashing algorithm.
-          # The memory and iterations parameters control the computational cost of hashing the password.
-          argon2_options = {
-            memory = 65536; # KiB
-            iterations = 2; # The number of iterations over the memory.
-            parallelism = 2; # The number of threads (or lanes) used by the algorithm.
-          };
-        };
-        password_validation = {
-          # What Entropy Value Should I Use?
-          # somewhere in the 50-70 range seems "reasonable".
-          # https://github.com/wagslane/go-password-validator#what-entropy-value-should-i-use
-          admins.min_entropy = 60;
-          users.min_entropy = 60;
-        };
-        # Cache passwords in memory to avoid hashing the same password multiple times(it costs).
-        password_caching = true;
-        # create the default admin user via environment variables
-        # SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD
-        create_default_admin = true;
-      };
-
-      # WebDAV is a popular protocol for file sharing, better than CIFS/SMB, NFS, etc.
-      # it's save to use WebDAV over HTTPS on public networks.
-      webdavd.bindings = [
-        {
-          address = "127.0.0.1";
-          port = 3303;
-        }
-      ];
-      # HTTP Server provides a simple web interface to manage the server.
-      httpd.bindings = [
-        {
-          address = "127.0.0.1";
-          enable_https = false;
-          port = 3302;
-          client_ip_proxy_header = "X-Forwarded-For";
-          # a basic built-in web interface that allows you to manage users,
-          # virtual folders, admins and connections.
-          # url: http://127.0.0.1:8080/web/admin
-          enable_web_admin = true;
-          # A basic front-end web interface for your users.
-          # It allows end-users to browse and manage their files and change their credentials.
-          enable_web_client = true;
-          enable_rest_api = true;
-        }
-      ];
-      # prometheus metrics
-      telemetry = {
-        bind_port = 10000;
-        bind_address = "0.0.0.0";
-        # auth_user_file = "";
-      };
-      # multi-factor authentication settings
-      mfa.totp = [
-        {
-          # Unique configuration name, not visible to the authentication apps.
-          # Should not to be changed after the first user has been created.
-          name = "SFTPGo";
-          # Name of the issuing Organization/Company
-          issuer = "SFTPGo";
-          # Algorithm to use for HMAC
-          # Currently Google Authenticator app on iPhone seems to only support sha1
-          algo = "sha1";
-        }
-      ];
-      # SMTP configuration enables SFTPGo email sending capabilities
-      # smtp = {};
-    };
-  };
-}
@@ -1,117 +0,0 @@
-{
-  config,
-  myvars,
-  ...
-}: let
-  dataDir = "/var/lib/transmission";
-  name = "transmission";
-in {
-  # the headless Transmission BitTorrent daemon
-  # https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/torrent/transmission.nix
-  # https://wiki.archlinux.org/title/transmission
-  services.transmission = {
-    enable = true;
-    user = name;
-    group = name;
-    home = dataDir;
-    downloadDirPermissions = "0770";
-
-    # Whether to enable tweaking of kernel parameters to open many more connections at the same time.
-    # Note that you may also want to increase peer-limit-global.
-    # And be aware that these settings are quite aggressive and might not suite your regular desktop use.
-    # For instance, SSH sessions may time out more easily.
-    performanceNetParameters = true;
-
-    # Path to a JSON file to be merged with the settings.
-    # Useful to merge a file which is better kept out of the Nix store to set secret config parameters like `rpc-password`.
-    credentialsFile = config.age.secrets."transmission-credentials.json".path;
-
-    # Whether to open the RPC port in the firewall.
-    openRPCPort = false;
-    openPeerPorts = true;
-
-    # https://github.com/transmission/transmission/blob/main/docs/Editing-Configuration-Files.md
-    settings = {
-      # 0 = None, 1 = Critical, 2 = Error, 3 = Warn, 4 = Info, 5 = Debug, 6 = Trace;
-      message-level = 3;
-
-      # Encryption may help get around some ISP filtering,
-      # but at the cost of slightly higher CPU use.
-      # 0 = Prefer unencrypted connections,
-      # 1 = Prefer encrypted connections,
-      # 2 = Require encrypted connections; default = 1)
-      encryption = 2;
-
-      # rpc = Web Interface
-      rpc-port = 9091;
-      rpc-bind-address = "127.0.0.1";
-      anti-brute-force-enabled = true;
-      # After this amount of failed authentication attempts is surpassed,
-      # the RPC server will deny any further authentication attempts until it is restarted.
-      # This is not tracked per IP but in total.
-      anti-brute-force-threshold = 20;
-      rpc-authentication-required = true;
-
-      # Comma-delimited list of IP addresses.
-      # Wildcards allowed using '*'. Example: "127.0.0.*,192.168.*.*",
-      rpc-whitelist-enabled = true;
-      rpc-whitelist = "127.0.0.*,192.168.*.*";
-      # Comma-delimited list of domain names.
-      # Wildcards allowed using '*'. Example: "*.foo.org,example.com",
-      rpc-host-whitelist-enabled = true;
-      rpc-host-whitelist = "*.writefor.fun,localhost,192.168.5.*";
-      rpc-user = myvars.username;
-      rpc-username = myvars.username;
-      # rpc-password = "test"; # you'd better use the credentialsFile for this.
-
-      incomplete-dir-enabled = true;
-      incomplete-dir = "${dataDir}/incomplete";
-      download-dir = "${dataDir}/downloads";
-
-      # Watch a directory for torrent files and add them to transmission.
-      watch-dir-enabled = false;
-      watch-dir = "${dataDir}/watch";
-      # Whether to enable Micro Transport Protocol (µTP).
-      utp-enabled = true;
-      # Executable to be run at torrent completion.
-      script-torrent-done-enabled = false;
-      # script-torrent-done-filename = "/path/to/script";
-
-      # Enable Local Peer Discovery (LPD).
-      lpd-enabled = true;
-      # The peer port to listen for incoming connections.
-      peer-port = 51413;
-      # Enable UOnP or NAT-PMP to forward a port through your firewall(NAT).
-      # https://github.com/transmission/transmission/blob/main/docs/Port-Forwarding-Guide.md
-      port-forwarding-enabled = true;
-
-      # "normal" speed limits
-      speed-limit-down-enabled = true;
-      speed-limit-down = 30000; # KB/s
-      speed-limit-up-enabled = true;
-      speed-limit-up = 500; # KB/s
-      upload-slots-per-torrent = 8;
-
-      # Start torrents as soon as they are added
-      start-added-torrents = true;
-
-      # Queuing
-      # When true, Transmission will only download
-      # download-queue-size non-stalled torrents at once.
-      download-queue-enabled = true;
-      download-queue-size = 5;
-
-      # When true, torrents that have not shared data for
-      # queue-stalled-minutes are treated as 'stalled'
-      # and are not counted against the queue-download-size
-      # and seed-queue-size limits.
-      queue-stalled-enabled = true;
-      queue-stalled-minutes = 60;
-
-      # When true. Transmission will only seed seed-queue-size
-      # non-stalled torrents at once.
-      seed-queue-enabled = true;
-      seed-queue-size = 10;
-    };
-  };
-}
@@ -1,12 +0,0 @@
-{
-  # https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/monitoring/uptime-kuma.nix
-  services.uptime-kuma = {
-    enable = true;
-    # https://github.com/louislam/uptime-kuma/wiki/Environment-Variables
-    settings = {
-      "UPTIME_KUMA_HOST" = "127.0.0.1";
-      "UPTIME_KUMA_PORT" = "3350";
-      "DATA_DIR" = "/var/lib/uptime-kuma/";
-    };
-  };
-}