feat: migrate services to aqua

This commit is contained in:
Ryan Yin
2024-07-31 11:11:14 +08:00
parent 80e0bcf031
commit b671c05db9
38 changed files with 11 additions and 8 deletions
-97
View File
@@ -1,97 +0,0 @@
{
pkgs,
config,
wallpapers,
...
}: let
hostCommonConfig = ''
encode zstd gzip
tls ${../../certs/ecc-server.crt} ${config.age.secrets."certs/ecc-server.key".path} {
protocols tls1.3 tls1.3
curves x25519 secp384r1 secp521r1
}
'';
in {
services.caddy = {
enable = true;
# Reload Caddy instead of restarting it when configuration file changes.
enableReload = true;
user = "caddy"; # User account under which caddy runs.
dataDir = "/var/lib/caddy";
logDir = "/var/log/caddy";
# Additional lines of configuration appended to the global config section of the Caddyfile.
# Refer to https://caddyserver.com/docs/caddyfile/options#global-options for details on supported values.
globalConfig = ''
http_port 80
https_port 443
auto_https disable_certs
'';
# Dashboard
virtualHosts."home.writefor.fun".extraConfig = ''
${hostCommonConfig}
reverse_proxy http://localhost:4401
'';
# https://caddyserver.com/docs/caddyfile/directives/file_server
virtualHosts."file.writefor.fun".extraConfig = ''
root * /var/lib/caddy/fileserver/
${hostCommonConfig}
file_server browse {
hide .git
precompressed zstd br gzip
}
'';
virtualHosts."git.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:3301
'';
virtualHosts."sftpgo.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:3302
'';
virtualHosts."webdav.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:3303
'';
virtualHosts."transmission.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:9091
'';
# Monitoring
virtualHosts."uptime-kuma.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:3350
'';
virtualHosts."grafana.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:3351
'';
virtualHosts."prometheus.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:9090
'';
virtualHosts."alertmanager.writefor.fun".extraConfig = ''
encode zstd gzip
reverse_proxy http://localhost:9093
'';
};
networking.firewall.allowedTCPPorts = [80 443];
# Create Directories
systemd.tmpfiles.rules = [
"d /var/lib/caddy/fileserver/ 0755 caddy caddy"
# directory for virtual machine's images
"d /var/lib/caddy/fileserver/vms 0755 caddy caddy"
];
# Add all my wallpapers into /var/lib/caddy/fileserver/wallpapers
# Install the homepage-dashboard configuration files
system.activationScripts.installCaddyWallpapers = ''
mkdir -p /var/lib/caddy/fileserver/wallpapers
${pkgs.rsync}/bin/rsync -avz --chmod=D2755,F644 ${wallpapers}/ /var/lib/caddy/fileserver/wallpapers/
'';
}
@@ -1,3 +0,0 @@
{mylib, ...}: {
imports = mylib.scanPaths ./.;
}
-88
View File
@@ -1,88 +0,0 @@
{pkgs, ...}: let
in {
# https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/misc/gitea.nix
services.gitea = {
enable = true;
user = "gitea";
group = "gitea";
stateDir = "/var/lib/gitea";
appName = "Ryan Yin's Gitea Service";
lfs.enable = true;
# Enable a timer that runs gitea dump to generate backup-files of the current gitea database and repositories.
dump = {
enable = false;
interval = "hourly";
file = "gitea-dump";
type = "tar.zst";
};
# Path to a file containing the SMTP password.
# mailerPasswordFile = "";
settings = {
server = {
SSH_PORT = 2222;
PROTOCOL = "http";
HTTP_PORT = 3301;
HTTP_ADDR = "127.0.0.1";
DOMAIN = "git.writefor.fun";
};
# one of "Trace", "Debug", "Info", "Warn", "Error", "Critical"
log.LEVEL = "Info";
session.COOKIE_SECURE = false;
service.DISABLE_REGISTRATION = true;
# "cron.sync_external_users" = {
# RUN_AT_START = true;
# SCHEDULE = "@every 24h";
# UPDATE_EXISTING = true;
# };
mailer = {
ENABLED = true;
MAILER_TYPE = "sendmail";
FROM = "do-not-reply@writefor.fun";
SENDMAIL_PATH = "${pkgs.system-sendmail}/bin/sendmail";
};
other = {
SHOW_FOOTER_VERSION = false;
};
};
database = {
type = "sqlite3";
# create a local database automatically.
createDatabase = true;
};
};
# services.gitea-actions-runner.instances."default" = {
# enable = true;
# name = "default";
# labels = [
# # provide a debian base with nodejs for actions
# "debian-latest:docker://node:18-bullseye"
# # fake the ubuntu name, because node provides no ubuntu builds
# "ubuntu-latest:docker://node:18-bullseye"
# # provide native execution on the host
# "native:host"
# ];
# gitea = "http://git.writefor.fun";
# # Path to an environment file,
# # containing the TOKEN environment variable,
# # that holds a token to register at the configured Gitea instance.
# tokenFile = "xxx"; # use agenix for secrets.
# # Configuration for act_runner daemon.
# # For an example configuration, see:
# # https://gitea.com/gitea/act_runner/src/branch/main/internal/pkg/config/config.example.yaml
# settings = {};
# # List of packages, that are available to actions,
# # when the runner is configured with a host execution label.
# hostPackages = with pkgs; [
# bash
# coreutils
# curl
# gawk
# gitMinimal
# gnused
# nodejs
# wget
# ];
# };
}
@@ -1,20 +0,0 @@
apiVersion: 1
providers:
# <string> an unique provider name. Required
- name: "Dashboards"
# <int> Org id. Default to 1
orgId: 1
# <string> provider type. Default to 'file'
type: file
# <bool> disable dashboard deletion
disableDeletion: false
# <int> how often Grafana will scan for changed dashboards
updateIntervalSeconds: 20
# <bool> allow updating provisioned dashboards from the UI
allowUiUpdates: false
options:
# <string, required> path to dashboard files on disk. Required when using the 'file' type
path: /etc/grafana/dashboards
# <bool> use folder names from filesystem to create folders in Grafana
foldersFromFilesStructure: true
@@ -1,10 +0,0 @@
# Grafana Dashboards
## Homelab
1. https://grafana.com/grafana/dashboards/1860-node-exporter-full/
2. https://grafana.com/grafana/dashboards/9578-alertmanager/
## Kubernetes
1. https://github.com/dotdc/grafana-dashboards-kubernetes/
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -1,21 +0,0 @@
# https://grafana.com/docs/grafana/latest/datasources/prometheus/
apiVersion: 1
datasources:
- name: prometheus-homelab
type: prometheus
access: proxy
# Access mode - proxy (server in the UI) or direct (browser in the UI).
url: http://localhost:9090
jsonData:
httpMethod: POST
manageAlerts: true
prometheusType: Prometheus
prometheusVersion: 2.49.0
cacheLevel: "High"
disableRecordingRules: false
# As of Grafana 10, the Prometheus data source can be configured to query live dashboards
# incrementally, instead of re-querying the entire duration on each dashboard refresh.
# Increasing the duration of the incrementalQueryOverlapWindow will increase the size of every incremental query,
# but might be helpful for instances that have inconsistent results for recent data.
incrementalQueryOverlapWindow: 10m
@@ -1,52 +0,0 @@
{
config,
myvars,
...
}: {
services.grafana = {
enable = true;
dataDir = "/var/lib/grafana";
# DeclarativePlugins = with pkgs.grafanaPlugins; [ grafana-piechart-panel ];
settings = {
server = {
http_addr = "127.0.0.1";
http_port = 3351;
protocol = "http";
domain = "grafana.writefo.fun";
# Redirect to correct domain if the host header does not match the domain. Prevents DNS rebinding attacks.
serve_from_sub_path = false;
# Add subpath to the root_url if serve_from_sub_path is true
root_url = "%(protocol)s://%(domain)s:%(http_port)s/";
enforce_domain = false;
read_timeout = "180s";
# Enable HTTP compression, this can improve transfer speed and bandwidth utilization.
enable_gzip = true;
# Cdn for accelerating loading of frontend assets.
# cdn_url = "https://cdn.jsdelivr.net/npm/grafana@7.5.5";
};
security = {
admin_user = myvars.username;
admin_email = myvars.useremail;
# Use file provider to read the admin password from a file.
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#file-provider
admin_password = "$__file{${config.age.secrets."grafana-admin-password".path}}";
};
users = {
allow_sign_up = false;
# home_page = "";
default_theme = "dark";
};
};
# Declaratively provision Grafana's data sources, dashboards, and alerting rules.
# Grafana's alerting rules is not recommended to use, we use Prometheus alertmanager instead.
# https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
provision = {
datasources.path = ./datasources.yml;
dashboards.path = ./dashboards.yml;
};
};
environment.etc."grafana/dashboards".source = ./dashboards;
}
@@ -1 +0,0 @@
# Homepage for my Homelab
@@ -1,8 +0,0 @@
---
- About Me:
- Blog:
- abbr: Blog
href: https://thiscute.world/
- Github:
- abbr: GH
href: https://github.com/ryan4yin
@@ -1,3 +0,0 @@
# kana-docker:
# socket: /var/run/docker.sock
#
@@ -1,6 +0,0 @@
# https://gethomepage.dev/latest/configs/kubernetes/
# uses the default kubeconfig to access the cluster
# read kubbecofig from $KUBECONFIG or $HOME/.kube/config
# mode: default
mode: disabled
@@ -1,75 +0,0 @@
---
# For configuration options and examples, please see:
# https://gethomepage.dev/latest/configs/services
- KubeVirt 虚拟化集群:
- KubeVirt-Shoryu:
icon: si-kubevirt
description: "CPU: R7-5825U / MEM: 64G / DISK: 1T"
href: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.181:9100
siteMonitor: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.181:9100
- KubeVirt-Shushou:
icon: si-kubevirt
description: "CPU: R9-5900HX / MEM: 64G / DISK: 1T"
href: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.182:9100
siteMonitor: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.182:9100
- KubeVirt-Youko:
icon: si-kubevirt
description: "CPU: R5-5625U / MEM: 32G / DISK: 512G+4T*2"
href: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.183:9100
siteMonitor: http://grafana.writefor.fun/d/rYdddlPWk/node-exporter-full?orgId=1&var-node=192.168.5.183:9100
- LongHorn-Storage:
icon: si-longhorn
href: http://longhorn.writefor.fun/
- Victoria-Metrics:
icon: si-victoriametrics
href: http://vm.writefor.fun/
- KubeVirt-Grafana:
icon: si-grafana
href: http://k8s-grafana.writefor.fun/
- Homelab Monitoring:
- Grafana:
icon: si-grafana
href: http://grafana.writefor.fun
description: Data visualised on dashboards
siteMonitor: http://grafana.writefor.fun
- Prometheus Dashboard:
icon: si-prometheus
href: http://prometheus.writefor.fun
description: Monitoring - Prometheus
siteMonitor: http://prometheus.writefor.fun
- Uptime Kuma:
icon: si-uptimekuma
href: http://uptime-kuma.writefor.fun
description: Uptime Checking
siteMonitor: http://uptime-kuma.writefor.fun
- Homelab Applications:
- SFTPGO:
icon: sftpgo.png
href: "http://sftpgo.writefor.fun/web/admin/folders"
description: WebDAV & SFTP server
siteMonitor: http://sftpgo.writefor.fun/
# - Kubernetes Monitoring:
# # TODO: Update this
# - Emby:
# icon: emby.png
# href: "http://emby.home/"
# description: Media server
# namespace: media # The kubernetes namespace the app resides in
# app: emby # The name of the deployed app
#
# - Element Chat:
# icon: matrix-light.png
# href: https://chat.example.com
# description: Matrix Synapse Powered Chat
# app: matrix-element
# namespace: comms
# pod-selector: >-
# app.kubernetes.io/instance in (
# matrix-element,
# matrix-media-repo,
# matrix-media-repo-postgresql,
# matrix-synapse
# )
@@ -1,82 +0,0 @@
---
# For configuration options and examples, please see:
# https://gethomepage.dev/latest/configs/settings
title: Ryan Yin's Homelab
base: https://home.writefor.fun/
favicon: https://thiscute.world/favicon.ico
# https://developer.mozilla.org/en-US/docs/Web/Manifest/start_url
# Used by some browsers to determine the start page of the web application
startUrl: https://home.writefor.fun/
language: zh
# Define shared API provider options and secrets here,
# You can then pass provider instead of apiKey in your widget configuration.
providers:
# read api keys from environment variables
openweathermap: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
weatherapi: { { HOMEPAGE_VAR_WEATHERAPI_APIKEY } }
background:
image: https://file.writefor.fun/wallpapers/rolling-girls.png
blur: sm # sm, "", md, xl... see https://tailwindcss.com/docs/backdrop-blur
saturate: 90 # 0, 50, 100... see https://tailwindcss.com/docs/backdrop-saturate
brightness: 90 # 0, 50, 75... see https://tailwindcss.com/docs/backdrop-brightness
opacity: 85 # 0-100
theme: dark # or light
# Supported colors are:
# slate, gray, zinc, neutral, stone, amber,
# yellow, lime, green, emerald, teal, cyan,
# sky, blue, indigo, violet, purple, fuchsia, pink, rose, red, white
color: indigo
# make all cards in a row the same height.
useEqualHeights: true
# Groups and its layout
# Groups Name should match the name defined in your services.yaml or widgets.yaml
layout:
KubeVirt 虚拟化集群:
icon: si-kubevirt
tab: First
Group A:
initiallyCollapsed: true # collapsed by default
tab: First
style: row
columns: 4
Second Service Group:
useEqualHeights: true # overrides global setting
tab: Second
columns: 4
Third Service Group:
tab: Third
style: row
Bookmark Group on Fourth Tab:
tab: Fourth
Service Group on every Tab:
style: row
columns: 4
# https://gethomepage.dev/latest/configs/services/#icons
# iconStyle: theme # optional, defaults to gradient
# Typing in homepage to quick search
quicklaunch:
searchDescriptions: true
hideInternetSearch: true
showSearchSuggestions: true
hideVisitURL: true
# Show docker stats
showStats: true
hideErrors: false
@@ -1,21 +0,0 @@
# TODO: add access to kubernetes cluster
# - kubernetes:
# cluster:
# show: true
# cpu: true
# memory: true
# showLabel: true
# label: "cluster"
# nodes:
# show: true
# cpu: true
# memory: true
# showLabel: true
# - resources:
# backend: resources
# expanded: true
# cpu: true
# memory: true
- search:
provider: google
target: _blank
@@ -1,25 +0,0 @@
{pkgs, ...}: let
configDir = "/var/lib/homepage-dashboard";
in {
# https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/services/misc/homepage-dashboard.nix
services.homepage-dashboard = {
enable = true;
listenPort = 4401;
openFirewall = false;
};
systemd.services.homepage-dashboard.environment = {
HOMEPAGE_CONFIG_DIR = configDir;
# 1. The value of env var HOMEPAGE_VAR_XXX will replace {{HOMEPAGE_VAR_XXX}} in any config
# HOMEPAGE_VAR_XXX_APIKEY = "myapikey";
# 2. The value of env var HOMEPAGE_FILE_XXX must be a file path,
# the contents of which will be used to replace {{HOMEPAGE_FILE_XXX}} in any config
};
# Install the homepage-dashboard configuration files
system.activationScripts.installHomepageDashboardConfig = ''
mkdir -p ${configDir}
${pkgs.rsync}/bin/rsync -avz --chmod=D2755,F600 ${./config}/ ${configDir}/
${pkgs.systemdMinimal}/bin/systemctl restart homepage-dashboard
'';
}
@@ -1,6 +0,0 @@
# Prometheus & Alertmanager
## Alert Rules
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
Prometheus alerting rules
@@ -1,13 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: CorednsPanicCount
expr: "increase(coredns_panics_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
description:
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
@@ -1,162 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: EtcdInsufficientMembers
expr: "count(etcd_server_id) % 2 == 0"
for: 0m
labels:
severity: critical
annotations:
summary: Etcd insufficient Members (instance {{ $labels.instance }})
description:
"Etcd cluster should have an odd number of members\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: EtcdNoLeader
expr: "etcd_server_has_leader == 0"
for: 0m
labels:
severity: critical
annotations:
summary: Etcd no Leader (instance {{ $labels.instance }})
description:
"Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfLeaderChanges
expr: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
for: 0m
labels:
severity: warning
annotations:
summary: Etcd high number of leader changes (instance {{ $labels.instance }})
description:
"Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedGrpcRequests
expr:
'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
description:
"More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: EtcdHighNumberOfFailedGrpcRequests
expr:
'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
for: 2m
labels:
severity: critical
annotations:
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
description:
"More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: EtcdGrpcRequestsSlow
expr:
'histogram_quantile(0.99,
sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service,
grpc_method, le)) > 0.15'
for: 2m
labels:
severity: warning
annotations:
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
description:
"GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedHttpRequests
expr:
"sum(rate(etcd_http_failed_total[1m])) BY (method) /
sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
description:
"More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
- alert: EtcdHighNumberOfFailedHttpRequests
expr:
"sum(rate(etcd_http_failed_total[1m])) BY (method) /
sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
for: 2m
labels:
severity: critical
annotations:
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
description:
"More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
- alert: EtcdHttpRequestsSlow
expr:
"histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
for: 2m
labels:
severity: warning
annotations:
summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
description:
"HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: EtcdMemberCommunicationSlow
expr:
"histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) >
0.15"
for: 2m
labels:
severity: warning
annotations:
summary: Etcd member communication slow (instance {{ $labels.instance }})
description:
"Etcd member communication slowing down, 99th percentile is over 0.15s\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedProposals
expr: "increase(etcd_server_proposals_failed_total[1h]) > 5"
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
description:
"Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: EtcdHighFsyncDurations
expr:
"histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high fsync durations (instance {{ $labels.instance }})
description:
"Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: EtcdHighCommitDurations
expr:
"histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) >
0.25"
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high commit durations (instance {{ $labels.instance }})
description:
"Etcd commit duration increasing, 99th percentile is over 0.25s\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
@@ -1,123 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: IstioKubernetesGatewayAvailabilityDrop
expr:
'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway",
namespace="istio-system"}) without (instance, pod) < 2'
for: 1m
labels:
severity: warning
annotations:
summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
description:
"Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: IstioPilotHighTotalRequestRate
expr: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
for: 1m
labels:
severity: warning
annotations:
summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
description:
"Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have
outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioMixerPrometheusDispatchesLow
expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
for: 1m
labels:
severity: warning
annotations:
summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
description:
"Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being
exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioHighTotalRequestRate
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
for: 2m
labels:
severity: warning
annotations:
summary: Istio high total request rate (instance {{ $labels.instance }})
description:
"Global request rate in the service mesh is unusually high.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: IstioLowTotalRequestRate
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
for: 2m
labels:
severity: warning
annotations:
summary: Istio low total request rate (instance {{ $labels.instance }})
description:
"Global request rate in the service mesh is unusually low.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: IstioHigh4xxErrorRate
expr:
'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) /
sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
for: 1m
labels:
severity: warning
annotations:
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
description:
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: IstioHigh5xxErrorRate
expr:
'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) /
sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
for: 1m
labels:
severity: warning
annotations:
summary: Istio high 5xx error rate (instance {{ $labels.instance }})
description:
"High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: IstioHighRequestLatency
expr:
'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) /
rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
for: 1m
labels:
severity: warning
annotations:
summary: Istio high request latency (instance {{ $labels.instance }})
description:
"Istio average requests execution is longer than 100ms.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: IstioLatency99Percentile
expr:
"histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by
(destination_canonical_service, destination_workload_namespace, source_canonical_service,
source_workload_namespace, le)) > 1000"
for: 1m
labels:
severity: warning
annotations:
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
description:
"Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: IstioPilotDuplicateEntry
expr: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
description:
"Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
@@ -1,435 +0,0 @@
groups:
- name: KubestateExporter
rules:
- alert: KubernetesNodeNotReady
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes Node ready (node {{ $labels.node }})
description:
"Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeMemoryPressure
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes memory pressure (node {{ $labels.node }})
description:
"Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS
= {{ $labels }}"
- alert: KubernetesNodeDiskPressure
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes disk pressure (node {{ $labels.node }})
description:
"Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: KubernetesNodeNetworkUnavailable
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
description:
"Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeOutOfPodCapacity
expr:
'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node)
(0 * kube_pod_info{pod_template_hash=""})) / sum by (node)
(kube_node_status_allocatable{resource="pods"}) * 100 > 90'
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
description:
"Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
- alert: KubernetesContainerOomKiller
expr:
'(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
offset 10m >= 1) and ignoring (reason)
min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
== 1'
for: 0m
labels:
severity: warning
annotations:
summary:
Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{
$labels.container }})
description:
"Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has
been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesJobFailed
expr: "kube_job_status_failed > 0"
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
description:
"Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobSuspended
expr: "kube_cronjob_spec_suspend != 0"
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
description:
"CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeclaimPending
expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
for: 2m
labels:
severity: warning
annotations:
summary:
Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
$labels.persistentvolumeclaim }})
description:
"PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is
pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeOutOfDiskSpace
expr:
"kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
description:
"Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeFullInFourDays
expr: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
description:
"Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to
fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeError
expr:
'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
for: 0m
labels:
severity: critical
annotations:
summary:
Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{
$labels.persistentvolumeclaim }})
description:
"Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetDown
expr: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
for: 1m
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
description:
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleInability
expr:
'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} ==
1'
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
description:
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to
scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaMetricsUnavailability
expr:
'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"}
== 1'
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
description:
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect
metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleMaximum
expr:
"kube_horizontalpodautoscaler_status_desired_replicas >=
kube_horizontalpodautoscaler_spec_max_replicas"
for: 2m
labels:
severity: info
annotations:
summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
description:
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum
number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaUnderutilized
expr:
"max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) ==
kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3"
for: 0m
labels:
severity: info
annotations:
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
description:
"HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at
minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesPodNotHealthy
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
description:
"Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for
longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodCrashLooping
expr: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
description:
"Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesReplicasetReplicasMismatch
expr: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
for: 10m
labels:
severity: warning
annotations:
summary:
Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
description:
"ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE
= {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentReplicasMismatch
expr: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
for: 10m
labels:
severity: warning
annotations:
summary:
Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment
}})
description:
"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE
= {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetReplicasMismatch
expr: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
description:
"StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentGenerationMismatch
expr: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
for: 10m
labels:
severity: critical
annotations:
summary:
Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment
}})
description:
"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been
rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetGenerationMismatch
expr: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
for: 10m
labels:
severity: critical
annotations:
summary:
Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{
$labels.statefulset }})
description:
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not
been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetUpdateNotRolledOut
expr:
"max without (revision) (kube_statefulset_status_current_revision unless
kube_statefulset_status_update_revision) * (kube_statefulset_replicas !=
kube_statefulset_status_replicas_updated)"
for: 10m
labels:
severity: warning
annotations:
summary:
Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{
$labels.statefulset }})
description:
"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been
rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetRolloutStuck
expr:
"kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100
< 100 or kube_daemonset_status_desired_number_scheduled -
kube_daemonset_status_current_number_scheduled > 0"
for: 10m
labels:
severity: warning
annotations:
summary:
Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
description:
"Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not
scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetMisscheduled
expr: "kube_daemonset_status_number_misscheduled > 0"
for: 1m
labels:
severity: critical
annotations:
summary:
Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
description:
"Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running
where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobTooLong
expr: "time() - kube_cronjob_next_schedule_time > 3600"
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
description:
"CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to
complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobSlowCompletion
expr: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
for: 12h
labels:
severity: critical
annotations:
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
description:
"Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in
time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors
expr:
'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) /
sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API server errors (instance {{ $labels.instance }})
description:
"Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS
= {{ $labels }}"
- alert: KubernetesApiClientErrors
expr:
'(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) /
sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API client errors (instance {{ $labels.instance }})
description:
"Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS
= {{ $labels }}"
- alert: KubernetesClientCertificateExpiresNextWeek
expr:
'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
histogram_quantile(0.01, sum by (job, le)
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
7*24*60*60'
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
description:
"A client certificate used to authenticate to the apiserver is expiring next
week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesClientCertificateExpiresSoon
expr:
'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and
histogram_quantile(0.01, sum by (job, le)
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) <
24*60*60'
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
description:
"A client certificate used to authenticate to the apiserver is expiring in less than
24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerLatency
expr:
'histogram_quantile(0.99,
sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}
[10m])) WITHOUT (instance, resource)) > 1'
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes API server latency (instance {{ $labels.instance }})
description:
"Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{
$labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels
}}"
@@ -1,508 +0,0 @@
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr:
'(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance)
group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description:
"Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels
}}"
- alert: HostMemoryUnderMemoryPressure
expr:
'(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description:
"The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr:
'(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes *
100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description:
"Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{
$labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr:
'(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description:
"Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr:
'(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description:
"Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr:
'(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description:
"Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr:
'(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description:
"Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: HostOutOfDiskSpace
expr:
'((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description:
"Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr:
'((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance,
device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 *
3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description:
"Filesystem is predicted to run out of space within the next 24 hours at current write
rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr:
'(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
* 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description:
"Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: "node_filesystem_device_error == 1"
for: 0m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description:
"{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }}
filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr:
'(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"}
* 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 *
3600) < 0 and ON (instance, device, mountpoint)
node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description:
"Filesystem is predicted to run out of inodes within the next 24 hours at current write
rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr:
'(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m])
> 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left
(nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description:
"Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr:
'(rate(node_disk_write_time_seconds_total[1m]) /
rate(node_disk_writes_completed_total[1m]) > 0.1 and
rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description:
"Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS =
{{ $labels }}"
- alert: HostHighCpuLoad
expr:
'(sum by (instance) (avg by (mode, instance)
(rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left
(nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr:
'(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance)
group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description:
"CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr:
'(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description:
"CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may
be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr:
'(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description:
"CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr:
'(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description:
"Time spent in IO is too high on {{ $labels.instance }}. Check storage for
issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching
expr:
'((rate(node_context_switches_total[5m])) / (count without(cpu, mode)
(node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description:
"Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr:
'((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr:
'(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr:
'((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor)
node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description:
"Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr:
'(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description:
"Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
- alert: HostRaidArrayGotInactive
expr:
'(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description:
"RAID array {{ $labels.device }} is in a degraded state due to one or more disk
failures. The number of spare drives is insufficient to fix the issue
automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr:
'(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description:
"At least one device in RAID array on {{ $labels.instance }} failed. Array {{
$labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr:
'(count(sum(label_replace(node_uname_info, "kernel", "$1", "release",
"([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description:
"Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr:
'(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr:
'(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left
(nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description:
"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory
errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr:
'(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description:
"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory
errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
- alert: HostNetworkReceiveErrors
expr:
'(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m])
> 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description:
"Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
\"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr:
'(rate(node_network_transmit_errs_total[2m]) /
rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description:
"Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf
\"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr:
'((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) +
rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) /
node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description:
"The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting
overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr:
'((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description:
"Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{
$value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr:
'(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance)
group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description:
"The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
- alert: HostClockSkew
expr:
'((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or
(node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description:
"Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this
host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr:
'(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) *
on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description:
"Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value
}}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr:
'(node_reboot_required > 0) * on(instance) group_left (nodename)
node_uname_info{nodename=~".+"}'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description:
"{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{
$labels }}"
@@ -1,157 +0,0 @@
{
config,
myvars,
...
}: {
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
services.prometheus = {
enable = true;
checkConfig = true;
listenAddress = "127.0.0.1";
port = 9090;
webExternalUrl = "http://prometheus.writefor.fun";
extraFlags = ["--storage.tsdb.retention.time=45d"];
# Directory below /var/lib to store Prometheus metrics data.
stateDir = "prometheus2";
# Reload prometheus when configuration file changes (instead of restart).
enableReload = true;
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
# remoteRead = [];
# Rules are read from these files.
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
#
# Prometheus supports two types of rules which may be configured
# and then evaluated at regular intervals:
# 1. Recording rules
# Recording rules allow you to precompute frequently needed or computationally
# expensive expressions and save their result as a new set of time series.
# Querying the precomputed result will then often be much faster than executing the original expression.
# This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
# 2. Alerting rules
# Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
# and to send notifications about firing alerts to an external service.
ruleFiles = [
./alert_rules/node-exporter.yml
./alert_rules/kubestate-exporter.yml
./alert_rules/etcd_embedded-exporter.yml
./alert_rules/istio_embedded-exporter.yml
./alert_rules/coredns_embedded-exporter.yml
# ./recording_rules.yml
];
# specifies a set of targets and parameters describing how to scrape metrics from them.
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
scrapeConfigs = [
# --- Hosts --- #
{
job_name = "node-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
# All my NixOS hosts.
targets =
map (addr: "${addr.ipv4}:9100")
(builtins.attrValues myvars.networking.hostsAddr);
labels.type = "node";
}
];
}
# --- Homelab Applications --- #
{
job_name = "dnsmasq-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
labels.type = "app";
labels.app = "dnsmasq";
}
];
}
{
job_name = "v2ray-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:9153"];
labels.type = "app";
labels.app = "v2ray";
}
];
}
{
job_name = "sftpgo-embedded-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.rakushun.ipv4}:10000"];
labels.type = "app";
labels.app = "v2ray";
}
];
}
];
# specifies Alertmanager instances the Prometheus server sends alerts to
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
};
services.prometheus.alertmanager = {
enable = true;
listenAddress = "127.0.0.1";
port = 9093;
webExternalUrl = "http://alertmanager.writefor.fun";
logLevel = "info";
environmentFile = config.age.secrets."alertmanager.env".path;
configuration = {
global = {
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost = "smtp.qq.com:465";
smtp_from = "$SMTP_SENDER_EMAIL";
smtp_auth_username = "$SMTP_AUTH_USERNAME";
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
# https://service.mail.qq.com/detail/0/310
smtp_require_tls = false;
};
route = {
receiver = "default";
routes = [
{
group_by = ["host"];
group_wait = "5m";
group_interval = "5m";
repeat_interval = "4h";
receiver = "default";
}
];
};
receivers = [
{
name = "default";
email_configs = [
{
to = "ryan4yin@linux.com";
# Whether to notify about resolved alerts.
send_resolved = true;
}
];
}
];
};
};
}
-79
View File
@@ -1,79 +0,0 @@
{pkgs, ...}: let
passwordFile = "/etc/agenix/restic-password";
sshKeyPath = "/etc/agenix/ssh-key-for-restic-backup";
rcloneConfigFile = "/etc/agenix/rclone-conf-for-restic-backup";
in {
# https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/backup/restic.nix
services.restic.backups = {
homelab-backup = {
inherit passwordFile;
initialize = true; # Initialize the repository if it doesn't exist.
repository = "rclone:smb-downloads:/Downloads/kubevirt-backup/"; # backup to a rclone remote
# rclone related
# rcloneOptions = {
# bwlimit = "100M"; # Limit the bandwidth used by rclone.
# };
inherit rcloneConfigFile;
# Which local paths to backup, in addition to ones specified via `dynamicFilesFrom`.
paths = [
"/tmp/restic-backup-temp"
];
#
# A script that produces a list of files to back up. The
# results of this command are given to the '--files-from'
# option. The result is merged with paths specified via `paths`.
# dynamicFilesFrom = "find /home/matt/git -type d -name .git";
#
# Patterns to exclude when backing up. See
# https://restic.readthedocs.io/en/latest/040_backup.html#excluding-files
# for details on syntax.
exclude = [];
# A script that must run before starting the backup process.
backupPrepareCommand = ''
${pkgs.nushell}/bin/nu -c '
let kubevirt_nodes = [
"kubevirt-shoryu"
"kubevirt-shushou"
"kubevirt-youko"
]
kubevirt_nodes | each {|it|
rsync -avz \
-e "ssh -i ${sshKeyPath}" \
$"($it):/perissitent/" $"/tmp/restic-backup-temp/($it)"
}
'
'';
# A script that must run after finishing the backup process.
backupCleanupCommand = "rm -rf /tmp/restic-backup-temp";
# Extra extended options to be passed to the restic --option flag.
# extraOptions = [];
# Extra arguments passed to restic backup.
# extraBackupArgs = [
# "--exclude-file=/etc/restic/excludes-list"
# ];
# repository = "/mnt/backup-hdd"; # backup to a local directory
# When to run the backup. See {manpage}`systemd.timer(5)` for details.
timerConfig = {
OnCalendar = "01:30";
RandomizedDelaySec = "1h";
};
# A list of options (--keep-* et al.) for 'restic forget --prune',
# to automatically prune old snapshots.
# The 'forget' command is run *after* the 'backup' command, so
# keep that in mind when constructing the --keep-* options.
pruneOpts = [
"--keep-daily 3"
"--keep-weekly 3"
"--keep-monthly 3"
"--keep-yearly 3"
];
};
};
}
-97
View File
@@ -1,97 +0,0 @@
{config, ...}: {
# Read SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD from a file
systemd.services.sftpgo.serviceConfig.EnvironmentFile = config.age.secrets."sftpgo.env".path;
services.sftpgo = {
enable = true;
user = "sftpgo";
dataDir = "/var/lib/sftpgo";
extraArgs = [
"--log-level"
"info"
];
# https://github.com/drakkan/sftpgo/blob/2.5.x/docs/full-configuration.md
settings = {
common = {
# Auto-blocking policy for SFTPGo and thus helps to prevent DoS (Denial of Service) and brute force password guessing.
defender = {
enable = true;
};
};
# Where to store stfpgo's data
data_provider = {
driver = "sqlite";
name = "sftpgo.db";
password_hashing = {
algo = "argon2id";
# options for argon2id hashing algorithm.
# The memory and iterations parameters control the computational cost of hashing the password.
argon2_options = {
memory = 65536; # KiB
iterations = 2; # The number of iterations over the memory.
parallelism = 2; # The number of threads (or lanes) used by the algorithm.
};
};
password_validation = {
# What Entropy Value Should I Use?
# somewhere in the 50-70 range seems "reasonable".
# https://github.com/wagslane/go-password-validator#what-entropy-value-should-i-use
admins.min_entropy = 60;
users.min_entropy = 60;
};
# Cache passwords in memory to avoid hashing the same password multiple times(it costs).
password_caching = true;
# create the default admin user via environment variables
# SFTPGO_DEFAULT_ADMIN_USERNAME and SFTPGO_DEFAULT_ADMIN_PASSWORD
create_default_admin = true;
};
# WebDAV is a popular protocol for file sharing, better than CIFS/SMB, NFS, etc.
# it's save to use WebDAV over HTTPS on public networks.
webdavd.bindings = [
{
address = "127.0.0.1";
port = 3303;
}
];
# HTTP Server provides a simple web interface to manage the server.
httpd.bindings = [
{
address = "127.0.0.1";
enable_https = false;
port = 3302;
client_ip_proxy_header = "X-Forwarded-For";
# a basic built-in web interface that allows you to manage users,
# virtual folders, admins and connections.
# url: http://127.0.0.1:8080/web/admin
enable_web_admin = true;
# A basic front-end web interface for your users.
# It allows end-users to browse and manage their files and change their credentials.
enable_web_client = true;
enable_rest_api = true;
}
];
# prometheus metrics
telemetry = {
bind_port = 10000;
bind_address = "0.0.0.0";
# auth_user_file = "";
};
# multi-factor authentication settings
mfa.totp = [
{
# Unique configuration name, not visible to the authentication apps.
# Should not to be changed after the first user has been created.
name = "SFTPGo";
# Name of the issuing Organization/Company
issuer = "SFTPGo";
# Algorithm to use for HMAC
# Currently Google Authenticator app on iPhone seems to only support sha1
algo = "sha1";
}
];
# SMTP configuration enables SFTPGo email sending capabilities
# smtp = {};
};
};
}
-117
View File
@@ -1,117 +0,0 @@
{
config,
myvars,
...
}: let
dataDir = "/var/lib/transmission";
name = "transmission";
in {
# the headless Transmission BitTorrent daemon
# https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/torrent/transmission.nix
# https://wiki.archlinux.org/title/transmission
services.transmission = {
enable = true;
user = name;
group = name;
home = dataDir;
downloadDirPermissions = "0770";
# Whether to enable tweaking of kernel parameters to open many more connections at the same time.
# Note that you may also want to increase peer-limit-global.
# And be aware that these settings are quite aggressive and might not suite your regular desktop use.
# For instance, SSH sessions may time out more easily.
performanceNetParameters = true;
# Path to a JSON file to be merged with the settings.
# Useful to merge a file which is better kept out of the Nix store to set secret config parameters like `rpc-password`.
credentialsFile = config.age.secrets."transmission-credentials.json".path;
# Whether to open the RPC port in the firewall.
openRPCPort = false;
openPeerPorts = true;
# https://github.com/transmission/transmission/blob/main/docs/Editing-Configuration-Files.md
settings = {
# 0 = None, 1 = Critical, 2 = Error, 3 = Warn, 4 = Info, 5 = Debug, 6 = Trace;
message-level = 3;
# Encryption may help get around some ISP filtering,
# but at the cost of slightly higher CPU use.
# 0 = Prefer unencrypted connections,
# 1 = Prefer encrypted connections,
# 2 = Require encrypted connections; default = 1)
encryption = 2;
# rpc = Web Interface
rpc-port = 9091;
rpc-bind-address = "127.0.0.1";
anti-brute-force-enabled = true;
# After this amount of failed authentication attempts is surpassed,
# the RPC server will deny any further authentication attempts until it is restarted.
# This is not tracked per IP but in total.
anti-brute-force-threshold = 20;
rpc-authentication-required = true;
# Comma-delimited list of IP addresses.
# Wildcards allowed using '*'. Example: "127.0.0.*,192.168.*.*",
rpc-whitelist-enabled = true;
rpc-whitelist = "127.0.0.*,192.168.*.*";
# Comma-delimited list of domain names.
# Wildcards allowed using '*'. Example: "*.foo.org,example.com",
rpc-host-whitelist-enabled = true;
rpc-host-whitelist = "*.writefor.fun,localhost,192.168.5.*";
rpc-user = myvars.username;
rpc-username = myvars.username;
# rpc-password = "test"; # you'd better use the credentialsFile for this.
incomplete-dir-enabled = true;
incomplete-dir = "${dataDir}/incomplete";
download-dir = "${dataDir}/downloads";
# Watch a directory for torrent files and add them to transmission.
watch-dir-enabled = false;
watch-dir = "${dataDir}/watch";
# Whether to enable Micro Transport Protocol (µTP).
utp-enabled = true;
# Executable to be run at torrent completion.
script-torrent-done-enabled = false;
# script-torrent-done-filename = "/path/to/script";
# Enable Local Peer Discovery (LPD).
lpd-enabled = true;
# The peer port to listen for incoming connections.
peer-port = 51413;
# Enable UOnP or NAT-PMP to forward a port through your firewall(NAT).
# https://github.com/transmission/transmission/blob/main/docs/Port-Forwarding-Guide.md
port-forwarding-enabled = true;
# "normal" speed limits
speed-limit-down-enabled = true;
speed-limit-down = 30000; # KB/s
speed-limit-up-enabled = true;
speed-limit-up = 500; # KB/s
upload-slots-per-torrent = 8;
# Start torrents as soon as they are added
start-added-torrents = true;
# Queuing
# When true, Transmission will only download
# download-queue-size non-stalled torrents at once.
download-queue-enabled = true;
download-queue-size = 5;
# When true, torrents that have not shared data for
# queue-stalled-minutes are treated as 'stalled'
# and are not counted against the queue-download-size
# and seed-queue-size limits.
queue-stalled-enabled = true;
queue-stalled-minutes = 60;
# When true. Transmission will only seed seed-queue-size
# non-stalled torrents at once.
seed-queue-enabled = true;
seed-queue-size = 10;
};
};
}
-12
View File
@@ -1,12 +0,0 @@
{
# https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/monitoring/uptime-kuma.nix
services.uptime-kuma = {
enable = true;
# https://github.com/louislam/uptime-kuma/wiki/Environment-Variables
settings = {
"UPTIME_KUMA_HOST" = "127.0.0.1";
"UPTIME_KUMA_PORT" = "3350";
"DATA_DIR" = "/var/lib/uptime-kuma/";
};
};
}