mirror of
https://github.com/ryan4yin/nix-config.git
synced 2026-03-24 01:41:00 +01:00
feat: migrate from prometheus to victoriametrics (#171)
* feat: migrate from prometheus to victoriametrics * fix: duplicated declaration * fix: victoriametrics - isSystemUser * fix: import promTypes * fix: vmalert * fix(victoriametrics): cli args
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# Prometheus & Alertmanager
|
||||
# Monitoring & Alerting
|
||||
|
||||
## Alert Rules
|
||||
|
||||
47
hosts/idols-aquamarine/monitoring/alertmanager.nix
Normal file
47
hosts/idols-aquamarine/monitoring/alertmanager.nix
Normal file
@@ -0,0 +1,47 @@
|
||||
{config, ...}: {
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9093;
|
||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||
logLevel = "info";
|
||||
|
||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "smtp.qq.com:465";
|
||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||
# https://service.mail.qq.com/detail/0/310
|
||||
smtp_require_tls = false;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = ["host"];
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "default";
|
||||
email_configs = [
|
||||
{
|
||||
to = "ryan4yin@linux.com";
|
||||
# Whether to notify about resolved alerts.
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
7
hosts/idols-aquamarine/monitoring/default.nix
Normal file
7
hosts/idols-aquamarine/monitoring/default.nix
Normal file
@@ -0,0 +1,7 @@
|
||||
{mylib, ...}: {
|
||||
imports = [
|
||||
./module
|
||||
./victoriametrics.nix
|
||||
./alertmanager.nix
|
||||
];
|
||||
}
|
||||
195
hosts/idols-aquamarine/monitoring/module/default.nix
Normal file
195
hosts/idols-aquamarine/monitoring/module/default.nix
Normal file
@@ -0,0 +1,195 @@
|
||||
# Based on
|
||||
# - https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/databases/victoriametrics.nix
|
||||
# - https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/monitoring/prometheus/default.nix
|
||||
{
|
||||
config,
|
||||
pkgs,
|
||||
lib,
|
||||
...
|
||||
}:
|
||||
with lib; let
|
||||
cfg = config.services.my-victoriametrics;
|
||||
yaml = pkgs.formats.yaml {};
|
||||
|
||||
promTypes = import ./promTypes.nix {inherit lib;};
|
||||
|
||||
bindAddr = "${cfg.listenAddress}:${builtins.toString cfg.port}";
|
||||
workingDir = "/var/lib/" + cfg.stateDir;
|
||||
|
||||
generatedPrometheusYml = yaml.generate "prometheus.yml" scrapeConfig;
|
||||
|
||||
# This becomes the main config file for VictoriaMetrics's `-promscrape.config`
|
||||
# https://docs.victoriametrics.com/vmagent/#how-to-collect-metrics-in-prometheus-format
|
||||
scrapeConfig = {
|
||||
global = filterValidPrometheus cfg.globalConfig;
|
||||
scrape_configs = filterValidPrometheus cfg.scrapeConfigs;
|
||||
};
|
||||
|
||||
filterValidPrometheus = filterAttrsListRecursive (n: v: !(n == "_module" || v == null));
|
||||
filterAttrsListRecursive = pred: x:
|
||||
if isAttrs x
|
||||
then
|
||||
listToAttrs
|
||||
(
|
||||
concatMap
|
||||
(
|
||||
name: let
|
||||
v = x.${name};
|
||||
in
|
||||
if pred name v
|
||||
then [
|
||||
(nameValuePair name (filterAttrsListRecursive pred v))
|
||||
]
|
||||
else []
|
||||
)
|
||||
(attrNames x)
|
||||
)
|
||||
else if isList x
|
||||
then map (filterAttrsListRecursive pred) x
|
||||
else x;
|
||||
in {
|
||||
options.services.my-victoriametrics = {
|
||||
enable = mkEnableOption "VictoriaMetrics, a time series database, long-term remote storage for victoriametrics";
|
||||
package = mkPackageOption pkgs "victoriametrics" {};
|
||||
|
||||
port = mkOption {
|
||||
type = types.port;
|
||||
default = 8428;
|
||||
description = ''
|
||||
Port to listen on.
|
||||
'';
|
||||
};
|
||||
|
||||
listenAddress = mkOption {
|
||||
type = types.str;
|
||||
default = "0.0.0.0";
|
||||
description = ''
|
||||
Address to listen on for the http API.
|
||||
'';
|
||||
};
|
||||
|
||||
stateDir = mkOption {
|
||||
type = types.str;
|
||||
default = "victoriametrics2";
|
||||
description = ''
|
||||
Directory below `/var/lib` to store VictoriaMetrics metrics data.
|
||||
This directory will be created automatically using systemd's StateDirectory mechanism.
|
||||
'';
|
||||
};
|
||||
|
||||
retentionTime = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
example = "15d";
|
||||
description = ''
|
||||
How long to retain samples in storage.
|
||||
The minimum retentionPeriod is 24h or 1d. See also -retentionFilter
|
||||
The following optional suffixes are supported: s (second), h (hour), d (day), w (week), y (year).
|
||||
If suffix isn't set, then the duration is counted in months (default 1)
|
||||
'';
|
||||
};
|
||||
|
||||
globalConfig = mkOption {
|
||||
type = promTypes.globalConfig;
|
||||
default = {};
|
||||
description = ''
|
||||
Parameters that are valid in all configuration contexts. They
|
||||
also serve as defaults for other configuration sections
|
||||
'';
|
||||
};
|
||||
|
||||
scrapeConfigs = mkOption {
|
||||
type = types.listOf promTypes.scrape_config;
|
||||
default = [];
|
||||
description = ''
|
||||
A list of scrape configurations.
|
||||
See docs: <https://docs.victoriametrics.com/vmagent/#how-to-collect-metrics-in-prometheus-format>
|
||||
'';
|
||||
};
|
||||
|
||||
extraFlags = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [];
|
||||
description = ''
|
||||
Extra options to pass to VictoriaMetrics. See the README:
|
||||
<https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md>
|
||||
or {command}`victoriametrics -help` for more
|
||||
information.
|
||||
'';
|
||||
};
|
||||
};
|
||||
config = lib.mkIf cfg.enable {
|
||||
users.groups.victoriametrics = {};
|
||||
users.users.victoriametrics = {
|
||||
description = "victoriametrics daemon user";
|
||||
isSystemUser = true; # required when uid is null
|
||||
group = "victoriametrics";
|
||||
};
|
||||
|
||||
systemd.services.my-victoriametrics = {
|
||||
description = "VictoriaMetrics time series database";
|
||||
wantedBy = ["multi-user.target"];
|
||||
after = ["network.target"];
|
||||
|
||||
startLimitBurst = 5;
|
||||
serviceConfig = {
|
||||
ExecStart = ''
|
||||
${cfg.package}/bin/victoria-metrics \
|
||||
-storageDataPath=${workingDir} \
|
||||
-httpListenAddr=${bindAddr} \
|
||||
-retentionPeriod=${cfg.retentionTime} \
|
||||
-promscrape.config=${generatedPrometheusYml} \
|
||||
${lib.escapeShellArgs cfg.extraFlags}
|
||||
'';
|
||||
RestartSec = 1;
|
||||
User = "victoriametrics";
|
||||
Restart = "on-failure";
|
||||
RuntimeDirectory = "victoriametrics";
|
||||
RuntimeDirectoryMode = "0700";
|
||||
WorkingDirectory = workingDir;
|
||||
StateDirectory = cfg.stateDir;
|
||||
StateDirectoryMode = "0700";
|
||||
|
||||
# Increase the limit to avoid errors like 'too many open files' when merging small parts
|
||||
LimitNOFILE = 1048576;
|
||||
|
||||
# Hardening
|
||||
AmbientCapabilities = lib.mkIf (cfg.port < 1024) ["CAP_NET_BIND_SERVICE"];
|
||||
CapabilityBoundingSet =
|
||||
if (cfg.port < 1024)
|
||||
then ["CAP_NET_BIND_SERVICE"]
|
||||
else [""];
|
||||
DeviceAllow = ["/dev/null rw"];
|
||||
DevicePolicy = "strict";
|
||||
LockPersonality = true;
|
||||
MemoryDenyWriteExecute = true;
|
||||
NoNewPrivileges = true;
|
||||
PrivateDevices = true;
|
||||
PrivateTmp = true;
|
||||
PrivateUsers = true;
|
||||
ProtectClock = true;
|
||||
ProtectControlGroups = true;
|
||||
ProtectHome = true;
|
||||
ProtectHostname = true;
|
||||
ProtectKernelLogs = true;
|
||||
ProtectKernelModules = true;
|
||||
ProtectKernelTunables = true;
|
||||
ProtectProc = "invisible";
|
||||
ProtectSystem = "full";
|
||||
RemoveIPC = true;
|
||||
RestrictAddressFamilies = ["AF_INET" "AF_INET6" "AF_UNIX"];
|
||||
RestrictNamespaces = true;
|
||||
RestrictRealtime = true;
|
||||
RestrictSUIDSGID = true;
|
||||
SystemCallArchitectures = "native";
|
||||
SystemCallFilter = ["@system-service" "~@privileged"];
|
||||
};
|
||||
|
||||
postStart = lib.mkBefore ''
|
||||
until ${lib.getBin pkgs.curl}/bin/curl -s -o /dev/null http://${bindAddr}/ping; do
|
||||
sleep 1;
|
||||
done
|
||||
'';
|
||||
};
|
||||
};
|
||||
}
|
||||
1334
hosts/idols-aquamarine/monitoring/module/promTypes.nix
Normal file
1334
hosts/idols-aquamarine/monitoring/module/promTypes.nix
Normal file
File diff suppressed because it is too large
Load Diff
132
hosts/idols-aquamarine/monitoring/victoriametrics.nix
Normal file
132
hosts/idols-aquamarine/monitoring/victoriametrics.nix
Normal file
@@ -0,0 +1,132 @@
|
||||
{
|
||||
lib,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
# Workaround for victoriametrics to store data in another place
|
||||
# https://www.freedesktop.org/software/systemd/man/latest/tmpfiles.d.html#Type
|
||||
systemd.tmpfiles.rules = [
|
||||
"D /data/apps/victoriametrics 0751 victoriametrics victoriametrics - -"
|
||||
"L+ /var/lib/victoriametrics - - - - /data/apps/victoriametrics"
|
||||
];
|
||||
|
||||
# https://victoriametrics.io/docs/victoriametrics/latest/configuration/configuration/
|
||||
services.my-victoriametrics = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9090;
|
||||
retentionTime = "30d";
|
||||
|
||||
extraFlags = [
|
||||
# Allowed percent of system memory VictoriaMetrics caches may occupy.
|
||||
"-memory.allowedPercent=50"
|
||||
];
|
||||
# Directory below /var/lib to store victoriametrics metrics data.
|
||||
stateDir = "victoriametrics";
|
||||
|
||||
# specifies a set of targets and parameters describing how to scrape metrics from them.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
|
||||
scrapeConfigs =
|
||||
[
|
||||
# --- Homelab Applications --- #
|
||||
|
||||
{
|
||||
job_name = "dnsmasq-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "dnsmasq";
|
||||
labels.host = "suzi";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "v2ray-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
labels.host = "aquamarine";
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "postgres-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9187"];
|
||||
labels.type = "app";
|
||||
labels.app = "postgresql";
|
||||
labels.host = "aquamarine";
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "sftpgo-embedded-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:10000"];
|
||||
labels.type = "app";
|
||||
labels.app = "sftpgo";
|
||||
labels.host = "aquamarine";
|
||||
}
|
||||
];
|
||||
}
|
||||
]
|
||||
# --- Hosts --- #
|
||||
++ (
|
||||
lib.attrsets.foldlAttrs
|
||||
(acc: hostname: addr:
|
||||
acc
|
||||
++ [
|
||||
{
|
||||
job_name = "node-exporter-${hostname}";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
# All my NixOS hosts.
|
||||
targets = ["${addr.ipv4}:9100"];
|
||||
labels.type = "node";
|
||||
labels.host = hostname;
|
||||
}
|
||||
];
|
||||
}
|
||||
])
|
||||
[]
|
||||
myvars.networking.hostsAddr
|
||||
);
|
||||
};
|
||||
|
||||
services.vmalert = {
|
||||
enable = true;
|
||||
settings = {
|
||||
"datasource.url" = "http://localhost:9090";
|
||||
"notifier.url" = ["http://localhost:9093"]; # alertmanager's api
|
||||
|
||||
# Whether to disable long-lived connections to the datasource.
|
||||
"datasource.disableKeepAlive" = true;
|
||||
# Whether to avoid stripping sensitive information such as auth headers or passwords
|
||||
# from URLs in log messages or UI and exported metrics.
|
||||
"datasource.showURL" = false;
|
||||
rule = [
|
||||
./alert_rules/node-exporter.yml
|
||||
./alert_rules/kubestate-exporter.yml
|
||||
./alert_rules/etcd_embedded-exporter.yml
|
||||
./alert_rules/istio_embedded-exporter.yml
|
||||
./alert_rules/coredns_embedded-exporter.yml
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
groups:
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
- alert: CorednsPanicCount
|
||||
expr: "increase(coredns_panics_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
|
||||
description:
|
||||
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
@@ -1,192 +0,0 @@
|
||||
{
|
||||
lib,
|
||||
config,
|
||||
myvars,
|
||||
...
|
||||
}: {
|
||||
# Workaround for prometheus to store data in another place
|
||||
# https://www.freedesktop.org/software/systemd/man/latest/tmpfiles.d.html#Type
|
||||
systemd.tmpfiles.rules = [
|
||||
"D /data/apps/prometheus2 0751 prometheus prometheus - -"
|
||||
"L+ /var/lib/prometheus2 - - - - /data/apps/prometheus2"
|
||||
];
|
||||
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
checkConfig = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9090;
|
||||
webExternalUrl = "http://prometheus.writefor.fun";
|
||||
|
||||
extraFlags = [
|
||||
"--storage.tsdb.retention.time=30d"
|
||||
# https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations
|
||||
"--web.enable-remote-write-receiver"
|
||||
];
|
||||
# Directory below /var/lib to store Prometheus metrics data.
|
||||
stateDir = "prometheus2";
|
||||
|
||||
# Reload prometheus when configuration file changes (instead of restart).
|
||||
enableReload = true;
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
|
||||
# remoteRead = [];
|
||||
|
||||
# Rules are read from these files.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
|
||||
#
|
||||
# Prometheus supports two types of rules which may be configured
|
||||
# and then evaluated at regular intervals:
|
||||
# 1. Recording rules
|
||||
# Recording rules allow you to precompute frequently needed or computationally
|
||||
# expensive expressions and save their result as a new set of time series.
|
||||
# Querying the precomputed result will then often be much faster than executing the original expression.
|
||||
# This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
|
||||
# 2. Alerting rules
|
||||
# Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
|
||||
# and to send notifications about firing alerts to an external service.
|
||||
ruleFiles = [
|
||||
./alert_rules/node-exporter.yml
|
||||
./alert_rules/kubestate-exporter.yml
|
||||
./alert_rules/etcd_embedded-exporter.yml
|
||||
./alert_rules/istio_embedded-exporter.yml
|
||||
./alert_rules/coredns_embedded-exporter.yml
|
||||
|
||||
# ./recording_rules.yml
|
||||
];
|
||||
|
||||
# specifies a set of targets and parameters describing how to scrape metrics from them.
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
|
||||
scrapeConfigs =
|
||||
[
|
||||
# --- Homelab Applications --- #
|
||||
|
||||
{
|
||||
job_name = "dnsmasq-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "dnsmasq";
|
||||
labels.host = "suzi";
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
{
|
||||
job_name = "v2ray-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9153"];
|
||||
labels.type = "app";
|
||||
labels.app = "v2ray";
|
||||
labels.host = "aquamarine";
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "postgres-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9187"];
|
||||
labels.type = "app";
|
||||
labels.app = "postgresql";
|
||||
labels.host = "aquamarine";
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "sftpgo-embedded-exporter";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:10000"];
|
||||
labels.type = "app";
|
||||
labels.app = "sftpgo";
|
||||
labels.host = "aquamarine";
|
||||
}
|
||||
];
|
||||
}
|
||||
]
|
||||
# --- Hosts --- #
|
||||
++ (
|
||||
lib.attrsets.foldlAttrs
|
||||
(acc: hostname: addr:
|
||||
acc
|
||||
++ [
|
||||
{
|
||||
job_name = "node-exporter-${hostname}";
|
||||
scrape_interval = "30s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
# All my NixOS hosts.
|
||||
targets = ["${addr.ipv4}:9100"];
|
||||
labels.type = "node";
|
||||
labels.host = hostname;
|
||||
}
|
||||
];
|
||||
}
|
||||
])
|
||||
[]
|
||||
myvars.networking.hostsAddr
|
||||
);
|
||||
|
||||
# specifies Alertmanager instances the Prometheus server sends alerts to
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
|
||||
alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
port = 9093;
|
||||
webExternalUrl = "http://alertmanager.writefor.fun";
|
||||
logLevel = "info";
|
||||
|
||||
environmentFile = config.age.secrets."alertmanager.env".path;
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "smtp.qq.com:465";
|
||||
smtp_from = "$SMTP_SENDER_EMAIL";
|
||||
smtp_auth_username = "$SMTP_AUTH_USERNAME";
|
||||
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
|
||||
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
|
||||
# https://service.mail.qq.com/detail/0/310
|
||||
smtp_require_tls = false;
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = ["host"];
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "default";
|
||||
email_configs = [
|
||||
{
|
||||
to = "ryan4yin@linux.com";
|
||||
# Whether to notify about resolved alerts.
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user