feat: migrate from prometheus to victoriametrics (#171)

* feat: migrate from prometheus to victoriametrics

* fix: duplicated declaration

* fix: victoriametrics - isSystemUser

* fix: import promTypes

* fix: vmalert

* fix(victoriametrics): cli args
This commit is contained in:
Ryan Yin
2024-10-23 23:22:08 +08:00
committed by GitHub
parent 0fb0601a24
commit 34072df760
12 changed files with 1716 additions and 206 deletions

View File

@@ -1,4 +1,4 @@
# Prometheus & Alertmanager
# Monitoring & Alerting
## Alert Rules

View File

@@ -0,0 +1,47 @@
{config, ...}: {
services.prometheus.alertmanager = {
enable = true;
listenAddress = "127.0.0.1";
port = 9093;
webExternalUrl = "http://alertmanager.writefor.fun";
logLevel = "info";
environmentFile = config.age.secrets."alertmanager.env".path;
configuration = {
global = {
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost = "smtp.qq.com:465";
smtp_from = "$SMTP_SENDER_EMAIL";
smtp_auth_username = "$SMTP_AUTH_USERNAME";
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
# https://service.mail.qq.com/detail/0/310
smtp_require_tls = false;
};
route = {
receiver = "default";
routes = [
{
group_by = ["host"];
group_wait = "5m";
group_interval = "5m";
repeat_interval = "4h";
receiver = "default";
}
];
};
receivers = [
{
name = "default";
email_configs = [
{
to = "ryan4yin@linux.com";
# Whether to notify about resolved alerts.
send_resolved = true;
}
];
}
];
};
};
}

View File

@@ -0,0 +1,7 @@
{mylib, ...}: {
imports = [
./module
./victoriametrics.nix
./alertmanager.nix
];
}

View File

@@ -0,0 +1,195 @@
# Based on
# - https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/databases/victoriametrics.nix
# - https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/monitoring/prometheus/default.nix
{
config,
pkgs,
lib,
...
}:
with lib; let
cfg = config.services.my-victoriametrics;
yaml = pkgs.formats.yaml {};
promTypes = import ./promTypes.nix {inherit lib;};
bindAddr = "${cfg.listenAddress}:${builtins.toString cfg.port}";
workingDir = "/var/lib/" + cfg.stateDir;
generatedPrometheusYml = yaml.generate "prometheus.yml" scrapeConfig;
# This becomes the main config file for VictoriaMetrics's `-promscrape.config`
# https://docs.victoriametrics.com/vmagent/#how-to-collect-metrics-in-prometheus-format
scrapeConfig = {
global = filterValidPrometheus cfg.globalConfig;
scrape_configs = filterValidPrometheus cfg.scrapeConfigs;
};
filterValidPrometheus = filterAttrsListRecursive (n: v: !(n == "_module" || v == null));
filterAttrsListRecursive = pred: x:
if isAttrs x
then
listToAttrs
(
concatMap
(
name: let
v = x.${name};
in
if pred name v
then [
(nameValuePair name (filterAttrsListRecursive pred v))
]
else []
)
(attrNames x)
)
else if isList x
then map (filterAttrsListRecursive pred) x
else x;
in {
options.services.my-victoriametrics = {
enable = mkEnableOption "VictoriaMetrics, a time series database, long-term remote storage for victoriametrics";
package = mkPackageOption pkgs "victoriametrics" {};
port = mkOption {
type = types.port;
default = 8428;
description = ''
Port to listen on.
'';
};
listenAddress = mkOption {
type = types.str;
default = "0.0.0.0";
description = ''
Address to listen on for the http API.
'';
};
stateDir = mkOption {
type = types.str;
default = "victoriametrics2";
description = ''
Directory below `/var/lib` to store VictoriaMetrics metrics data.
This directory will be created automatically using systemd's StateDirectory mechanism.
'';
};
retentionTime = mkOption {
type = types.nullOr types.str;
default = null;
example = "15d";
description = ''
How long to retain samples in storage.
The minimum retentionPeriod is 24h or 1d. See also -retentionFilter
The following optional suffixes are supported: s (second), h (hour), d (day), w (week), y (year).
If suffix isn't set, then the duration is counted in months (default 1)
'';
};
globalConfig = mkOption {
type = promTypes.globalConfig;
default = {};
description = ''
Parameters that are valid in all configuration contexts. They
also serve as defaults for other configuration sections
'';
};
scrapeConfigs = mkOption {
type = types.listOf promTypes.scrape_config;
default = [];
description = ''
A list of scrape configurations.
See docs: <https://docs.victoriametrics.com/vmagent/#how-to-collect-metrics-in-prometheus-format>
'';
};
extraFlags = mkOption {
type = types.listOf types.str;
default = [];
description = ''
Extra options to pass to VictoriaMetrics. See the README:
<https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md>
or {command}`victoriametrics -help` for more
information.
'';
};
};
config = lib.mkIf cfg.enable {
users.groups.victoriametrics = {};
users.users.victoriametrics = {
description = "victoriametrics daemon user";
isSystemUser = true; # required when uid is null
group = "victoriametrics";
};
systemd.services.my-victoriametrics = {
description = "VictoriaMetrics time series database";
wantedBy = ["multi-user.target"];
after = ["network.target"];
startLimitBurst = 5;
serviceConfig = {
ExecStart = ''
${cfg.package}/bin/victoria-metrics \
-storageDataPath=${workingDir} \
-httpListenAddr=${bindAddr} \
-retentionPeriod=${cfg.retentionTime} \
-promscrape.config=${generatedPrometheusYml} \
${lib.escapeShellArgs cfg.extraFlags}
'';
RestartSec = 1;
User = "victoriametrics";
Restart = "on-failure";
RuntimeDirectory = "victoriametrics";
RuntimeDirectoryMode = "0700";
WorkingDirectory = workingDir;
StateDirectory = cfg.stateDir;
StateDirectoryMode = "0700";
# Increase the limit to avoid errors like 'too many open files' when merging small parts
LimitNOFILE = 1048576;
# Hardening
AmbientCapabilities = lib.mkIf (cfg.port < 1024) ["CAP_NET_BIND_SERVICE"];
CapabilityBoundingSet =
if (cfg.port < 1024)
then ["CAP_NET_BIND_SERVICE"]
else [""];
DeviceAllow = ["/dev/null rw"];
DevicePolicy = "strict";
LockPersonality = true;
MemoryDenyWriteExecute = true;
NoNewPrivileges = true;
PrivateDevices = true;
PrivateTmp = true;
PrivateUsers = true;
ProtectClock = true;
ProtectControlGroups = true;
ProtectHome = true;
ProtectHostname = true;
ProtectKernelLogs = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
ProtectProc = "invisible";
ProtectSystem = "full";
RemoveIPC = true;
RestrictAddressFamilies = ["AF_INET" "AF_INET6" "AF_UNIX"];
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
SystemCallArchitectures = "native";
SystemCallFilter = ["@system-service" "~@privileged"];
};
postStart = lib.mkBefore ''
until ${lib.getBin pkgs.curl}/bin/curl -s -o /dev/null http://${bindAddr}/ping; do
sleep 1;
done
'';
};
};
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,132 @@
{
lib,
myvars,
...
}: {
# Workaround for victoriametrics to store data in another place
# https://www.freedesktop.org/software/systemd/man/latest/tmpfiles.d.html#Type
systemd.tmpfiles.rules = [
"D /data/apps/victoriametrics 0751 victoriametrics victoriametrics - -"
"L+ /var/lib/victoriametrics - - - - /data/apps/victoriametrics"
];
# https://victoriametrics.io/docs/victoriametrics/latest/configuration/configuration/
services.my-victoriametrics = {
enable = true;
listenAddress = "127.0.0.1";
port = 9090;
retentionTime = "30d";
extraFlags = [
# Allowed percent of system memory VictoriaMetrics caches may occupy.
"-memory.allowedPercent=50"
];
# Directory below /var/lib to store victoriametrics metrics data.
stateDir = "victoriametrics";
# specifies a set of targets and parameters describing how to scrape metrics from them.
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
scrapeConfigs =
[
# --- Homelab Applications --- #
{
job_name = "dnsmasq-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
labels.type = "app";
labels.app = "dnsmasq";
labels.host = "suzi";
}
];
}
{
job_name = "v2ray-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9153"];
labels.type = "app";
labels.app = "v2ray";
labels.host = "aquamarine";
}
];
}
{
job_name = "postgres-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9187"];
labels.type = "app";
labels.app = "postgresql";
labels.host = "aquamarine";
}
];
}
{
job_name = "sftpgo-embedded-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:10000"];
labels.type = "app";
labels.app = "sftpgo";
labels.host = "aquamarine";
}
];
}
]
# --- Hosts --- #
++ (
lib.attrsets.foldlAttrs
(acc: hostname: addr:
acc
++ [
{
job_name = "node-exporter-${hostname}";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
# All my NixOS hosts.
targets = ["${addr.ipv4}:9100"];
labels.type = "node";
labels.host = hostname;
}
];
}
])
[]
myvars.networking.hostsAddr
);
};
services.vmalert = {
enable = true;
settings = {
"datasource.url" = "http://localhost:9090";
"notifier.url" = ["http://localhost:9093"]; # alertmanager's api
# Whether to disable long-lived connections to the datasource.
"datasource.disableKeepAlive" = true;
# Whether to avoid stripping sensitive information such as auth headers or passwords
# from URLs in log messages or UI and exported metrics.
"datasource.showURL" = false;
rule = [
./alert_rules/node-exporter.yml
./alert_rules/kubestate-exporter.yml
./alert_rules/etcd_embedded-exporter.yml
./alert_rules/istio_embedded-exporter.yml
./alert_rules/coredns_embedded-exporter.yml
];
};
};
}

View File

@@ -1,13 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: CorednsPanicCount
expr: "increase(coredns_panics_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: CoreDNS Panic Count (instance {{ $labels.instance }})
description:
"Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@@ -1,192 +0,0 @@
{
lib,
config,
myvars,
...
}: {
# Workaround for prometheus to store data in another place
# https://www.freedesktop.org/software/systemd/man/latest/tmpfiles.d.html#Type
systemd.tmpfiles.rules = [
"D /data/apps/prometheus2 0751 prometheus prometheus - -"
"L+ /var/lib/prometheus2 - - - - /data/apps/prometheus2"
];
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
services.prometheus = {
enable = true;
checkConfig = true;
listenAddress = "127.0.0.1";
port = 9090;
webExternalUrl = "http://prometheus.writefor.fun";
extraFlags = [
"--storage.tsdb.retention.time=30d"
# https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations
"--web.enable-remote-write-receiver"
];
# Directory below /var/lib to store Prometheus metrics data.
stateDir = "prometheus2";
# Reload prometheus when configuration file changes (instead of restart).
enableReload = true;
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read
# remoteRead = [];
# Rules are read from these files.
# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
#
# Prometheus supports two types of rules which may be configured
# and then evaluated at regular intervals:
# 1. Recording rules
# Recording rules allow you to precompute frequently needed or computationally
# expensive expressions and save their result as a new set of time series.
# Querying the precomputed result will then often be much faster than executing the original expression.
# This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh.
# 2. Alerting rules
# Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
# and to send notifications about firing alerts to an external service.
ruleFiles = [
./alert_rules/node-exporter.yml
./alert_rules/kubestate-exporter.yml
./alert_rules/etcd_embedded-exporter.yml
./alert_rules/istio_embedded-exporter.yml
./alert_rules/coredns_embedded-exporter.yml
# ./recording_rules.yml
];
# specifies a set of targets and parameters describing how to scrape metrics from them.
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
scrapeConfigs =
[
# --- Homelab Applications --- #
{
job_name = "dnsmasq-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"];
labels.type = "app";
labels.app = "dnsmasq";
labels.host = "suzi";
}
];
}
{
job_name = "v2ray-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9153"];
labels.type = "app";
labels.app = "v2ray";
labels.host = "aquamarine";
}
];
}
{
job_name = "postgres-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9187"];
labels.type = "app";
labels.app = "postgresql";
labels.host = "aquamarine";
}
];
}
{
job_name = "sftpgo-embedded-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:10000"];
labels.type = "app";
labels.app = "sftpgo";
labels.host = "aquamarine";
}
];
}
]
# --- Hosts --- #
++ (
lib.attrsets.foldlAttrs
(acc: hostname: addr:
acc
++ [
{
job_name = "node-exporter-${hostname}";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
# All my NixOS hosts.
targets = ["${addr.ipv4}:9100"];
labels.type = "node";
labels.host = hostname;
}
];
}
])
[]
myvars.networking.hostsAddr
);
# specifies Alertmanager instances the Prometheus server sends alerts to
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config
alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}];
};
services.prometheus.alertmanager = {
enable = true;
listenAddress = "127.0.0.1";
port = 9093;
webExternalUrl = "http://alertmanager.writefor.fun";
logLevel = "info";
environmentFile = config.age.secrets."alertmanager.env".path;
configuration = {
global = {
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost = "smtp.qq.com:465";
smtp_from = "$SMTP_SENDER_EMAIL";
smtp_auth_username = "$SMTP_AUTH_USERNAME";
smtp_auth_password = "$SMTP_AUTH_PASSWORD";
# smtp.qq.com:465 support SSL only, so we need to disable TLS here.
# https://service.mail.qq.com/detail/0/310
smtp_require_tls = false;
};
route = {
receiver = "default";
routes = [
{
group_by = ["host"];
group_wait = "5m";
group_interval = "5m";
repeat_interval = "4h";
receiver = "default";
}
];
};
receivers = [
{
name = "default";
email_configs = [
{
to = "ryan4yin@linux.com";
# Whether to notify about resolved alerts.
send_resolved = true;
}
];
}
];
};
};
}