From 34072df760bdd75c57a5f78d161a7bd1beee48ed Mon Sep 17 00:00:00 2001 From: Ryan Yin Date: Wed, 23 Oct 2024 23:22:08 +0800 Subject: [PATCH] feat: migrate from prometheus to victoriametrics (#171) * feat: migrate from prometheus to victoriametrics * fix: duplicated declaration * fix: victoriametrics - isSystemUser * fix: import promTypes * fix: vmalert * fix(victoriametrics): cli args --- .../{prometheus => monitoring}/README.md | 2 +- .../alert_rules/etcd_embedded-exporter.yml | 0 .../alert_rules/istio_embedded-exporter.yml | 0 .../alert_rules/kubestate-exporter.yml | 0 .../alert_rules/node-exporter.yml | 0 .../monitoring/alertmanager.nix | 47 + hosts/idols-aquamarine/monitoring/default.nix | 7 + .../monitoring/module/default.nix | 195 +++ .../monitoring/module/promTypes.nix | 1334 +++++++++++++++++ .../monitoring/victoriametrics.nix | 132 ++ .../alert_rules/coredns_embedded-exporter.yml | 13 - hosts/idols-aquamarine/prometheus/default.nix | 192 --- 12 files changed, 1716 insertions(+), 206 deletions(-) rename hosts/idols-aquamarine/{prometheus => monitoring}/README.md (83%) rename hosts/idols-aquamarine/{prometheus => monitoring}/alert_rules/etcd_embedded-exporter.yml (100%) rename hosts/idols-aquamarine/{prometheus => monitoring}/alert_rules/istio_embedded-exporter.yml (100%) rename hosts/idols-aquamarine/{prometheus => monitoring}/alert_rules/kubestate-exporter.yml (100%) rename hosts/idols-aquamarine/{prometheus => monitoring}/alert_rules/node-exporter.yml (100%) create mode 100644 hosts/idols-aquamarine/monitoring/alertmanager.nix create mode 100644 hosts/idols-aquamarine/monitoring/default.nix create mode 100644 hosts/idols-aquamarine/monitoring/module/default.nix create mode 100644 hosts/idols-aquamarine/monitoring/module/promTypes.nix create mode 100644 hosts/idols-aquamarine/monitoring/victoriametrics.nix delete mode 100644 hosts/idols-aquamarine/prometheus/alert_rules/coredns_embedded-exporter.yml delete mode 100644 hosts/idols-aquamarine/prometheus/default.nix diff --git a/hosts/idols-aquamarine/prometheus/README.md b/hosts/idols-aquamarine/monitoring/README.md similarity index 83% rename from hosts/idols-aquamarine/prometheus/README.md rename to hosts/idols-aquamarine/monitoring/README.md index 2abb15a7..e7c050c5 100644 --- a/hosts/idols-aquamarine/prometheus/README.md +++ b/hosts/idols-aquamarine/monitoring/README.md @@ -1,4 +1,4 @@ -# Prometheus & Alertmanager +# Monitoring & Alerting ## Alert Rules diff --git a/hosts/idols-aquamarine/prometheus/alert_rules/etcd_embedded-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/etcd_embedded-exporter.yml similarity index 100% rename from hosts/idols-aquamarine/prometheus/alert_rules/etcd_embedded-exporter.yml rename to hosts/idols-aquamarine/monitoring/alert_rules/etcd_embedded-exporter.yml diff --git a/hosts/idols-aquamarine/prometheus/alert_rules/istio_embedded-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/istio_embedded-exporter.yml similarity index 100% rename from hosts/idols-aquamarine/prometheus/alert_rules/istio_embedded-exporter.yml rename to hosts/idols-aquamarine/monitoring/alert_rules/istio_embedded-exporter.yml diff --git a/hosts/idols-aquamarine/prometheus/alert_rules/kubestate-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/kubestate-exporter.yml similarity index 100% rename from hosts/idols-aquamarine/prometheus/alert_rules/kubestate-exporter.yml rename to hosts/idols-aquamarine/monitoring/alert_rules/kubestate-exporter.yml diff --git a/hosts/idols-aquamarine/prometheus/alert_rules/node-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml similarity index 100% rename from hosts/idols-aquamarine/prometheus/alert_rules/node-exporter.yml rename to hosts/idols-aquamarine/monitoring/alert_rules/node-exporter.yml diff --git a/hosts/idols-aquamarine/monitoring/alertmanager.nix b/hosts/idols-aquamarine/monitoring/alertmanager.nix new file mode 100644 index 00000000..f8b1ce3a --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alertmanager.nix @@ -0,0 +1,47 @@ +{config, ...}: { + services.prometheus.alertmanager = { + enable = true; + listenAddress = "127.0.0.1"; + port = 9093; + webExternalUrl = "http://alertmanager.writefor.fun"; + logLevel = "info"; + + environmentFile = config.age.secrets."alertmanager.env".path; + configuration = { + global = { + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost = "smtp.qq.com:465"; + smtp_from = "$SMTP_SENDER_EMAIL"; + smtp_auth_username = "$SMTP_AUTH_USERNAME"; + smtp_auth_password = "$SMTP_AUTH_PASSWORD"; + # smtp.qq.com:465 support SSL only, so we need to disable TLS here. + # https://service.mail.qq.com/detail/0/310 + smtp_require_tls = false; + }; + route = { + receiver = "default"; + routes = [ + { + group_by = ["host"]; + group_wait = "5m"; + group_interval = "5m"; + repeat_interval = "4h"; + receiver = "default"; + } + ]; + }; + receivers = [ + { + name = "default"; + email_configs = [ + { + to = "ryan4yin@linux.com"; + # Whether to notify about resolved alerts. + send_resolved = true; + } + ]; + } + ]; + }; + }; +} diff --git a/hosts/idols-aquamarine/monitoring/default.nix b/hosts/idols-aquamarine/monitoring/default.nix new file mode 100644 index 00000000..408c0ada --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/default.nix @@ -0,0 +1,7 @@ +{mylib, ...}: { + imports = [ + ./module + ./victoriametrics.nix + ./alertmanager.nix + ]; +} diff --git a/hosts/idols-aquamarine/monitoring/module/default.nix b/hosts/idols-aquamarine/monitoring/module/default.nix new file mode 100644 index 00000000..3aaf5b74 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/module/default.nix @@ -0,0 +1,195 @@ +# Based on +# - https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/databases/victoriametrics.nix +# - https://github.com/NixOS/nixpkgs/blob/nixos-24.05/nixos/modules/services/monitoring/prometheus/default.nix +{ + config, + pkgs, + lib, + ... +}: +with lib; let + cfg = config.services.my-victoriametrics; + yaml = pkgs.formats.yaml {}; + + promTypes = import ./promTypes.nix {inherit lib;}; + + bindAddr = "${cfg.listenAddress}:${builtins.toString cfg.port}"; + workingDir = "/var/lib/" + cfg.stateDir; + + generatedPrometheusYml = yaml.generate "prometheus.yml" scrapeConfig; + + # This becomes the main config file for VictoriaMetrics's `-promscrape.config` + # https://docs.victoriametrics.com/vmagent/#how-to-collect-metrics-in-prometheus-format + scrapeConfig = { + global = filterValidPrometheus cfg.globalConfig; + scrape_configs = filterValidPrometheus cfg.scrapeConfigs; + }; + + filterValidPrometheus = filterAttrsListRecursive (n: v: !(n == "_module" || v == null)); + filterAttrsListRecursive = pred: x: + if isAttrs x + then + listToAttrs + ( + concatMap + ( + name: let + v = x.${name}; + in + if pred name v + then [ + (nameValuePair name (filterAttrsListRecursive pred v)) + ] + else [] + ) + (attrNames x) + ) + else if isList x + then map (filterAttrsListRecursive pred) x + else x; +in { + options.services.my-victoriametrics = { + enable = mkEnableOption "VictoriaMetrics, a time series database, long-term remote storage for victoriametrics"; + package = mkPackageOption pkgs "victoriametrics" {}; + + port = mkOption { + type = types.port; + default = 8428; + description = '' + Port to listen on. + ''; + }; + + listenAddress = mkOption { + type = types.str; + default = "0.0.0.0"; + description = '' + Address to listen on for the http API. + ''; + }; + + stateDir = mkOption { + type = types.str; + default = "victoriametrics2"; + description = '' + Directory below `/var/lib` to store VictoriaMetrics metrics data. + This directory will be created automatically using systemd's StateDirectory mechanism. + ''; + }; + + retentionTime = mkOption { + type = types.nullOr types.str; + default = null; + example = "15d"; + description = '' + How long to retain samples in storage. + The minimum retentionPeriod is 24h or 1d. See also -retentionFilter + The following optional suffixes are supported: s (second), h (hour), d (day), w (week), y (year). + If suffix isn't set, then the duration is counted in months (default 1) + ''; + }; + + globalConfig = mkOption { + type = promTypes.globalConfig; + default = {}; + description = '' + Parameters that are valid in all configuration contexts. They + also serve as defaults for other configuration sections + ''; + }; + + scrapeConfigs = mkOption { + type = types.listOf promTypes.scrape_config; + default = []; + description = '' + A list of scrape configurations. + See docs: + ''; + }; + + extraFlags = mkOption { + type = types.listOf types.str; + default = []; + description = '' + Extra options to pass to VictoriaMetrics. See the README: + + or {command}`victoriametrics -help` for more + information. + ''; + }; + }; + config = lib.mkIf cfg.enable { + users.groups.victoriametrics = {}; + users.users.victoriametrics = { + description = "victoriametrics daemon user"; + isSystemUser = true; # required when uid is null + group = "victoriametrics"; + }; + + systemd.services.my-victoriametrics = { + description = "VictoriaMetrics time series database"; + wantedBy = ["multi-user.target"]; + after = ["network.target"]; + + startLimitBurst = 5; + serviceConfig = { + ExecStart = '' + ${cfg.package}/bin/victoria-metrics \ + -storageDataPath=${workingDir} \ + -httpListenAddr=${bindAddr} \ + -retentionPeriod=${cfg.retentionTime} \ + -promscrape.config=${generatedPrometheusYml} \ + ${lib.escapeShellArgs cfg.extraFlags} + ''; + RestartSec = 1; + User = "victoriametrics"; + Restart = "on-failure"; + RuntimeDirectory = "victoriametrics"; + RuntimeDirectoryMode = "0700"; + WorkingDirectory = workingDir; + StateDirectory = cfg.stateDir; + StateDirectoryMode = "0700"; + + # Increase the limit to avoid errors like 'too many open files' when merging small parts + LimitNOFILE = 1048576; + + # Hardening + AmbientCapabilities = lib.mkIf (cfg.port < 1024) ["CAP_NET_BIND_SERVICE"]; + CapabilityBoundingSet = + if (cfg.port < 1024) + then ["CAP_NET_BIND_SERVICE"] + else [""]; + DeviceAllow = ["/dev/null rw"]; + DevicePolicy = "strict"; + LockPersonality = true; + MemoryDenyWriteExecute = true; + NoNewPrivileges = true; + PrivateDevices = true; + PrivateTmp = true; + PrivateUsers = true; + ProtectClock = true; + ProtectControlGroups = true; + ProtectHome = true; + ProtectHostname = true; + ProtectKernelLogs = true; + ProtectKernelModules = true; + ProtectKernelTunables = true; + ProtectProc = "invisible"; + ProtectSystem = "full"; + RemoveIPC = true; + RestrictAddressFamilies = ["AF_INET" "AF_INET6" "AF_UNIX"]; + RestrictNamespaces = true; + RestrictRealtime = true; + RestrictSUIDSGID = true; + SystemCallArchitectures = "native"; + SystemCallFilter = ["@system-service" "~@privileged"]; + }; + + postStart = lib.mkBefore '' + until ${lib.getBin pkgs.curl}/bin/curl -s -o /dev/null http://${bindAddr}/ping; do + sleep 1; + done + ''; + }; + }; +} diff --git a/hosts/idols-aquamarine/monitoring/module/promTypes.nix b/hosts/idols-aquamarine/monitoring/module/promTypes.nix new file mode 100644 index 00000000..e13e71f8 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/module/promTypes.nix @@ -0,0 +1,1334 @@ +{lib}: +with lib; let + # + # Config types: helper functions + # + mkDefOpt = type: defaultStr: description: + mkOpt type (description + + '' + + Defaults to ````${defaultStr}```` in prometheus + when set to `null`. + ''); + + mkOpt = type: description: + mkOption { + type = types.nullOr type; + default = null; + description = description; + }; + + mkSdConfigModule = extraOptions: + types.submodule { + options = + { + basic_auth = mkOpt promTypes.basic_auth '' + Optional HTTP basic authentication information. + ''; + + authorization = + mkOpt + (types.submodule { + options = { + type = mkDefOpt types.str "Bearer" '' + Sets the authentication type. + ''; + + credentials = mkOpt types.str '' + Sets the credentials. It is mutually exclusive with `credentials_file`. + ''; + + credentials_file = mkOpt types.str '' + Sets the credentials to the credentials read from the configured file. + It is mutually exclusive with `credentials`. + ''; + }; + }) '' + Optional `Authorization` header configuration. + ''; + + oauth2 = mkOpt promtypes.oauth2 '' + Optional OAuth 2.0 configuration. + Cannot be used at the same time as basic_auth or authorization. + ''; + + proxy_url = mkOpt types.str '' + Optional proxy URL. + ''; + + follow_redirects = mkDefOpt types.bool "true" '' + Configure whether HTTP requests follow HTTP 3xx redirects. + ''; + + tls_config = mkOpt promTypes.tls_config '' + TLS configuration. + ''; + } + // extraOptions; + }; + + # + # Config types: general + # + + promTypes.globalConfig = types.submodule { + options = { + scrape_interval = mkDefOpt types.str "1m" '' + How frequently to scrape targets by default. + ''; + + scrape_timeout = mkDefOpt types.str "10s" '' + How long until a scrape request times out. + ''; + + evaluation_interval = mkDefOpt types.str "1m" '' + How frequently to evaluate rules by default. + ''; + + external_labels = mkOpt (types.attrsOf types.str) '' + The labels to add to any time series or alerts when + communicating with external systems (federation, remote + storage, Alertmanager). + ''; + }; + }; + + promTypes.basic_auth = types.submodule { + options = { + username = mkOption { + type = types.str; + description = '' + HTTP username + ''; + }; + password = mkOpt types.str "HTTP password"; + password_file = mkOpt types.str "HTTP password file"; + }; + }; + + promTypes.tls_config = types.submodule { + options = { + ca_file = mkOpt types.str '' + CA certificate to validate API server certificate with. + ''; + + cert_file = mkOpt types.str '' + Certificate file for client cert authentication to the server. + ''; + + key_file = mkOpt types.str '' + Key file for client cert authentication to the server. + ''; + + server_name = mkOpt types.str '' + ServerName extension to indicate the name of the server. + http://tools.ietf.org/html/rfc4366#section-3.1 + ''; + + insecure_skip_verify = mkOpt types.bool '' + Disable validation of the server certificate. + ''; + }; + }; + + promtypes.oauth2 = types.submodule { + options = { + client_id = mkOpt types.str '' + OAuth client ID. + ''; + + client_secret = mkOpt types.str '' + OAuth client secret. + ''; + + client_secret_file = mkOpt types.str '' + Read the client secret from a file. It is mutually exclusive with `client_secret`. + ''; + + scopes = mkOpt (types.listOf types.str) '' + Scopes for the token request. + ''; + + token_url = mkOpt types.str '' + The URL to fetch the token from. + ''; + + endpoint_params = mkOpt (types.attrsOf types.str) '' + Optional parameters to append to the token URL. + ''; + }; + }; + + promTypes.scrape_config = types.submodule { + options = { + authorization = mkOption { + type = types.nullOr types.attrs; + default = null; + description = '' + Sets the `Authorization` header on every scrape request with the configured credentials. + ''; + }; + job_name = mkOption { + type = types.str; + description = '' + The job name assigned to scraped metrics by default. + ''; + }; + scrape_interval = mkOpt types.str '' + How frequently to scrape targets from this job. Defaults to the + globally configured default. + ''; + + scrape_timeout = mkOpt types.str '' + Per-target timeout when scraping this job. Defaults to the + globally configured default. + ''; + + metrics_path = mkDefOpt types.str "/metrics" '' + The HTTP resource path on which to fetch metrics from targets. + ''; + + honor_labels = mkDefOpt types.bool "false" '' + Controls how Prometheus handles conflicts between labels + that are already present in scraped data and labels that + Prometheus would attach server-side ("job" and "instance" + labels, manually configured target labels, and labels + generated by service discovery implementations). + + If honor_labels is set to "true", label conflicts are + resolved by keeping label values from the scraped data and + ignoring the conflicting server-side labels. + + If honor_labels is set to "false", label conflicts are + resolved by renaming conflicting labels in the scraped data + to "exported_\" (for example + "exported_instance", "exported_job") and then attaching + server-side labels. This is useful for use cases such as + federation, where all labels specified in the target should + be preserved. + ''; + + honor_timestamps = mkDefOpt types.bool "true" '' + honor_timestamps controls whether Prometheus respects the timestamps present + in scraped data. + + If honor_timestamps is set to `true`, the timestamps of the metrics exposed + by the target will be used. + + If honor_timestamps is set to `false`, the timestamps of the metrics exposed + by the target will be ignored. + ''; + + scheme = mkDefOpt (types.enum ["http" "https"]) "http" '' + The URL scheme with which to fetch metrics from targets. + ''; + + params = mkOpt (types.attrsOf (types.listOf types.str)) '' + Optional HTTP URL parameters. + ''; + + basic_auth = mkOpt promTypes.basic_auth '' + Sets the `Authorization` header on every scrape request with the + configured username and password. + password and password_file are mutually exclusive. + ''; + + bearer_token = mkOpt types.str '' + Sets the `Authorization` header on every scrape request with + the configured bearer token. It is mutually exclusive with + {option}`bearer_token_file`. + ''; + + bearer_token_file = mkOpt types.str '' + Sets the `Authorization` header on every scrape request with + the bearer token read from the configured file. It is mutually + exclusive with {option}`bearer_token`. + ''; + + tls_config = mkOpt promTypes.tls_config '' + Configures the scrape request's TLS settings. + ''; + + proxy_url = mkOpt types.str '' + Optional proxy URL. + ''; + + azure_sd_configs = mkOpt (types.listOf promTypes.azure_sd_config) '' + List of Azure service discovery configurations. + ''; + + consul_sd_configs = mkOpt (types.listOf promTypes.consul_sd_config) '' + List of Consul service discovery configurations. + ''; + + digitalocean_sd_configs = mkOpt (types.listOf promTypes.digitalocean_sd_config) '' + List of DigitalOcean service discovery configurations. + ''; + + docker_sd_configs = mkOpt (types.listOf promTypes.docker_sd_config) '' + List of Docker service discovery configurations. + ''; + + dockerswarm_sd_configs = mkOpt (types.listOf promTypes.dockerswarm_sd_config) '' + List of Docker Swarm service discovery configurations. + ''; + + dns_sd_configs = mkOpt (types.listOf promTypes.dns_sd_config) '' + List of DNS service discovery configurations. + ''; + + ec2_sd_configs = mkOpt (types.listOf promTypes.ec2_sd_config) '' + List of EC2 service discovery configurations. + ''; + + eureka_sd_configs = mkOpt (types.listOf promTypes.eureka_sd_config) '' + List of Eureka service discovery configurations. + ''; + + file_sd_configs = mkOpt (types.listOf promTypes.file_sd_config) '' + List of file service discovery configurations. + ''; + + gce_sd_configs = mkOpt (types.listOf promTypes.gce_sd_config) '' + List of Google Compute Engine service discovery configurations. + + See [the relevant Prometheus configuration docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) + for more detail. + ''; + + hetzner_sd_configs = mkOpt (types.listOf promTypes.hetzner_sd_config) '' + List of Hetzner service discovery configurations. + ''; + + http_sd_configs = mkOpt (types.listOf promTypes.http_sd_config) '' + List of HTTP service discovery configurations. + ''; + + kubernetes_sd_configs = mkOpt (types.listOf promTypes.kubernetes_sd_config) '' + List of Kubernetes service discovery configurations. + ''; + + kuma_sd_configs = mkOpt (types.listOf promTypes.kuma_sd_config) '' + List of Kuma service discovery configurations. + ''; + + lightsail_sd_configs = mkOpt (types.listOf promTypes.lightsail_sd_config) '' + List of Lightsail service discovery configurations. + ''; + + linode_sd_configs = mkOpt (types.listOf promTypes.linode_sd_config) '' + List of Linode service discovery configurations. + ''; + + marathon_sd_configs = mkOpt (types.listOf promTypes.marathon_sd_config) '' + List of Marathon service discovery configurations. + ''; + + nerve_sd_configs = mkOpt (types.listOf promTypes.nerve_sd_config) '' + List of AirBnB's Nerve service discovery configurations. + ''; + + openstack_sd_configs = mkOpt (types.listOf promTypes.openstack_sd_config) '' + List of OpenStack service discovery configurations. + ''; + + puppetdb_sd_configs = mkOpt (types.listOf promTypes.puppetdb_sd_config) '' + List of PuppetDB service discovery configurations. + ''; + + scaleway_sd_configs = mkOpt (types.listOf promTypes.scaleway_sd_config) '' + List of Scaleway service discovery configurations. + ''; + + serverset_sd_configs = mkOpt (types.listOf promTypes.serverset_sd_config) '' + List of Zookeeper Serverset service discovery configurations. + ''; + + triton_sd_configs = mkOpt (types.listOf promTypes.triton_sd_config) '' + List of Triton Serverset service discovery configurations. + ''; + + uyuni_sd_configs = mkOpt (types.listOf promTypes.uyuni_sd_config) '' + List of Uyuni Serverset service discovery configurations. + ''; + + static_configs = mkOpt (types.listOf promTypes.static_config) '' + List of labeled target groups for this job. + ''; + + relabel_configs = mkOpt (types.listOf promTypes.relabel_config) '' + List of relabel configurations. + ''; + + metric_relabel_configs = mkOpt (types.listOf promTypes.relabel_config) '' + List of metric relabel configurations. + ''; + + body_size_limit = mkDefOpt types.str "0" '' + An uncompressed response body larger than this many bytes will cause the + scrape to fail. 0 means no limit. Example: 100MB. + This is an experimental feature, this behaviour could + change or be removed in the future. + ''; + + sample_limit = mkDefOpt types.int "0" '' + Per-scrape limit on number of scraped samples that will be accepted. + If more than this number of samples are present after metric relabelling + the entire scrape will be treated as failed. 0 means no limit. + ''; + + label_limit = mkDefOpt types.int "0" '' + Per-scrape limit on number of labels that will be accepted for a sample. If + more than this number of labels are present post metric-relabeling, the + entire scrape will be treated as failed. 0 means no limit. + ''; + + label_name_length_limit = mkDefOpt types.int "0" '' + Per-scrape limit on length of labels name that will be accepted for a sample. + If a label name is longer than this number post metric-relabeling, the entire + scrape will be treated as failed. 0 means no limit. + ''; + + label_value_length_limit = mkDefOpt types.int "0" '' + Per-scrape limit on length of labels value that will be accepted for a sample. + If a label value is longer than this number post metric-relabeling, the + entire scrape will be treated as failed. 0 means no limit. + ''; + + target_limit = mkDefOpt types.int "0" '' + Per-scrape config limit on number of unique targets that will be + accepted. If more than this number of targets are present after target + relabeling, Prometheus will mark the targets as failed without scraping them. + 0 means no limit. This is an experimental feature, this behaviour could + change in the future. + ''; + }; + }; + + # + # Config types: service discovery + # + + # For this one, the docs actually define all types needed to use mkSdConfigModule, but a bunch + # of them are marked with 'currently not support by Azure' so we don't bother adding them in + # here. + promTypes.azure_sd_config = types.submodule { + options = { + environment = mkDefOpt types.str "AzurePublicCloud" '' + The Azure environment. + ''; + + authentication_method = mkDefOpt (types.enum ["OAuth" "ManagedIdentity"]) "OAuth" '' + The authentication method, either OAuth or ManagedIdentity. + See https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/overview + ''; + + subscription_id = mkOption { + type = types.str; + description = '' + The subscription ID. + ''; + }; + + tenant_id = mkOpt types.str '' + Optional tenant ID. Only required with authentication_method OAuth. + ''; + + client_id = mkOpt types.str '' + Optional client ID. Only required with authentication_method OAuth. + ''; + + client_secret = mkOpt types.str '' + Optional client secret. Only required with authentication_method OAuth. + ''; + + refresh_interval = mkDefOpt types.str "300s" '' + Refresh interval to re-read the instance list. + ''; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. If using the public IP + address, this must instead be specified in the relabeling + rule. + ''; + + proxy_url = mkOpt types.str '' + Optional proxy URL. + ''; + + follow_redirects = mkDefOpt types.bool "true" '' + Configure whether HTTP requests follow HTTP 3xx redirects. + ''; + + tls_config = mkOpt promTypes.tls_config '' + TLS configuration. + ''; + }; + }; + + promTypes.consul_sd_config = mkSdConfigModule { + server = mkDefOpt types.str "localhost:8500" '' + Consul server to query. + ''; + + token = mkOpt types.str "Consul token"; + + datacenter = mkOpt types.str "Consul datacenter"; + + scheme = mkDefOpt types.str "http" "Consul scheme"; + + username = mkOpt types.str "Consul username"; + + password = mkOpt types.str "Consul password"; + + tls_config = mkOpt promTypes.tls_config '' + Configures the Consul request's TLS settings. + ''; + + services = mkOpt (types.listOf types.str) '' + A list of services for which targets are retrieved. + ''; + + tags = mkOpt (types.listOf types.str) '' + An optional list of tags used to filter nodes for a given + service. Services must contain all tags in the list. + ''; + + node_meta = mkOpt (types.attrsOf types.str) '' + Node metadata used to filter nodes for a given service. + ''; + + tag_separator = mkDefOpt types.str "," '' + The string by which Consul tags are joined into the tag label. + ''; + + allow_stale = mkOpt types.bool '' + Allow stale Consul results + (see ). + + Will reduce load on Consul. + ''; + + refresh_interval = mkDefOpt types.str "30s" '' + The time after which the provided names are refreshed. + + On large setup it might be a good idea to increase this value + because the catalog will change all the time. + ''; + }; + + promTypes.digitalocean_sd_config = mkSdConfigModule { + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + The time after which the droplets are refreshed. + ''; + }; + + mkDockerSdConfigModule = extraOptions: + mkSdConfigModule ({ + host = mkOption { + type = types.str; + description = '' + Address of the Docker daemon. + ''; + }; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from, when `role` is nodes, and for discovered + tasks and services that don't have published ports. + ''; + + filters = + mkOpt + (types.listOf (types.submodule { + options = { + name = mkOption { + type = types.str; + description = '' + Name of the filter. The available filters are listed in the upstream documentation: + Services: + Tasks: + Nodes: + ''; + }; + values = mkOption { + type = types.str; + description = '' + Value for the filter. + ''; + }; + }; + })) '' + Optional filters to limit the discovery process to a subset of available resources. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + The time after which the containers are refreshed. + ''; + } + // extraOptions); + + promTypes.docker_sd_config = mkDockerSdConfigModule { + host_networking_host = mkDefOpt types.str "localhost" '' + The host to use if the container is in host networking mode. + ''; + }; + + promTypes.dockerswarm_sd_config = mkDockerSdConfigModule { + role = mkOption { + type = types.enum ["services" "tasks" "nodes"]; + description = '' + Role of the targets to retrieve. Must be `services`, `tasks`, or `nodes`. + ''; + }; + }; + + promTypes.dns_sd_config = types.submodule { + options = { + names = mkOption { + type = types.listOf types.str; + description = '' + A list of DNS SRV record names to be queried. + ''; + }; + + type = mkDefOpt (types.enum ["SRV" "A" "AAAA"]) "SRV" '' + The type of DNS query to perform. One of SRV, A, or AAAA. + ''; + + port = mkOpt types.port '' + The port number used if the query type is not SRV. + ''; + + refresh_interval = mkDefOpt types.str "30s" '' + The time after which the provided names are refreshed. + ''; + }; + }; + + promTypes.ec2_sd_config = types.submodule { + options = { + region = mkOption { + type = types.str; + description = '' + The AWS Region. If blank, the region from the instance metadata is used. + ''; + }; + endpoint = mkOpt types.str '' + Custom endpoint to be used. + ''; + + access_key = mkOpt types.str '' + The AWS API key id. If blank, the environment variable + `AWS_ACCESS_KEY_ID` is used. + ''; + + secret_key = mkOpt types.str '' + The AWS API key secret. If blank, the environment variable + `AWS_SECRET_ACCESS_KEY` is used. + ''; + + profile = mkOpt types.str '' + Named AWS profile used to connect to the API. + ''; + + role_arn = mkOpt types.str '' + AWS Role ARN, an alternative to using AWS API keys. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-read the instance list. + ''; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. If using the public IP + address, this must instead be specified in the relabeling + rule. + ''; + + filters = + mkOpt + (types.listOf (types.submodule { + options = { + name = mkOption { + type = types.str; + description = '' + See [this list](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstances.html) + for the available filters. + ''; + }; + + values = mkOption { + type = types.listOf types.str; + default = []; + description = '' + Value of the filter. + ''; + }; + }; + })) '' + Filters can be used optionally to filter the instance list by other criteria. + ''; + }; + }; + + promTypes.eureka_sd_config = mkSdConfigModule { + server = mkOption { + type = types.str; + description = '' + The URL to connect to the Eureka server. + ''; + }; + }; + + promTypes.file_sd_config = types.submodule { + options = { + files = mkOption { + type = types.listOf types.str; + description = '' + Patterns for files from which target groups are extracted. Refer + to the Prometheus documentation for permitted filename patterns + and formats. + ''; + }; + + refresh_interval = mkDefOpt types.str "5m" '' + Refresh interval to re-read the files. + ''; + }; + }; + + promTypes.gce_sd_config = types.submodule { + options = { + # Use `mkOption` instead of `mkOpt` for project and zone because they are + # required configuration values for `gce_sd_config`. + project = mkOption { + type = types.str; + description = '' + The GCP Project. + ''; + }; + + zone = mkOption { + type = types.str; + description = '' + The zone of the scrape targets. If you need multiple zones use multiple + gce_sd_configs. + ''; + }; + + filter = mkOpt types.str '' + Filter can be used optionally to filter the instance list by other + criteria Syntax of this filter string is described here in the filter + query parameter section: . + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-read the cloud instance list. + ''; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. If using the public IP address, this + must instead be specified in the relabeling rule. + ''; + + tag_separator = mkDefOpt types.str "," '' + The tag separator used to separate concatenated GCE instance network tags. + + See the GCP documentation on network tags for more information: + + ''; + }; + }; + + promTypes.hetzner_sd_config = mkSdConfigModule { + role = mkOption { + type = types.enum ["robot" "hcloud"]; + description = '' + The Hetzner role of entities that should be discovered. + One of `robot` or `hcloud`. + ''; + }; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + The time after which the servers are refreshed. + ''; + }; + + promTypes.http_sd_config = types.submodule { + options = { + url = mkOption { + type = types.str; + description = '' + URL from which the targets are fetched. + ''; + }; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-query the endpoint. + ''; + + basic_auth = mkOpt promTypes.basic_auth '' + Authentication information used to authenticate to the API server. + password and password_file are mutually exclusive. + ''; + + proxy_url = mkOpt types.str '' + Optional proxy URL. + ''; + + follow_redirects = mkDefOpt types.bool "true" '' + Configure whether HTTP requests follow HTTP 3xx redirects. + ''; + + tls_config = mkOpt promTypes.tls_config '' + Configures the scrape request's TLS settings. + ''; + }; + }; + + promTypes.kubernetes_sd_config = mkSdConfigModule { + api_server = mkOpt types.str '' + The API server addresses. If left empty, Prometheus is assumed to run inside + of the cluster and will discover API servers automatically and use the pod's + CA certificate and bearer token file at /var/run/secrets/kubernetes.io/serviceaccount/. + ''; + + role = mkOption { + type = types.enum ["endpoints" "service" "pod" "node" "ingress"]; + description = '' + The Kubernetes role of entities that should be discovered. + One of endpoints, service, pod, node, or ingress. + ''; + }; + + kubeconfig_file = mkOpt types.str '' + Optional path to a kubeconfig file. + Note that api_server and kube_config are mutually exclusive. + ''; + + namespaces = + mkOpt + ( + types.submodule { + options = { + names = mkOpt (types.listOf types.str) '' + Namespace name. + ''; + }; + } + ) '' + Optional namespace discovery. If omitted, all namespaces are used. + ''; + + selectors = + mkOpt + ( + types.listOf ( + types.submodule { + options = { + role = mkOption { + type = types.str; + description = '' + Selector role + ''; + }; + + label = mkOpt types.str '' + Selector label + ''; + + field = mkOpt types.str '' + Selector field + ''; + }; + } + ) + ) '' + Optional label and field selectors to limit the discovery process to a subset of available resources. + See https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ + and https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ to learn more about the possible + filters that can be used. Endpoints role supports pod, service and endpoints selectors, other roles + only support selectors matching the role itself (e.g. node role can only contain node selectors). + + Note: When making decision about using field/label selector make sure that this + is the best approach - it will prevent Prometheus from reusing single list/watch + for all scrape configs. This might result in a bigger load on the Kubernetes API, + because per each selector combination there will be additional LIST/WATCH. On the other hand, + if you just want to monitor small subset of pods in large cluster it's recommended to use selectors. + Decision, if selectors should be used or not depends on the particular situation. + ''; + }; + + promTypes.kuma_sd_config = mkSdConfigModule { + server = mkOption { + type = types.str; + description = '' + Address of the Kuma Control Plane's MADS xDS server. + ''; + }; + + refresh_interval = mkDefOpt types.str "30s" '' + The time to wait between polling update requests. + ''; + + fetch_timeout = mkDefOpt types.str "2m" '' + The time after which the monitoring assignments are refreshed. + ''; + }; + + promTypes.lightsail_sd_config = types.submodule { + options = { + region = mkOpt types.str '' + The AWS region. If blank, the region from the instance metadata is used. + ''; + + endpoint = mkOpt types.str '' + Custom endpoint to be used. + ''; + + access_key = mkOpt types.str '' + The AWS API keys. If blank, the environment variable `AWS_ACCESS_KEY_ID` is used. + ''; + + secret_key = mkOpt types.str '' + The AWS API keys. If blank, the environment variable `AWS_SECRET_ACCESS_KEY` is used. + ''; + + profile = mkOpt types.str '' + Named AWS profile used to connect to the API. + ''; + + role_arn = mkOpt types.str '' + AWS Role ARN, an alternative to using AWS API keys. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-read the instance list. + ''; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. If using the public IP address, this must + instead be specified in the relabeling rule. + ''; + }; + }; + + promTypes.linode_sd_config = mkSdConfigModule { + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. + ''; + + tag_separator = mkDefOpt types.str "," '' + The string by which Linode Instance tags are joined into the tag label. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + The time after which the linode instances are refreshed. + ''; + }; + + promTypes.marathon_sd_config = mkSdConfigModule { + servers = mkOption { + type = types.listOf types.str; + description = '' + List of URLs to be used to contact Marathon servers. You need to provide at least one server URL. + ''; + }; + + refresh_interval = mkDefOpt types.str "30s" '' + Polling interval. + ''; + + auth_token = mkOpt types.str '' + Optional authentication information for token-based authentication: + + It is mutually exclusive with `auth_token_file` and other authentication mechanisms. + ''; + + auth_token_file = mkOpt types.str '' + Optional authentication information for token-based authentication: + + It is mutually exclusive with `auth_token` and other authentication mechanisms. + ''; + }; + + promTypes.nerve_sd_config = types.submodule { + options = { + servers = mkOption { + type = types.listOf types.str; + description = '' + The Zookeeper servers. + ''; + }; + + paths = mkOption { + type = types.listOf types.str; + description = '' + Paths can point to a single service, or the root of a tree of services. + ''; + }; + + timeout = mkDefOpt types.str "10s" '' + Timeout value. + ''; + }; + }; + + promTypes.openstack_sd_config = types.submodule { + options = let + userDescription = '' + username is required if using Identity V2 API. Consult with your provider's + control panel to discover your account's username. In Identity V3, either + userid or a combination of username and domain_id or domain_name are needed. + ''; + + domainDescription = '' + At most one of domain_id and domain_name must be provided if using username + with Identity V3. Otherwise, either are optional. + ''; + + projectDescription = '' + The project_id and project_name fields are optional for the Identity V2 API. + Some providers allow you to specify a project_name instead of the project_id. + Some require both. Your provider's authentication policies will determine + how these fields influence authentication. + ''; + + applicationDescription = '' + The application_credential_id or application_credential_name fields are + required if using an application credential to authenticate. Some providers + allow you to create an application credential to authenticate rather than a + password. + ''; + in { + role = mkOption { + type = types.str; + description = '' + The OpenStack role of entities that should be discovered. + ''; + }; + + region = mkOption { + type = types.str; + description = '' + The OpenStack Region. + ''; + }; + + identity_endpoint = mkOpt types.str '' + identity_endpoint specifies the HTTP endpoint that is required to work with + the Identity API of the appropriate version. While it's ultimately needed by + all of the identity services, it will often be populated by a provider-level + function. + ''; + + username = mkOpt types.str userDescription; + userid = mkOpt types.str userDescription; + + password = mkOpt types.str '' + password for the Identity V2 and V3 APIs. Consult with your provider's + control panel to discover your account's preferred method of authentication. + ''; + + domain_name = mkOpt types.str domainDescription; + domain_id = mkOpt types.str domainDescription; + + project_name = mkOpt types.str projectDescription; + project_id = mkOpt types.str projectDescription; + + application_credential_name = mkOpt types.str applicationDescription; + application_credential_id = mkOpt types.str applicationDescription; + + application_credential_secret = mkOpt types.str '' + The application_credential_secret field is required if using an application + credential to authenticate. + ''; + + all_tenants = mkDefOpt types.bool "false" '' + Whether the service discovery should list all instances for all projects. + It is only relevant for the 'instance' role and usually requires admin permissions. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-read the instance list. + ''; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. If using the public IP address, this must + instead be specified in the relabeling rule. + ''; + + availability = mkDefOpt (types.enum ["public" "admin" "internal"]) "public" '' + The availability of the endpoint to connect to. Must be one of public, admin or internal. + ''; + + tls_config = mkOpt promTypes.tls_config '' + TLS configuration. + ''; + }; + }; + + promTypes.puppetdb_sd_config = mkSdConfigModule { + url = mkOption { + type = types.str; + description = '' + The URL of the PuppetDB root query endpoint. + ''; + }; + + query = mkOption { + type = types.str; + description = '' + Puppet Query Language (PQL) query. Only resources are supported. + https://puppet.com/docs/puppetdb/latest/api/query/v4/pql.html + ''; + }; + + include_parameters = mkDefOpt types.bool "false" '' + Whether to include the parameters as meta labels. + Due to the differences between parameter types and Prometheus labels, + some parameters might not be rendered. The format of the parameters might + also change in future releases. + + Note: Enabling this exposes parameters in the Prometheus UI and API. Make sure + that you don't have secrets exposed as parameters if you enable this. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-read the resources list. + ''; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. + ''; + }; + + promTypes.scaleway_sd_config = types.submodule { + options = { + access_key = mkOption { + type = types.str; + description = '' + Access key to use. https://console.scaleway.com/project/credentials + ''; + }; + + secret_key = mkOpt types.str '' + Secret key to use when listing targets. https://console.scaleway.com/project/credentials + It is mutually exclusive with `secret_key_file`. + ''; + + secret_key_file = mkOpt types.str '' + Sets the secret key with the credentials read from the configured file. + It is mutually exclusive with `secret_key`. + ''; + + project_id = mkOption { + type = types.str; + description = '' + Project ID of the targets. + ''; + }; + + role = mkOption { + type = types.enum ["instance" "baremetal"]; + description = '' + Role of the targets to retrieve. Must be `instance` or `baremetal`. + ''; + }; + + port = mkDefOpt types.port "80" '' + The port to scrape metrics from. + ''; + + api_url = mkDefOpt types.str "https://api.scaleway.com" '' + API URL to use when doing the server listing requests. + ''; + + zone = mkDefOpt types.str "fr-par-1" '' + Zone is the availability zone of your targets (e.g. fr-par-1). + ''; + + name_filter = mkOpt types.str '' + Specify a name filter (works as a LIKE) to apply on the server listing request. + ''; + + tags_filter = mkOpt (types.listOf types.str) '' + Specify a tag filter (a server needs to have all defined tags to be listed) to apply on the server listing request. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-read the managed targets list. + ''; + + proxy_url = mkOpt types.str '' + Optional proxy URL. + ''; + + follow_redirects = mkDefOpt types.bool "true" '' + Configure whether HTTP requests follow HTTP 3xx redirects. + ''; + + tls_config = mkOpt promTypes.tls_config '' + TLS configuration. + ''; + }; + }; + + # These are exactly the same. + promTypes.serverset_sd_config = promTypes.nerve_sd_config; + + promTypes.triton_sd_config = types.submodule { + options = { + account = mkOption { + type = types.str; + description = '' + The account to use for discovering new targets. + ''; + }; + + role = mkDefOpt (types.enum ["container" "cn"]) "container" '' + The type of targets to discover, can be set to: + - "container" to discover virtual machines (SmartOS zones, lx/KVM/bhyve branded zones) running on Triton + - "cn" to discover compute nodes (servers/global zones) making up the Triton infrastructure + ''; + + dns_suffix = mkOption { + type = types.str; + description = '' + The DNS suffix which should be applied to target. + ''; + }; + + endpoint = mkOption { + type = types.str; + description = '' + The Triton discovery endpoint (e.g. `cmon.us-east-3b.triton.zone`). This is + often the same value as dns_suffix. + ''; + }; + + groups = mkOpt (types.listOf types.str) '' + A list of groups for which targets are retrieved, only supported when targeting the `container` role. + If omitted all containers owned by the requesting account are scraped. + ''; + + port = mkDefOpt types.port "9163" '' + The port to use for discovery and metric scraping. + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + The interval which should be used for refreshing targets. + ''; + + version = mkDefOpt types.int "1" '' + The Triton discovery API version. + ''; + + tls_config = mkOpt promTypes.tls_config '' + TLS configuration. + ''; + }; + }; + + promTypes.uyuni_sd_config = mkSdConfigModule { + server = mkOption { + type = types.str; + description = '' + The URL to connect to the Uyuni server. + ''; + }; + + username = mkOption { + type = types.str; + description = '' + Credentials are used to authenticate the requests to Uyuni API. + ''; + }; + + password = mkOption { + type = types.str; + description = '' + Credentials are used to authenticate the requests to Uyuni API. + ''; + }; + + entitlement = mkDefOpt types.str "monitoring_entitled" '' + The entitlement string to filter eligible systems. + ''; + + separator = mkDefOpt types.str "," '' + The string by which Uyuni group names are joined into the groups label + ''; + + refresh_interval = mkDefOpt types.str "60s" '' + Refresh interval to re-read the managed targets list. + ''; + }; + + promTypes.static_config = types.submodule { + options = { + targets = mkOption { + type = types.listOf types.str; + description = '' + The targets specified by the target group. + ''; + }; + labels = mkOption { + type = types.attrsOf types.str; + default = {}; + description = '' + Labels assigned to all metrics scraped from the targets. + ''; + }; + }; + }; + + # + # Config types: relabling + # + + promTypes.relabel_config = types.submodule { + options = { + source_labels = mkOpt (types.listOf types.str) '' + The source labels select values from existing labels. Their content + is concatenated using the configured separator and matched against + the configured regular expression. + ''; + + separator = mkDefOpt types.str ";" '' + Separator placed between concatenated source label values. + ''; + + target_label = mkOpt types.str '' + Label to which the resulting value is written in a replace action. + It is mandatory for replace actions. + ''; + + regex = mkDefOpt types.str "(.*)" '' + Regular expression against which the extracted value is matched. + ''; + + modulus = mkOpt types.int '' + Modulus to take of the hash of the source label values. + ''; + + replacement = mkDefOpt types.str "$1" '' + Replacement value against which a regex replace is performed if the + regular expression matches. + ''; + + action = mkDefOpt (types.enum ["replace" "lowercase" "uppercase" "keep" "drop" "hashmod" "labelmap" "labeldrop" "labelkeep"]) "replace" '' + Action to perform based on regex matching. + ''; + }; + }; +in + promTypes diff --git a/hosts/idols-aquamarine/monitoring/victoriametrics.nix b/hosts/idols-aquamarine/monitoring/victoriametrics.nix new file mode 100644 index 00000000..b068e0c1 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/victoriametrics.nix @@ -0,0 +1,132 @@ +{ + lib, + myvars, + ... +}: { + # Workaround for victoriametrics to store data in another place + # https://www.freedesktop.org/software/systemd/man/latest/tmpfiles.d.html#Type + systemd.tmpfiles.rules = [ + "D /data/apps/victoriametrics 0751 victoriametrics victoriametrics - -" + "L+ /var/lib/victoriametrics - - - - /data/apps/victoriametrics" + ]; + + # https://victoriametrics.io/docs/victoriametrics/latest/configuration/configuration/ + services.my-victoriametrics = { + enable = true; + listenAddress = "127.0.0.1"; + port = 9090; + retentionTime = "30d"; + + extraFlags = [ + # Allowed percent of system memory VictoriaMetrics caches may occupy. + "-memory.allowedPercent=50" + ]; + # Directory below /var/lib to store victoriametrics metrics data. + stateDir = "victoriametrics"; + + # specifies a set of targets and parameters describing how to scrape metrics from them. + # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + scrapeConfigs = + [ + # --- Homelab Applications --- # + + { + job_name = "dnsmasq-exporter"; + scrape_interval = "30s"; + metrics_path = "/metrics"; + static_configs = [ + { + targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"]; + labels.type = "app"; + labels.app = "dnsmasq"; + labels.host = "suzi"; + } + ]; + } + + { + job_name = "v2ray-exporter"; + scrape_interval = "30s"; + metrics_path = "/metrics"; + static_configs = [ + { + targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9153"]; + labels.type = "app"; + labels.app = "v2ray"; + labels.host = "aquamarine"; + } + ]; + } + { + job_name = "postgres-exporter"; + scrape_interval = "30s"; + metrics_path = "/metrics"; + static_configs = [ + { + targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9187"]; + labels.type = "app"; + labels.app = "postgresql"; + labels.host = "aquamarine"; + } + ]; + } + { + job_name = "sftpgo-embedded-exporter"; + scrape_interval = "30s"; + metrics_path = "/metrics"; + static_configs = [ + { + targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:10000"]; + labels.type = "app"; + labels.app = "sftpgo"; + labels.host = "aquamarine"; + } + ]; + } + ] + # --- Hosts --- # + ++ ( + lib.attrsets.foldlAttrs + (acc: hostname: addr: + acc + ++ [ + { + job_name = "node-exporter-${hostname}"; + scrape_interval = "30s"; + metrics_path = "/metrics"; + static_configs = [ + { + # All my NixOS hosts. + targets = ["${addr.ipv4}:9100"]; + labels.type = "node"; + labels.host = hostname; + } + ]; + } + ]) + [] + myvars.networking.hostsAddr + ); + }; + + services.vmalert = { + enable = true; + settings = { + "datasource.url" = "http://localhost:9090"; + "notifier.url" = ["http://localhost:9093"]; # alertmanager's api + + # Whether to disable long-lived connections to the datasource. + "datasource.disableKeepAlive" = true; + # Whether to avoid stripping sensitive information such as auth headers or passwords + # from URLs in log messages or UI and exported metrics. + "datasource.showURL" = false; + rule = [ + ./alert_rules/node-exporter.yml + ./alert_rules/kubestate-exporter.yml + ./alert_rules/etcd_embedded-exporter.yml + ./alert_rules/istio_embedded-exporter.yml + ./alert_rules/coredns_embedded-exporter.yml + ]; + }; + }; +} diff --git a/hosts/idols-aquamarine/prometheus/alert_rules/coredns_embedded-exporter.yml b/hosts/idols-aquamarine/prometheus/alert_rules/coredns_embedded-exporter.yml deleted file mode 100644 index 7eb89261..00000000 --- a/hosts/idols-aquamarine/prometheus/alert_rules/coredns_embedded-exporter.yml +++ /dev/null @@ -1,13 +0,0 @@ -groups: - - name: EmbeddedExporter - - rules: - - alert: CorednsPanicCount - expr: "increase(coredns_panics_total[1m]) > 0" - for: 0m - labels: - severity: critical - annotations: - summary: CoreDNS Panic Count (instance {{ $labels.instance }}) - description: - "Number of CoreDNS panics encountered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/hosts/idols-aquamarine/prometheus/default.nix b/hosts/idols-aquamarine/prometheus/default.nix deleted file mode 100644 index 9ba1a7c3..00000000 --- a/hosts/idols-aquamarine/prometheus/default.nix +++ /dev/null @@ -1,192 +0,0 @@ -{ - lib, - config, - myvars, - ... -}: { - # Workaround for prometheus to store data in another place - # https://www.freedesktop.org/software/systemd/man/latest/tmpfiles.d.html#Type - systemd.tmpfiles.rules = [ - "D /data/apps/prometheus2 0751 prometheus prometheus - -" - "L+ /var/lib/prometheus2 - - - - /data/apps/prometheus2" - ]; - - # https://prometheus.io/docs/prometheus/latest/configuration/configuration/ - services.prometheus = { - enable = true; - checkConfig = true; - listenAddress = "127.0.0.1"; - port = 9090; - webExternalUrl = "http://prometheus.writefor.fun"; - - extraFlags = [ - "--storage.tsdb.retention.time=30d" - # https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations - "--web.enable-remote-write-receiver" - ]; - # Directory below /var/lib to store Prometheus metrics data. - stateDir = "prometheus2"; - - # Reload prometheus when configuration file changes (instead of restart). - enableReload = true; - # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read - # remoteRead = []; - - # Rules are read from these files. - # https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ - # - # Prometheus supports two types of rules which may be configured - # and then evaluated at regular intervals: - # 1. Recording rules - # Recording rules allow you to precompute frequently needed or computationally - # expensive expressions and save their result as a new set of time series. - # Querying the precomputed result will then often be much faster than executing the original expression. - # This is especially useful for dashboards, which need to query the same expression repeatedly every time they refresh. - # 2. Alerting rules - # Alerting rules allow you to define alert conditions based on Prometheus expression language expressions - # and to send notifications about firing alerts to an external service. - ruleFiles = [ - ./alert_rules/node-exporter.yml - ./alert_rules/kubestate-exporter.yml - ./alert_rules/etcd_embedded-exporter.yml - ./alert_rules/istio_embedded-exporter.yml - ./alert_rules/coredns_embedded-exporter.yml - - # ./recording_rules.yml - ]; - - # specifies a set of targets and parameters describing how to scrape metrics from them. - # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config - scrapeConfigs = - [ - # --- Homelab Applications --- # - - { - job_name = "dnsmasq-exporter"; - scrape_interval = "30s"; - metrics_path = "/metrics"; - static_configs = [ - { - targets = ["${myvars.networking.hostsAddr.suzi.ipv4}:9153"]; - labels.type = "app"; - labels.app = "dnsmasq"; - labels.host = "suzi"; - } - ]; - } - - { - job_name = "v2ray-exporter"; - scrape_interval = "30s"; - metrics_path = "/metrics"; - static_configs = [ - { - targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9153"]; - labels.type = "app"; - labels.app = "v2ray"; - labels.host = "aquamarine"; - } - ]; - } - { - job_name = "postgres-exporter"; - scrape_interval = "30s"; - metrics_path = "/metrics"; - static_configs = [ - { - targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:9187"]; - labels.type = "app"; - labels.app = "postgresql"; - labels.host = "aquamarine"; - } - ]; - } - { - job_name = "sftpgo-embedded-exporter"; - scrape_interval = "30s"; - metrics_path = "/metrics"; - static_configs = [ - { - targets = ["${myvars.networking.hostsAddr.aquamarine.ipv4}:10000"]; - labels.type = "app"; - labels.app = "sftpgo"; - labels.host = "aquamarine"; - } - ]; - } - ] - # --- Hosts --- # - ++ ( - lib.attrsets.foldlAttrs - (acc: hostname: addr: - acc - ++ [ - { - job_name = "node-exporter-${hostname}"; - scrape_interval = "30s"; - metrics_path = "/metrics"; - static_configs = [ - { - # All my NixOS hosts. - targets = ["${addr.ipv4}:9100"]; - labels.type = "node"; - labels.host = hostname; - } - ]; - } - ]) - [] - myvars.networking.hostsAddr - ); - - # specifies Alertmanager instances the Prometheus server sends alerts to - # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config - alertmanagers = [{static_configs = [{targets = ["localhost:9093"];}];}]; - }; - - services.prometheus.alertmanager = { - enable = true; - listenAddress = "127.0.0.1"; - port = 9093; - webExternalUrl = "http://alertmanager.writefor.fun"; - logLevel = "info"; - - environmentFile = config.age.secrets."alertmanager.env".path; - configuration = { - global = { - # The smarthost and SMTP sender used for mail notifications. - smtp_smarthost = "smtp.qq.com:465"; - smtp_from = "$SMTP_SENDER_EMAIL"; - smtp_auth_username = "$SMTP_AUTH_USERNAME"; - smtp_auth_password = "$SMTP_AUTH_PASSWORD"; - # smtp.qq.com:465 support SSL only, so we need to disable TLS here. - # https://service.mail.qq.com/detail/0/310 - smtp_require_tls = false; - }; - route = { - receiver = "default"; - routes = [ - { - group_by = ["host"]; - group_wait = "5m"; - group_interval = "5m"; - repeat_interval = "4h"; - receiver = "default"; - } - ]; - }; - receivers = [ - { - name = "default"; - email_configs = [ - { - to = "ryan4yin@linux.com"; - # Whether to notify about resolved alerts. - send_resolved = true; - } - ]; - } - ]; - }; - }; -}