From 2961a9591a20303d3f080a462a5609f6df90790b Mon Sep 17 00:00:00 2001 From: Ryan Yin Date: Tue, 23 Sep 2025 14:31:48 +0800 Subject: [PATCH] feat: add recoding rules --- hosts/idols-aquamarine/monitoring/README.md | 8 +- hosts/idols-aquamarine/monitoring/alert.nix | 1 + .../monitoring/alert_rules/README.md | 8 + .../monitoring/alert_rules/general.yml | 57 +++++++ .../monitoring/alert_rules/kubernetes.yml | 120 ++++++++++++++ .../monitoring/recoding_rules/README.md | 7 + .../monitoring/recoding_rules/k8s.yml | 149 ++++++++++++++++++ .../recoding_rules/node-exporter.yml | 128 +++++++++++++++ 8 files changed, 475 insertions(+), 3 deletions(-) create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/README.md create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/general.yml create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/kubernetes.yml create mode 100644 hosts/idols-aquamarine/monitoring/recoding_rules/README.md create mode 100644 hosts/idols-aquamarine/monitoring/recoding_rules/k8s.yml create mode 100644 hosts/idols-aquamarine/monitoring/recoding_rules/node-exporter.yml diff --git a/hosts/idols-aquamarine/monitoring/README.md b/hosts/idols-aquamarine/monitoring/README.md index e7c050c5..d4ed16d2 100644 --- a/hosts/idols-aquamarine/monitoring/README.md +++ b/hosts/idols-aquamarine/monitoring/README.md @@ -1,6 +1,8 @@ # Monitoring & Alerting -## Alert Rules +## Alert Rules & Recoding Rules -- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of - Prometheus alerting rules +- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts) + - Collection of Prometheus alerting rules. +- [victoria-metrics-k8s-stack/files/rules](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/files/rules/generated) + - Alert Rules & Recoding Rules used by kube-prometheus-stack. diff --git a/hosts/idols-aquamarine/monitoring/alert.nix b/hosts/idols-aquamarine/monitoring/alert.nix index 3a5c3d36..0013ac1b 100644 --- a/hosts/idols-aquamarine/monitoring/alert.nix +++ b/hosts/idols-aquamarine/monitoring/alert.nix @@ -16,6 +16,7 @@ # Path to the files with alerting and/or recording rules. rule = [ "${./alert_rules}/*.yml" + "${./recoding_rules}/*.yml" ]; # https://docs.victoriametrics.com/victoriametrics/vmalert/#link-to-alert-source # Set this two args to generate the correct `.GeneratorURL` diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/README.md b/hosts/idols-aquamarine/monitoring/alert_rules/README.md new file mode 100644 index 00000000..40e55cb1 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/README.md @@ -0,0 +1,8 @@ +# Alert Rules + +Alert rules are configurations that define conditions, scope, and actions for generating alerts from +monitored signals, such as metrics, logs, or activity. When an alert rule's defined conditions are +met for a specific resource within its scope, the system generates a triggered alert, which is the +actual instance of the condition being met. These rules specify the data to monitor, the trigger +threshold, and the resulting actions, like sending notifications to specific receivers or performing +automated tasks. diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/general.yml b/hosts/idols-aquamarine/monitoring/alert_rules/general.yml new file mode 100644 index 00000000..61789d24 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/general.yml @@ -0,0 +1,57 @@ +groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: + '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in + {{ $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: + 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, + namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: 'This is an alert meant to ensure that the entire alerting pipeline is + functional. + + This alert is always firing, therefore it should always be firing in Alertmanager + + and always fire against a receiver. There are integrations with various notification + + mechanisms that send a notification when this alert is not firing. For example the + + "DeadMansSnitch" integration in PagerDuty.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: + An alert that should always be firing to certify that Alertmanager is working properly. + expr: vector(1) + labels: + severity: none + - alert: InfoInhibitor + annotations: + description: 'This is an alert that is used to inhibit info alerts. + + By themselves, the info-level alerts are sometimes very noisy, but they are relevant + when combined with + + other alerts. + + This alert fires whenever there''s a severity="info" alert, and stops firing when + another alert with a + + severity of ''warning'' or ''critical'' starts firing on the same namespace. + + This alert should be routed to a null receiver and configured to inhibit alerts with + severity="info".' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor + summary: Info-level alert inhibition. + expr: + ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", + severity =~ "warning|critical", alertstate="firing"} == 1 + labels: + severity: none diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/kubernetes.yml b/hosts/idols-aquamarine/monitoring/alert_rules/kubernetes.yml new file mode 100644 index 00000000..327171e6 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/kubernetes.yml @@ -0,0 +1,120 @@ +groups: + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ + $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ + $value | humanize }} bytes and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted memory resource requests for + Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: + Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ + $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: + Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ + $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: + Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ + $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: + "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace + }} for container {{ $labels.container }} in pod {{ $labels.pod }}." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: |- + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info diff --git a/hosts/idols-aquamarine/monitoring/recoding_rules/README.md b/hosts/idols-aquamarine/monitoring/recoding_rules/README.md new file mode 100644 index 00000000..9af3baf9 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/recoding_rules/README.md @@ -0,0 +1,7 @@ +# Recording Rules + +Recording rules are pre-defined queries, often complex or computationally expensive, that are +evaluated periodically to create new, pre-computed time series metrics. + +These rules store the results in a metric backend, significantly speeding up queries for dashboards +and other alerts, and reducing system load by avoiding the re-computation of data. diff --git a/hosts/idols-aquamarine/monitoring/recoding_rules/k8s.yml b/hosts/idols-aquamarine/monitoring/recoding_rules/k8s.yml new file mode 100644 index 00000000..ca3d4a4a --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/recoding_rules/k8s.yml @@ -0,0 +1,149 @@ +groups: + - name: k8s.rules + rules: + - expr: |- + sum by (cluster, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + - expr: |- + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: |- + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: |- + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: |- + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: |- + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: |- + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: |- + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: |- + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum + - expr: |- + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: |- + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: |- + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: |- + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: job + record: namespace_workload_pod:kube_pod_owner:relabel diff --git a/hosts/idols-aquamarine/monitoring/recoding_rules/node-exporter.yml b/hosts/idols-aquamarine/monitoring/recoding_rules/node-exporter.yml new file mode 100644 index 00000000..86d2933b --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/recoding_rules/node-exporter.yml @@ -0,0 +1,128 @@ +groups: + - name: kube-prometheus-node-recording.rules + rules: + - expr: + sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY + (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: + sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT + (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, + cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + + - name: node-exporter.rules + rules: + - expr: |- + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum + - expr: |- + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) + ) + record: instance:node_cpu_utilisation:rate5m + - expr: |- + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: |- + 1 - ( + ( + node_memory_MemAvailable_bytes{job="node-exporter"} + or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + + node_memory_Cached_bytes{job="node-exporter"} + + + node_memory_MemFree_bytes{job="node-exporter"} + + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: + rate(node_disk_io_time_seconds_total{job="node-exporter", + device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", + device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: |- + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: |- + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: |- + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: |- + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m + + - name: node.rules + rules: + - expr: |- + topk by(cluster, namespace, pod) (1, + max by (cluster, node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: "node_namespace_pod:kube_pod_info:" + - expr: |- + count by (cluster, node) ( + node_cpu_seconds_total{mode="idle",job="node-exporter"} + * on (namespace, pod) group_left(node) + topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) + ) + record: node:node_num_cpu:sum + - expr: |- + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - expr: |- + avg by (cluster, node) ( + sum without (mode) ( + rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) + ) + ) + record: node:node_cpu_utilization:ratio_rate5m + - expr: |- + avg by (cluster) ( + node:node_cpu_utilization:ratio_rate5m + ) + record: cluster:node_cpu:ratio_rate5m