feat: Grafana Dashboards & VMAlert (#224)

* chore: grafana - remove useless dashboards
* fix: alertmanager - metrics
* fix: victoria-metrics - job filter
* feat: add recoding rules
* fix: grafana - add or update uid for all dashboards
* fix: vmalert - remoteWrite
This commit is contained in:
Ryan Yin
2025-09-26 19:10:32 +08:00
committed by GitHub
27 changed files with 1835 additions and 4666 deletions
@@ -73,11 +73,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"#7eb26d",
"#d44a3a"
],
"colors": ["#299c46", "#7eb26d", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -156,11 +152,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"description": "start time of the process",
"format": "dateTimeFromNow",
@@ -239,11 +231,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
"datasource": "${DS_PROMETHEUS}",
"format": "decbytes",
"gauge": {
@@ -322,11 +310,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
"datasource": "${DS_PROMETHEUS}",
"format": "decbytes",
"gauge": {
@@ -405,11 +389,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
"datasource": "${DS_PROMETHEUS}",
"format": "decbytes",
"gauge": {
@@ -488,11 +468,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -864,11 +840,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -945,11 +917,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -1026,11 +994,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -1107,11 +1071,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -1189,11 +1149,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"decimals": 1,
"format": "bytes",
@@ -1271,11 +1227,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -1352,11 +1304,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -1433,11 +1381,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -1514,11 +1458,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -2944,11 +2884,7 @@
"refresh": "10s",
"schemaVersion": 19,
"style": "dark",
"tags": [
"postgres",
"db",
"stats"
],
"tags": ["postgres", "db", "stats"],
"templating": {
"list": [
{
@@ -3136,32 +3072,11 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "PostgreSQL Database",
"uid": "000000039",
"uid": "postgresql-database",
"version": 1
}
}
@@ -11139,6 +11139,6 @@
},
"timezone": "",
"title": "Alertmanager",
"uid": "eea-9_sik",
"uid": "alertmanager",
"version": 27
}
@@ -23262,7 +23262,7 @@
},
"timezone": "browser",
"title": "Node Exporter Full",
"uid": "rYdddlPWk",
"uid": "node-exporter-full",
"version": 87,
"weekStart": ""
}
@@ -853,19 +853,11 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
"refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
},
"timezone": "",
"title": "Istio Wasm Extension Dashboard",
"uid": "istio-wasm-extension",
"version": 1,
"weekStart": ""
}
@@ -114,9 +114,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -196,9 +194,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -275,9 +271,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -354,9 +348,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -433,9 +425,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -511,9 +501,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -589,9 +577,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -667,9 +653,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -745,9 +729,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -823,9 +805,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -901,9 +881,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -979,9 +957,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -1329,9 +1305,7 @@
"footer": {
"countRows": false,
"fields": "",
"reducer": [
"sum"
],
"reducer": ["sum"],
"show": false
},
"showHeader": true
@@ -1466,9 +1440,7 @@
"cellHeight": "sm",
"footer": {
"show": false,
"reducer": [
"sum"
],
"reducer": ["sum"],
"countRows": false,
"fields": ""
}
@@ -1832,30 +1804,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "browser",
"title": "Istio Mesh Dashboard",
"uid": "istio-mesh",
"version": 1,
"weekStart": ""
}
}
@@ -1574,30 +1574,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "Istio Performance Dashboard",
"uid": "istio-performance",
"version": 1,
"weekStart": ""
}
@@ -123,9 +123,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -197,9 +195,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -398,9 +394,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -478,9 +472,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -552,9 +544,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -753,9 +743,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -3368,28 +3356,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "Istio Service Dashboard",
"uid": "istio-service",
"version": 1,
"weekStart": ""
}
@@ -123,9 +123,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -206,9 +204,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -405,9 +401,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -485,9 +479,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -3040,28 +3032,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "Istio Workload Dashboard",
"uid": "istio-workload",
"version": 1,
"weekStart": ""
}
File diff suppressed because it is too large Load Diff
@@ -1,458 +1,434 @@
{
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"panels": [ ],
"title": "Process",
"type": "row"
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Version number of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 1
},
"id": 2,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(istio_build{component=\"ztunnel\"}) by (tag)",
"legendFormat": "Version ({{tag}})"
}
],
"title": "Ztunnel Versions",
"type": "timeseries"
"id": 1,
"panels": [],
"title": "Process",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Memory usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "bytes"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 1
},
"id": 3,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n container_memory_working_set_bytes{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "Memory Usage",
"type": "timeseries"
"description": "Version number of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "CPU usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 1
},
"id": 4,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n irate(\n container_cpu_usage_seconds_total{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "CPU Usage",
"type": "timeseries"
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 1
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 5,
"panels": [ ],
"title": "Network",
"type": "row"
"id": 2,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Connections opened and closed per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "cps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 10
},
"id": 6,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_connections_opened_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Opened ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "-sum by (pod) (\n rate(\n istio_tcp_connections_closed_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Closed ({{pod}})"
}
],
"title": "Connections",
"type": "timeseries"
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(istio_build{component=\"ztunnel\"}) by (tag)",
"legendFormat": "Version ({{tag}})"
}
],
"title": "Ztunnel Versions",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Bytes sent and received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "Bps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 10
},
"id": 7,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_sent_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Sent ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_received_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Received ({{pod}})"
}
],
"title": "Bytes Transmitted",
"type": "timeseries"
"description": "Memory usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "bytes"
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "DNS queries received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "qps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 10
},
"id": 8,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_dns_requests_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Request ({{pod}})"
}
],
"title": "DNS Request",
"type": "timeseries"
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 1
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 18
},
"id": 9,
"panels": [ ],
"title": "Operations",
"type": "row"
"id": 3,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of XDS connection terminations.\nThis will typically spike every 30min for each instance.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 19
},
"id": 10,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_xds_connection_terminations_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "XDS Connection Terminations ({{pod}})"
}
],
"title": "XDS",
"type": "timeseries"
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n container_memory_working_set_bytes{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "CPU usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 1
},
"id": 4,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n irate(\n container_cpu_usage_seconds_total{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "CPU Usage",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 5,
"panels": [],
"title": "Network",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Connections opened and closed per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "cps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 10
},
"id": 6,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_connections_opened_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Opened ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "-sum by (pod) (\n rate(\n istio_tcp_connections_closed_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Closed ({{pod}})"
}
],
"title": "Connections",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Bytes sent and received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "Bps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 10
},
"id": 7,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_sent_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Sent ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_received_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Received ({{pod}})"
}
],
"title": "Bytes Transmitted",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "DNS queries received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "qps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 10
},
"id": 8,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_dns_requests_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Request ({{pod}})"
}
],
"title": "DNS Request",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 18
},
"id": 9,
"panels": [],
"title": "Operations",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of XDS connection terminations.\nThis will typically spike every 30min for each instance.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 19
},
"id": 10,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_xds_connection_terminations_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "XDS Connection Terminations ({{pod}})"
}
],
"title": "XDS",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of active and pending proxies managed by each instance.\nPending is expected to converge to zero.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 19
},
"id": 11,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_active_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Active Proxies ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_pending_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Pending Proxies ({{pod}})"
}
],
"title": "Workload Manager",
"type": "timeseries"
}
],
"refresh": "15s",
"schemaVersion": 39,
"templating": {
"list": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of active and pending proxies managed by each instance.\nPending is expected to converge to zero.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 19
},
"id": 11,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_active_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Active Proxies ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_pending_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Pending Proxies ({{pod}})"
}
],
"title": "Workload Manager",
"type": "timeseries"
"name": "datasource",
"query": "prometheus",
"type": "datasource"
}
],
"refresh": "15s",
"schemaVersion": 39,
"templating": {
"list": [
{
"name": "datasource",
"query": "prometheus",
"type": "datasource"
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timezone": "utc",
"title": "Istio Ztunnel Dashboard",
"uid": "12c58766acc81a1c835dd5059eaf2741"
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timezone": "utc",
"title": "Istio Ztunnel Dashboard",
"uid": "istio-ztunnel"
}
File diff suppressed because it is too large Load Diff
@@ -4572,11 +4572,7 @@
"refresh": "1m",
"schemaVersion": 26,
"style": "dark",
"tags": [
"kubevirt",
"kubevirt-control-plane",
"sig-scale"
],
"tags": ["kubevirt", "kubevirt-control-plane", "sig-scale"],
"templating": {
"list": [
{
@@ -5165,32 +5161,11 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "UTC",
"title": "KubeVirt / Control Plane",
"uid": "V1Qq_IBM_za0",
"uid": "kubevirt-control-plane",
"version": 3
}
}
@@ -1157,6 +1157,6 @@
},
"timezone": "utc",
"title": "Loki / Chunks",
"uid": "chunks",
"uid": "loki-chunks",
"version": 0
}
@@ -720,6 +720,6 @@
},
"timezone": "utc",
"title": "Loki / Deletion",
"uid": "deletion",
"uid": "loki-deletion",
"version": 0
}
@@ -1032,6 +1032,6 @@
},
"timezone": "utc",
"title": "Loki / Logs",
"uid": "logs",
"uid": "loki-logs",
"version": 0
}
@@ -6701,6 +6701,6 @@
},
"timezone": "utc",
"title": "Loki / Operational",
"uid": "operational",
"uid": "loki-operational",
"version": 0
}
@@ -1464,6 +1464,6 @@
},
"timezone": "utc",
"title": "Loki / Retention",
"uid": "retention",
"uid": "loki-retention",
"version": 0
}
@@ -6469,12 +6469,12 @@
"type": "prometheus",
"uid": "$ds"
},
"definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
"definition": "label_values(vm_app_version{}, job)",
"includeAll": false,
"name": "job",
"options": [],
"query": {
"query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
"query": "label_values(vm_app_version{}, job)",
"refId": "VictoriaMetrics-job-Variable-Query"
},
"refresh": 1,
@@ -6542,7 +6542,7 @@
},
"timezone": "",
"title": "VictoriaMetrics - single-node",
"uid": "wNf0q_kZk",
"uid": "victoriametrics-single-node",
"version": 1,
"weekStart": "",
"gnetId": 10229
+5 -3
View File
@@ -1,6 +1,8 @@
# Monitoring & Alerting
## Alert Rules
## Alert Rules & Recoding Rules
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
Prometheus alerting rules
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts)
- Collection of Prometheus alerting rules.
- [victoria-metrics-k8s-stack/files/rules](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/files/rules/generated)
- Alert Rules & Recoding Rules used by kube-prometheus-stack.
+7 -2
View File
@@ -1,12 +1,16 @@
{ config, lib, ... }:
{ config, ... }:
{
services.vmalert = {
# https://docs.victoriametrics.com/victoriametrics/vmalert/
services.vmalert.instances."homelab" = {
enable = true;
settings = {
"httpListenAddr" = "127.0.0.1:8880";
"datasource.url" = "http://localhost:9090";
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
# Recording rules results are persisted via remote write.
"remoteWrite.url" = "http://localhost:9090";
"remoteRead.url" = "http://localhost:9090";
# Whether to disable long-lived connections to the datasource.
"datasource.disableKeepAlive" = true;
@@ -16,6 +20,7 @@
# Path to the files with alerting and/or recording rules.
rule = [
"${./alert_rules}/*.yml"
"${./recoding_rules}/*.yml"
];
# https://docs.victoriametrics.com/victoriametrics/vmalert/#link-to-alert-source
# Set this two args to generate the correct `.GeneratorURL`
@@ -0,0 +1,8 @@
# Alert Rules
Alert rules are configurations that define conditions, scope, and actions for generating alerts from
monitored signals, such as metrics, logs, or activity. When an alert rule's defined conditions are
met for a specific resource within its scope, the system generates a triggered alert, which is the
actual instance of the condition being met. These rules specify the data to monitor, the trigger
threshold, and the resulting actions, like sending notifications to specific receivers or performing
automated tasks.
@@ -0,0 +1,57 @@
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description:
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in
{{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr:
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job,
namespace, service)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is
functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary:
An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
severity: none
- alert: InfoInhibitor
annotations:
description: 'This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant
when combined with
other alerts.
This alert fires whenever there''s a severity="info" alert, and stops firing when
another alert with a
severity of ''warning'' or ''critical'' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with
severity="info".'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr:
ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor",
severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none
@@ -0,0 +1,120 @@
groups:
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{
$value }} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{
$value | humanize }} bytes and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted memory resource requests for
Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
$labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
$labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: info
- alert: KubeQuotaExceeded
annotations:
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
$labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description:
"{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace
}} for container {{ $labels.container }} in pod {{ $labels.pod }}."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
> ( 25 / 100 )
for: 15m
labels:
severity: info
@@ -0,0 +1,7 @@
# Recording Rules
Recording rules are pre-defined queries, often complex or computationally expensive, that are
evaluated periodically to create new, pre-computed time series metrics.
These rules store the results in a metric backend, significantly speeding up queries for dashboards
and other alerts, and reducing system load by avoiding the re-computation of data.
@@ -0,0 +1,149 @@
groups:
- name: k8s.rules
rules:
- expr: |-
sum by (cluster, namespace, pod, container) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
- expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: |-
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss
- expr: |-
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache
- expr: |-
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap
- expr: |-
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
- expr: |-
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- expr: |-
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- expr: |-
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_limits:sum
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel
@@ -0,0 +1,128 @@
groups:
- name: kube-prometheus-node-recording.rules
rules:
- expr:
sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY
(instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr:
sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance,
cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: node-exporter.rules
rules:
- expr: |-
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
record: instance:node_num_cpu:sum
- expr: |-
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
- expr: |-
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |-
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- expr:
rate(node_disk_io_time_seconds_total{job="node-exporter",
device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- expr:
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter",
device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m
- name: node.rules
rules:
- expr: |-
topk by(cluster, namespace, pod) (1,
max by (cluster, node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: "node_namespace_pod:kube_pod_info:"
- expr: |-
count by (cluster, node) (
node_cpu_seconds_total{mode="idle",job="node-exporter"}
* on (namespace, pod) group_left(node)
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
)
record: node:node_num_cpu:sum
- expr: |-
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by (cluster)
record: :node_memory_MemAvailable_bytes:sum
- expr: |-
avg by (cluster, node) (
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
)
)
record: node:node_cpu_utilization:ratio_rate5m
- expr: |-
avg by (cluster) (
node:node_cpu_utilization:ratio_rate5m
)
record: cluster:node_cpu:ratio_rate5m
@@ -50,6 +50,8 @@
labels.type = "app";
labels.app = "dnsmasq";
labels.host = "suzi";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -64,6 +66,8 @@
labels.type = "app";
labels.app = "v2ray";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -77,6 +81,8 @@
labels.type = "app";
labels.app = "postgresql";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -90,6 +96,39 @@
labels.type = "app";
labels.app = "sftpgo";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
{
job_name = "alertmanager-embedded-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = [ "localhost:9093" ];
labels.type = "app";
labels.app = "alertmanager";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
{
job_name = "victoriametrics-embedded-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
# scrape vm itself
targets = [ "localhost:9090" ];
labels.type = "app";
labels.app = "victoriametrics";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -109,6 +148,8 @@
targets = [ "${addr.ipv4}:9100" ];
labels.type = "node";
labels.host = hostname;
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}