feat: Grafana Dashboards & VMAlert (#224)

* chore: grafana - remove useless dashboards
* fix: alertmanager - metrics
* fix: victoria-metrics - job filter
* feat: add recoding rules
* fix: grafana - add or update uid for all dashboards
* fix: vmalert - remoteWrite
This commit is contained in:
Ryan Yin
2025-09-26 19:10:32 +08:00
committed by GitHub
27 changed files with 1835 additions and 4666 deletions

View File

@@ -73,11 +73,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"#7eb26d",
"#d44a3a"
],
"colors": ["#299c46", "#7eb26d", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -156,11 +152,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"description": "start time of the process",
"format": "dateTimeFromNow",
@@ -239,11 +231,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
"datasource": "${DS_PROMETHEUS}",
"format": "decbytes",
"gauge": {
@@ -322,11 +310,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
"datasource": "${DS_PROMETHEUS}",
"format": "decbytes",
"gauge": {
@@ -405,11 +389,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
"datasource": "${DS_PROMETHEUS}",
"format": "decbytes",
"gauge": {
@@ -488,11 +468,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -864,11 +840,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -945,11 +917,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -1026,11 +994,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -1107,11 +1071,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "bytes",
"gauge": {
@@ -1189,11 +1149,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"decimals": 1,
"format": "bytes",
@@ -1271,11 +1227,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -1352,11 +1304,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -1433,11 +1381,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -1514,11 +1458,7 @@
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
@@ -2944,11 +2884,7 @@
"refresh": "10s",
"schemaVersion": 19,
"style": "dark",
"tags": [
"postgres",
"db",
"stats"
],
"tags": ["postgres", "db", "stats"],
"templating": {
"list": [
{
@@ -3136,32 +3072,11 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "PostgreSQL Database",
"uid": "000000039",
"uid": "postgresql-database",
"version": 1
}
}

View File

@@ -11139,6 +11139,6 @@
},
"timezone": "",
"title": "Alertmanager",
"uid": "eea-9_sik",
"uid": "alertmanager",
"version": 27
}

View File

@@ -23262,7 +23262,7 @@
},
"timezone": "browser",
"title": "Node Exporter Full",
"uid": "rYdddlPWk",
"uid": "node-exporter-full",
"version": 87,
"weekStart": ""
}

View File

@@ -853,19 +853,11 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
"refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
},
"timezone": "",
"title": "Istio Wasm Extension Dashboard",
"uid": "istio-wasm-extension",
"version": 1,
"weekStart": ""
}

View File

@@ -114,9 +114,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -196,9 +194,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -275,9 +271,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -354,9 +348,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -433,9 +425,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -511,9 +501,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -589,9 +577,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -667,9 +653,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -745,9 +729,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -823,9 +805,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -901,9 +881,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -979,9 +957,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -1329,9 +1305,7 @@
"footer": {
"countRows": false,
"fields": "",
"reducer": [
"sum"
],
"reducer": ["sum"],
"show": false
},
"showHeader": true
@@ -1466,9 +1440,7 @@
"cellHeight": "sm",
"footer": {
"show": false,
"reducer": [
"sum"
],
"reducer": ["sum"],
"countRows": false,
"fields": ""
}
@@ -1832,30 +1804,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "browser",
"title": "Istio Mesh Dashboard",
"uid": "istio-mesh",
"version": 1,
"weekStart": ""
}
}

View File

@@ -1574,30 +1574,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "Istio Performance Dashboard",
"uid": "istio-performance",
"version": 1,
"weekStart": ""
}

View File

@@ -123,9 +123,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -197,9 +195,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -398,9 +394,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -478,9 +472,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -552,9 +544,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -753,9 +743,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -3368,28 +3356,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "Istio Service Dashboard",
"uid": "istio-service",
"version": 1,
"weekStart": ""
}

View File

@@ -123,9 +123,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@@ -206,9 +204,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -405,9 +401,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -485,9 +479,7 @@
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"calcs": ["mean"],
"fields": "",
"values": false
},
@@ -3040,28 +3032,12 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "Istio Workload Dashboard",
"uid": "istio-workload",
"version": 1,
"weekStart": ""
}

View File

@@ -1,458 +1,434 @@
{
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"panels": [ ],
"title": "Process",
"type": "row"
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Version number of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 1
},
"id": 2,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(istio_build{component=\"ztunnel\"}) by (tag)",
"legendFormat": "Version ({{tag}})"
}
],
"title": "Ztunnel Versions",
"type": "timeseries"
"id": 1,
"panels": [],
"title": "Process",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Memory usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "bytes"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 1
},
"id": 3,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n container_memory_working_set_bytes{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "Memory Usage",
"type": "timeseries"
"description": "Version number of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "CPU usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 1
},
"id": 4,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n irate(\n container_cpu_usage_seconds_total{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "CPU Usage",
"type": "timeseries"
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 1
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 5,
"panels": [ ],
"title": "Network",
"type": "row"
"id": 2,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Connections opened and closed per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "cps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 10
},
"id": 6,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_connections_opened_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Opened ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "-sum by (pod) (\n rate(\n istio_tcp_connections_closed_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Closed ({{pod}})"
}
],
"title": "Connections",
"type": "timeseries"
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(istio_build{component=\"ztunnel\"}) by (tag)",
"legendFormat": "Version ({{tag}})"
}
],
"title": "Ztunnel Versions",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Bytes sent and received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "Bps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 10
},
"id": 7,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_sent_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Sent ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_received_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Received ({{pod}})"
}
],
"title": "Bytes Transmitted",
"type": "timeseries"
"description": "Memory usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "bytes"
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "DNS queries received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "qps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 10
},
"id": 8,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_dns_requests_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Request ({{pod}})"
}
],
"title": "DNS Request",
"type": "timeseries"
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 1
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 18
},
"id": 9,
"panels": [ ],
"title": "Operations",
"type": "row"
"id": 3,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of XDS connection terminations.\nThis will typically spike every 30min for each instance.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 19
},
"id": 10,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_xds_connection_terminations_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "XDS Connection Terminations ({{pod}})"
}
],
"title": "XDS",
"type": "timeseries"
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n container_memory_working_set_bytes{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "CPU usage of each running instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 1
},
"id": 4,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n irate(\n container_cpu_usage_seconds_total{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Container ({{pod}})"
}
],
"title": "CPU Usage",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 5,
"panels": [],
"title": "Network",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Connections opened and closed per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "cps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 10
},
"id": 6,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_connections_opened_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Opened ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "-sum by (pod) (\n rate(\n istio_tcp_connections_closed_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Closed ({{pod}})"
}
],
"title": "Connections",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Bytes sent and received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "Bps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 10
},
"id": 7,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_sent_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Sent ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_tcp_received_bytes_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Received ({{pod}})"
}
],
"title": "Bytes Transmitted",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "DNS queries received per instance",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
},
"unit": "qps"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 10
},
"id": 8,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_dns_requests_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "Request ({{pod}})"
}
],
"title": "DNS Request",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 18
},
"id": 9,
"panels": [],
"title": "Operations",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of XDS connection terminations.\nThis will typically spike every 30min for each instance.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 19
},
"id": 10,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (\n rate(\n istio_xds_connection_terminations_total{pod=~\"ztunnel-.*\"}\n [$__rate_interval])\n)",
"legendFormat": "XDS Connection Terminations ({{pod}})"
}
],
"title": "XDS",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of active and pending proxies managed by each instance.\nPending is expected to converge to zero.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 19
},
"id": 11,
"interval": "5s",
"options": {
"legend": {
"calcs": ["last", "max"],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_active_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Active Proxies ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_pending_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Pending Proxies ({{pod}})"
}
],
"title": "Workload Manager",
"type": "timeseries"
}
],
"refresh": "15s",
"schemaVersion": 39,
"templating": {
"list": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Count of active and pending proxies managed by each instance.\nPending is expected to converge to zero.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"gradientMode": "hue",
"showPoints": "never"
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 19
},
"id": 11,
"interval": "5s",
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table"
}
},
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_active_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Active Proxies ({{pod}})"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (pod) (workload_manager_pending_proxy_count{pod=~\"ztunnel-.*\"})",
"legendFormat": "Pending Proxies ({{pod}})"
}
],
"title": "Workload Manager",
"type": "timeseries"
"name": "datasource",
"query": "prometheus",
"type": "datasource"
}
],
"refresh": "15s",
"schemaVersion": 39,
"templating": {
"list": [
{
"name": "datasource",
"query": "prometheus",
"type": "datasource"
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timezone": "utc",
"title": "Istio Ztunnel Dashboard",
"uid": "12c58766acc81a1c835dd5059eaf2741"
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timezone": "utc",
"title": "Istio Ztunnel Dashboard",
"uid": "istio-ztunnel"
}

View File

@@ -4572,11 +4572,7 @@
"refresh": "1m",
"schemaVersion": 26,
"style": "dark",
"tags": [
"kubevirt",
"kubevirt-control-plane",
"sig-scale"
],
"tags": ["kubevirt", "kubevirt-control-plane", "sig-scale"],
"templating": {
"list": [
{
@@ -5165,32 +5161,11 @@
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "UTC",
"title": "KubeVirt / Control Plane",
"uid": "V1Qq_IBM_za0",
"uid": "kubevirt-control-plane",
"version": 3
}
}

View File

@@ -1157,6 +1157,6 @@
},
"timezone": "utc",
"title": "Loki / Chunks",
"uid": "chunks",
"uid": "loki-chunks",
"version": 0
}

View File

@@ -720,6 +720,6 @@
},
"timezone": "utc",
"title": "Loki / Deletion",
"uid": "deletion",
"uid": "loki-deletion",
"version": 0
}

View File

@@ -1032,6 +1032,6 @@
},
"timezone": "utc",
"title": "Loki / Logs",
"uid": "logs",
"uid": "loki-logs",
"version": 0
}

View File

@@ -6701,6 +6701,6 @@
},
"timezone": "utc",
"title": "Loki / Operational",
"uid": "operational",
"uid": "loki-operational",
"version": 0
}

View File

@@ -1464,6 +1464,6 @@
},
"timezone": "utc",
"title": "Loki / Retention",
"uid": "retention",
"uid": "loki-retention",
"version": 0
}

View File

@@ -6469,12 +6469,12 @@
"type": "prometheus",
"uid": "$ds"
},
"definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
"definition": "label_values(vm_app_version{}, job)",
"includeAll": false,
"name": "job",
"options": [],
"query": {
"query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
"query": "label_values(vm_app_version{}, job)",
"refId": "VictoriaMetrics-job-Variable-Query"
},
"refresh": 1,
@@ -6542,7 +6542,7 @@
},
"timezone": "",
"title": "VictoriaMetrics - single-node",
"uid": "wNf0q_kZk",
"uid": "victoriametrics-single-node",
"version": 1,
"weekStart": "",
"gnetId": 10229

View File

@@ -1,6 +1,8 @@
# Monitoring & Alerting
## Alert Rules
## Alert Rules & Recoding Rules
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
Prometheus alerting rules
- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts)
- Collection of Prometheus alerting rules.
- [victoria-metrics-k8s-stack/files/rules](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/files/rules/generated)
- Alert Rules & Recoding Rules used by kube-prometheus-stack.

View File

@@ -1,12 +1,16 @@
{ config, lib, ... }:
{ config, ... }:
{
services.vmalert = {
# https://docs.victoriametrics.com/victoriametrics/vmalert/
services.vmalert.instances."homelab" = {
enable = true;
settings = {
"httpListenAddr" = "127.0.0.1:8880";
"datasource.url" = "http://localhost:9090";
"notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
# Recording rules results are persisted via remote write.
"remoteWrite.url" = "http://localhost:9090";
"remoteRead.url" = "http://localhost:9090";
# Whether to disable long-lived connections to the datasource.
"datasource.disableKeepAlive" = true;
@@ -16,6 +20,7 @@
# Path to the files with alerting and/or recording rules.
rule = [
"${./alert_rules}/*.yml"
"${./recoding_rules}/*.yml"
];
# https://docs.victoriametrics.com/victoriametrics/vmalert/#link-to-alert-source
# Set this two args to generate the correct `.GeneratorURL`

View File

@@ -0,0 +1,8 @@
# Alert Rules
Alert rules are configurations that define conditions, scope, and actions for generating alerts from
monitored signals, such as metrics, logs, or activity. When an alert rule's defined conditions are
met for a specific resource within its scope, the system generates a triggered alert, which is the
actual instance of the condition being met. These rules specify the data to monitor, the trigger
threshold, and the resulting actions, like sending notifications to specific receivers or performing
automated tasks.

View File

@@ -0,0 +1,57 @@
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description:
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in
{{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr:
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job,
namespace, service)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is
functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary:
An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
severity: none
- alert: InfoInhibitor
annotations:
description: 'This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant
when combined with
other alerts.
This alert fires whenever there''s a severity="info" alert, and stops firing when
another alert with a
severity of ''warning'' or ''critical'' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with
severity="info".'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr:
ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor",
severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none

View File

@@ -0,0 +1,120 @@
groups:
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{
$value }} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{
$value | humanize }} bytes and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
description:
Cluster {{ $labels.cluster }} has overcommitted memory resource requests for
Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
$labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
$labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: info
- alert: KubeQuotaExceeded
annotations:
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
$labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description:
"{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace
}} for container {{ $labels.container }} in pod {{ $labels.pod }}."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
> ( 25 / 100 )
for: 15m
labels:
severity: info

View File

@@ -0,0 +1,7 @@
# Recording Rules
Recording rules are pre-defined queries, often complex or computationally expensive, that are
evaluated periodically to create new, pre-computed time series metrics.
These rules store the results in a metric backend, significantly speeding up queries for dashboards
and other alerts, and reducing system load by avoiding the re-computation of data.

View File

@@ -0,0 +1,149 @@
groups:
- name: k8s.rules
rules:
- expr: |-
sum by (cluster, namespace, pod, container) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
- expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: |-
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss
- expr: |-
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache
- expr: |-
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap
- expr: |-
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
- expr: |-
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- expr: |-
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- expr: |-
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_limits:sum
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel

View File

@@ -0,0 +1,128 @@
groups:
- name: kube-prometheus-node-recording.rules
rules:
- expr:
sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY
(instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr:
sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance,
cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: node-exporter.rules
rules:
- expr: |-
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
record: instance:node_num_cpu:sum
- expr: |-
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
- expr: |-
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |-
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- expr:
rate(node_disk_io_time_seconds_total{job="node-exporter",
device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- expr:
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter",
device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m
- name: node.rules
rules:
- expr: |-
topk by(cluster, namespace, pod) (1,
max by (cluster, node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: "node_namespace_pod:kube_pod_info:"
- expr: |-
count by (cluster, node) (
node_cpu_seconds_total{mode="idle",job="node-exporter"}
* on (namespace, pod) group_left(node)
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
)
record: node:node_num_cpu:sum
- expr: |-
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by (cluster)
record: :node_memory_MemAvailable_bytes:sum
- expr: |-
avg by (cluster, node) (
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
)
)
record: node:node_cpu_utilization:ratio_rate5m
- expr: |-
avg by (cluster) (
node:node_cpu_utilization:ratio_rate5m
)
record: cluster:node_cpu:ratio_rate5m

View File

@@ -50,6 +50,8 @@
labels.type = "app";
labels.app = "dnsmasq";
labels.host = "suzi";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -64,6 +66,8 @@
labels.type = "app";
labels.app = "v2ray";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -77,6 +81,8 @@
labels.type = "app";
labels.app = "postgresql";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -90,6 +96,39 @@
labels.type = "app";
labels.app = "sftpgo";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
{
job_name = "alertmanager-embedded-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
targets = [ "localhost:9093" ];
labels.type = "app";
labels.app = "alertmanager";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
{
job_name = "victoriametrics-embedded-exporter";
scrape_interval = "30s";
metrics_path = "/metrics";
static_configs = [
{
# scrape vm itself
targets = [ "localhost:9090" ];
labels.type = "app";
labels.app = "victoriametrics";
labels.host = "aquamarine";
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}
@@ -109,6 +148,8 @@
targets = [ "${addr.ipv4}:9100" ];
labels.type = "node";
labels.host = hostname;
labels.env = "homelab";
labels.cluster = "homelab";
}
];
}