feat: Grafana Dashboards & VMAlert (#224)

* chore: grafana - remove useless dashboards * fix: alertmanager - metrics * fix: victoria-metrics - job filter * feat: add recoding rules * fix: grafana - add or update uid for all dashboards * fix: vmalert - remoteWrite
2026-05-25 00:50:01 +02:00 · 2025-09-26 19:10:32 +08:00
parent 3ac0cea3cc 38f9a3e1bb
commit 5e727543c1
27 changed files with 1835 additions and 4666 deletions
@@ -73,11 +73,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": true,
-      "colors": [
-        "#299c46",
-        "#7eb26d",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "#7eb26d", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "none",
      "gauge": {
@@ -156,11 +152,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "description": "start time of the process",
      "format": "dateTimeFromNow",
@@ -239,11 +231,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "rgba(245, 54, 54, 0.9)",
-        "rgba(237, 129, 40, 0.89)",
-        "rgba(50, 172, 45, 0.97)"
-      ],
+      "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "decbytes",
      "gauge": {
@@ -322,11 +310,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "rgba(245, 54, 54, 0.9)",
-        "rgba(237, 129, 40, 0.89)",
-        "rgba(50, 172, 45, 0.97)"
-      ],
+      "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "decbytes",
      "gauge": {
@@ -405,11 +389,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "rgba(245, 54, 54, 0.9)",
-        "rgba(237, 129, 40, 0.89)",
-        "rgba(50, 172, 45, 0.97)"
-      ],
+      "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "decbytes",
      "gauge": {
@@ -488,11 +468,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "none",
      "gauge": {
@@ -864,11 +840,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "bytes",
      "gauge": {
@@ -945,11 +917,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "bytes",
      "gauge": {
@@ -1026,11 +994,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "bytes",
      "gauge": {
@@ -1107,11 +1071,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "bytes",
      "gauge": {
@@ -1189,11 +1149,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "decimals": 1,
      "format": "bytes",
@@ -1271,11 +1227,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "none",
      "gauge": {
@@ -1352,11 +1304,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "none",
      "gauge": {
@@ -1433,11 +1381,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "none",
      "gauge": {
@@ -1514,11 +1458,7 @@
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
-      "colors": [
-        "#299c46",
-        "rgba(237, 129, 40, 0.89)",
-        "#d44a3a"
-      ],
+      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
      "datasource": "${DS_PROMETHEUS}",
      "format": "none",
      "gauge": {
@@ -2944,11 +2884,7 @@
  "refresh": "10s",
  "schemaVersion": 19,
  "style": "dark",
-  "tags": [
-    "postgres",
-    "db",
-    "stats"
-  ],
+  "tags": ["postgres", "db", "stats"],
  "templating": {
    "list": [
      {
@@ -3136,32 +3072,11 @@
    "to": "now"
  },
  "timepicker": {
-    "refresh_intervals": [
-      "5s",
-      "10s",
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
+    "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
+    "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
  },
  "timezone": "",
  "title": "PostgreSQL Database",
-  "uid": "000000039",
+  "uid": "postgresql-database",
  "version": 1
-}
+}
@@ -11139,6 +11139,6 @@
  },
  "timezone": "",
  "title": "Alertmanager",
-  "uid": "eea-9_sik",
+  "uid": "alertmanager",
  "version": 27
 }
@@ -23262,7 +23262,7 @@
  },
  "timezone": "browser",
  "title": "Node Exporter Full",
-  "uid": "rYdddlPWk",
+  "uid": "node-exporter-full",
  "version": 87,
  "weekStart": ""
 }
@@ -853,19 +853,11 @@
    "to": "now"
  },
  "timepicker": {
-    "refresh_intervals": [
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ]
+    "refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
  },
  "timezone": "",
  "title": "Istio Wasm Extension Dashboard",
+  "uid": "istio-wasm-extension",
  "version": 1,
  "weekStart": ""
 }
@@ -114,9 +114,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -196,9 +194,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -275,9 +271,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -354,9 +348,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -433,9 +425,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -511,9 +501,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -589,9 +577,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -667,9 +653,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -745,9 +729,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -823,9 +805,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -901,9 +881,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -979,9 +957,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -1329,9 +1305,7 @@
        "footer": {
          "countRows": false,
          "fields": "",
-          "reducer": [
-            "sum"
-          ],
+          "reducer": ["sum"],
          "show": false
        },
        "showHeader": true
@@ -1466,9 +1440,7 @@
        "cellHeight": "sm",
        "footer": {
          "show": false,
-          "reducer": [
-            "sum"
-          ],
+          "reducer": ["sum"],
          "countRows": false,
          "fields": ""
        }
@@ -1832,30 +1804,12 @@
    "to": "now"
  },
  "timepicker": {
-    "refresh_intervals": [
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
+    "refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
+    "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
  },
  "timezone": "browser",
  "title": "Istio Mesh Dashboard",
+  "uid": "istio-mesh",
  "version": 1,
  "weekStart": ""
-}
+}
@@ -1574,30 +1574,12 @@
    "to": "now"
  },
  "timepicker": {
-    "refresh_intervals": [
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
+    "refresh_intervals": ["30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
+    "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
  },
  "timezone": "",
  "title": "Istio Performance Dashboard",
+  "uid": "istio-performance",
  "version": 1,
  "weekStart": ""
 }
@@ -123,9 +123,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -197,9 +195,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -398,9 +394,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -478,9 +472,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -552,9 +544,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -753,9 +743,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -3368,28 +3356,12 @@
    "to": "now"
  },
  "timepicker": {
-    "refresh_intervals": [
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
+    "refresh_intervals": ["5m", "15m", "30m", "1h", "2h", "1d"],
+    "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
  },
  "timezone": "",
  "title": "Istio Service Dashboard",
+  "uid": "istio-service",
  "version": 1,
  "weekStart": ""
 }
@@ -123,9 +123,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
+          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
@@ -206,9 +204,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -405,9 +401,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -485,9 +479,7 @@
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
-          "calcs": [
-            "mean"
-          ],
+          "calcs": ["mean"],
          "fields": "",
          "values": false
        },
@@ -3040,28 +3032,12 @@
    "to": "now"
  },
  "timepicker": {
-    "refresh_intervals": [
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
+    "refresh_intervals": ["5m", "15m", "30m", "1h", "2h", "1d"],
+    "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
  },
  "timezone": "",
  "title": "Istio Workload Dashboard",
+  "uid": "istio-workload",
  "version": 1,
  "weekStart": ""
 }
@@ -1,458 +1,434 @@
 {
-   "graphTooltip": 1,
-   "panels": [
-      {
-         "collapsed": false,
-         "gridPos": {
-            "h": 1,
-            "w": 24,
-            "x": 0,
-            "y": 0
-         },
-         "id": 1,
-         "panels": [ ],
-         "title": "Process",
-         "type": "row"
+  "graphTooltip": 1,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
      },
-      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "Version number of each running instance",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               }
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 0,
-            "y": 1
-         },
-         "id": 2,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum(istio_build{component=\"ztunnel\"}) by (tag)",
-               "legendFormat": "Version ({{tag}})"
-            }
-         ],
-         "title": "Ztunnel Versions",
-         "type": "timeseries"
+      "id": 1,
+      "panels": [],
+      "title": "Process",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
      },
-      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "Memory usage of each running instance",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               },
-               "unit": "bytes"
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 8,
-            "y": 1
-         },
-         "id": 3,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (\n  container_memory_working_set_bytes{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n)",
-               "legendFormat": "Container ({{pod}})"
-            }
-         ],
-         "title": "Memory Usage",
-         "type": "timeseries"
+      "description": "Version number of each running instance",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          }
+        }
      },
-      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "CPU usage of each running instance",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               }
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 16,
-            "y": 1
-         },
-         "id": 4,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (\n  irate(\n    container_cpu_usage_seconds_total{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
-               "legendFormat": "Container ({{pod}})"
-            }
-         ],
-         "title": "CPU Usage",
-         "type": "timeseries"
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 1
      },
-      {
-         "collapsed": false,
-         "gridPos": {
-            "h": 1,
-            "w": 24,
-            "x": 0,
-            "y": 9
-         },
-         "id": 5,
-         "panels": [ ],
-         "title": "Network",
-         "type": "row"
+      "id": 2,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
      },
-      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "Connections opened and closed per instance",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               },
-               "unit": "cps"
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 0,
-            "y": 10
-         },
-         "id": 6,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (\n  rate(\n    istio_tcp_connections_opened_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
-               "legendFormat": "Opened ({{pod}})"
-            },
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "-sum by (pod) (\n  rate(\n    istio_tcp_connections_closed_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
-               "legendFormat": "Closed ({{pod}})"
-            }
-         ],
-         "title": "Connections",
-         "type": "timeseries"
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum(istio_build{component=\"ztunnel\"}) by (tag)",
+          "legendFormat": "Version ({{tag}})"
+        }
+      ],
+      "title": "Ztunnel Versions",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
      },
-      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "Bytes sent and received per instance",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               },
-               "unit": "Bps"
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 8,
-            "y": 10
-         },
-         "id": 7,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (\n  rate(\n    istio_tcp_sent_bytes_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
-               "legendFormat": "Sent ({{pod}})"
-            },
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (\n  rate(\n    istio_tcp_received_bytes_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
-               "legendFormat": "Received ({{pod}})"
-            }
-         ],
-         "title": "Bytes Transmitted",
-         "type": "timeseries"
+      "description": "Memory usage of each running instance",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          },
+          "unit": "bytes"
+        }
      },
-      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "DNS queries received per instance",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               },
-               "unit": "qps"
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 16,
-            "y": 10
-         },
-         "id": 8,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (\n  rate(\n    istio_dns_requests_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
-               "legendFormat": "Request ({{pod}})"
-            }
-         ],
-         "title": "DNS Request",
-         "type": "timeseries"
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 1
      },
-      {
-         "collapsed": false,
-         "gridPos": {
-            "h": 1,
-            "w": 24,
-            "x": 0,
-            "y": 18
-         },
-         "id": 9,
-         "panels": [ ],
-         "title": "Operations",
-         "type": "row"
+      "id": 3,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
      },
-      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "Count of XDS connection terminations.\nThis will typically spike every 30min for each instance.\n",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               }
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 19
-         },
-         "id": 10,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (\n  rate(\n    istio_xds_connection_terminations_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
-               "legendFormat": "XDS Connection Terminations ({{pod}})"
-            }
-         ],
-         "title": "XDS",
-         "type": "timeseries"
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (\n  container_memory_working_set_bytes{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n)",
+          "legendFormat": "Container ({{pod}})"
+        }
+      ],
+      "title": "Memory Usage",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
      },
+      "description": "CPU usage of each running instance",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 1
+      },
+      "id": 4,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
+      },
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (\n  irate(\n    container_cpu_usage_seconds_total{container=\"istio-proxy\", pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
+          "legendFormat": "Container ({{pod}})"
+        }
+      ],
+      "title": "CPU Usage",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 9
+      },
+      "id": 5,
+      "panels": [],
+      "title": "Network",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
+      },
+      "description": "Connections opened and closed per instance",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          },
+          "unit": "cps"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 10
+      },
+      "id": 6,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
+      },
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (\n  rate(\n    istio_tcp_connections_opened_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
+          "legendFormat": "Opened ({{pod}})"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "-sum by (pod) (\n  rate(\n    istio_tcp_connections_closed_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
+          "legendFormat": "Closed ({{pod}})"
+        }
+      ],
+      "title": "Connections",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
+      },
+      "description": "Bytes sent and received per instance",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          },
+          "unit": "Bps"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 10
+      },
+      "id": 7,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
+      },
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (\n  rate(\n    istio_tcp_sent_bytes_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
+          "legendFormat": "Sent ({{pod}})"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (\n  rate(\n    istio_tcp_received_bytes_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
+          "legendFormat": "Received ({{pod}})"
+        }
+      ],
+      "title": "Bytes Transmitted",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
+      },
+      "description": "DNS queries received per instance",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          },
+          "unit": "qps"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 10
+      },
+      "id": 8,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
+      },
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (\n  rate(\n    istio_dns_requests_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
+          "legendFormat": "Request ({{pod}})"
+        }
+      ],
+      "title": "DNS Request",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 18
+      },
+      "id": 9,
+      "panels": [],
+      "title": "Operations",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
+      },
+      "description": "Count of XDS connection terminations.\nThis will typically spike every 30min for each instance.\n",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 19
+      },
+      "id": 10,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
+      },
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (\n  rate(\n    istio_xds_connection_terminations_total{pod=~\"ztunnel-.*\"}\n  [$__rate_interval])\n)",
+          "legendFormat": "XDS Connection Terminations ({{pod}})"
+        }
+      ],
+      "title": "XDS",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "-- Mixed --"
+      },
+      "description": "Count of active and pending proxies managed by each instance.\nPending is expected to converge to zero.\n",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "fillOpacity": 10,
+            "gradientMode": "hue",
+            "showPoints": "never"
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 19
+      },
+      "id": 11,
+      "interval": "5s",
+      "options": {
+        "legend": {
+          "calcs": ["last", "max"],
+          "displayMode": "table"
+        }
+      },
+      "pluginVersion": "v11.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (workload_manager_active_proxy_count{pod=~\"ztunnel-.*\"})",
+          "legendFormat": "Active Proxies ({{pod}})"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+          },
+          "expr": "sum by (pod) (workload_manager_pending_proxy_count{pod=~\"ztunnel-.*\"})",
+          "legendFormat": "Pending Proxies ({{pod}})"
+        }
+      ],
+      "title": "Workload Manager",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 39,
+  "templating": {
+    "list": [
      {
-         "datasource": {
-            "type": "datasource",
-            "uid": "-- Mixed --"
-         },
-         "description": "Count of active and pending proxies managed by each instance.\nPending is expected to converge to zero.\n",
-         "fieldConfig": {
-            "defaults": {
-               "custom": {
-                  "fillOpacity": 10,
-                  "gradientMode": "hue",
-                  "showPoints": "never"
-               }
-            }
-         },
-         "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 19
-         },
-         "id": 11,
-         "interval": "5s",
-         "options": {
-            "legend": {
-               "calcs": [
-                  "last",
-                  "max"
-               ],
-               "displayMode": "table"
-            }
-         },
-         "pluginVersion": "v11.0.0",
-         "targets": [
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (workload_manager_active_proxy_count{pod=~\"ztunnel-.*\"})",
-               "legendFormat": "Active Proxies ({{pod}})"
-            },
-            {
-               "datasource": {
-                  "type": "prometheus",
-                  "uid": "$datasource"
-               },
-               "expr": "sum by (pod) (workload_manager_pending_proxy_count{pod=~\"ztunnel-.*\"})",
-               "legendFormat": "Pending Proxies ({{pod}})"
-            }
-         ],
-         "title": "Workload Manager",
-         "type": "timeseries"
+        "name": "datasource",
+        "query": "prometheus",
+        "type": "datasource"
      }
-   ],
-   "refresh": "15s",
-   "schemaVersion": 39,
-   "templating": {
-      "list": [
-         {
-            "name": "datasource",
-            "query": "prometheus",
-            "type": "datasource"
-         }
-      ]
-   },
-   "time": {
-      "from": "now-30m",
-      "to": "now"
-   },
-   "timezone": "utc",
-   "title": "Istio Ztunnel Dashboard",
-   "uid": "12c58766acc81a1c835dd5059eaf2741"
+    ]
+  },
+  "time": {
+    "from": "now-30m",
+    "to": "now"
+  },
+  "timezone": "utc",
+  "title": "Istio Ztunnel Dashboard",
+  "uid": "istio-ztunnel"
 }
@@ -4572,11 +4572,7 @@
  "refresh": "1m",
  "schemaVersion": 26,
  "style": "dark",
-  "tags": [
-    "kubevirt",
-    "kubevirt-control-plane",
-    "sig-scale"
-  ],
+  "tags": ["kubevirt", "kubevirt-control-plane", "sig-scale"],
  "templating": {
    "list": [
      {
@@ -5165,32 +5161,11 @@
    "to": "now"
  },
  "timepicker": {
-    "refresh_intervals": [
-      "5s",
-      "10s",
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
+    "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
+    "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
  },
  "timezone": "UTC",
  "title": "KubeVirt / Control Plane",
-  "uid": "V1Qq_IBM_za0",
+  "uid": "kubevirt-control-plane",
  "version": 3
-}
+}
@@ -1157,6 +1157,6 @@
  },
  "timezone": "utc",
  "title": "Loki / Chunks",
-  "uid": "chunks",
+  "uid": "loki-chunks",
  "version": 0
 }
@@ -720,6 +720,6 @@
  },
  "timezone": "utc",
  "title": "Loki / Deletion",
-  "uid": "deletion",
+  "uid": "loki-deletion",
  "version": 0
 }
@@ -1032,6 +1032,6 @@
  },
  "timezone": "utc",
  "title": "Loki / Logs",
-  "uid": "logs",
+  "uid": "loki-logs",
  "version": 0
 }
@@ -6701,6 +6701,6 @@
  },
  "timezone": "utc",
  "title": "Loki / Operational",
-  "uid": "operational",
+  "uid": "loki-operational",
  "version": 0
 }
@@ -1464,6 +1464,6 @@
  },
  "timezone": "utc",
  "title": "Loki / Retention",
-  "uid": "retention",
+  "uid": "loki-retention",
  "version": 0
 }
@@ -6469,12 +6469,12 @@
          "type": "prometheus",
          "uid": "$ds"
        },
-        "definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
+        "definition": "label_values(vm_app_version{}, job)",
        "includeAll": false,
        "name": "job",
        "options": [],
        "query": {
-          "query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
+          "query": "label_values(vm_app_version{}, job)",
          "refId": "VictoriaMetrics-job-Variable-Query"
        },
        "refresh": 1,
@@ -6542,7 +6542,7 @@
  },
  "timezone": "",
  "title": "VictoriaMetrics - single-node",
-  "uid": "wNf0q_kZk",
+  "uid": "victoriametrics-single-node",
  "version": 1,
  "weekStart": "",
  "gnetId": 10229
@@ -1,6 +1,8 @@
 # Monitoring & Alerting

-## Alert Rules
+## Alert Rules & Recoding Rules

- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of
-  Prometheus alerting rules
+- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts)
+  - Collection of Prometheus alerting rules.
+- [victoria-metrics-k8s-stack/files/rules](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/files/rules/generated)
+  - Alert Rules & Recoding Rules used by kube-prometheus-stack.
@@ -1,12 +1,16 @@
-{ config, lib, ... }:
+{ config, ... }:
 {
-  services.vmalert = {
+  # https://docs.victoriametrics.com/victoriametrics/vmalert/
+  services.vmalert.instances."homelab" = {
    enable = true;
    settings = {
      "httpListenAddr" = "127.0.0.1:8880";

      "datasource.url" = "http://localhost:9090";
      "notifier.url" = [ "http://localhost:9093" ]; # alertmanager's api
+      # Recording rules results are persisted via remote write.
+      "remoteWrite.url" = "http://localhost:9090";
+      "remoteRead.url" = "http://localhost:9090";

      # Whether to disable long-lived connections to the datasource.
      "datasource.disableKeepAlive" = true;
@@ -16,6 +20,7 @@
      # Path to the files with alerting and/or recording rules.
      rule = [
        "${./alert_rules}/*.yml"
+        "${./recoding_rules}/*.yml"
      ];
      # https://docs.victoriametrics.com/victoriametrics/vmalert/#link-to-alert-source
      # Set this two args to generate the correct `.GeneratorURL`
@@ -0,0 +1,8 @@
+# Alert Rules
+
+Alert rules are configurations that define conditions, scope, and actions for generating alerts from
+monitored signals, such as metrics, logs, or activity. When an alert rule's defined conditions are
+met for a specific resource within its scope, the system generates a triggered alert, which is the
+actual instance of the condition being met. These rules specify the data to monitor, the trigger
+threshold, and the resulting actions, like sending notifications to specific receivers or performing
+automated tasks.
@@ -0,0 +1,57 @@
+groups:
+  - name: general.rules
+    rules:
+      - alert: TargetDown
+        annotations:
+          description:
+            '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in
+            {{ $labels.namespace }} namespace are down.'
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
+          summary: One or more targets are unreachable.
+        expr:
+          100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job,
+          namespace, service)) > 10
+        for: 10m
+        labels:
+          severity: warning
+      - alert: Watchdog
+        annotations:
+          description: 'This is an alert meant to ensure that the entire alerting pipeline is
+            functional.
+
+            This alert is always firing, therefore it should always be firing in Alertmanager
+
+            and always fire against a receiver. There are integrations with various notification
+
+            mechanisms that send a notification when this alert is not firing. For example the
+
+            "DeadMansSnitch" integration in PagerDuty.'
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
+          summary:
+            An alert that should always be firing to certify that Alertmanager is working properly.
+        expr: vector(1)
+        labels:
+          severity: none
+      - alert: InfoInhibitor
+        annotations:
+          description: 'This is an alert that is used to inhibit info alerts.
+
+            By themselves, the info-level alerts are sometimes very noisy, but they are relevant
+            when combined with
+
+            other alerts.
+
+            This alert fires whenever there''s a severity="info" alert, and stops firing when
+            another alert with a
+
+            severity of ''warning'' or ''critical'' starts firing on the same namespace.
+
+            This alert should be routed to a null receiver and configured to inhibit alerts with
+            severity="info".'
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
+          summary: Info-level alert inhibition.
+        expr:
+          ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor",
+          severity =~ "warning|critical", alertstate="firing"} == 1
+        labels:
+          severity: none
@@ -0,0 +1,120 @@
+groups:
+  - name: kubernetes-resources
+    rules:
+      - alert: KubeCPUOvercommit
+        annotations:
+          description:
+            Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{
+            $value }} CPU shares and cannot tolerate node failure.
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
+          summary: Cluster has overcommitted CPU resource requests.
+        expr: |-
+          sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
+          and
+          (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
+        for: 10m
+        labels:
+          severity: warning
+      - alert: KubeMemoryOvercommit
+        annotations:
+          description:
+            Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{
+            $value | humanize }} bytes and cannot tolerate node failure.
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
+          summary: Cluster has overcommitted memory resource requests.
+        expr: |-
+          sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
+          and
+          (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
+        for: 10m
+        labels:
+          severity: warning
+      - alert: KubeCPUQuotaOvercommit
+        annotations:
+          description:
+            Cluster {{ $labels.cluster }}  has overcommitted CPU resource requests for Namespaces.
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
+          summary: Cluster has overcommitted CPU resource requests.
+        expr: |-
+          sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
+            /
+          sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
+            > 1.5
+        for: 5m
+        labels:
+          severity: warning
+      - alert: KubeMemoryQuotaOvercommit
+        annotations:
+          description:
+            Cluster {{ $labels.cluster }}  has overcommitted memory resource requests for
+            Namespaces.
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
+          summary: Cluster has overcommitted memory resource requests.
+        expr: |-
+          sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
+            /
+          sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
+            > 1.5
+        for: 5m
+        labels:
+          severity: warning
+      - alert: KubeQuotaAlmostFull
+        annotations:
+          description:
+            Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
+            $labels.resource }} quota.
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
+          summary: Namespace quota is going to be full.
+        expr: |-
+          kube_resourcequota{job="kube-state-metrics", type="used"}
+            / ignoring(instance, job, type)
+          (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
+            > 0.9 < 1
+        for: 15m
+        labels:
+          severity: info
+      - alert: KubeQuotaFullyUsed
+        annotations:
+          description:
+            Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
+            $labels.resource }} quota.
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
+          summary: Namespace quota is fully used.
+        expr: |-
+          kube_resourcequota{job="kube-state-metrics", type="used"}
+            / ignoring(instance, job, type)
+          (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
+            == 1
+        for: 15m
+        labels:
+          severity: info
+      - alert: KubeQuotaExceeded
+        annotations:
+          description:
+            Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{
+            $labels.resource }} quota.
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
+          summary: Namespace quota has exceeded the limits.
+        expr: |-
+          kube_resourcequota{job="kube-state-metrics", type="used"}
+            / ignoring(instance, job, type)
+          (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
+            > 1
+        for: 15m
+        labels:
+          severity: warning
+      - alert: CPUThrottlingHigh
+        annotations:
+          description:
+            "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace
+            }} for container {{ $labels.container }} in pod {{ $labels.pod }}."
+          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
+          summary: Processes experience elevated CPU throttling.
+        expr: |-
+          sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
+            /
+          sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
+            > ( 25 / 100 )
+        for: 15m
+        labels:
+          severity: info
@@ -0,0 +1,7 @@
+# Recording Rules
+
+Recording rules are pre-defined queries, often complex or computationally expensive, that are
+evaluated periodically to create new, pre-computed time series metrics.
+
+These rules store the results in a metric backend, significantly speeding up queries for dashboards
+and other alerts, and reducing system load by avoiding the re-computation of data.
@@ -0,0 +1,149 @@
+groups:
+  - name: k8s.rules
+    rules:
+      - expr: |-
+          sum by (cluster, namespace, pod, container) (
+            irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
+          ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
+            1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
+          )
+        record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
+      - expr: |-
+          container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
+            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
+          )
+        record: node_namespace_pod_container:container_memory_working_set_bytes
+      - expr: |-
+          container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
+            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
+          )
+        record: node_namespace_pod_container:container_memory_rss
+      - expr: |-
+          container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
+            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
+          )
+        record: node_namespace_pod_container:container_memory_cache
+      - expr: |-
+          container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
+            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
+          )
+        record: node_namespace_pod_container:container_memory_swap
+      - expr: |-
+          kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster)
+          group_left() max by (namespace, pod, cluster) (
+            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+          )
+        record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
+      - expr: |-
+          sum by (namespace, cluster) (
+              sum by (namespace, pod, cluster) (
+                  max by (namespace, pod, container, cluster) (
+                    kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
+                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
+                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
+                  )
+              )
+          )
+        record: namespace_memory:kube_pod_container_resource_requests:sum
+      - expr: |-
+          kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster)
+          group_left() max by (namespace, pod, cluster) (
+            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+          )
+        record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
+      - expr: |-
+          sum by (namespace, cluster) (
+              sum by (namespace, pod, cluster) (
+                  max by (namespace, pod, container, cluster) (
+                    kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
+                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
+                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
+                  )
+              )
+          )
+        record: namespace_cpu:kube_pod_container_resource_requests:sum
+      - expr: |-
+          kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster)
+          group_left() max by (namespace, pod, cluster) (
+            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+          )
+        record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
+      - expr: |-
+          sum by (namespace, cluster) (
+              sum by (namespace, pod, cluster) (
+                  max by (namespace, pod, container, cluster) (
+                    kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
+                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
+                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
+                  )
+              )
+          )
+        record: namespace_memory:kube_pod_container_resource_limits:sum
+      - expr: |-
+          kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster)
+          group_left() max by (namespace, pod, cluster) (
+            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+            )
+        record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
+      - expr: |-
+          sum by (namespace, cluster) (
+              sum by (namespace, pod, cluster) (
+                  max by (namespace, pod, container, cluster) (
+                    kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
+                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
+                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
+                  )
+              )
+          )
+        record: namespace_cpu:kube_pod_container_resource_limits:sum
+      - expr: |-
+          max by (cluster, namespace, workload, pod) (
+            label_replace(
+              label_replace(
+                kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
+                "replicaset", "$1", "owner_name", "(.*)"
+              ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
+                1, max by (replicaset, namespace, owner_name) (
+                  kube_replicaset_owner{job="kube-state-metrics"}
+                )
+              ),
+              "workload", "$1", "owner_name", "(.*)"
+            )
+          )
+        labels:
+          workload_type: deployment
+        record: namespace_workload_pod:kube_pod_owner:relabel
+      - expr: |-
+          max by (cluster, namespace, workload, pod) (
+            label_replace(
+              kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
+              "workload", "$1", "owner_name", "(.*)"
+            )
+          )
+        labels:
+          workload_type: daemonset
+        record: namespace_workload_pod:kube_pod_owner:relabel
+      - expr: |-
+          max by (cluster, namespace, workload, pod) (
+            label_replace(
+              kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
+              "workload", "$1", "owner_name", "(.*)"
+            )
+          )
+        labels:
+          workload_type: statefulset
+        record: namespace_workload_pod:kube_pod_owner:relabel
+      - expr: |-
+          max by (cluster, namespace, workload, pod) (
+            label_replace(
+              kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
+              "workload", "$1", "owner_name", "(.*)"
+            )
+          )
+        labels:
+          workload_type: job
+        record: namespace_workload_pod:kube_pod_owner:relabel
@@ -0,0 +1,128 @@
+groups:
+  - name: kube-prometheus-node-recording.rules
+    rules:
+      - expr:
+          sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY
+          (instance)
+        record: instance:node_cpu:rate:sum
+      - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
+        record: instance:node_network_receive_bytes:rate:sum
+      - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
+        record: instance:node_network_transmit_bytes:rate:sum
+      - expr:
+          sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT
+          (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance,
+          cpu)) BY (instance)
+        record: instance:node_cpu:ratio
+      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
+        record: cluster:node_cpu:sum_rate5m
+      - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
+        record: cluster:node_cpu:ratio
+
+  - name: node-exporter.rules
+    rules:
+      - expr: |-
+          count without (cpu, mode) (
+            node_cpu_seconds_total{job="node-exporter",mode="idle"}
+          )
+        record: instance:node_num_cpu:sum
+      - expr: |-
+          1 - avg without (cpu) (
+            sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
+          )
+        record: instance:node_cpu_utilisation:rate5m
+      - expr: |-
+          (
+            node_load1{job="node-exporter"}
+          /
+            instance:node_num_cpu:sum{job="node-exporter"}
+          )
+        record: instance:node_load1_per_cpu:ratio
+      - expr: |-
+          1 - (
+            (
+              node_memory_MemAvailable_bytes{job="node-exporter"}
+              or
+              (
+                node_memory_Buffers_bytes{job="node-exporter"}
+                +
+                node_memory_Cached_bytes{job="node-exporter"}
+                +
+                node_memory_MemFree_bytes{job="node-exporter"}
+                +
+                node_memory_Slab_bytes{job="node-exporter"}
+              )
+            )
+          /
+            node_memory_MemTotal_bytes{job="node-exporter"}
+          )
+        record: instance:node_memory_utilisation:ratio
+      - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
+        record: instance:node_vmstat_pgmajfault:rate5m
+      - expr:
+          rate(node_disk_io_time_seconds_total{job="node-exporter",
+          device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
+        record: instance_device:node_disk_io_time_seconds:rate5m
+      - expr:
+          rate(node_disk_io_time_weighted_seconds_total{job="node-exporter",
+          device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
+        record: instance_device:node_disk_io_time_weighted_seconds:rate5m
+      - expr: |-
+          sum without (device) (
+            rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
+          )
+        record: instance:node_network_receive_bytes_excluding_lo:rate5m
+      - expr: |-
+          sum without (device) (
+            rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
+          )
+        record: instance:node_network_transmit_bytes_excluding_lo:rate5m
+      - expr: |-
+          sum without (device) (
+            rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
+          )
+        record: instance:node_network_receive_drop_excluding_lo:rate5m
+      - expr: |-
+          sum without (device) (
+            rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
+          )
+        record: instance:node_network_transmit_drop_excluding_lo:rate5m
+
+  - name: node.rules
+    rules:
+      - expr: |-
+          topk by(cluster, namespace, pod) (1,
+            max by (cluster, node, namespace, pod) (
+              label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
+          ))
+        record: "node_namespace_pod:kube_pod_info:"
+      - expr: |-
+          count by (cluster, node) (
+            node_cpu_seconds_total{mode="idle",job="node-exporter"}
+            * on (namespace, pod) group_left(node)
+            topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
+          )
+        record: node:node_num_cpu:sum
+      - expr: |-
+          sum(
+            node_memory_MemAvailable_bytes{job="node-exporter"} or
+            (
+              node_memory_Buffers_bytes{job="node-exporter"} +
+              node_memory_Cached_bytes{job="node-exporter"} +
+              node_memory_MemFree_bytes{job="node-exporter"} +
+              node_memory_Slab_bytes{job="node-exporter"}
+            )
+          ) by (cluster)
+        record: :node_memory_MemAvailable_bytes:sum
+      - expr: |-
+          avg by (cluster, node) (
+            sum without (mode) (
+              rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
+            )
+          )
+        record: node:node_cpu_utilization:ratio_rate5m
+      - expr: |-
+          avg by (cluster) (
+            node:node_cpu_utilization:ratio_rate5m
+          )
+        record: cluster:node_cpu:ratio_rate5m
@@ -50,6 +50,8 @@
              labels.type = "app";
              labels.app = "dnsmasq";
              labels.host = "suzi";
+              labels.env = "homelab";
+              labels.cluster = "homelab";
            }
          ];
        }
@@ -64,6 +66,8 @@
              labels.type = "app";
              labels.app = "v2ray";
              labels.host = "aquamarine";
+              labels.env = "homelab";
+              labels.cluster = "homelab";
            }
          ];
        }
@@ -77,6 +81,8 @@
              labels.type = "app";
              labels.app = "postgresql";
              labels.host = "aquamarine";
+              labels.env = "homelab";
+              labels.cluster = "homelab";
            }
          ];
        }
@@ -90,6 +96,39 @@
              labels.type = "app";
              labels.app = "sftpgo";
              labels.host = "aquamarine";
+              labels.env = "homelab";
+              labels.cluster = "homelab";
+            }
+          ];
+        }
+        {
+          job_name = "alertmanager-embedded-exporter";
+          scrape_interval = "30s";
+          metrics_path = "/metrics";
+          static_configs = [
+            {
+              targets = [ "localhost:9093" ];
+              labels.type = "app";
+              labels.app = "alertmanager";
+              labels.host = "aquamarine";
+              labels.env = "homelab";
+              labels.cluster = "homelab";
+            }
+          ];
+        }
+        {
+          job_name = "victoriametrics-embedded-exporter";
+          scrape_interval = "30s";
+          metrics_path = "/metrics";
+          static_configs = [
+            {
+              # scrape vm itself
+              targets = [ "localhost:9090" ];
+              labels.type = "app";
+              labels.app = "victoriametrics";
+              labels.host = "aquamarine";
+              labels.env = "homelab";
+              labels.cluster = "homelab";
            }
          ];
        }
@@ -109,6 +148,8 @@
                targets = [ "${addr.ipv4}:9100" ];
                labels.type = "node";
                labels.host = hostname;
+                labels.env = "homelab";
+                labels.cluster = "homelab";
              }
            ];
          }