From 7f112010c2600da3fb8aa41929251a5938f19a78 Mon Sep 17 00:00:00 2001 From: Ryan Yin Date: Thu, 11 Sep 2025 13:48:01 +0800 Subject: [PATCH] feat: update alert rules --- .../dashboards/databases/cloudnative-pg.json | 278 ++++++++++-------- .../alert_rules/argocd-exporter.yml | 25 ++ ...mbedded-exporter.yml => etcd-exporter.yml} | 2 +- .../alert_rules/fluxcd-exporter.yml | 53 ++++ ...bedded-exporter.yml => istio-exporter.yml} | 2 +- ...er.yml => kube-state-metrics-exporter.yml} | 2 +- .../monitoring/alert_rules/loki-exporter.yaml | 52 ++++ .../alert_rules/postgres-exporter.yml | 262 +++++++++++++++++ 8 files changed, 546 insertions(+), 130 deletions(-) create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/argocd-exporter.yml rename hosts/idols-aquamarine/monitoring/alert_rules/{etcd_embedded-exporter.yml => etcd-exporter.yml} (99%) create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/fluxcd-exporter.yml rename hosts/idols-aquamarine/monitoring/alert_rules/{istio_embedded-exporter.yml => istio-exporter.yml} (99%) rename hosts/idols-aquamarine/monitoring/alert_rules/{kubestate-exporter.yml => kube-state-metrics-exporter.yml} (99%) create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/loki-exporter.yaml create mode 100644 hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml diff --git a/hosts/idols-aquamarine/grafana/dashboards/databases/cloudnative-pg.json b/hosts/idols-aquamarine/grafana/dashboards/databases/cloudnative-pg.json index 10ce4969..73724bcd 100644 --- a/hosts/idols-aquamarine/grafana/dashboards/databases/cloudnative-pg.json +++ b/hosts/idols-aquamarine/grafana/dashboards/databases/cloudnative-pg.json @@ -139,7 +139,7 @@ }, "id": 676, "options": { - "alertInstanceLabelFilter": "{namespace=~\"$namespace\",pod=~\"$instances\"}", + "alertInstanceLabelFilter": "{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "alertName": "", "dashboardAlerts": false, "folder": "", @@ -341,7 +341,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "(max(cnpg_pg_replication_streaming_replicas{namespace=~\"$namespace\", pod=~\"$instances\"}) - sum(cnpg_pg_replication_is_wal_receiver_up{namespace=~\"$namespace\", pod=~\"$instances\"})) + (clamp_max(max(cnpg_pg_replication_streaming_replicas{namespace=~\"$namespace\", pod=~\"$instances\"}), 1) - 1)", + "expr": "(max(cnpg_pg_replication_streaming_replicas{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}) - sum(cnpg_pg_replication_is_wal_receiver_up{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"})) + (clamp_max(max(cnpg_pg_replication_streaming_replicas{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}), 1) - 1)", "legendFormat": "Replication", "range": true, "refId": "A" @@ -463,7 +463,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max(cnpg_pg_replication_lag{namespace=~\"$namespace\",pod=~\"$instances\"}) + max(cnpg_pg_stat_replication_write_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"}) + max(cnpg_pg_stat_replication_flush_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"}) + max(cnpg_pg_stat_replication_replay_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max(cnpg_pg_replication_lag{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) + max(cnpg_pg_stat_replication_write_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) + max(cnpg_pg_stat_replication_flush_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) + max(cnpg_pg_stat_replication_replay_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "hide": false, "instant": false, "legendFormat": "Lag", @@ -610,7 +610,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max((max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"}))) OR (max by(persistentvolumeclaim) (kubelet_volume_stats_inodes_used{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_inodes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"})))", + "expr": "max((max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"}))) OR (max by(persistentvolumeclaim) (kubelet_volume_stats_inodes_used{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_inodes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"})))", "hide": false, "legendFormat": "Storage", "range": true, @@ -676,7 +676,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(cnpg_pg_postmaster_start_time{namespace=~\"$namespace\",pod=~\"$instances\"})*1000", + "expr": "max(cnpg_pg_postmaster_start_time{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})*1000", "format": "time_series", "hide": false, "instant": true, @@ -745,7 +745,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_xact_commit{namespace=~\"$namespace\",pod=~\"$instances\"}[$__interval])) + sum(rate(cnpg_pg_stat_database_xact_rollback{namespace=~\"$namespace\",pod=~\"$instances\"}[$__interval]))", + "expr": "sum(rate(cnpg_pg_stat_database_xact_commit{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[$__interval])) + sum(rate(cnpg_pg_stat_database_xact_rollback{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[$__interval]))", "interval": "", "legendFormat": "TPS", "range": true, @@ -833,7 +833,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=~\"$instances\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", resource=\"cpu\", pod=~\"$instances\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", cluster=\"$cluster\", resource=\"cpu\", pod=~\"$instances\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -921,7 +921,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\",container!=\"\", image!=\"\", pod=~\"$instances\"}) / sum(max by(pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", resource=\"memory\", pod=~\"$instances\"}))", + "expr": "sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\", cluster=\"$cluster\",container!=\"\", image!=\"\", pod=~\"$instances\"}) / sum(max by(pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", cluster=\"$cluster\", resource=\"memory\", pod=~\"$instances\"}))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -1000,7 +1000,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(cnpg_pg_replication_lag{namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max(cnpg_pg_replication_lag{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "instant": true, "legendFormat": "__auto", "range": false, @@ -1076,7 +1076,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max(cnpg_pg_stat_replication_write_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max(cnpg_pg_stat_replication_write_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1149,7 +1149,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"}))", + "expr": "max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"}))", "format": "time_series", "instant": true, "interval": "", @@ -1164,7 +1164,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-wal\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-wal\"}))", + "expr": "max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-wal\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-wal\"}))", "format": "time_series", "instant": true, "interval": "", @@ -1179,7 +1179,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(\n sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n /\n sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n *\n on(namespace, persistentvolumeclaim) group_left(volume)\n kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~\"$instances\"}\n)", + "expr": "max(\n sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n /\n sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n *\n on(namespace, persistentvolumeclaim) group_left(volume)\n kube_pod_spec_volumes_persistentvolumeclaims_info{namespace=\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}\n)", "hide": false, "instant": true, "legendFormat": "Tablespaces (max)", @@ -1287,7 +1287,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "-(time() - max(cnpg_collector_last_available_backup_timestamp{namespace=\"$namespace\",pod=~\"$instances\"}))", + "expr": "-(time() - max(cnpg_collector_last_available_backup_timestamp{namespace=\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}))", "instant": true, "legendFormat": "__auto", "range": false, @@ -1411,7 +1411,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{ namespace=\"$namespace\", pod=~\"$instances\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", resource=\"cpu\", pod=~\"$instances\"}))", + "expr": "(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{ namespace=\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", cluster=\"$cluster\", resource=\"cpu\", pod=~\"$instances\"}))", "hide": false, "legendFormat": "CPU", "range": true, @@ -1423,7 +1423,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "(sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\",container!=\"\", image!=\"\", pod=~\"$instances\"}) / sum(max by(pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", resource=\"memory\", pod=~\"$instances\"})))", + "expr": "(sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\", cluster=\"$cluster\",container!=\"\", image!=\"\", pod=~\"$instances\"}) / sum(max by(pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"$namespace\", cluster=\"$cluster\", resource=\"memory\", pod=~\"$instances\"})))", "hide": false, "instant": false, "legendFormat": "Memory", @@ -1436,7 +1436,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": " (max(sum by (pod) (cnpg_backends_total{namespace=~\"$namespace\", pod=~\"$instances\"}) / sum by (pod) (cnpg_pg_settings_setting{name=\"max_connections\", namespace=~\"$namespace\", pod=~\"$instances\"})))", + "expr": " (max(sum by (pod) (cnpg_backends_total{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}) / sum by (pod) (cnpg_pg_settings_setting{name=\"max_connections\", namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"})))", "hide": false, "instant": false, "legendFormat": "Connections", @@ -1526,7 +1526,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max((1 - cnpg_pg_replication_in_recovery{namespace=~\"$namespace\",pod=~\"$instances\"}) * (time() - timestamp(cnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\",pod=~\"$instances\"}) +\ncnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\",pod=~\"$instances\"}))", + "expr": "max((1 - cnpg_pg_replication_in_recovery{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) * (time() - timestamp(cnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) +\ncnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}))", "format": "time_series", "instant": true, "interval": "", @@ -1596,7 +1596,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "cnpg_collector_postgres_version{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_collector_postgres_version{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "format": "table", "hide": false, "instant": true, @@ -1677,7 +1677,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max(cnpg_pg_stat_replication_flush_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max(cnpg_pg_stat_replication_flush_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1752,7 +1752,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max(cnpg_pg_stat_replication_replay_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max(cnpg_pg_stat_replication_replay_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1924,7 +1924,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "time() - max(cnpg_collector_last_available_backup_timestamp{namespace=\"$namespace\", pod=~\"$instances\"})", + "expr": "time() - max(cnpg_collector_last_available_backup_timestamp{namespace=\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"})", "legendFormat": "Backups", "range": true, "refId": "BACKUPS" @@ -2095,7 +2095,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max((1 - cnpg_pg_replication_in_recovery{namespace=~\"$namespace\", pod=~\"$instances\"}) * (time() - timestamp(cnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\", pod=~\"$instances\"}) +\ncnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\", pod=~\"$instances\"}))", + "expr": "max((1 - cnpg_pg_replication_in_recovery{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}) * (time() - timestamp(cnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}) +\ncnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}))", "hide": false, "instant": false, "legendFormat": "WAL", @@ -2195,7 +2195,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum(kube_pod_status_ready{namespace=\"$operatorNamespace\", pod=~\"cloudnative-pg.+|cnpg-controller-manager.+\", condition=\"true\"})", + "expr": "sum(kube_pod_status_ready{namespace=\"$operatorNamespace\", cluster=\"$cluster\", pod=~\"cloudnative-pg.+|cnpg-controller-manager.+\", condition=\"true\"})", "hide": false, "instant": true, "legendFormat": "Operator Status", @@ -2331,7 +2331,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"backup\"}), 1)", + "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"backup\"}), 1)", "hide": true, "legendFormat": "__auto", "range": true, @@ -2343,7 +2343,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"cluster\"}), 1)", + "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"cluster\"}), 1)", "hide": true, "legendFormat": "__auto", "range": true, @@ -2355,7 +2355,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"pooler\"}), 1)", + "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"pooler\"}), 1)", "hide": true, "legendFormat": "__auto", "range": true, @@ -2367,7 +2367,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=~\"scheduledbackup|scheduled-backup\"}), 1)", + "expr": "clamp_max(max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=~\"scheduledbackup|scheduled-backup\"}), 1)", "hide": true, "legendFormat": "__auto", "range": true, @@ -2452,7 +2452,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=~\"$instances\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"})", "hide": false, "interval": "", "legendFormat": "Total", @@ -2527,7 +2527,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(container_memory_working_set_bytes{pod=~\"$instances\", namespace=\"$namespace\", container!=\"\", image!=\"\"})", + "expr": "sum(container_memory_working_set_bytes{pod=~\"$instances\", namespace=\"$namespace\", cluster=\"$cluster\", container!=\"\", image!=\"\"})", "hide": false, "interval": "", "legendFormat": "Total", @@ -2603,7 +2603,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "cnpg_pg_database_size_bytes{namespace=\"$namespace\", pod=~\"$instances\"}", + "expr": "cnpg_pg_database_size_bytes{namespace=\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}", "format": "table", "instant": false, "legendFormat": "__auto", @@ -2708,7 +2708,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(cnpg_collector_first_recoverability_point{namespace=~\"$namespace\",pod=~\"$instances\"})*1000", + "expr": "max(cnpg_collector_first_recoverability_point{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})*1000", "format": "time_series", "instant": true, "interval": "", @@ -2774,7 +2774,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -2815,7 +2815,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -2855,7 +2855,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -2895,7 +2895,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -2935,7 +2935,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -2975,7 +2975,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3016,7 +3016,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3056,7 +3056,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3096,7 +3096,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3137,7 +3137,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3221,7 +3221,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "min(kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "min(kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "instant": true, "interval": "", "legendFormat": "", @@ -3308,7 +3308,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "1 - cnpg_pg_replication_in_recovery{namespace=~\"$namespace\",pod=~\"$instances\"} + cnpg_pg_replication_is_wal_receiver_up{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "1 - cnpg_pg_replication_in_recovery{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"} + cnpg_pg_replication_is_wal_receiver_up{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3381,7 +3381,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "cnpg_pg_replication_streaming_replicas{namespace=~\"$namespace\", pod=~\"$instances\"}", + "expr": "cnpg_pg_replication_streaming_replicas{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3451,7 +3451,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "kube_pod_info{namespace=~\"$namespace\", pod=~\"$instances\"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels", + "expr": "kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels{cluster=\"$cluster\"}", "format": "table", "instant": true, "interval": "", @@ -3549,7 +3549,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum by (pod) (cnpg_backends_total{namespace=~\"$namespace\", pod=~\"$instances\"})", + "expr": "sum by (pod) (cnpg_backends_total{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"})", "instant": false, "interval": "", "legendFormat": "-", @@ -3628,7 +3628,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "100 * sum by (pod) (cnpg_backends_total{namespace=~\"$namespace\", pod=~\"$instances\"}) / sum by (pod) (cnpg_pg_settings_setting{name=\"max_connections\", namespace=~\"$namespace\", pod=~\"$instances\"})", + "expr": "100 * sum by (pod) (cnpg_backends_total{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}) / sum by (pod) (cnpg_pg_settings_setting{name=\"max_connections\", namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"})", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3707,7 +3707,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "max by (pod) (cnpg_pg_database_xid_age{namespace=~\"$namespace\", pod=~\"$instances\"})", + "expr": "max by (pod) (cnpg_pg_database_xid_age{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"})", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3775,7 +3775,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "cnpg_pg_postmaster_start_time{namespace=~\"$namespace\", pod=~\"$instances\"}*1000", + "expr": "cnpg_pg_postmaster_start_time{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}*1000", "format": "time_series", "hide": false, "instant": true, @@ -3847,7 +3847,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "cnpg_collector_postgres_version{namespace=~\"$namespace\", pod=~\"$instances\"}", + "expr": "cnpg_collector_postgres_version{namespace=~\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}", "format": "table", "hide": false, "instant": true, @@ -3903,7 +3903,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3944,7 +3944,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -3984,7 +3984,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4024,7 +4024,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4064,7 +4064,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4104,7 +4104,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4144,7 +4144,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4184,7 +4184,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4225,7 +4225,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "kube_pod_container_status_ready{container=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4289,7 +4289,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_settings_setting{name=\"max_connections\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_settings_setting{name=\"max_connections\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4355,7 +4355,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "max by (pod) (cnpg_pg_settings_setting{name=\"shared_buffers\",namespace=~\"$namespace\",pod=~\"$instances\"}) * max by (pod) (cnpg_pg_settings_setting{name=\"block_size\",namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max by (pod) (cnpg_pg_settings_setting{name=\"shared_buffers\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) * max by (pod) (cnpg_pg_settings_setting{name=\"block_size\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4421,7 +4421,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "max by (pod) (cnpg_pg_settings_setting{name=\"effective_cache_size\",namespace=~\"$namespace\",pod=~\"$instances\"}) * max by (pod) (cnpg_pg_settings_setting{name=\"block_size\",namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max by (pod) (cnpg_pg_settings_setting{name=\"effective_cache_size\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) * max by (pod) (cnpg_pg_settings_setting{name=\"block_size\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4487,7 +4487,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_settings_setting{name=\"work_mem\",namespace=~\"$namespace\",pod=~\"$instances\"} * 1024", + "expr": "cnpg_pg_settings_setting{name=\"work_mem\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"} * 1024", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4552,7 +4552,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_settings_setting{name=\"maintenance_work_mem\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_settings_setting{name=\"maintenance_work_mem\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4617,7 +4617,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_settings_setting{name=\"random_page_cost\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_settings_setting{name=\"random_page_cost\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4682,7 +4682,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_settings_setting{name=\"seq_page_cost\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_settings_setting{name=\"seq_page_cost\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": true, "interval": "", "legendFormat": "{{pod}}", @@ -4749,7 +4749,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_settings_setting{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_settings_setting{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "format": "table", "instant": true, "interval": "", @@ -4939,7 +4939,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$instances\", namespace=~\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"$instances\", namespace=~\"$namespace\", cluster=\"$cluster\"}) by (pod)", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -5116,7 +5116,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(container_memory_working_set_bytes{pod=~\"$instances\", namespace=\"$namespace\", container!=\"\", image!=\"\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{pod=~\"$instances\", namespace=\"$namespace\", cluster=\"$cluster\", container!=\"\", image!=\"\"}) by (pod)", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -5211,7 +5211,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(cnpg_backends_total{namespace=~\"$namespace\",pod=~\"$instances\"}) by (pod)", + "expr": "sum(cnpg_backends_total{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) by (pod)", "hide": false, "interval": "", "legendFormat": "total ({{pod}})", @@ -5223,7 +5223,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(cnpg_backends_total{namespace=~\"$namespace\",pod=~\"$instances\"}) by (state, pod)", + "expr": "sum(cnpg_backends_total{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}) by (state, pod)", "interval": "", "legendFormat": "{{state}} ({{pod}})", "refId": "A" @@ -5318,7 +5318,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_xact_commit{namespace=~\"$namespace\",pod=~\"$instances\"}[5m])) by (pod)", + "expr": "sum(rate(cnpg_pg_stat_database_xact_commit{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])) by (pod)", "interval": "", "legendFormat": "committed ({{pod}})", "range": true, @@ -5330,7 +5330,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_xact_rollback{namespace=~\"$namespace\",pod=~\"$instances\"}[5m])) by (pod)", + "expr": "sum(rate(cnpg_pg_stat_database_xact_rollback{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])) by (pod)", "hide": false, "interval": "", "legendFormat": "rolled back ({{pod}})", @@ -5427,7 +5427,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "max by (pod) (cnpg_backends_max_tx_duration_seconds{namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max by (pod) (cnpg_backends_max_tx_duration_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "interval": "", "legendFormat": "{{pod}}", "refId": "A" @@ -5522,7 +5522,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "rate(cnpg_pg_stat_database_deadlocks{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_deadlocks{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])", "hide": false, "instant": false, "interval": "", @@ -5619,7 +5619,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_backends_waiting_total{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_backends_waiting_total{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "interval": "", "legendFormat": "{{pod}}", "refId": "A" @@ -5704,7 +5704,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"})", + "expr": "max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"})", "format": "time_series", "interval": "", "legendFormat": "{{persistentvolumeclaim}}", @@ -5717,7 +5717,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-wal\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-wal\"})", + "expr": "max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-wal\"} / kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-wal\"})", "format": "time_series", "interval": "", "legendFormat": "{{persistentvolumeclaim}}", @@ -5792,7 +5792,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_inodes_used{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_inodes{namespace=\"$namespace\", persistentvolumeclaim=~\"$instances\"})", + "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_inodes_used{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"} / kubelet_volume_stats_inodes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"$instances\"})", "format": "time_series", "interval": "", "legendFormat": "{{persistentvolumeclaim}}", @@ -5805,7 +5805,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_inodes_used{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-wal\"} / kubelet_volume_stats_inodes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-wal\"})", + "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_inodes_used{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-wal\"} / kubelet_volume_stats_inodes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-wal\"})", "format": "time_series", "interval": "", "legendFormat": "{{persistentvolumeclaim}}", @@ -5880,7 +5880,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n/\nsum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n*\non(namespace, persistentvolumeclaim) group_left(volume,pod)\nkube_pod_spec_volumes_persistentvolumeclaims_info{pod=~\"$instances\"}", + "expr": "sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n/\nsum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\", cluster=\"$cluster\", persistentvolumeclaim=~\"(${instances})-tbs.*\"}) \n*\non(namespace, persistentvolumeclaim) group_left(volume,pod)\nkube_pod_spec_volumes_persistentvolumeclaims_info{namespace=\"$namespace\", cluster=\"$cluster\", pod=~\"$instances\"}", "format": "time_series", "interval": "", "legendFormat": "{{volume}}-{{pod}}", @@ -5978,7 +5978,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_tup_deleted{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m]))", + "expr": "sum(rate(cnpg_pg_stat_database_tup_deleted{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m]))", "interval": "", "legendFormat": "deleted", "range": true, @@ -5991,7 +5991,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_tup_inserted{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m]))", + "expr": "sum(rate(cnpg_pg_stat_database_tup_inserted{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m]))", "hide": false, "interval": "", "legendFormat": "inserted", @@ -6005,7 +6005,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_tup_fetched{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m]))", + "expr": "sum(rate(cnpg_pg_stat_database_tup_fetched{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m]))", "hide": false, "interval": "", "legendFormat": "fetched", @@ -6019,7 +6019,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_tup_returned{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m]))", + "expr": "sum(rate(cnpg_pg_stat_database_tup_returned{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m]))", "hide": false, "interval": "", "legendFormat": "returned", @@ -6033,7 +6033,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(cnpg_pg_stat_database_tup_updated{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m]))", + "expr": "sum(rate(cnpg_pg_stat_database_tup_updated{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m]))", "hide": false, "interval": "", "legendFormat": "updated", @@ -6130,7 +6130,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "rate(cnpg_pg_stat_database_blks_hit{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_blks_hit{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])", "interval": "", "legendFormat": "hit ({{pod}})", "range": true, @@ -6143,7 +6143,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "rate(cnpg_pg_stat_database_blks_read{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_blks_read{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])", "hide": false, "interval": "", "legendFormat": "read ({{pod}})", @@ -6238,7 +6238,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "max by (datname) (cnpg_pg_database_size_bytes{datname!~\"template.*\",datname!=\"postgres\",namespace=~\"$namespace\",pod=~\"$instances\"})", + "expr": "max by (datname) (cnpg_pg_database_size_bytes{datname!~\"template.*\",datname!=\"postgres\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"})", "interval": "", "legendFormat": " {{pod}}: {{datname}}", "range": true, @@ -6334,7 +6334,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "rate(cnpg_pg_stat_database_temp_bytes{datname=\"\",namespace=~\"$namespace\",pod=~\"$instances\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_temp_bytes{datname=\"\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])", "instant": false, "interval": "", "legendFormat": "{{pod}}", @@ -6454,7 +6454,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_collector_pg_wal_archive_status{value=\"ready\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_collector_pg_wal_archive_status{value=\"ready\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "interval": "", "legendFormat": "ready ({{pod}})", "refId": "A" @@ -6465,7 +6465,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_collector_pg_wal_archive_status{value=\"done\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_collector_pg_wal_archive_status{value=\"done\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "hide": false, "interval": "", "legendFormat": "done ({{pod}})", @@ -6560,7 +6560,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "rate(cnpg_pg_stat_archiver_archived_count{namespace=~\"$namespace\",pod=~\"$instances\"}[5m])", + "expr": "rate(cnpg_pg_stat_archiver_archived_count{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])", "interval": "", "legendFormat": "archived ({{pod}})", "refId": "A" @@ -6571,7 +6571,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "rate(cnpg_pg_stat_archiver_failed_count{namespace=~\"$namespace\",pod=~\"$instances\"}[5m])", + "expr": "rate(cnpg_pg_stat_archiver_failed_count{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}[5m])", "hide": false, "interval": "", "legendFormat": "failed ({{pod}})", @@ -6668,7 +6668,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_stat_archiver_seconds_since_last_archival{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "interval": "", "legendFormat": "age ({{pod}})", "refId": "A" @@ -6762,7 +6762,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "cnpg_collector_pg_wal{pod=~\"$instances\", namespace=~\"$namespace\", value=\"count\"}", + "expr": "cnpg_collector_pg_wal{pod=~\"$instances\", namespace=~\"$namespace\", cluster=\"$cluster\", value=\"count\"}", "instant": false, "legendFormat": "{{pod}}", "range": true, @@ -6887,7 +6887,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_pg_replication_lag{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_replication_lag{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": false, "interval": "", "legendFormat": "{{pod}}", @@ -6984,7 +6984,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "cnpg_pg_stat_replication_write_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_stat_replication_write_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": false, "interval": "", "legendFormat": "{{pod}} -> {{application_name}}", @@ -7081,7 +7081,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "cnpg_pg_stat_replication_flush_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_stat_replication_flush_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "instant": false, "interval": "", "legendFormat": "{{pod}} -> {{application_name}}", @@ -7179,7 +7179,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "cnpg_pg_stat_replication_replay_lag_seconds{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_pg_stat_replication_replay_lag_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "interval": "", "legendFormat": "{{pod}} -> {{application_name}}", "range": true, @@ -7304,7 +7304,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_collector_collection_duration_seconds{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_collector_collection_duration_seconds{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "interval": "", "legendFormat": "", "refId": "A" @@ -7411,7 +7411,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_collector_last_collection_error{namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "cnpg_collector_last_collection_error{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "interval": "", "legendFormat": "{{pod}}", "refId": "A" @@ -7531,7 +7531,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "cnpg_collector_first_recoverability_point{namespace=~\"$namespace\",pod=~\"$instances\"}*1000 > 0", + "expr": "cnpg_collector_first_recoverability_point{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}*1000 > 0", "format": "time_series", "interval": "", "legendFormat": "{{pod}}", @@ -7654,7 +7654,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter|checkpointer)_checkpoints_req\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter|checkpointer)_checkpoints_req\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "format": "time_series", "hide": false, "instant": false, @@ -7669,7 +7669,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter|checkpointer)_checkpoints_timed\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter|checkpointer)_checkpoints_timed\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -7768,7 +7768,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter_checkpoint|checkpointer)_write_time\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter_checkpoint|checkpointer)_write_time\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "format": "time_series", "hide": false, "instant": false, @@ -7783,7 +7783,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter_checkpoint|checkpointer)_sync_time\",namespace=~\"$namespace\",pod=~\"$instances\"}", + "expr": "{__name__=~\"cnpg_pg_stat_(bgwriter_checkpoint|checkpointer)_sync_time\",namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$instances\"}", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -7906,7 +7906,7 @@ { "disableTextWrap": false, "exemplar": false, - "expr": "max(cnpg_pg_extensions_update_available{pod=~\"$instances\", namespace=~\"$namespace\"}) by (datname, extname, default_version, installed_version)", + "expr": "max(cnpg_pg_extensions_update_available{pod=~\"$instances\", namespace=~\"$namespace\", cluster=\"$cluster\"}) by (datname, extname, default_version, installed_version)", "format": "table", "fullMetaSearch": false, "includeNullMetadata": true, @@ -8050,7 +8050,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(kube_pod_status_ready{namespace=\"$operatorNamespace\", pod=~\"cloudnative-pg.+|cnpg-controller-manager.+\", condition=\"true\"})", + "expr": "sum(kube_pod_status_ready{namespace=\"$operatorNamespace\", cluster=\"$cluster\", pod=~\"cloudnative-pg.+|cnpg-controller-manager.+\", condition=\"true\"})", "hide": false, "instant": true, "legendFormat": "Ready Operator Pods", @@ -8149,7 +8149,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"cluster\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"cluster\"})", "hide": false, "instant": true, "legendFormat": "Cluster Reconcile Errors", @@ -8248,7 +8248,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"backup\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"backup\"})", "hide": false, "instant": true, "legendFormat": "Backup Reconcile Errors", @@ -8347,7 +8347,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=~\"scheduledbackup|scheduled-backup\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=~\"scheduledbackup|scheduled-backup\"})", "hide": false, "instant": true, "legendFormat": "Scheduled Backup Reconcile Errors", @@ -8446,7 +8446,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"pooler\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"pooler\"})", "hide": false, "instant": true, "legendFormat": "Pooler Reconcile Errors", @@ -8563,7 +8563,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum(kube_pod_status_ready{namespace=\"$operatorNamespace\", pod=~\"cloudnative-pg.+|cnpg-controller-manager.+\", condition=\"true\"})", + "expr": "sum(kube_pod_status_ready{namespace=\"$operatorNamespace\", cluster=\"$cluster\", pod=~\"cloudnative-pg.+|cnpg-controller-manager.+\", condition=\"true\"})", "hide": false, "instant": false, "legendFormat": "Ready Operator Pods", @@ -8692,7 +8692,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"cluster\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"cluster\"})", "hide": false, "legendFormat": "Cluster Reconcile Errors", "range": true, @@ -8820,7 +8820,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"backup\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"backup\"})", "hide": false, "legendFormat": "Backup Reconcile Errors", "range": true, @@ -8949,7 +8949,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=~\"scheduledbackup|scheduled-backup\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=~\"scheduledbackup|scheduled-backup\"})", "hide": false, "instant": false, "legendFormat": "Scheduled Backup Reconcile Errors", @@ -9078,7 +9078,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", result=\"error\", controller=\"pooler\"})", + "expr": "max(controller_runtime_reconcile_total{namespace=~\"$operatorNamespace\", cluster=\"$cluster\", result=\"error\", controller=\"pooler\"})", "hide": false, "legendFormat": "Pooler Reconcile Errors", "range": true, @@ -9124,7 +9124,31 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "label_values(controller_runtime_webhook_requests_total{webhook=\"/mutate-postgresql-cnpg-io-v1-cluster\"},namespace)", + "definition": "cnpg_collector_up", + "description": "Kubernetes cluster identifier", + "hide": 0, + "includeAll": false, + "label": "K8s Cluster", + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "cnpg_collector_up", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/cluster=\"(?[^\"]+)/g", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(controller_runtime_webhook_requests_total{webhook=\"/mutate-postgresql-cnpg-io-v1-cluster\", cluster=\"$cluster\"},namespace)", "description": "Namespace where the CNPG operator is located", "hide": 0, "includeAll": false, @@ -9134,7 +9158,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(controller_runtime_webhook_requests_total{webhook=\"/mutate-postgresql-cnpg-io-v1-cluster\"},namespace)", + "query": "label_values(controller_runtime_webhook_requests_total{webhook=\"/mutate-postgresql-cnpg-io-v1-cluster\", cluster=\"$cluster\"},namespace)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, @@ -9149,7 +9173,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "cnpg_collector_up", + "definition": "cnpg_collector_up{cluster=\"$cluster\"}", "description": "Namespace where the database cluster is located", "hide": 0, "includeAll": false, @@ -9158,7 +9182,7 @@ "name": "namespace", "options": [], "query": { - "query": "cnpg_collector_up", + "query": "cnpg_collector_up{cluster=\"$cluster\"}", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -9173,7 +9197,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "cnpg_collector_up{namespace=~\"$namespace\"}", + "definition": "cnpg_collector_up{namespace=~\"$namespace\", cluster=\"$cluster\"}", "description": "CNPG Cluster", "hide": 0, "includeAll": false, @@ -9182,7 +9206,7 @@ "name": "cnpg_cluster", "options": [], "query": { - "query": "cnpg_collector_up{namespace=~\"$namespace\"}", + "query": "cnpg_collector_up{namespace=~\"$namespace\", cluster=\"$cluster\"}", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, @@ -9198,7 +9222,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "cnpg_collector_up{namespace=~\"$namespace\",pod=~\"$cnpg_cluster-([1-9][0-9]*)$\"}", + "definition": "cnpg_collector_up{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$cnpg_cluster-([1-9][0-9]*)$\"}", "description": "Database cluster instances", "hide": 0, "includeAll": true, @@ -9208,7 +9232,7 @@ "options": [], "query": { "qryType": 4, - "query": "cnpg_collector_up{namespace=~\"$namespace\",pod=~\"$cnpg_cluster-([1-9][0-9]*)$\"}", + "query": "cnpg_collector_up{namespace=~\"$namespace\", cluster=\"$cluster\",pod=~\"$cnpg_cluster-([1-9][0-9]*)$\"}", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/argocd-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/argocd-exporter.yml new file mode 100644 index 00000000..a83673ee --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/argocd-exporter.yml @@ -0,0 +1,25 @@ +groups: + - name: ArgoCD Exporter + + rules: + - alert: ArgocdServiceNotSynced + expr: 'argocd_app_info{sync_status!="Synced"} != 0' + for: 15m + labels: + severity: warning + annotations: + summary: ArgoCD service not synced (instance {{ $labels.instance }}) + description: + "Service {{ $labels.name }} run by argo is currently not in sync.\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: ArgocdServiceUnhealthy + expr: 'argocd_app_info{health_status!="Healthy"} != 0' + for: 15m + labels: + severity: warning + annotations: + summary: ArgoCD service unhealthy (instance {{ $labels.instance }}) + description: + "Service {{ $labels.name }} run by argo is currently not healthy.\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/etcd_embedded-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/etcd-exporter.yml similarity index 99% rename from hosts/idols-aquamarine/monitoring/alert_rules/etcd_embedded-exporter.yml rename to hosts/idols-aquamarine/monitoring/alert_rules/etcd-exporter.yml index dff630f9..9225b9cd 100644 --- a/hosts/idols-aquamarine/monitoring/alert_rules/etcd_embedded-exporter.yml +++ b/hosts/idols-aquamarine/monitoring/alert_rules/etcd-exporter.yml @@ -1,5 +1,5 @@ groups: - - name: EmbeddedExporter + - name: Etcd Exporter rules: - alert: EtcdInsufficientMembers diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/fluxcd-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/fluxcd-exporter.yml new file mode 100644 index 00000000..b051d845 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/fluxcd-exporter.yml @@ -0,0 +1,53 @@ +groups: + - name: FluxCD Exporter + + rules: + - alert: FluxKustomizationFailure + expr: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0' + for: 15m + labels: + severity: warning + annotations: + summary: Flux Kustomization Failure (instance {{ $labels.instance }}) + description: + "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ + $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS + = {{ $labels }}" + + - alert: FluxHelmreleaseFailure + expr: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0' + for: 15m + labels: + severity: warning + annotations: + summary: Flux HelmRelease Failure (instance {{ $labels.instance }}) + description: + "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ + $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS + = {{ $labels }}" + + - alert: FluxSourceIssue + expr: + 'gotk_resource_info{ready="False", + customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0' + for: 15m + labels: + severity: warning + annotations: + summary: Flux Source Issue (instance {{ $labels.instance }}) + description: + "Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has + issue(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: FluxImageIssue + expr: + 'gotk_resource_info{ready="False", + customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0' + for: 15m + labels: + severity: warning + annotations: + summary: Flux Image Issue (instance {{ $labels.instance }}) + description: + "The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not + ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/istio_embedded-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/istio-exporter.yml similarity index 99% rename from hosts/idols-aquamarine/monitoring/alert_rules/istio_embedded-exporter.yml rename to hosts/idols-aquamarine/monitoring/alert_rules/istio-exporter.yml index 63b81d21..8a2ff838 100644 --- a/hosts/idols-aquamarine/monitoring/alert_rules/istio_embedded-exporter.yml +++ b/hosts/idols-aquamarine/monitoring/alert_rules/istio-exporter.yml @@ -1,5 +1,5 @@ groups: - - name: EmbeddedExporter + - name: Istio Exporter rules: - alert: IstioKubernetesGatewayAvailabilityDrop diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/kubestate-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/kube-state-metrics-exporter.yml similarity index 99% rename from hosts/idols-aquamarine/monitoring/alert_rules/kubestate-exporter.yml rename to hosts/idols-aquamarine/monitoring/alert_rules/kube-state-metrics-exporter.yml index f9241089..9a839f15 100644 --- a/hosts/idols-aquamarine/monitoring/alert_rules/kubestate-exporter.yml +++ b/hosts/idols-aquamarine/monitoring/alert_rules/kube-state-metrics-exporter.yml @@ -1,5 +1,5 @@ groups: - - name: KubestateExporter + - name: kube-state-metrics Exporter rules: - alert: KubernetesNodeNotReady diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/loki-exporter.yaml b/hosts/idols-aquamarine/monitoring/alert_rules/loki-exporter.yaml new file mode 100644 index 00000000..bcfe5126 --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/loki-exporter.yaml @@ -0,0 +1,52 @@ +groups: + - name: Loki Exporter + + rules: + - alert: LokiProcessTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Loki process too many restarts (instance {{ $labels.instance }}) + description: + "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ + $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestErrors + expr: + '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by + (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by + (namespace, job, route) > 10' + for: 15m + labels: + severity: critical + annotations: + summary: Loki request errors (instance {{ $labels.instance }}) + description: + "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ + $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestPanic + expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0" + for: 5m + labels: + severity: critical + annotations: + summary: Loki request panic (instance {{ $labels.instance }}) + description: + "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of + panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestLatency + expr: + '(histogram_quantile(0.99, + sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1' + for: 5m + labels: + severity: critical + annotations: + summary: Loki request latency (instance {{ $labels.instance }}) + description: + "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s + 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml b/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml new file mode 100644 index 00000000..75d86a6b --- /dev/null +++ b/hosts/idols-aquamarine/monitoring/alert_rules/postgres-exporter.yml @@ -0,0 +1,262 @@ +groups: + - name: PostgresExporter + + rules: + - alert: PostgresqlDown + expr: "pg_up == 0" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + description: + "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlRestarted + expr: "time() - pg_postmaster_start_time_seconds < 60" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql restarted (instance {{ $labels.instance }}) + description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlExporterError + expr: "pg_exporter_last_scrape_error > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql exporter error (instance {{ $labels.instance }}) + description: + "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ + $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTableNotAutoVacuumed + expr: + "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() + - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10" + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }}) + description: + "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTableNotAutoAnalyzed + expr: + "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() + - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10" + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto analyzed (instance {{ $labels.instance }}) + description: + "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyConnections + expr: + "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) + (pg_settings_max_connections * 0.8)" + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + description: + "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS + = {{ $labels }}" + + - alert: PostgresqlNotEnoughConnections + expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql not enough connections (instance {{ $labels.instance }}) + description: + "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlDeadLocks + expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRollbackRate + expr: + 'sum by (namespace,datname) + ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / + ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > + 0.02' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql high rollback rate (instance {{ $labels.instance }}) + description: + "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlCommitRateLow + expr: + 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql commit rate low (instance {{ $labels.instance }}) + description: + "Postgresql seems to be processing very few transactions\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlLowXidConsumption + expr: "rate(pg_txid_current[1m]) < 5" + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql low XID consumption (instance {{ $labels.instance }}) + description: + "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateStatementTimeout + expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) + description: + "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateDeadlock + expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) + description: + "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlUnusedReplicationSlot + expr: "pg_replication_slots_active == 0" + for: 1m + labels: + severity: warning + annotations: + summary: Postgresql unused replication slot (instance {{ $labels.instance }}) + description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyDeadTuples + expr: + "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + + pg_stat_user_tables_n_dead_tup)) >= 0.1" + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many dead tuples (instance {{ $labels.instance }}) + description: + "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlConfigurationChanged + expr: + '{__name__=~"pg_settings_.*"} != ON(__name__, instance) + {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_only[^y]).*"} + OFFSET 5m' + for: 0m + labels: + severity: info + annotations: + summary: Postgresql configuration changed (instance {{ $labels.instance }}) + description: + "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = + {{ $labels }}" + + - alert: PostgresqlSslCompressionActive + expr: "sum(pg_stat_ssl_compression) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql SSL compression active (instance {{ $labels.instance }}) + description: + "Database allows connections with SSL compression enabled. This may add significant + jitter in replication delay. Replicas should turn off SSL compression via + `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyLocksAcquired + expr: + "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * + pg_settings_max_connections)) > 0.20" + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) + description: + "Too many locks acquired on the database. If this alert happens frequently, we may need + to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlBloatIndexHigh(>80%) + expr: + "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)" + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) + description: + "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX + CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlBloatTableHigh(>80%) + expr: + "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)" + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) + description: + "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ + $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlInvalidIndex + expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' + for: 6h + labels: + severity: warning + annotations: + summary: Postgresql invalid index (instance {{ $labels.instance }}) + description: + "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You + should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value + }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlReplicationLag + expr: "pg_replication_lag_seconds > 5" + for: 30s + labels: + severity: warning + annotations: + summary: Postgresql replication lag (instance {{ $labels.instance }}) + description: + "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ + $labels }}"