feat: monitoring - grafana dashboards, alertmanager alerting rules

2026-07-01 02:31:36 +02:00 · 2024-02-18 00:29:34 +08:00
parent b75e9d6abe
commit ef1fb417ad
15 changed files with 35988 additions and 2 deletions
@@ -0,0 +1,20 @@
+apiVersion: 1
+
+providers:
+  # <string> an unique provider name. Required
+  - name: 'Dashboards'
+    # <int> Org id. Default to 1
+    orgId: 1
+    # <string> provider type. Default to 'file'
+    type: file
+    # <bool> disable dashboard deletion
+    disableDeletion: false
+    # <int> how often Grafana will scan for changed dashboards
+    updateIntervalSeconds: 20
+    # <bool> allow updating provisioned dashboards from the UI
+    allowUiUpdates: false
+    options:
+      # <string, required> path to dashboard files on disk. Required when using the 'file' type
+      path: /etc/grafana/dashboards
+      # <bool> use folder names from filesystem to create folders in Grafana
+      foldersFromFilesStructure: true
@@ -0,0 +1,5 @@
+# Grafana Dashbaords
+
+1. https://grafana.com/grafana/dashboards/1860-node-exporter-full/
+2. https://grafana.com/grafana/dashboards/9578-alertmanager/
+
@@ -0,0 +1,21 @@
+# https://grafana.com/docs/grafana/latest/datasources/prometheus/
+apiVersion: 1
+
+datasources:
+  - name: prometheus-homelab
+    type: prometheus
+    access: proxy
+    # Access mode - proxy (server in the UI) or direct (browser in the UI).
+    url: http://localhost:9090
+    jsonData:
+      httpMethod: POST
+      manageAlerts: true
+      prometheusType: Prometheus
+      prometheusVersion: 2.49.0
+      cacheLevel: 'High'
+      disableRecordingRules: false
+      # As of Grafana 10, the Prometheus data source can be configured to query live dashboards 
+      # incrementally, instead of re-querying the entire duration on each dashboard refresh.
+      # Increasing the duration of the incrementalQueryOverlapWindow will increase the size of every incremental query, 
+      # but might be helpful for instances that have inconsistent results for recent data.
+      incrementalQueryOverlapWindow: 10m
@@ -49,4 +49,6 @@
      dashboards.path = ./dashboards.yml;
    };
  };
+
+  environment.etc."grafana/dashboards".source = ./dashboards;
 }
@@ -0,0 +1,7 @@
+# Prometheus & Alertmanager
+
+## Alert Rules
+
+- [awesome-prometheus-alerts](https://github.com/samber/awesome-prometheus-alerts): Collection of Prometheus alerting rules 
+
+
@@ -0,0 +1,14 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: CorednsPanicCount
+      expr: 'increase(coredns_panics_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: CoreDNS Panic Count (instance {{ $labels.instance }})
+        description: "Number of CoreDNS panics encountered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -0,0 +1,122 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: EtcdInsufficientMembers
+      expr: 'count(etcd_server_id) % 2 == 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Etcd insufficient Members (instance {{ $labels.instance }})
+        description: "Etcd cluster should have an odd number of members\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdNoLeader
+      expr: 'etcd_server_has_leader == 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Etcd no Leader (instance {{ $labels.instance }})
+        description: "Etcd cluster have no leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighNumberOfLeaderChanges
+      expr: 'increase(etcd_server_leader_changes_seen_total[10m]) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd high number of leader changes (instance {{ $labels.instance }})
+        description: "Etcd leader changed more than 2 times during 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighNumberOfFailedGrpcRequests
+      expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
+        description: "More than 1% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighNumberOfFailedGrpcRequests
+      expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
+        description: "More than 5% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdGrpcRequestsSlow
+      expr: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
+        description: "GRPC requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighNumberOfFailedHttpRequests
+      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
+        description: "More than 1% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighNumberOfFailedHttpRequests
+      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
+        description: "More than 5% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHttpRequestsSlow
+      expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
+        description: "HTTP requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdMemberCommunicationSlow
+      expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd member communication slow (instance {{ $labels.instance }})
+        description: "Etcd member communication slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighNumberOfFailedProposals
+      expr: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
+        description: "Etcd server got more than 5 failed proposals past hour\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighFsyncDurations
+      expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd high fsync durations (instance {{ $labels.instance }})
+        description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: EtcdHighCommitDurations
+      expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Etcd high commit durations (instance {{ $labels.instance }})
+        description: "Etcd commit duration increasing, 99th percentile is over 0.25s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -0,0 +1,95 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: IstioKubernetesGatewayAvailabilityDrop
+      expr: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
+        description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioPilotHighTotalRequestRate
+      expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
+        description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioMixerPrometheusDispatchesLow
+      expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
+        description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioHighTotalRequestRate
+      expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio high total request rate (instance {{ $labels.instance }})
+        description: "Global request rate in the service mesh is unusually high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioLowTotalRequestRate
+      expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio low total request rate (instance {{ $labels.instance }})
+        description: "Global request rate in the service mesh is unusually low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioHigh4xxErrorRate
+      expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio high 4xx error rate (instance {{ $labels.instance }})
+        description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioHigh5xxErrorRate
+      expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio high 5xx error rate (instance {{ $labels.instance }})
+        description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioHighRequestLatency
+      expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio high request latency (instance {{ $labels.instance }})
+        description: "Istio average requests execution is longer than 100ms.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioLatency99Percentile
+      expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Istio latency 99 percentile (instance {{ $labels.instance }})
+        description: "Istio 1% slowest requests are longer than 1000ms.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: IstioPilotDuplicateEntry
+      expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
+        description: "Istio pilot duplicate entry error.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -0,0 +1,311 @@
+groups:
+
+- name: KubestateExporter
+
+  rules:
+
+    - alert: KubernetesNodeNotReady
+      expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes Node ready (node {{ $labels.node }})
+        description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesNodeMemoryPressure
+      expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes memory pressure (node {{ $labels.node }})
+        description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesNodeDiskPressure
+      expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes disk pressure (node {{ $labels.node }})
+        description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesNodeNetworkUnavailable
+      expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
+        description: "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesNodeOutOfPodCapacity
+      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
+        description: "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesContainerOomKiller
+      expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
+        description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesJobFailed
+      expr: 'kube_job_status_failed > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
+        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesCronjobSuspended
+      expr: 'kube_cronjob_spec_suspend != 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
+        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesPersistentvolumeclaimPending
+      expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
+        description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesVolumeOutOfDiskSpace
+      expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
+        description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesVolumeFullInFourDays
+      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
+        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesPersistentvolumeError
+      expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
+        description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesStatefulsetDown
+      expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesHpaScaleInability
+      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesHpaMetricsUnavailability
+      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesHpaScaleMaximum
+      expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
+      for: 2m
+      labels:
+        severity: info
+      annotations:
+        summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesHpaUnderutilized
+      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesPodNotHealthy
+      expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesPodCrashLooping
+      expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesReplicasetReplicasMismatch
+      expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
+        description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesDeploymentReplicasMismatch
+      expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesStatefulsetReplicasMismatch
+      expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
+        description: "StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesDeploymentGenerationMismatch
+      expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesStatefulsetGenerationMismatch
+      expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesStatefulsetUpdateNotRolledOut
+      expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesDaemonsetRolloutStuck
+      expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesDaemonsetMisscheduled
+      expr: 'kube_daemonset_status_number_misscheduled > 0'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesCronjobTooLong
+      expr: 'time() - kube_cronjob_next_schedule_time > 3600'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
+        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesJobSlowCompletion
+      expr: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
+      for: 12h
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
+        description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesApiServerErrors
+      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes API server errors (instance {{ $labels.instance }})
+        description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesApiClientErrors
+      expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes API client errors (instance {{ $labels.instance }})
+        description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesClientCertificateExpiresNextWeek
+      expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
+        description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesClientCertificateExpiresSoon
+      expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
+        description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: KubernetesApiServerLatency
+      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes API server latency (instance {{ $labels.instance }})
+        description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -0,0 +1,347 @@
+groups:
+
+- name: NodeExporter
+
+  rules:
+
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryIsUnderutilized
+      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputIn
+      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputOut
+      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadRate
+      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteRate
+      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write rate (instance {{ $labels.instance }})
+        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfDiskSpace
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostDiskWillFillIn24Hours
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostInodesWillFillIn24Hours
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostHighCpuLoad
+      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host high CPU load (instance {{ $labels.instance }})
+        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuIsUnderutilized
+      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuStealNoisyNeighbor
+      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuHighIowait
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
+        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskIo
+      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostContextSwitching
+      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSwapIsFillingUp
+      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host swap is filling up (instance {{ $labels.instance }})
+        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostPhysicalComponentTooHot
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidArrayGotInactive
+      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostKernelVersionDeviations
+      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkInterfaceSaturated
+      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostRequiresReboot
+      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 4h
+      labels:
+        severity: info
+      annotations:
+        summary: Host requires reboot (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -34,8 +34,13 @@
    #      Alerting rules allow you to define alert conditions based on Prometheus expression language expressions
    #      and to send notifications about firing alerts to an external service.
    ruleFiles = [
-      ./recording_rules.yml
-      ./alerting_rules.yml
+      ./alert_rules/node-exporter.yml
+      ./alert_rules/kubestate-exporter.yml
+      ./alert_rules/etcd_embedded-exporter.yml
+      ./alert_rules/istio_embedded-exporter.yml
+      ./alert_rules/coredns_embedded-exporter.yml
+
+      # ./recording_rules.yml
    ];

    # specifies a set of targets and parameters describing how to scrape metrics from them.