feat(observability): add cluster heartbeat dead-man switch alerts

ClusterMetricsSilent: fires if no kubelet metrics for >10m (catches vmagent outages).
ClusterAPIServerDown: fires if apiserver scrape fails for >5m.
Replaces silenced KubeControllerManagerDown/KubeSchedulerDown which never fire on managed K8s.
This commit is contained in:
Daniel Sy 2026-06-22 11:05:43 +02:00
parent eda2812d47
commit 7a6f96a8b4
Signed by untrusted user: danielsy
GPG key ID: 1F39A8BBCD2EE3D3

View file

@ -93,3 +93,27 @@ spec:
summary: "Node memory >90% on {{ $labels.cluster_environment }}" summary: "Node memory >90% on {{ $labels.cluster_environment }}"
description: "Node memory utilization above 90% for 10 minutes." description: "Node memory utilization above 90% for 10 minutes."
value: "{{ $value | humanizePercentage }}" value: "{{ $value | humanizePercentage }}"
- name: cluster-health
rules:
- alert: ClusterMetricsSilent
expr: |
count(up{job="kubelet"}) by (cluster_environment) < 1
or
absent(up{job="kubelet", cluster_environment="dev"})
for: 10m
labels:
severity: critical
annotations:
summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
- alert: ClusterAPIServerDown
expr: |
up{job="apiserver", cluster_environment=~".+"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "API server down on {{ $labels.cluster_environment }}"
description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."