feat(observability): ✨ add cluster heartbeat dead-man switch alerts
ClusterMetricsSilent: fires if no kubelet metrics for >10m (catches vmagent outages). ClusterAPIServerDown: fires if apiserver scrape fails for >5m. Replaces silenced KubeControllerManagerDown/KubeSchedulerDown which never fire on managed K8s.
This commit is contained in:
parent
eda2812d47
commit
7a6f96a8b4
1 changed files with 24 additions and 0 deletions
|
|
@ -93,3 +93,27 @@ spec:
|
|||
summary: "Node memory >90% on {{ $labels.cluster_environment }}"
|
||||
description: "Node memory utilization above 90% for 10 minutes."
|
||||
value: "{{ $value | humanizePercentage }}"
|
||||
|
||||
- name: cluster-health
|
||||
rules:
|
||||
- alert: ClusterMetricsSilent
|
||||
expr: |
|
||||
count(up{job="kubelet"}) by (cluster_environment) < 1
|
||||
or
|
||||
absent(up{job="kubelet", cluster_environment="dev"})
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
|
||||
description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
|
||||
|
||||
- alert: ClusterAPIServerDown
|
||||
expr: |
|
||||
up{job="apiserver", cluster_environment=~".+"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API server down on {{ $labels.cluster_environment }}"
|
||||
description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue