feat(observability): ✨ add cluster heartbeat dead-man switch alerts
ClusterMetricsSilent: fires if no kubelet metrics for >10m (catches vmagent outages). ClusterAPIServerDown: fires if apiserver scrape fails for >5m. Replaces silenced KubeControllerManagerDown/KubeSchedulerDown which never fire on managed K8s.
This commit is contained in:
parent
eda2812d47
commit
7a6f96a8b4
1 changed files with 24 additions and 0 deletions
|
|
@ -93,3 +93,27 @@ spec:
|
||||||
summary: "Node memory >90% on {{ $labels.cluster_environment }}"
|
summary: "Node memory >90% on {{ $labels.cluster_environment }}"
|
||||||
description: "Node memory utilization above 90% for 10 minutes."
|
description: "Node memory utilization above 90% for 10 minutes."
|
||||||
value: "{{ $value | humanizePercentage }}"
|
value: "{{ $value | humanizePercentage }}"
|
||||||
|
|
||||||
|
- name: cluster-health
|
||||||
|
rules:
|
||||||
|
- alert: ClusterMetricsSilent
|
||||||
|
expr: |
|
||||||
|
count(up{job="kubelet"}) by (cluster_environment) < 1
|
||||||
|
or
|
||||||
|
absent(up{job="kubelet", cluster_environment="dev"})
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
|
||||||
|
description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
|
||||||
|
|
||||||
|
- alert: ClusterAPIServerDown
|
||||||
|
expr: |
|
||||||
|
up{job="apiserver", cluster_environment=~".+"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "API server down on {{ $labels.cluster_environment }}"
|
||||||
|
description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue