feat(observability): ✨ add cluster heartbeat dead-man switch alerts

ClusterMetricsSilent: fires if no kubelet metrics for >10m (catches vmagent outages). ClusterAPIServerDown: fires if apiserver scrape fails for >5m. Replaces silenced KubeControllerManagerDown/KubeSchedulerDown which never fire on managed K8s.
2026-06-22 11:05:43 +02:00 · 2026-06-22 11:05:43 +02:00 · 7a6f96a8b4
commit 7a6f96a8b4
parent eda2812d47
1 changed files with 24 additions and 0 deletions
--- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml
@ -93,3 +93,27 @@ spec:
            summary: "Node memory >90% on {{ $labels.cluster_environment }}"
            description: "Node memory utilization above 90% for 10 minutes."
            value: "{{ $value | humanizePercentage }}"
    - name: cluster-health
      rules:
        - alert: ClusterMetricsSilent
          expr: |
            count(up{job="kubelet"}) by (cluster_environment) < 1
            or
            absent(up{job="kubelet", cluster_environment="dev"})
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
            description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
        - alert: ClusterAPIServerDown
          expr: |
            up{job="apiserver", cluster_environment=~".+"} == 0
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "API server down on {{ $labels.cluster_environment }}"
            description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."