diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml index cb0f1e3..2cce6a3 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -93,3 +93,27 @@ spec: summary: "Node memory >90% on {{ $labels.cluster_environment }}" description: "Node memory utilization above 90% for 10 minutes." value: "{{ $value | humanizePercentage }}" + + - name: cluster-health + rules: + - alert: ClusterMetricsSilent + expr: | + count(up{job="kubelet"}) by (cluster_environment) < 1 + or + absent(up{job="kubelet", cluster_environment="dev"}) + for: 10m + labels: + severity: critical + annotations: + summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics" + description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable." + + - alert: ClusterAPIServerDown + expr: | + up{job="apiserver", cluster_environment=~".+"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "API server down on {{ $labels.cluster_environment }}" + description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."