ClusterMetricsSilent: fires if no kubelet metrics for >10m (catches vmagent outages). ClusterAPIServerDown: fires if apiserver scrape fails for >5m. Replaces silenced KubeControllerManagerDown/KubeSchedulerDown which never fire on managed K8s.
119 lines
4.9 KiB
YAML
119 lines
4.9 KiB
YAML
apiVersion: operator.victoriametrics.com/v1beta1
|
|
kind: VMRule
|
|
metadata:
|
|
name: edp-platform-alerts
|
|
namespace: observability
|
|
spec:
|
|
groups:
|
|
- name: platform-health
|
|
rules:
|
|
- alert: ForgejoDown
|
|
expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Forgejo is down on {{ $labels.cluster_environment }}"
|
|
description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}."
|
|
|
|
- alert: IngressHighErrorRate
|
|
expr: |
|
|
sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m]))
|
|
/ sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: major
|
|
annotations:
|
|
summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}"
|
|
description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes."
|
|
value: "{{ $value | humanizePercentage }}"
|
|
|
|
- alert: NodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}"
|
|
description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
|
|
|
|
- alert: PodCrashLooping
|
|
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3
|
|
for: 5m
|
|
labels:
|
|
severity: major
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}"
|
|
description: "Pod has restarted more than 3 times in the last 15 minutes."
|
|
|
|
- name: storage
|
|
rules:
|
|
- alert: PVCUsageHigh
|
|
expr: |
|
|
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
|
|
for: 5m
|
|
labels:
|
|
severity: major
|
|
annotations:
|
|
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%"
|
|
description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}."
|
|
value: "{{ $value | humanizePercentage }}"
|
|
|
|
- alert: PVCUsageCritical
|
|
expr: |
|
|
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%"
|
|
description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required."
|
|
value: "{{ $value | humanizePercentage }}"
|
|
|
|
- name: resources
|
|
rules:
|
|
- alert: NodeCPUHigh
|
|
expr: |
|
|
1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85
|
|
for: 15m
|
|
labels:
|
|
severity: major
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}"
|
|
description: "Node CPU utilization has been above 85% for 15 minutes."
|
|
value: "{{ $value | humanizePercentage }}"
|
|
|
|
- alert: NodeMemoryHigh
|
|
expr: |
|
|
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
|
|
for: 10m
|
|
labels:
|
|
severity: major
|
|
annotations:
|
|
summary: "Node memory >90% on {{ $labels.cluster_environment }}"
|
|
description: "Node memory utilization above 90% for 10 minutes."
|
|
value: "{{ $value | humanizePercentage }}"
|
|
|
|
- name: cluster-health
|
|
rules:
|
|
- alert: ClusterMetricsSilent
|
|
expr: |
|
|
count(up{job="kubelet"}) by (cluster_environment) < 1
|
|
or
|
|
absent(up{job="kubelet", cluster_environment="dev"})
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
|
|
description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
|
|
|
|
- alert: ClusterAPIServerDown
|
|
expr: |
|
|
up{job="apiserver", cluster_environment=~".+"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "API server down on {{ $labels.cluster_environment }}"
|
|
description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."
|