2025-07-21 12:52:28 +00:00
apiVersion : operator.victoriametrics.com/v1beta1
kind : VMRule
metadata :
2026-06-19 16:43:21 +02:00
name : edp-platform-alerts
2025-07-21 12:52:28 +00:00
namespace : observability
spec :
groups :
2026-06-19 16:43:21 +02:00
- name : platform-health
2025-07-21 12:52:28 +00:00
rules :
2026-06-19 16:43:21 +02:00
- alert : ForgejoDown
expr : sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1
for : 1m
2025-07-21 12:52:28 +00:00
labels :
2025-08-12 12:40:19 +00:00
severity : critical
2025-07-21 12:52:28 +00:00
annotations :
2026-06-19 16:43:21 +02:00
summary : "Forgejo is down on {{ $labels.cluster_environment }}"
description : "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}."
- alert : IngressHighErrorRate
expr : |
sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m]))
/ sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05
for : 5m
labels :
severity : major
annotations :
summary : "High ingress 5xx rate on {{ $labels.cluster_environment }}"
description : "More than 5% of ingress requests are returning 5xx errors for over 5 minutes."
value : "{{ $value | humanizePercentage }}"
- alert : NodeNotReady
expr : kube_node_status_condition{condition="Ready", status="true"} == 0
for : 5m
labels :
severity : critical
annotations :
summary : "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}"
description : "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
- alert : PodCrashLooping
expr : rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3
for : 5m
labels :
severity : major
annotations :
summary : "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}"
description : "Pod has restarted more than 3 times in the last 15 minutes."
- name : storage
2025-08-12 12:40:19 +00:00
rules :
2026-06-19 16:43:21 +02:00
- alert : PVCUsageHigh
expr : |
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
for : 5m
labels :
severity : major
annotations :
summary : "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%"
description : "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}."
value : "{{ $value | humanizePercentage }}"
- alert : PVCUsageCritical
expr : |
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90
for : 5m
2025-08-12 12:40:19 +00:00
labels :
severity : critical
annotations :
2026-06-19 16:43:21 +02:00
summary : "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%"
description : "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required."
value : "{{ $value | humanizePercentage }}"
- name : resources
2026-03-04 09:55:46 +00:00
rules :
2026-06-19 16:43:21 +02:00
- alert : NodeCPUHigh
expr : |
1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85
for : 15m
labels :
severity : major
annotations :
summary : "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}"
description : "Node CPU utilization has been above 85% for 15 minutes."
value : "{{ $value | humanizePercentage }}"
- alert : NodeMemoryHigh
expr : |
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
for : 10m
2026-03-04 09:55:46 +00:00
labels :
severity : major
annotations :
2026-06-19 16:43:21 +02:00
summary : "Node memory >90% on {{ $labels.cluster_environment }}"
description : "Node memory utilization above 90% for 10 minutes."
value : "{{ $value | humanizePercentage }}"
2026-06-22 11:05:43 +02:00
- name : cluster-health
rules :
- alert : ClusterMetricsSilent
expr : |
count(up{job="kubelet"}) by (cluster_environment) < 1
or
absent(up{job="kubelet", cluster_environment="dev"})
for : 10m
labels :
severity : critical
annotations :
summary : "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
description : "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
- alert : ClusterAPIServerDown
expr : |
up{job="apiserver", cluster_environment=~".+"} == 0
for : 5m
labels :
severity : critical
annotations :
summary : "API server down on {{ $labels.cluster_environment }}"
description : "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."