apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: name: edp-platform-alerts namespace: observability spec: groups: - name: platform-health rules: - alert: ForgejoDown expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1 for: 1m labels: severity: critical annotations: summary: "Forgejo is down on {{ $labels.cluster_environment }}" description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}." - alert: IngressHighErrorRate expr: | sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m])) / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05 for: 5m labels: severity: major annotations: summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}" description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes." value: "{{ $value | humanizePercentage }}" - alert: NodeNotReady expr: kube_node_status_condition{condition="Ready", status="true"} == 0 for: 5m labels: severity: critical annotations: summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}" description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes." - alert: PodCrashLooping expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3 for: 5m labels: severity: major annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}" description: "Pod has restarted more than 3 times in the last 15 minutes." - name: storage rules: - alert: PVCUsageHigh expr: | 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 for: 5m labels: severity: major annotations: summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%" description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}." value: "{{ $value | humanizePercentage }}" - alert: PVCUsageCritical expr: | 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90 for: 5m labels: severity: critical annotations: summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%" description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required." value: "{{ $value | humanizePercentage }}" - name: resources rules: - alert: NodeCPUHigh expr: | 1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85 for: 15m labels: severity: major annotations: summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}" description: "Node CPU utilization has been above 85% for 15 minutes." value: "{{ $value | humanizePercentage }}" - alert: NodeMemoryHigh expr: | 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 for: 10m labels: severity: major annotations: summary: "Node memory >90% on {{ $labels.cluster_environment }}" description: "Node memory utilization above 90% for 10 minutes." value: "{{ $value | humanizePercentage }}"