feat(observability): comprehensive platform alert rules

Replace ad-hoc forgejo/disk alerts with structured VMRule covering:
- platform-health: ForgejoDown, IngressHighErrorRate, NodeNotReady, PodCrashLooping
- storage: PVCUsageHigh (>80%), PVCUsageCritical (>90%)
- resources: NodeCPUHigh (>85%), NodeMemoryHigh (>90%)
This commit is contained in:
Daniel Sy 2026-06-19 16:43:21 +02:00
parent 70939149ea
commit 3141b7bd6c
Signed by untrusted user: danielsy
GPG key ID: 1F39A8BBCD2EE3D3

View file

@ -1,40 +1,95 @@
apiVersion: operator.victoriametrics.com/v1beta1 apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule kind: VMRule
metadata: metadata:
name: forgejo-alerts name: edp-platform-alerts
namespace: observability namespace: observability
spec: spec:
groups: groups:
- name: forgejo - name: platform-health
rules: rules:
- alert: forgejo down - alert: ForgejoDown
expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1 expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1
for: 30s for: 1m
labels: labels:
severity: critical severity: critical
job: "{{ $labels.job }}"
annotations: annotations:
value: "{{ $value }}" summary: "Forgejo is down on {{ $labels.cluster_environment }}"
description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}' description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}."
- name: forgejo-backup
rules: - alert: IngressHighErrorRate
- alert: forgejo s3 backup job failed expr: |
expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0 sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m]))
for: 30s / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05
labels: for: 5m
severity: critical
job: "{{ $labels.job }}"
annotations:
value: "{{ $value }}"
description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}'
- name: disk-consumption-high
rules:
- alert: disk consumption high
expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6
for: 30s
labels: labels:
severity: major severity: major
job: "{{ $labels.job }}"
annotations: annotations:
value: "{{ $value }}" summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}"
description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}' description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes."
value: "{{ $value | humanizePercentage }}"
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}"
description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3
for: 5m
labels:
severity: major
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}"
description: "Pod has restarted more than 3 times in the last 15 minutes."
- name: storage
rules:
- alert: PVCUsageHigh
expr: |
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
for: 5m
labels:
severity: major
annotations:
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%"
description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}."
value: "{{ $value | humanizePercentage }}"
- alert: PVCUsageCritical
expr: |
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90
for: 5m
labels:
severity: critical
annotations:
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%"
description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required."
value: "{{ $value | humanizePercentage }}"
- name: resources
rules:
- alert: NodeCPUHigh
expr: |
1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85
for: 15m
labels:
severity: major
annotations:
summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}"
description: "Node CPU utilization has been above 85% for 15 minutes."
value: "{{ $value | humanizePercentage }}"
- alert: NodeMemoryHigh
expr: |
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
for: 10m
labels:
severity: major
annotations:
summary: "Node memory >90% on {{ $labels.cluster_environment }}"
description: "Node memory utilization above 90% for 10 minutes."
value: "{{ $value | humanizePercentage }}"