feat(observability): 📊 add backup failure alerting rules

VMRule alerts for forgejo-s3-backup and secrets-backup CronJobs:
- BackupCronJobNotScheduled (>26h since last run)
- BackupCronJobNeverScheduled (never ran)
- BackupJobFailed (job failed)
- BackupJobTooSlow (running >5min)

Ref: IPCEICIS-9313
Ref: IPCEICIS-2810
This commit is contained in:
Daniel Sy 2026-06-08 15:07:06 +02:00
parent b087dac0f1
commit 053acd7596
Signed by untrusted user: danielsy
GPG key ID: 1F39A8BBCD2EE3D3

View file

@ -0,0 +1,78 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: backup-alerts
namespace: observability
spec:
groups:
- name: backup-schedule-staleness
rules:
- alert: BackupCronJobNotScheduled
expr: |
time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
> 26 * 3600
for: 5m
labels:
severity: critical
cronjob: "{{ $labels.cronjob }}"
annotations:
value: "{{ $value | humanizeDuration }}"
description: >-
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been
scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}.
Last schedule was {{ $value | humanizeDuration }} ago.
summary: "Backup CronJob {{ $labels.cronjob }} is stale"
- alert: BackupCronJobNeverScheduled
expr: |
kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
== 0
for: 30m
labels:
severity: critical
cronjob: "{{ $labels.cronjob }}"
annotations:
description: >-
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been
scheduled in cluster {{ $labels.cluster_environment }}.
summary: "Backup CronJob {{ $labels.cronjob }} never ran"
- name: backup-job-failures
rules:
- alert: BackupJobFailed
expr: |
max by(cluster_environment, namespace, job_name) (
kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
) > 0
for: 30s
labels:
severity: critical
job_name: "{{ $labels.job_name }}"
annotations:
value: "{{ $value }}"
description: >-
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has
{{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}.
summary: "Backup job {{ $labels.job_name }} failed"
- name: backup-job-duration
rules:
- alert: BackupJobTooSlow
expr: |
(
time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
) > 300
and
kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0
for: 1m
labels:
severity: major
job_name: "{{ $labels.job_name }}"
annotations:
value: "{{ $value | humanizeDuration }}"
description: >-
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been
running for {{ $value | humanizeDuration }} (threshold: 5m)
in cluster {{ $labels.cluster_environment }}. This may indicate a
hung process or connectivity issue.
summary: "Backup job {{ $labels.job_name }} running too long"