apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: name: backup-alerts namespace: observability spec: groups: - name: backup-schedule-staleness rules: - alert: BackupCronJobNotScheduled expr: | time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} > 26 * 3600 for: 5m labels: severity: critical cronjob: "{{ $labels.cronjob }}" annotations: value: "{{ $value | humanizeDuration }}" description: >- CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. Last schedule was {{ $value | humanizeDuration }} ago. summary: "Backup CronJob {{ $labels.cronjob }} is stale" - alert: BackupCronJobNeverScheduled expr: | kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} == 0 for: 30m labels: severity: critical cronjob: "{{ $labels.cronjob }}" annotations: description: >- CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been scheduled in cluster {{ $labels.cluster_environment }}. summary: "Backup CronJob {{ $labels.cronjob }} never ran" - name: backup-job-failures rules: - alert: BackupJobFailed expr: | max by(cluster_environment, namespace, job_name) ( kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} ) > 0 for: 30s labels: severity: critical job_name: "{{ $labels.job_name }}" annotations: value: "{{ $value }}" description: >- Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. summary: "Backup job {{ $labels.job_name }} failed" - name: backup-job-duration rules: - alert: BackupJobTooSlow expr: | ( time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} ) > 300 and kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 for: 1m labels: severity: major job_name: "{{ $labels.job_name }}" annotations: value: "{{ $value | humanizeDuration }}" description: >- Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been running for {{ $value | humanizeDuration }} (threshold: 5m) in cluster {{ $labels.cluster_environment }}. This may indicate a hung process or connectivity issue. summary: "Backup job {{ $labels.job_name }} running too long"