diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml new file mode 100644 index 0000000..259a2bf --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml @@ -0,0 +1,78 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: backup-alerts + namespace: observability +spec: + groups: + - name: backup-schedule-staleness + rules: + - alert: BackupCronJobNotScheduled + expr: | + time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + > 26 * 3600 + for: 5m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been + scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. + Last schedule was {{ $value | humanizeDuration }} ago. + summary: "Backup CronJob {{ $labels.cronjob }} is stale" + + - alert: BackupCronJobNeverScheduled + expr: | + kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + == 0 + for: 30m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been + scheduled in cluster {{ $labels.cluster_environment }}. + summary: "Backup CronJob {{ $labels.cronjob }} never ran" + + - name: backup-job-failures + rules: + - alert: BackupJobFailed + expr: | + max by(cluster_environment, namespace, job_name) ( + kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 0 + for: 30s + labels: + severity: critical + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has + {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. + summary: "Backup job {{ $labels.job_name }} failed" + + - name: backup-job-duration + rules: + - alert: BackupJobTooSlow + expr: | + ( + time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 300 + and + kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 + for: 1m + labels: + severity: major + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been + running for {{ $value | humanizeDuration }} (threshold: 5m) + in cluster {{ $labels.cluster_environment }}. This may indicate a + hung process or connectivity issue. + summary: "Backup job {{ $labels.job_name }} running too long"