Automated upload for dev.t09.de
This commit is contained in:
parent
9bbcf4efca
commit
95deeef6a0
7 changed files with 86 additions and 27 deletions
|
|
@ -0,0 +1,78 @@
|
|||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: backup-alerts
|
||||
namespace: observability
|
||||
spec:
|
||||
groups:
|
||||
- name: backup-schedule-staleness
|
||||
rules:
|
||||
- alert: BackupCronJobNotScheduled
|
||||
expr: |
|
||||
time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
|
||||
> 26 * 3600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
cronjob: "{{ $labels.cronjob }}"
|
||||
annotations:
|
||||
value: "{{ $value | humanizeDuration }}"
|
||||
description: >-
|
||||
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been
|
||||
scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}.
|
||||
Last schedule was {{ $value | humanizeDuration }} ago.
|
||||
summary: "Backup CronJob {{ $labels.cronjob }} is stale"
|
||||
|
||||
- alert: BackupCronJobNeverScheduled
|
||||
expr: |
|
||||
kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
|
||||
== 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
cronjob: "{{ $labels.cronjob }}"
|
||||
annotations:
|
||||
description: >-
|
||||
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been
|
||||
scheduled in cluster {{ $labels.cluster_environment }}.
|
||||
summary: "Backup CronJob {{ $labels.cronjob }} never ran"
|
||||
|
||||
- name: backup-job-failures
|
||||
rules:
|
||||
- alert: BackupJobFailed
|
||||
expr: |
|
||||
max by(cluster_environment, namespace, job_name) (
|
||||
kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
|
||||
) > 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
job_name: "{{ $labels.job_name }}"
|
||||
annotations:
|
||||
value: "{{ $value }}"
|
||||
description: >-
|
||||
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has
|
||||
{{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}.
|
||||
summary: "Backup job {{ $labels.job_name }} failed"
|
||||
|
||||
- name: backup-job-duration
|
||||
rules:
|
||||
- alert: BackupJobTooSlow
|
||||
expr: |
|
||||
(
|
||||
time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
|
||||
) > 300
|
||||
and
|
||||
kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: major
|
||||
job_name: "{{ $labels.job_name }}"
|
||||
annotations:
|
||||
value: "{{ $value | humanizeDuration }}"
|
||||
description: >-
|
||||
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been
|
||||
running for {{ $value | humanizeDuration }} (threshold: 5m)
|
||||
in cluster {{ $labels.cluster_environment }}. This may indicate a
|
||||
hung process or connectivity issue.
|
||||
summary: "Backup job {{ $labels.job_name }} running too long"
|
||||
Loading…
Add table
Add a link
Reference in a new issue