diff --git a/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml index 6870fb3..3cbfb4c 100644 --- a/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml +++ b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml @@ -62,7 +62,7 @@ spec: - name: RECEIVER_SESSION_TTL value: "12h" - name: RECEIVER_ALLOWED_ORG - value: "giteaAdmin" + value: "DevFW-CICD" - name: RECEIVER_CPU_SIZING_MODE value: "observe" - name: RECEIVER_MEMORY_QOS diff --git a/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/secret.yaml b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/secret.yaml deleted file mode 100644 index ac8a37c..0000000 --- a/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/secret.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: sizer-oidc-client - labels: - app: sizer-receiver -type: Opaque -stringData: - client-secret: "73eda9068bd00dfe67d29f087b5540cb1cd82cc1dd2ac0f838558ac8bbcfcb3a" diff --git a/otc/dev.t09.de/stacks/core/dex.yaml b/otc/dev.t09.de/stacks/core/dex.yaml index b67aa7d..5da98f5 100644 --- a/otc/dev.t09.de/stacks/core/dex.yaml +++ b/otc/dev.t09.de/stacks/core/dex.yaml @@ -27,6 +27,3 @@ spec: - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances targetRevision: HEAD ref: values - - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances - targetRevision: HEAD - path: "otc/dev.t09.de/stacks/core/dex/manifests" diff --git a/otc/dev.t09.de/stacks/core/dex/manifests/dex-sizer-client.yaml b/otc/dev.t09.de/stacks/core/dex/manifests/dex-sizer-client.yaml deleted file mode 100644 index 884df64..0000000 --- a/otc/dev.t09.de/stacks/core/dex/manifests/dex-sizer-client.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: dex-sizer-client - namespace: dex -type: Opaque -stringData: - clientSecret: "73eda9068bd00dfe67d29f087b5540cb1cd82cc1dd2ac0f838558ac8bbcfcb3a" diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml index f2e3fb2..a8a173e 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml @@ -1,3 +1,4 @@ + # We use recreate to make sure only one instance with one version is running, because Forgejo might break or data gets inconsistant. strategy: type: Recreate @@ -173,8 +174,10 @@ service: image: pullPolicy: "IfNotPresent" - # DB has v15a/v15b migrations from workflow-webhook build. - # Using that image until a proper v15+ EDP release is cut. - fullOverride: edp.buildth.ing/devfw-cicd/edp-forgejo:workflow-webhook-20260305 + # Overrides the image tag whose default is the chart appVersion. + #tag: "8.0.3" + # Adds -rootless suffix to image name + # rootless: true + fullOverride: edp.buildth.ing/devfw-cicd/edp-forgejo:14.0.2-edp1-rootless forgejo: {} diff --git a/otc/dev.t09.de/stacks/garm/garm/values.yaml b/otc/dev.t09.de/stacks/garm/garm/values.yaml index bce1f62..827e495 100644 --- a/otc/dev.t09.de/stacks/garm/garm/values.yaml +++ b/otc/dev.t09.de/stacks/garm/garm/values.yaml @@ -26,9 +26,7 @@ credentials: image: repository: edp.buildth.ing/devfw-cicd/garm-forgejo - # NOTE: v0.1.7-forgejo-23 has exec format error (wrong arch build) - # Rolled back to -22 until -23 is rebuilt for amd64 - tag: v0.1.7-forgejo-22 + tag: v0.1.7-forgejo-23 providerConfig: edgeConnect: diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml new file mode 100644 index 0000000..259a2bf --- /dev/null +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml @@ -0,0 +1,78 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: backup-alerts + namespace: observability +spec: + groups: + - name: backup-schedule-staleness + rules: + - alert: BackupCronJobNotScheduled + expr: | + time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + > 26 * 3600 + for: 5m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been + scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. + Last schedule was {{ $value | humanizeDuration }} ago. + summary: "Backup CronJob {{ $labels.cronjob }} is stale" + + - alert: BackupCronJobNeverScheduled + expr: | + kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + == 0 + for: 30m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been + scheduled in cluster {{ $labels.cluster_environment }}. + summary: "Backup CronJob {{ $labels.cronjob }} never ran" + + - name: backup-job-failures + rules: + - alert: BackupJobFailed + expr: | + max by(cluster_environment, namespace, job_name) ( + kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 0 + for: 30s + labels: + severity: critical + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has + {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. + summary: "Backup job {{ $labels.job_name }} failed" + + - name: backup-job-duration + rules: + - alert: BackupJobTooSlow + expr: | + ( + time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 300 + and + kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 + for: 1m + labels: + severity: major + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been + running for {{ $value | humanizeDuration }} (threshold: 5m) + in cluster {{ $labels.cluster_environment }}. This may indicate a + hung process or connectivity issue. + summary: "Backup job {{ $labels.job_name }} running too long"