From a52a6691a8fb36155d6d01ed72dd0fc9865528ac Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 09:51:42 +0200 Subject: [PATCH 01/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20add?= =?UTF-8?q?=20prune=20+=20RespectIgnoreDifferences=20to=20o12y=20syncPolic?= =?UTF-8?q?y?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix CRD bootstrap deadlock on victoria-metrics-k8s-stack ArgoCD app. Adds prune: true and RespectIgnoreDifferences=true to prevent sync failures when CRs are applied before CRDs are established. --- otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml index 3011a2f..0ff2853 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml @@ -9,10 +9,12 @@ spec: project: default syncPolicy: automated: + prune: true selfHeal: true syncOptions: - CreateNamespace=true - ServerSideApply=true + - RespectIgnoreDifferences=true destination: name: in-cluster namespace: observability From 29c0a59734cbba01b85413e9f6772a6eaec929a1 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 09:56:19 +0200 Subject: [PATCH 02/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20add?= =?UTF-8?q?=20SkipDryRunOnMissingResource=20to=20o12y=20syncOptions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VLSingle CRD missing at sync time — ArgoCD pre-validates all resources before applying any, causing 'synchronization tasks not valid' on CRs whose CRDs are created by the operator in the same sync wave. SkipDryRunOnMissingResource=true bypasses dry-run for missing CRDs, unblocking the CRD bootstrap deadlock. --- otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml index 0ff2853..d7599b9 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml @@ -15,6 +15,7 @@ spec: - CreateNamespace=true - ServerSideApply=true - RespectIgnoreDifferences=true + - SkipDryRunOnMissingResource=true destination: name: in-cluster namespace: observability From ef4a1d7ce23d12d51fbee92697318df30d32b403 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 09:58:50 +0200 Subject: [PATCH 03/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20disa?= =?UTF-8?q?ble=20crds.cleanup=20hook=20in=20victoria-metrics-operator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-upgrade cleanup hook uses bitnami/kubectl and spawns on every ArgoCD sync. Dev cluster nodes are at 99% CPU / pod limit — hook pod cannot be scheduled, blocking the entire sync indefinitely. Disabling cleanup.enabled prevents the hook Job from being created. CRD cleanup is safe to skip on a fresh bootstrap where no old CRDs exist. --- .../stacks/observability/victoria-k8s-stack/values.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml index d407910..ca0b671 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml @@ -28,10 +28,7 @@ victoria-metrics-operator: crds: plain: true cleanup: - enabled: true - image: - repository: bitnami/kubectl - pullPolicy: IfNotPresent + enabled: false # disabled: cleanup hook can't schedule on resource-constrained nodes (Insufficient cpu / Too many pods) serviceMonitor: enabled: true operator: From d83945413d5eaac583d2aee0ec5f061f0cddf257 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 10:20:13 +0200 Subject: [PATCH 04/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20chan?= =?UTF-8?q?ge=20VLSingle=20=E2=86=92=20VLogs=20in=20victorialogs=20manifes?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chart 0.48.1 / operator v0.58.0 uses VLogs CRD for VictoriaLogs, not VLSingle. The VLSingle kind was introduced in a newer operator version and is not registered in this chart release. Changing to VLogs which has identical spec fields (retentionPeriod, removePvcAfterDelete, storage, storageMetadata, resources all supported). --- .../observability/victoria-k8s-stack/manifests/vlogs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml index 8657ac8..72e13d1 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml @@ -1,5 +1,5 @@ apiVersion: operator.victoriametrics.com/v1beta1 -kind: VLSingle +kind: VLogs metadata: name: victorialogs namespace: observability From 369961a940b84298a8772fda64e0ba2f6868c76d Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 10:44:25 +0200 Subject: [PATCH 05/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20enab?= =?UTF-8?q?le=20vmagent,=20fix=20grafana=20auth,=20disable=20vmauth=20on?= =?UTF-8?q?=20dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Enable VMAgent (was disabled → no metrics scraped) - Remove disable_login from Grafana config; add security block so operator can auth via API - Disable VMAuth (invalid trailing-dot hostname o12y.observability.; not needed on dev) --- .../observability/grafana-operator/manifests/grafana.yaml | 4 +++- .../stacks/observability/victoria-k8s-stack/values.yaml | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml index 1e8b038..17d6046 100644 --- a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml +++ b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml @@ -35,8 +35,10 @@ spec: server: root_url: "https://grafana.dev.t09.de" auth: - disable_login: "true" disable_login_form: "true" + security: + admin_user: admin + admin_password: admin auth.generic_oauth: enabled: "true" name: Forgejo diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml index ca0b671..9751113 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml @@ -673,7 +673,7 @@ vmalert: vmauth: # -- Enable VMAuth CR - enabled: true + enabled: false # -- VMAuth annotations annotations: {} # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) @@ -696,7 +696,7 @@ vmauth: vmagent: # -- Create VMAgent CR - enabled: false + enabled: true # -- VMAgent annotations annotations: {} # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) From 59eed97263992d1c705a3d9e72251199794eba47 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 11:41:20 +0200 Subject: [PATCH 06/29] =?UTF-8?q?fix(observability-client):=20=F0=9F=90=9B?= =?UTF-8?q?=20fix=20remote=20write=20URL=20and=20add=20missing=20manifests?= =?UTF-8?q?=20dir?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix broken remote write URL: o12y.observability. → o12y.observability.buildth.ing - Create manifests/ directory with .gitkeep for ArgoCD source path --- .../observability-client/vm-client-stack/manifests/.gitkeep | 0 .../stacks/observability-client/vm-client-stack/values.yaml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml index f85a786..9224a46 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret From 32e998df5b1b4b937dd72c31e985beebaf9bad85 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 12:35:18 +0200 Subject: [PATCH 07/29] =?UTF-8?q?fix(forgejo):=20=E2=8F=B1=EF=B8=8F=20incr?= =?UTF-8?q?ease=20s3-backup=20activeDeadlineSeconds=201350=E2=86=927200?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous 22.5m deadline caused DeadlineExceeded on 2026-06-19 when rclone sync took >22m (vs 13-16s prior days). Likely triggered by significant new data in OBS bucket. 2h window accommodates large incremental syncs while BackupJobTooSlow alert still fires at 5m. --- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 4 ++-- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 4 ++-- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 4 ++-- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index ed54cb0..12883a9 100644 --- a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index de14801..d313b18 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 71f1649..7226bd2 100644 --- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 842a7cc..a1caaae 100644 --- a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: From 0316eefa43c725484f31a0d6248f4a3a0175c737 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 12:42:04 +0200 Subject: [PATCH 08/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20disa?= =?UTF-8?q?ble=20false-positive=20control-plane=20alerts=20and=20fix=20emp?= =?UTF-8?q?ty=20cluster=5Fenvironment=20label?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hub defaultRules groups kubernetesSystemControllerManager, kubeScheduler, and kubernetesSystemScheduler used wrong key 'enabled: false' — chart expects 'create: false'. This caused KubeControllerManagerDown/KubeSchedulerDown to fire as false positives because OTC CCE managed k8s does not expose control plane for scraping. Dev local vmagent had empty externalLabels, so backup-alert rules evaluated by local vmalert had no cluster_environment label on kube_job_status_failed metrics. Added cluster_environment=dev to match what the vm-client-stack vmagent adds for hub shipping. --- .../stacks/observability/victoria-k8s-stack/values.yaml | 3 ++- .../stacks/observability/victoria-k8s-stack/values.yaml | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml index 9751113..e7bffbc 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml @@ -708,7 +708,8 @@ vmagent: port: "8429" selectAllByDefault: true scrapeInterval: 20s - externalLabels: {} + externalLabels: + cluster_environment: "dev" # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. # For example: # cluster: cluster-name diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml index 5bb9361..c535829 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: enabled: true rules: {} kubernetesSystemControllerManager: - enabled: false + create: false rules: {} kubeScheduler: - enabled: false + create: false rules: {} kubernetesSystemScheduler: - enabled: false + create: false rules: {} kubeStateMetrics: enabled: true From c2528f6f693ee3bb38f2a2ae322618f151589910 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 12:47:34 +0200 Subject: [PATCH 09/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20add=20?= =?UTF-8?q?platform=20grafana=20dashboard=20CRs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add forgejo.yaml: Forgejo app dashboard (grafana.com ID 17802) - Add argocd-operational.yaml: ArgoCD operational dashboard (grafana.com ID 19993) - Add cronjob-monitoring.yaml: CronJob/backup monitoring dashboard (grafana.com ID 14279) - Add platform-overview.yaml: custom EDP Platform Overview inline dashboard (platform health, forgejo stats, resource usage, backup status rows) - Fix victoria-logs.yaml: replace broken URL with grafanaCom ID 22698 --- .../manifests/argocd-operational.yaml | 11 + .../manifests/cronjob-monitoring.yaml | 11 + .../grafana-operator/manifests/forgejo.yaml | 11 + .../manifests/platform-overview.yaml | 227 ++++++++++++++++++ .../manifests/victoria-logs.yaml | 4 +- 5 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml new file mode 100644 index 0000000..b3fa256 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -0,0 +1,11 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: argocd-operational +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + grafanaCom: + id: 19993 + revision: 2 diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml new file mode 100644 index 0000000..e77eb20 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -0,0 +1,11 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cronjob-monitoring +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + grafanaCom: + id: 14279 + revision: 1 diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml new file mode 100644 index 0000000..cf32e5e --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -0,0 +1,11 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: forgejo +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + grafanaCom: + id: 17802 + revision: 1 diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml new file mode 100644 index 0000000..aa8be4c --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -0,0 +1,227 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: platform-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "annotations": {"list": []}, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Platform Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Forgejo", + "type": "stat", + "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Ingress 5xx (5m)", + "type": "stat", + "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Failed Jobs (24h)", + "type": "stat", + "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Cluster CPU Usage", + "type": "stat", + "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Cluster Memory Usage", + "type": "stat", + "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Max PVC Usage", + "type": "stat", + "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Forgejo", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Resources", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, + "title": "Node CPU Usage", + "type": "timeseries", + "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, + "title": "PVC Usage by Claim", + "type": "timeseries", + "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, + "title": "Backups", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, + "title": "Time Since Last Backup Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "s"}}, + "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, + "title": "Backup Job Duration (Last 7d)", + "type": "timeseries", + "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}], + "options": {"legend": {"displayMode": "table"}} + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, + "title": "Failed Backup Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "platform", "overview"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "definition": "label_values(up, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "query": "label_values(up, cluster_environment)", + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "EDP Platform Overview", + "uid": "edp-platform-overview" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml index 4018fbd..819dec7 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -6,4 +6,6 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" + grafanaCom: + id: 22698 + revision: 1 From 949529eb5c3d693306e9916c15724195fc91040a Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 12:50:20 +0200 Subject: [PATCH 10/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20add=20?= =?UTF-8?q?cluster=5Fenvironment=20dropdown=20to=20Forgejo=20and=20platfor?= =?UTF-8?q?m-overview=20dashboards?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace grafanaCom import (17802) with custom inline Forgejo dashboard containing cluster_environment query variable (refresh=2, label=Environment) - Add label, refresh=2, sort=1 to platform-overview cluster_environment variable - ArgoCD (19993) and CronJob (14279) remain grafanaCom imports (acceptable) --- .../grafana-operator/manifests/forgejo.yaml | 179 +++++++++++++++++- .../manifests/platform-overview.yaml | 3 + 2 files changed, 179 insertions(+), 3 deletions(-) diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml index cf32e5e..bf566a5 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -6,6 +6,179 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - grafanaCom: - id: 17802 - revision: 1 + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Forgejo Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Status", + "type": "stat", + "targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Version", + "type": "stat", + "targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Teams", + "type": "stat", + "targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 6}, + "title": "Closed Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 6}, + "title": "Hook Tasks", + "type": "stat", + "targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Content & Auth", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 11}, + "title": "Stars", + "type": "stat", + "targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 11}, + "title": "Watches", + "type": "stat", + "targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 11}, + "title": "Releases", + "type": "stat", + "targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 11}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 11}, + "title": "Public Keys", + "type": "stat", + "targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 11}, + "title": "OAuth Apps", + "type": "stat", + "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "forgejo", "gitea"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(gitea_repositories, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(gitea_repositories, cluster_environment)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "Forgejo", + "uid": "edp-forgejo" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml index aa8be4c..d4102fb 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -216,7 +216,10 @@ spec: "includeAll": true, "multi": true, "name": "cluster_environment", + "label": "Environment", "query": "label_values(up, cluster_environment)", + "refresh": 2, + "sort": 1, "type": "query" } ] From 91db8038e68e1dbdfb8fab012863777d9b055106 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 13:02:35 +0200 Subject: [PATCH 11/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20custom?= =?UTF-8?q?=20ArgoCD=20dashboard=20with=20cluster=5Fenvironment=20filter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../manifests/garm-scrape.yaml | 2 +- .../manifests/argocd-operational.yaml | 133 +++++++++++++++++- .../manifests/garm-scrape.yaml | 2 +- 3 files changed, 132 insertions(+), 5 deletions(-) diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml index 6fc8de6..4b5807e 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -11,4 +11,4 @@ spec: matchLabels: app.kubernetes.io/name: garm endpoints: - - port: metrics + - port: http diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml index b3fa256..f37cf03 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -6,6 +6,133 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - grafanaCom: - id: 19993 - revision: 2 + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Application Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Total Apps", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Healthy", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Degraded", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Synced", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "OutOfSync", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Progressing", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Application Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": {"custom": {"filterable": true}}, + "overrides": [ + {"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]}, + {"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]} + ] + }, + "gridPos": {"h": 12, "w": 24, "x": 0, "y": 6}, + "title": "All Applications", + "type": "table", + "targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}, + "title": "Sync Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 19}, + "title": "Sync Operations (rate)", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 19}, + "title": "Reconciliation Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "argocd", "gitops"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(argocd_app_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(argocd_app_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "ArgoCD Operations", + "uid": "edp-argocd-ops" + } diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml index a4c6119..f73afa8 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -10,4 +10,4 @@ spec: matchLabels: app.kubernetes.io/name: garm endpoints: - - port: metrics + - port: http From 6ea1e798d2592c1b627636808928c3628f3ba389 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 13:06:19 +0200 Subject: [PATCH 12/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20add?= =?UTF-8?q?=20missing=20manifests=20to=20instance=20stacks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - backup-alerts.yaml → observability.buildth.ing victoria-k8s-stack - forgejo-scrape.yaml → dev.t09.de vm-client-stack --- .../manifests/forgejo-scrape.yaml | 15 ++++ .../manifests/backup-alerts.yaml | 78 +++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml new file mode 100644 index 0000000..259a2bf --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml @@ -0,0 +1,78 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: backup-alerts + namespace: observability +spec: + groups: + - name: backup-schedule-staleness + rules: + - alert: BackupCronJobNotScheduled + expr: | + time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + > 26 * 3600 + for: 5m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been + scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. + Last schedule was {{ $value | humanizeDuration }} ago. + summary: "Backup CronJob {{ $labels.cronjob }} is stale" + + - alert: BackupCronJobNeverScheduled + expr: | + kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + == 0 + for: 30m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been + scheduled in cluster {{ $labels.cluster_environment }}. + summary: "Backup CronJob {{ $labels.cronjob }} never ran" + + - name: backup-job-failures + rules: + - alert: BackupJobFailed + expr: | + max by(cluster_environment, namespace, job_name) ( + kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 0 + for: 30s + labels: + severity: critical + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has + {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. + summary: "Backup job {{ $labels.job_name }} failed" + + - name: backup-job-duration + rules: + - alert: BackupJobTooSlow + expr: | + ( + time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 300 + and + kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 + for: 1m + labels: + severity: major + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been + running for {{ $value | humanizeDuration }} (threshold: 5m) + in cluster {{ $labels.cluster_environment }}. This may indicate a + hung process or connectivity issue. + summary: "Backup job {{ $labels.job_name }} running too long" From 076b2a16c9129f5f53bdb139f9d1abeb78e99143 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 13:11:32 +0200 Subject: [PATCH 13/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20fix?= =?UTF-8?q?=20datasource=20UIDs,=20replace=20cronjob=20dashboard,=20add=20?= =?UTF-8?q?GARM?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove all ${DS_VICTORIAMETRICS} uid refs from platform-overview; use type-only datasource so grafana-operator resolves default prometheus DS - Replace grafanaCom id:14279 cronjob dashboard with inline custom version supporting cluster_environment variable (dev/edp/observability) - Add new GARM runners dashboard (edp-garm) ready for when GARM metrics are scraped; uses or vector(0) guards so panels show 0 not empty Note: cluster_environment values confirmed as dev/edp/observability (no benchmark). GARM metrics not yet present in VictoriaMetrics (0 series found). --- .../manifests/cronjob-monitoring.yaml | 97 ++++++++++++++- .../grafana-operator/manifests/garm.yaml | 116 ++++++++++++++++++ .../manifests/platform-overview.yaml | 36 +++--- 3 files changed, 228 insertions(+), 21 deletions(-) create mode 100644 otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml index e77eb20..5b5eeac 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -6,6 +6,97 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - grafanaCom: - id: 14279 - revision: 1 + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Backup Job Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 0, "y": 1}, + "title": "Time Since Last Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 12, "y": 1}, + "title": "Failed Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "CronJob Overview", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 7}, + "title": "All CronJobs", + "type": "table", + "targets": [ + {"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"} + ], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Job History", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "title": "Job Completions (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "title": "Job Failures (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "backup", "cronjob"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(kube_cronjob_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(kube_cronjob_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-24h", "to": "now"}, + "title": "CronJob & Backup Monitoring", + "uid": "edp-cronjobs" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml new file mode 100644 index 0000000..9e01a51 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml @@ -0,0 +1,116 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: garm +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "GARM Runner Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1}, + "title": "Total Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1}, + "title": "Idle Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1}, + "title": "Creating", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1}, + "title": "Errors", + "type": "stat", + "targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "GitHub API Rate Limits", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "min": 0}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 7}, + "title": "Rate Limit Remaining", + "type": "timeseries", + "targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 7}, + "title": "Runner Operations Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Runner Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 16}, + "title": "Runner Pool Status", + "type": "table", + "targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}} + ] + } + ], + "schemaVersion": 39, + "tags": ["edp", "garm", "ci-cd", "runners"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(garm_runner_status, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(garm_runner_status, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "GARM Runners", + "uid": "edp-garm" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml index d4102fb..ac099d0 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -21,7 +21,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], @@ -34,7 +34,7 @@ spec: "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} @@ -46,7 +46,7 @@ spec: "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "short", @@ -59,7 +59,7 @@ spec: "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", @@ -72,7 +72,7 @@ spec: "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", @@ -85,7 +85,7 @@ spec: "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", @@ -104,7 +104,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, "title": "Repositories", @@ -112,7 +112,7 @@ spec: "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, "title": "Users", @@ -120,7 +120,7 @@ spec: "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, "title": "Organizations", @@ -128,7 +128,7 @@ spec: "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, "title": "Open Issues", @@ -136,7 +136,7 @@ spec: "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, "title": "Webhooks", @@ -144,7 +144,7 @@ spec: "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, "title": "Mirrors", @@ -158,7 +158,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, "title": "Node CPU Usage", @@ -166,7 +166,7 @@ spec: "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, "title": "PVC Usage by Claim", @@ -180,7 +180,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, "title": "Time Since Last Backup Schedule", @@ -188,7 +188,7 @@ spec: "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "s"}}, "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, "title": "Backup Job Duration (Last 7d)", @@ -197,7 +197,7 @@ spec: "options": {"legend": {"displayMode": "table"}} }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, "title": "Failed Backup Jobs (Active)", @@ -211,7 +211,7 @@ spec: "list": [ { "current": {"selected": true, "text": "All", "value": "$__all"}, - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "definition": "label_values(up, cluster_environment)", "includeAll": true, "multi": true, From 238ef71630a9dbde930d119b9d9d9efa2efe6527 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 13:23:37 +0200 Subject: [PATCH 14/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20fix?= =?UTF-8?q?=20remote=20write=20URL=20and=20add=20manifests=20for=20benchma?= =?UTF-8?q?rk=20+=20edp=20clients?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix broken remote write URL (o12y.observability./ → o12y.observability.buildth.ing/) - Create manifests/ dirs with .gitkeep for benchmark.t09.de and edp.buildth.ing - Copy forgejo-scrape.yaml VMServiceScrape manifest to both instances --- .../vm-client-stack/manifests/.gitkeep | 0 .../vm-client-stack/manifests/forgejo-scrape.yaml | 15 +++++++++++++++ .../vm-client-stack/values.yaml | 2 +- .../vm-client-stack/manifests/.gitkeep | 0 .../vm-client-stack/manifests/forgejo-scrape.yaml | 15 +++++++++++++++ .../vm-client-stack/values.yaml | 2 +- 6 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep create mode 100644 otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml create mode 100644 otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep create mode 100644 otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml index dde927b..4bc089d 100644 --- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml index 4e1c079..255e9e5 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret From bcf583a0556af1b1c88246d128e2709a2f9c706e Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 13:32:13 +0200 Subject: [PATCH 15/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20fix?= =?UTF-8?q?=20Vector=20log=20shipping=20URL=20on=20all=20clusters?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restores missing '.buildth.ing' domain segment in Vector elasticsearch endpoint for benchmark, dev, and edp instances. Template source uses {{{ .Env.DOMAIN_O12Y }}} (correct) — instances were mis-hydrated, omitting the TLD suffix. --- .../stacks/observability-client/vector/values.yaml | 2 +- otc/dev.t09.de/stacks/observability-client/vector/values.yaml | 2 +- .../stacks/observability-client/vector/values.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml index 3fb5e53..2393b1a 100644 --- a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml +++ b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml index c0644cf..4d7458a 100644 --- a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml index 7b30cdc..2fefacd 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} From b6fbd3f6eb92cdb394fe86ef0ddf5c7c6cbd2b3f Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 13:34:08 +0200 Subject: [PATCH 16/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20add=20?= =?UTF-8?q?VictoriaLogs=20log=20panels=20to=20platform,=20forgejo,=20argoc?= =?UTF-8?q?d=20dashboards?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../manifests/argocd-operational.yaml | 14 ++++++++++++ .../grafana-operator/manifests/forgejo.yaml | 22 +++++++++++++++++++ .../manifests/platform-overview.yaml | 14 ++++++++++++ 3 files changed, 50 insertions(+) diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml index f37cf03..e8e51a2 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -111,6 +111,20 @@ spec: "title": "Reconciliation Rate", "type": "timeseries", "targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}, + "title": "ArgoCD Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 28}, + "title": "ArgoCD Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} } ], "schemaVersion": 39, diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml index bf566a5..606b601 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -158,6 +158,28 @@ spec: "title": "OAuth Apps", "type": "stat", "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Forgejo Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 16}, + "title": "Forgejo Server Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 12, "y": 16}, + "title": "Forgejo Errors", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} } ], "schemaVersion": 39, diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml index ac099d0..eab6c9f 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -203,6 +203,20 @@ spec: "title": "Failed Backup Jobs (Active)", "type": "stat", "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 24}, + "title": "Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 25}, + "title": "Recent Errors (all namespaces)", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"} } ], "schemaVersion": 39, From 7f5c680e19b4e35eec48e703f1eea44f36705824 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 13:36:15 +0200 Subject: [PATCH 17/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20enab?= =?UTF-8?q?le=20GARM=20unauthenticated=20metrics=20+=20ArgoCD=20metrics=20?= =?UTF-8?q?on=20all=20instances?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - GARM dev.t09.de: set garm.metrics.disableAuth=true to unblock Prometheus scraping (was 401) - ArgoCD dev.t09.de: add controller/server/repoServer/applicationSet metrics blocks - ArgoCD edp.buildth.ing: add controller/server/repoServer/applicationSet metrics blocks - ArgoCD benchmark.t09.de: add controller/server/repoServer/applicationSet metrics blocks - observability.buildth.ing already had metrics enabled (no change needed) --- .../stacks/core/argocd/values.yaml | 24 +++++++++++++++++++ otc/dev.t09.de/stacks/core/argocd/values.yaml | 24 +++++++++++++++++++ otc/dev.t09.de/stacks/garm/garm/values.yaml | 3 +++ .../stacks/core/argocd/values.yaml | 24 +++++++++++++++++++ 4 files changed, 75 insertions(+) diff --git a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml index a6521b0..1591cc9 100644 --- a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml +++ b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/dev.t09.de/stacks/core/argocd/values.yaml b/otc/dev.t09.de/stacks/core/argocd/values.yaml index dd5b83d..cb856f0 100644 --- a/otc/dev.t09.de/stacks/core/argocd/values.yaml +++ b/otc/dev.t09.de/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/dev.t09.de/stacks/garm/garm/values.yaml b/otc/dev.t09.de/stacks/garm/garm/values.yaml index 5baed69..41fc84c 100644 --- a/otc/dev.t09.de/stacks/garm/garm/values.yaml +++ b/otc/dev.t09.de/stacks/garm/garm/values.yaml @@ -41,5 +41,8 @@ providerConfig: sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4 garm: + metrics: + enable: true + disableAuth: true logging: logLevel: info diff --git a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml index 019dc65..c1bde64 100644 --- a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml +++ b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false From 4591ee7b144d7bad8c30eda360d8e89200121501 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 14:46:35 +0200 Subject: [PATCH 18/29] =?UTF-8?q?feat(observability):=20=F0=9F=97=82?= =?UTF-8?q?=EF=B8=8F=20organize=20dashboards=20into=20Grafana=20folders?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Assigns folder field to all GrafanaDashboard CRs: - EDP / Overview: platform-overview - EDP / Applications: forgejo, argocd-operational, garm, argocd - EDP / Operations: cronjob-monitoring, ingress-nginx, victoria-logs --- .../grafana-operator/manifests/argocd-operational.yaml | 1 + .../stacks/observability/grafana-operator/manifests/argocd.yaml | 1 + .../grafana-operator/manifests/cronjob-monitoring.yaml | 1 + .../stacks/observability/grafana-operator/manifests/forgejo.yaml | 1 + .../stacks/observability/grafana-operator/manifests/garm.yaml | 1 + .../observability/grafana-operator/manifests/ingress-nginx.yaml | 1 + .../grafana-operator/manifests/platform-overview.yaml | 1 + .../observability/grafana-operator/manifests/victoria-logs.yaml | 1 + 8 files changed, 8 insertions(+) diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml index e8e51a2..9130b42 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -6,6 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Applications" json: | { "annotations": {"list": []}, diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml index b348ff7..2b81b2b 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Applications" url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml index 5b5eeac..ddcc883 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -6,6 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Operations" json: | { "annotations": {"list": []}, diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml index 606b601..ec40751 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -6,6 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Applications" json: | { "annotations": {"list": []}, diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml index 9e01a51..2a23e20 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml @@ -6,6 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Applications" json: | { "annotations": {"list": []}, diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml index c13d6a2..077edd8 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Operations" url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml index eab6c9f..ffce4e2 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -6,6 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Overview" json: | { "annotations": {"list": []}, diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml index 819dec7..c44c474 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -6,6 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Operations" grafanaCom: id: 22698 revision: 1 From b1a00d0395d298e46ed679b6564a3548516c025f Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 15:28:03 +0200 Subject: [PATCH 19/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20add?= =?UTF-8?q?=20missing=20simple-user-secret=20to=20hub=20observability=20st?= =?UTF-8?q?ack?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hub's VMUser (vmauth.yaml) references simple-user-secret via passwordRef, but the Secret was never added to the hub's manifests. Without this Secret, the VM operator cannot reconcile the VMUser into the vmauth config, causing ALL requests to fall through to the unauthorizedUser catch-all (vmsingle). Result: Vector log shipping to VictoriaLogs was broken — vmauth routed /insert/elasticsearch/_bulk to vmsingle instead of vlogs-victorialogs. --- .../victoria-k8s-stack/manifests/simple-user-secret.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml new file mode 100644 index 0000000..7013863 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: simple-user-secret + namespace: observability +type: Opaque +data: + username: c2ltcGxlLXVzZXI= + password: c3g1Z0M3b29XYVdPT0R3RA== From 8488de0c6f8ec81192b2e9bd7c530f48ddc5bc49 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 15:45:48 +0200 Subject: [PATCH 20/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20use?= =?UTF-8?q?=20plaintext=20password=20in=20hub=20VMUser=20to=20unblock=20op?= =?UTF-8?q?erator=20reconciliation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hub VMUser was using passwordRef pointing to simple-user-secret, but that Secret was not present in the cluster (only exists in git now via the previous commit). VM operator skips VMUser reconciliation when passwordRef cannot resolve, leaving vmauth with only the unauthorizedUser catch-all (vmsingle). Switching to inline password ensures immediate operator reconciliation without waiting for Secret deployment. The simple-user-secret.yaml manifest is kept for Vector's credential reference. --- .../observability/victoria-k8s-stack/manifests/vmauth.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 5759093..374511d 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -5,9 +5,7 @@ metadata: namespace: observability spec: username: simple-user - passwordRef: - key: password - name: simple-user-secret + password: sx5gC7ooWaWOODwD targetRefs: - static: url: http://vmsingle-o12y:8429 From f3931dc5509a2b86a8772e2388cd9a1c83e083a7 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 16:07:06 +0200 Subject: [PATCH 21/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20add?= =?UTF-8?q?=20ArgoCD=20+=20GARM=20VMServiceScrapes=20to=20dev=20client=20s?= =?UTF-8?q?tack?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../vm-client-stack/manifests/argocd-scrape.yaml | 14 ++++++++++++++ .../vm-client-stack/manifests/garm-scrape.yaml | 15 +++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml create mode 100644 otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..710145a --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd + namespace: observability +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..9904e86 --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm + namespace: observability +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http + path: /metrics From 0a249820de5c682267ec7f73ebbd70b41f6a0a49 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 16:11:09 +0200 Subject: [PATCH 22/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20fix?= =?UTF-8?q?=20ArgoCD=20scrape=20port=20name=20http-metrics=20not=20metrics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../vm-client-stack/manifests/argocd-scrape.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml index 710145a..2e9248f 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml @@ -11,4 +11,4 @@ spec: matchLabels: app.kubernetes.io/part-of: argocd endpoints: - - port: metrics + - port: http-metrics From 23edd5d6b4883999c581a66e9b2040fbeb529696 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 16:33:04 +0200 Subject: [PATCH 23/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20add=20?= =?UTF-8?q?read=20routes=20to=20vmauth=20for=20metrics=20and=20logs=20quer?= =?UTF-8?q?ies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../victoria-k8s-stack/manifests/vmauth.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 374511d..e1de2c6 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -10,6 +10,12 @@ spec: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/select/.*"] \ No newline at end of file From 70939149ea29154a2a155c4d7068b43d087f3770 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 16:37:33 +0200 Subject: [PATCH 24/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20add=20?= =?UTF-8?q?read=20routes=20to=20vmauth=20for=20dev.t09.de=20instance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../victoria-k8s-stack/manifests/vmauth.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 5759093..a4f0368 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -12,6 +12,12 @@ spec: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/select/.*"] \ No newline at end of file From 3141b7bd6c97ffe21ed1a3e258f649c74e79109e Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 16:43:21 +0200 Subject: [PATCH 25/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20compre?= =?UTF-8?q?hensive=20platform=20alert=20rules?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace ad-hoc forgejo/disk alerts with structured VMRule covering: - platform-health: ForgejoDown, IngressHighErrorRate, NodeNotReady, PodCrashLooping - storage: PVCUsageHigh (>80%), PVCUsageCritical (>90%) - resources: NodeCPUHigh (>85%), NodeMemoryHigh (>90%) --- .../victoria-k8s-stack/manifests/alerts.yaml | 109 +++++++++++++----- 1 file changed, 82 insertions(+), 27 deletions(-) diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml index 110ee7e..cb0f1e3 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -1,40 +1,95 @@ apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: - name: forgejo-alerts + name: edp-platform-alerts namespace: observability spec: groups: - - name: forgejo + - name: platform-health rules: - - alert: forgejo down - expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1 - for: 30s + - alert: ForgejoDown + expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1 + for: 1m labels: severity: critical - job: "{{ $labels.job }}" annotations: - value: "{{ $value }}" - description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}' - - name: forgejo-backup - rules: - - alert: forgejo s3 backup job failed - expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0 - for: 30s - labels: - severity: critical - job: "{{ $labels.job }}" - annotations: - value: "{{ $value }}" - description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}' - - name: disk-consumption-high - rules: - - alert: disk consumption high - expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6 - for: 30s + summary: "Forgejo is down on {{ $labels.cluster_environment }}" + description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}." + + - alert: IngressHighErrorRate + expr: | + sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m])) + / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05 + for: 5m labels: severity: major - job: "{{ $labels.job }}" annotations: - value: "{{ $value }}" - description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}' + summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}" + description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeNotReady + expr: kube_node_status_condition{condition="Ready", status="true"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}" + description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes." + + - alert: PodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3 + for: 5m + labels: + severity: major + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}" + description: "Pod has restarted more than 3 times in the last 15 minutes." + + - name: storage + rules: + - alert: PVCUsageHigh + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: major + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%" + description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}." + value: "{{ $value | humanizePercentage }}" + + - alert: PVCUsageCritical + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90 + for: 5m + labels: + severity: critical + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%" + description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required." + value: "{{ $value | humanizePercentage }}" + + - name: resources + rules: + - alert: NodeCPUHigh + expr: | + 1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85 + for: 15m + labels: + severity: major + annotations: + summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}" + description: "Node CPU utilization has been above 85% for 15 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeMemoryHigh + expr: | + 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 + for: 10m + labels: + severity: major + annotations: + summary: "Node memory >90% on {{ $labels.cluster_environment }}" + description: "Node memory utilization above 90% for 10 minutes." + value: "{{ $value | humanizePercentage }}" From 01c41c93799ddc5bb02342de5ecef9f67b8dc417 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Mon, 22 Jun 2026 10:34:58 +0200 Subject: [PATCH 26/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20use?= =?UTF-8?q?=20cluster=5Fenvironment=20as=20global=20clusterLabel=20for=20d?= =?UTF-8?q?efault=20dashboards?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default Victoria Metrics k8s dashboards were filtering on 'cluster' label which only contained 'observability'. Our metrics use 'cluster_environment' label which contains the actual cluster values: dev, edp, observability. --- .../stacks/observability/victoria-k8s-stack/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml index c535829..4868e3a 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml @@ -1,6 +1,6 @@ global: # -- Cluster label to use for dashboards and rules - clusterLabel: cluster + clusterLabel: cluster_environment # -- Global license configuration license: key: "" From 3ed3487e972d2509256d2cf78a6b0b1cdf59e04c Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Mon, 22 Jun 2026 10:40:43 +0200 Subject: [PATCH 27/29] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20hard?= =?UTF-8?q?en=20vmagent=20liveness=20probe=20failureThreshold=2010?= =?UTF-8?q?=E2=86=923?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Silent outage for 72h went undetected due to lenient probe. Add startupProbe (failureThreshold=30) to allow slow starts. --- .../vm-client-stack/values.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml index 9224a46..06930b0 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -801,6 +801,20 @@ vmagent: # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug promscrape.dropOriginalLabels: "true" + # Harden liveness probe: default failureThreshold=10 masked a 72h silent outage + livenessProbe: + httpGet: + path: /health + port: http + failureThreshold: 3 + periodSeconds: 5 + timeoutSeconds: 5 + startupProbe: + httpGet: + path: /health + port: http + failureThreshold: 30 + periodSeconds: 5 # -- (object) VMAgent ingress configuration ingress: enabled: false From eda2812d47d59918492e44bc48539c52d77579a8 Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Mon, 22 Jun 2026 10:45:49 +0200 Subject: [PATCH 28/29] =?UTF-8?q?fix(observability):=20=F0=9F=94=87=20sile?= =?UTF-8?q?nce=20managed-K8s=20false=20alerts=20+=20bump=20backup=20deadli?= =?UTF-8?q?ne=20to=204h?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Disable kubernetesSystemControllerManager, kubeScheduler, kubernetesSystemScheduler alert rules on dev, benchmark, edp clusters (unreachable on managed K8s) - Bump forgejo s3 backup activeDeadlineSeconds 7200→14400 (2h→4h) across all instances; deadline hit Jun 20-21 on heavy sync --- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 2 +- .../stacks/observability-client/vm-client-stack/values.yaml | 6 +++--- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 2 +- .../stacks/observability-client/vm-client-stack/values.yaml | 6 +++--- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 2 +- .../stacks/observability-client/vm-client-stack/values.yaml | 6 +++--- .../forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 12883a9..b99a903 100644 --- a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -12,7 +12,7 @@ spec: jobTemplate: spec: # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 7200 + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml index 4bc089d..7f6dd00 100644 --- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index d313b18..a8e236f 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -12,7 +12,7 @@ spec: jobTemplate: spec: # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 7200 + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml index 06930b0..c6d6b3a 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true diff --git a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 7226bd2..dd51f5b 100644 --- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -12,7 +12,7 @@ spec: jobTemplate: spec: # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 7200 + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml index 255e9e5..a7ba239 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true diff --git a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index a1caaae..9f86064 100644 --- a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -12,7 +12,7 @@ spec: jobTemplate: spec: # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 7200 + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: From 7a6f96a8b4f80b1ccbd8e5b1b1d312e103ec453f Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Mon, 22 Jun 2026 11:05:43 +0200 Subject: [PATCH 29/29] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20add=20?= =?UTF-8?q?cluster=20heartbeat=20dead-man=20switch=20alerts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ClusterMetricsSilent: fires if no kubelet metrics for >10m (catches vmagent outages). ClusterAPIServerDown: fires if apiserver scrape fails for >5m. Replaces silenced KubeControllerManagerDown/KubeSchedulerDown which never fire on managed K8s. --- .../victoria-k8s-stack/manifests/alerts.yaml | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml index cb0f1e3..2cce6a3 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -93,3 +93,27 @@ spec: summary: "Node memory >90% on {{ $labels.cluster_environment }}" description: "Node memory utilization above 90% for 10 minutes." value: "{{ $value | humanizePercentage }}" + + - name: cluster-health + rules: + - alert: ClusterMetricsSilent + expr: | + count(up{job="kubelet"}) by (cluster_environment) < 1 + or + absent(up{job="kubelet", cluster_environment="dev"}) + for: 10m + labels: + severity: critical + annotations: + summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics" + description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable." + + - alert: ClusterAPIServerDown + expr: | + up{job="apiserver", cluster_environment=~".+"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "API server down on {{ $labels.cluster_environment }}" + description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."