42 changed files with 66 additions and 1238 deletions
--- a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml
+++ b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml
@ -35,30 +35,6 @@ configs:
  tls:
    certificates:
 controller:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 server:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 repoServer:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 applicationSet:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 notifications:
  enabled: false
--- a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
+++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
@ -11,8 +11,8 @@ spec:
  startingDeadlineSeconds: 600 # 10 minutes
  jobTemplate:
    spec:
-      # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
+      # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
-      activeDeadlineSeconds: 14400
+      activeDeadlineSeconds: 1350
      backoffLimit: 2
      ttlSecondsAfterFinished: 259200 #
      template:
--- a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml
+++ b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml
@ -48,7 +48,7 @@ customConfig:
      type: elasticsearch
      inputs: [parser]
      endpoints: 
-        - https://o12y.observability.buildth.ing/insert/elasticsearch/
+        - https://o12y.observability./insert/elasticsearch/
      auth:
        strategy: basic
        user: ${VECTOR_USER}
--- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep
+++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep
--- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml
+++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml
@ -1,15 +0,0 @@
 apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMServiceScrape
 metadata:
  name: forgejo
  namespace: observability
 spec:
  namespaceSelector:
    matchNames:
      - gitea
  selector:
    matchLabels:
      app.kubernetes.io/name: forgejo
  endpoints:
    - port: http
      path: /metrics
--- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml
+++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml
@ -201,13 +201,13 @@ defaultRules:
      create: true
      rules: {}
    kubernetesSystemControllerManager:
-      create: false
+      create: true
      rules: {}
    kubeScheduler:
-      create: false
+      create: true
      rules: {}
    kubernetesSystemScheduler:
-      create: false
+      create: true
      rules: {}
    kubeStateMetrics:
      create: true
@ -778,7 +778,7 @@ vmagent:
  # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
  additionalRemoteWrites:
    # []
-    - url: https://o12y.observability.buildth.ing/api/v1/write
+    - url: https://o12y.observability./api/v1/write
      basicAuth:
        username:
          name: simple-user-secret
--- a/otc/dev.t09.de/stacks/core/argocd/values.yaml
+++ b/otc/dev.t09.de/stacks/core/argocd/values.yaml
@ -35,30 +35,6 @@ configs:
  tls:
    certificates:
 controller:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 server:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 repoServer:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 applicationSet:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 notifications:
  enabled: false
--- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
+++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
@ -11,8 +11,8 @@ spec:
  startingDeadlineSeconds: 600 # 10 minutes
  jobTemplate:
    spec:
-      # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
+      # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
-      activeDeadlineSeconds: 14400
+      activeDeadlineSeconds: 1350
      backoffLimit: 2
      ttlSecondsAfterFinished: 259200 #
      template:
--- a/otc/dev.t09.de/stacks/garm/garm/values.yaml
+++ b/otc/dev.t09.de/stacks/garm/garm/values.yaml
@ -41,8 +41,5 @@ providerConfig:
      sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4
 garm:
  metrics:
    enable: true
    disableAuth: true
  logging:
    logLevel: info
--- a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml
+++ b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml
@ -48,7 +48,7 @@ customConfig:
      type: elasticsearch
      inputs: [parser]
      endpoints: 
-        - https://o12y.observability.buildth.ing/insert/elasticsearch/
+        - https://o12y.observability./insert/elasticsearch/
      auth:
        strategy: basic
        user: ${VECTOR_USER}
--- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep
+++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep
--- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml
+++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml
@ -1,14 +0,0 @@
 apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMServiceScrape
 metadata:
  name: argocd
  namespace: observability
 spec:
  namespaceSelector:
    matchNames:
      - argocd
  selector:
    matchLabels:
      app.kubernetes.io/part-of: argocd
  endpoints:
    - port: http-metrics
--- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml
+++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml
@ -1,15 +0,0 @@
 apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMServiceScrape
 metadata:
  name: forgejo
  namespace: observability
 spec:
  namespaceSelector:
    matchNames:
      - gitea
  selector:
    matchLabels:
      app.kubernetes.io/name: forgejo
  endpoints:
    - port: http
      path: /metrics
--- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml
+++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml
@ -1,15 +0,0 @@
 apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMServiceScrape
 metadata:
  name: garm
  namespace: observability
 spec:
  namespaceSelector:
    matchNames:
      - garm
  selector:
    matchLabels:
      app.kubernetes.io/name: garm
  endpoints:
    - port: http
      path: /metrics
--- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml
+++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml
@ -201,13 +201,13 @@ defaultRules:
      create: true
      rules: {}
    kubernetesSystemControllerManager:
-      create: false
+      create: true
      rules: {}
    kubeScheduler:
-      create: false
+      create: true
      rules: {}
    kubernetesSystemScheduler:
-      create: false
+      create: true
      rules: {}
    kubeStateMetrics:
      create: true
@ -778,7 +778,7 @@ vmagent:
  # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
  additionalRemoteWrites:
    # []
-    - url: https://o12y.observability.buildth.ing/api/v1/write
+    - url: https://o12y.observability./api/v1/write
      basicAuth:
        username:
          name: simple-user-secret
@ -801,20 +801,6 @@ vmagent:
      # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent
      # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug
      promscrape.dropOriginalLabels: "true"
    # Harden liveness probe: default failureThreshold=10 masked a 72h silent outage
    livenessProbe:
      httpGet:
        path: /health
        port: http
      failureThreshold: 3
      periodSeconds: 5
      timeoutSeconds: 5
    startupProbe:
      httpGet:
        path: /health
        port: http
      failureThreshold: 30
      periodSeconds: 5
  # -- (object) VMAgent ingress configuration
  ingress:
    enabled: false
--- a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml
+++ b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml
@ -35,10 +35,8 @@ spec:
    server:
      root_url: "https://grafana.dev.t09.de"
    auth:
      disable_login: "true"
      disable_login_form: "true"
    security:
      admin_user: admin
      admin_password: admin
    auth.generic_oauth:
      enabled: "true"
      name: Forgejo
--- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml
+++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml
@ -9,13 +9,10 @@ spec:
  project: default
  syncPolicy:
    automated:
      prune: true
      selfHeal: true
    syncOptions:
      - CreateNamespace=true
      - ServerSideApply=true
      - RespectIgnoreDifferences=true
      - SkipDryRunOnMissingResource=true
  destination:
    name: in-cluster
    namespace: observability
--- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml
+++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml
@ -11,4 +11,4 @@ spec:
    matchLabels:
      app.kubernetes.io/name: garm
  endpoints:
-    - port: http
+    - port: metrics
--- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml
+++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml
@ -1,5 +1,5 @@
 apiVersion: operator.victoriametrics.com/v1beta1
-kind: VLogs
+kind: VLSingle
 metadata:
  name: victorialogs
  namespace: observability
--- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml
+++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml
@ -12,12 +12,6 @@ spec:
    - static:
        url: http://vmsingle-o12y:8429
      paths: ["/api/v1/write"]
    - static:
        url: http://vmsingle-o12y:8429
      paths: ["/api/v1/.*"]
    - static:
        url: http://vlogs-victorialogs:9428
      paths: ["/insert/elasticsearch/.*"]
    - static:
        url: http://vlogs-victorialogs:9428
      paths: ["/select/.*"]
--- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml
+++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml
@ -28,7 +28,10 @@ victoria-metrics-operator:
  crds:
    plain: true
    cleanup:
-      enabled: false  # disabled: cleanup hook can't schedule on resource-constrained nodes (Insufficient cpu / Too many pods)
+      enabled: true
      image:
        repository: bitnami/kubectl
        pullPolicy: IfNotPresent
  serviceMonitor:
    enabled: true
  operator:
@ -673,7 +676,7 @@ vmalert:
 vmauth:
  # -- Enable VMAuth CR
-  enabled: false
+  enabled: true
  # -- VMAuth annotations
  annotations: {}
  # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec)
@ -696,7 +699,7 @@ vmauth:
 vmagent:
  # -- Create VMAgent CR
-  enabled: true
+  enabled: false
  # -- VMAgent annotations
  annotations: {}
  # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
@ -708,8 +711,7 @@ vmagent:
    port: "8429"
    selectAllByDefault: true
    scrapeInterval: 20s
-    externalLabels:
+    externalLabels: {}
      cluster_environment: "dev"
      # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source.
      # For example:
      # cluster: cluster-name
--- a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml
+++ b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml
@ -35,30 +35,6 @@ configs:
  tls:
    certificates:
 controller:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 server:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 repoServer:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 applicationSet:
  metrics:
    enabled: true
    serviceMonitor:
      enabled: false
 notifications:
  enabled: false
--- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
+++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
@ -11,8 +11,8 @@ spec:
  startingDeadlineSeconds: 600 # 10 minutes
  jobTemplate:
    spec:
-      # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
+      # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
-      activeDeadlineSeconds: 14400
+      activeDeadlineSeconds: 1350
      backoffLimit: 2
      ttlSecondsAfterFinished: 259200 #
      template:
--- a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml
+++ b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml
@ -48,7 +48,7 @@ customConfig:
      type: elasticsearch
      inputs: [parser]
      endpoints: 
-        - https://o12y.observability.buildth.ing/insert/elasticsearch/
+        - https://o12y.observability./insert/elasticsearch/
      auth:
        strategy: basic
        user: ${VECTOR_USER}
--- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep
+++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep
--- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml
+++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml
@ -1,15 +0,0 @@
 apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMServiceScrape
 metadata:
  name: forgejo
  namespace: observability
 spec:
  namespaceSelector:
    matchNames:
      - gitea
  selector:
    matchLabels:
      app.kubernetes.io/name: forgejo
  endpoints:
    - port: http
      path: /metrics
--- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml
+++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml
@ -201,13 +201,13 @@ defaultRules:
      create: true
      rules: {}
    kubernetesSystemControllerManager:
-      create: false
+      create: true
      rules: {}
    kubeScheduler:
-      create: false
+      create: true
      rules: {}
    kubernetesSystemScheduler:
-      create: false
+      create: true
      rules: {}
    kubeStateMetrics:
      create: true
@ -778,7 +778,7 @@ vmagent:
  # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
  additionalRemoteWrites:
    # []
-    - url: https://o12y.observability.buildth.ing/api/v1/write
+    - url: https://o12y.observability./api/v1/write
      basicAuth:
        username:
          name: simple-user-secret
--- a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
+++ b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml
@ -11,8 +11,8 @@ spec:
  startingDeadlineSeconds: 600 # 10 minutes
  jobTemplate:
    spec:
-      # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
+      # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
-      activeDeadlineSeconds: 14400
+      activeDeadlineSeconds: 1350
      backoffLimit: 2
      ttlSecondsAfterFinished: 259200 #
      template:
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml
@ -1,153 +0,0 @@
 apiVersion: grafana.integreatly.org/v1beta1
 kind: GrafanaDashboard
 metadata:
  name: argocd-operational
 spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
  folder: "EDP / Applications"
  json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "graphTooltip": 1,
      "panels": [
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
          "title": "Application Status",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
          "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
          "title": "Total Apps",
          "type": "stat",
          "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
          "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
          "title": "Healthy",
          "type": "stat",
          "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
          "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
          "title": "Degraded",
          "type": "stat",
          "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
          "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
          "title": "Synced",
          "type": "stat",
          "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
          "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
          "title": "OutOfSync",
          "type": "stat",
          "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}},
          "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
          "title": "Progressing",
          "type": "stat",
          "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
          "title": "Application Details",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {
            "defaults": {"custom": {"filterable": true}},
            "overrides": [
              {"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]},
              {"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]}
            ]
          },
          "gridPos": {"h": 12, "w": 24, "x": 0, "y": 6},
          "title": "All Applications",
          "type": "table",
          "targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}],
          "transformations": [
            {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}},
            {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}}
          ]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18},
          "title": "Sync Activity",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "ops"}},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 19},
          "title": "Sync Operations (rate)",
          "type": "timeseries",
          "targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "ops"}},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 19},
          "title": "Reconciliation Rate",
          "type": "timeseries",
          "targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27},
          "title": "ArgoCD Logs",
          "type": "row"
        },
        {
          "datasource": {"type": "victoriametrics-logs-datasource"},
          "gridPos": {"h": 10, "w": 24, "x": 0, "y": 28},
          "title": "ArgoCD Logs",
          "type": "logs",
          "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}],
          "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
        }
      ],
      "schemaVersion": 39,
      "tags": ["edp", "argocd", "gitops"],
      "templating": {
        "list": [
          {
            "current": {"selected": true, "text": "All", "value": "$__all"},
            "datasource": {"type": "prometheus"},
            "definition": "label_values(argocd_app_info, cluster_environment)",
            "includeAll": true,
            "multi": true,
            "name": "cluster_environment",
            "label": "Environment",
            "query": "label_values(argocd_app_info, cluster_environment)",
            "refresh": 2,
            "sort": 1,
            "type": "query"
          }
        ]
      },
      "time": {"from": "now-6h", "to": "now"},
      "title": "ArgoCD Operations",
      "uid": "edp-argocd-ops"
    }
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml
@ -6,5 +6,4 @@ spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
  folder: "EDP / Applications"
  url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json"
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml
@ -1,103 +0,0 @@
 apiVersion: grafana.integreatly.org/v1beta1
 kind: GrafanaDashboard
 metadata:
  name: cronjob-monitoring
 spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
  folder: "EDP / Operations"
  json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "graphTooltip": 1,
      "panels": [
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
          "title": "Backup Job Status",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
          "gridPos": {"h": 5, "w": 12, "x": 0, "y": 1},
          "title": "Time Since Last Schedule",
          "type": "stat",
          "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
          "gridPos": {"h": 5, "w": 12, "x": 12, "y": 1},
          "title": "Failed Jobs (Active)",
          "type": "stat",
          "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
          "title": "CronJob Overview",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]},
          "gridPos": {"h": 8, "w": 24, "x": 0, "y": 7},
          "title": "All CronJobs",
          "type": "table",
          "targets": [
            {"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"}
          ],
          "transformations": [
            {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}},
            {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}}
          ]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
          "title": "Job History",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
          "title": "Job Completions (24h)",
          "type": "timeseries",
          "targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
          "title": "Job Failures (24h)",
          "type": "timeseries",
          "targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
        }
      ],
      "schemaVersion": 39,
      "tags": ["edp", "backup", "cronjob"],
      "templating": {
        "list": [
          {
            "current": {"selected": true, "text": "All", "value": "$__all"},
            "datasource": {"type": "prometheus"},
            "definition": "label_values(kube_cronjob_info, cluster_environment)",
            "includeAll": true,
            "multi": true,
            "name": "cluster_environment",
            "label": "Environment",
            "query": "label_values(kube_cronjob_info, cluster_environment)",
            "refresh": 2,
            "sort": 1,
            "type": "query"
          }
        ]
      },
      "time": {"from": "now-24h", "to": "now"},
      "title": "CronJob & Backup Monitoring",
      "uid": "edp-cronjobs"
    }
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml
@ -1,207 +0,0 @@
 apiVersion: grafana.integreatly.org/v1beta1
 kind: GrafanaDashboard
 metadata:
  name: forgejo
 spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
  folder: "EDP / Applications"
  json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "graphTooltip": 1,
      "panels": [
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
          "title": "Forgejo Health",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}},
          "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
          "title": "Status",
          "type": "stat",
          "targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
          "title": "Version",
          "type": "stat",
          "targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}],
          "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"}
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
          "title": "Repositories",
          "type": "stat",
          "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
          "title": "Users",
          "type": "stat",
          "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
          "title": "Organizations",
          "type": "stat",
          "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
          "title": "Teams",
          "type": "stat",
          "targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
          "title": "Activity",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 6, "x": 0, "y": 6},
          "title": "Open Issues",
          "type": "stat",
          "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 6, "x": 6, "y": 6},
          "title": "Closed Issues",
          "type": "stat",
          "targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 6, "x": 12, "y": 6},
          "title": "Webhooks",
          "type": "stat",
          "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 6, "x": 18, "y": 6},
          "title": "Hook Tasks",
          "type": "stat",
          "targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
          "title": "Content & Auth",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 0, "y": 11},
          "title": "Stars",
          "type": "stat",
          "targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 4, "y": 11},
          "title": "Watches",
          "type": "stat",
          "targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 8, "y": 11},
          "title": "Releases",
          "type": "stat",
          "targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 12, "y": 11},
          "title": "Mirrors",
          "type": "stat",
          "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 16, "y": 11},
          "title": "Public Keys",
          "type": "stat",
          "targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 20, "y": 11},
          "title": "OAuth Apps",
          "type": "stat",
          "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
          "title": "Forgejo Logs",
          "type": "row"
        },
        {
          "datasource": {"type": "victoriametrics-logs-datasource"},
          "gridPos": {"h": 10, "w": 12, "x": 0, "y": 16},
          "title": "Forgejo Server Logs",
          "type": "logs",
          "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}],
          "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
        },
        {
          "datasource": {"type": "victoriametrics-logs-datasource"},
          "gridPos": {"h": 10, "w": 12, "x": 12, "y": 16},
          "title": "Forgejo Errors",
          "type": "logs",
          "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}],
          "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
        }
      ],
      "schemaVersion": 39,
      "tags": ["edp", "forgejo", "gitea"],
      "templating": {
        "list": [
          {
            "current": {"selected": true, "text": "All", "value": "$__all"},
            "datasource": {"type": "prometheus"},
            "definition": "label_values(gitea_repositories, cluster_environment)",
            "includeAll": true,
            "multi": true,
            "name": "cluster_environment",
            "label": "Environment",
            "query": "label_values(gitea_repositories, cluster_environment)",
            "refresh": 2,
            "type": "query"
          }
        ]
      },
      "time": {"from": "now-6h", "to": "now"},
      "title": "Forgejo",
      "uid": "edp-forgejo"
    }
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml
@ -1,117 +0,0 @@
 apiVersion: grafana.integreatly.org/v1beta1
 kind: GrafanaDashboard
 metadata:
  name: garm
 spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
  folder: "EDP / Applications"
  json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "graphTooltip": 1,
      "panels": [
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
          "title": "GARM Runner Status",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
          "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
          "title": "Total Runners",
          "type": "stat",
          "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
          "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
          "title": "Idle Runners",
          "type": "stat",
          "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
          "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
          "title": "Creating",
          "type": "stat",
          "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
          "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
          "title": "Errors",
          "type": "stat",
          "targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
          "title": "GitHub API Rate Limits",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "min": 0}},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 7},
          "title": "Rate Limit Remaining",
          "type": "timeseries",
          "targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "ops"}},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 7},
          "title": "Runner Operations Rate",
          "type": "timeseries",
          "targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
          "title": "Runner Details",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"custom": {"filterable": true}}},
          "gridPos": {"h": 8, "w": 24, "x": 0, "y": 16},
          "title": "Runner Pool Status",
          "type": "table",
          "targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}],
          "transformations": [
            {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}},
            {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}}
          ]
        }
      ],
      "schemaVersion": 39,
      "tags": ["edp", "garm", "ci-cd", "runners"],
      "templating": {
        "list": [
          {
            "current": {"selected": true, "text": "All", "value": "$__all"},
            "datasource": {"type": "prometheus"},
            "definition": "label_values(garm_runner_status, cluster_environment)",
            "includeAll": true,
            "multi": true,
            "name": "cluster_environment",
            "label": "Environment",
            "query": "label_values(garm_runner_status, cluster_environment)",
            "refresh": 2,
            "sort": 1,
            "type": "query"
          }
        ]
      },
      "time": {"from": "now-6h", "to": "now"},
      "title": "GARM Runners",
      "uid": "edp-garm"
    }
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml
@ -6,5 +6,4 @@ spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
  folder: "EDP / Operations"
  url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json"
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml
@ -1,245 +0,0 @@
 apiVersion: grafana.integreatly.org/v1beta1
 kind: GrafanaDashboard
 metadata:
  name: platform-overview
 spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
  folder: "EDP / Overview"
  json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 1,
      "links": [],
      "panels": [
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
          "title": "Platform Health",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {
            "defaults": {
              "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}],
              "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
            }
          },
          "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
          "title": "Forgejo",
          "type": "stat",
          "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {
            "defaults": {
              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
            }
          },
          "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
          "title": "Ingress 5xx (5m)",
          "type": "stat",
          "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {
            "defaults": {
              "unit": "short",
              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}
            }
          },
          "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
          "title": "Failed Jobs (24h)",
          "type": "stat",
          "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {
            "defaults": {
              "unit": "percentunit",
              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
            }
          },
          "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
          "title": "Cluster CPU Usage",
          "type": "stat",
          "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {
            "defaults": {
              "unit": "percentunit",
              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
            }
          },
          "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
          "title": "Cluster Memory Usage",
          "type": "stat",
          "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {
            "defaults": {
              "unit": "percentunit",
              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]}
            }
          },
          "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
          "title": "Max PVC Usage",
          "type": "stat",
          "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
          "title": "Forgejo",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
          "title": "Repositories",
          "type": "stat",
          "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
          "title": "Users",
          "type": "stat",
          "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
          "title": "Organizations",
          "type": "stat",
          "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
          "title": "Open Issues",
          "type": "stat",
          "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6},
          "title": "Webhooks",
          "type": "stat",
          "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short"}},
          "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6},
          "title": "Mirrors",
          "type": "stat",
          "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
          "title": "Resources",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11},
          "title": "Node CPU Usage",
          "type": "timeseries",
          "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11},
          "title": "PVC Usage by Claim",
          "type": "timeseries",
          "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19},
          "title": "Backups",
          "type": "row"
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
          "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20},
          "title": "Time Since Last Backup Schedule",
          "type": "stat",
          "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "s"}},
          "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20},
          "title": "Backup Job Duration (Last 7d)",
          "type": "timeseries",
          "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}],
          "options": {"legend": {"displayMode": "table"}}
        },
        {
          "datasource": {"type": "prometheus"},
          "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
          "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20},
          "title": "Failed Backup Jobs (Active)",
          "type": "stat",
          "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
        },
        {
          "collapsed": false,
          "gridPos": {"h": 1, "w": 24, "x": 0, "y": 24},
          "title": "Logs",
          "type": "row"
        },
        {
          "datasource": {"type": "victoriametrics-logs-datasource"},
          "gridPos": {"h": 10, "w": 24, "x": 0, "y": 25},
          "title": "Recent Errors (all namespaces)",
          "type": "logs",
          "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}],
          "options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"}
        }
      ],
      "schemaVersion": 39,
      "tags": ["edp", "platform", "overview"],
      "templating": {
        "list": [
          {
            "current": {"selected": true, "text": "All", "value": "$__all"},
            "datasource": {"type": "prometheus"},
            "definition": "label_values(up, cluster_environment)",
            "includeAll": true,
            "multi": true,
            "name": "cluster_environment",
            "label": "Environment",
            "query": "label_values(up, cluster_environment)",
            "refresh": 2,
            "sort": 1,
            "type": "query"
          }
        ]
      },
      "time": {"from": "now-6h", "to": "now"},
      "title": "EDP Platform Overview",
      "uid": "edp-platform-overview"
    }
--- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml
@ -6,7 +6,4 @@ spec:
  instanceSelector:
    matchLabels:
      dashboards: "grafana"
-  folder: "EDP / Operations"
+  url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json"
  grafanaCom:
    id: 22698
    revision: 1
--- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml
@ -1,119 +1,40 @@
 apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMRule
 metadata:
-  name: edp-platform-alerts
+  name: forgejo-alerts
  namespace: observability
 spec:
  groups:
-    - name: platform-health
+    - name: forgejo
      rules:
-        - alert: ForgejoDown
+        - alert: forgejo down
-          expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1
+          expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1
-          for: 1m
+          for: 30s
          labels:
            severity: critical
            job:  "{{ $labels.job }}"
          annotations:
-            summary: "Forgejo is down on {{ $labels.cluster_environment }}"
+            value: "{{ $value }}"
-            description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}."
+            description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}'
-
+    - name: forgejo-backup
        - alert: IngressHighErrorRate
          expr: |
            sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m]))
            / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05
          for: 5m
          labels:
            severity: major
          annotations:
            summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}"
            description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes."
            value: "{{ $value | humanizePercentage }}"
        - alert: NodeNotReady
          expr: kube_node_status_condition{condition="Ready", status="true"} == 0
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}"
            description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
        - alert: PodCrashLooping
          expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3
          for: 5m
          labels:
            severity: major
          annotations:
            summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}"
            description: "Pod has restarted more than 3 times in the last 15 minutes."
    - name: storage
      rules:
-        - alert: PVCUsageHigh
+        - alert: forgejo s3 backup job failed
-          expr: |
+          expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0
-            1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
+          for: 30s
          for: 5m
          labels:
            severity: major
          annotations:
            summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%"
            description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}."
            value: "{{ $value | humanizePercentage }}"
        - alert: PVCUsageCritical
          expr: |
            1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90
          for: 5m
          labels:
            severity: critical
            job:  "{{ $labels.job }}"
          annotations:
-            summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%"
+            value: "{{ $value }}"
-            description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required."
+            description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}'
-            value: "{{ $value | humanizePercentage }}"
+    - name: disk-consumption-high
    - name: resources
      rules:
-        - alert: NodeCPUHigh
+        - alert: disk consumption high
-          expr: |
+          expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6
-            1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85
+          for: 30s
          for: 15m
          labels:
            severity: major
            job:  "{{ $labels.job }}"
          annotations:
-            summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}"
+            value: "{{ $value }}"
-            description: "Node CPU utilization has been above 85% for 15 minutes."
+            description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}'
            value: "{{ $value | humanizePercentage }}"
        - alert: NodeMemoryHigh
          expr: |
            1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
          for: 10m
          labels:
            severity: major
          annotations:
            summary: "Node memory >90% on {{ $labels.cluster_environment }}"
            description: "Node memory utilization above 90% for 10 minutes."
            value: "{{ $value | humanizePercentage }}"
    - name: cluster-health
      rules:
        - alert: ClusterMetricsSilent
          expr: |
            count(up{job="kubelet"}) by (cluster_environment) < 1
            or
            absent(up{job="kubelet", cluster_environment="dev"})
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
            description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
        - alert: ClusterAPIServerDown
          expr: |
            up{job="apiserver", cluster_environment=~".+"} == 0
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "API server down on {{ $labels.cluster_environment }}"
            description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."
--- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml
@ -1,78 +0,0 @@
 apiVersion: operator.victoriametrics.com/v1beta1
 kind: VMRule
 metadata:
  name: backup-alerts
  namespace: observability
 spec:
  groups:
    - name: backup-schedule-staleness
      rules:
        - alert: BackupCronJobNotScheduled
          expr: |
            time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
            > 26 * 3600
          for: 5m
          labels:
            severity: critical
            cronjob: "{{ $labels.cronjob }}"
          annotations:
            value: "{{ $value | humanizeDuration }}"
            description: >-
              CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been
              scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}.
              Last schedule was {{ $value | humanizeDuration }} ago.
            summary: "Backup CronJob {{ $labels.cronjob }} is stale"
        - alert: BackupCronJobNeverScheduled
          expr: |
            kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
            == 0
          for: 30m
          labels:
            severity: critical
            cronjob: "{{ $labels.cronjob }}"
          annotations:
            description: >-
              CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been
              scheduled in cluster {{ $labels.cluster_environment }}.
            summary: "Backup CronJob {{ $labels.cronjob }} never ran"
    - name: backup-job-failures
      rules:
        - alert: BackupJobFailed
          expr: |
            max by(cluster_environment, namespace, job_name) (
              kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
            ) > 0
          for: 30s
          labels:
            severity: critical
            job_name: "{{ $labels.job_name }}"
          annotations:
            value: "{{ $value }}"
            description: >-
              Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has
              {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}.
            summary: "Backup job {{ $labels.job_name }} failed"
    - name: backup-job-duration
      rules:
        - alert: BackupJobTooSlow
          expr: |
            (
              time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
            ) > 300
            and
            kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0
          for: 1m
          labels:
            severity: major
            job_name: "{{ $labels.job_name }}"
          annotations:
            value: "{{ $value | humanizeDuration }}"
            description: >-
              Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been
              running for {{ $value | humanizeDuration }} (threshold: 5m)
              in cluster {{ $labels.cluster_environment }}. This may indicate a
              hung process or connectivity issue.
            summary: "Backup job {{ $labels.job_name }} running too long"
--- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml
@ -10,4 +10,4 @@ spec:
    matchLabels:
      app.kubernetes.io/name: garm
  endpoints:
-    - port: http
+    - port: metrics
--- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml
@ -1,9 +0,0 @@
 apiVersion: v1
 kind: Secret
 metadata:
  name: simple-user-secret
  namespace: observability
 type: Opaque
 data:
  username: c2ltcGxlLXVzZXI=
  password: c3g1Z0M3b29XYVdPT0R3RA==
--- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml
@ -5,17 +5,13 @@ metadata:
  namespace: observability
 spec:
  username: simple-user
-  password: sx5gC7ooWaWOODwD
+  passwordRef:
    key: password
    name: simple-user-secret
  targetRefs:
    - static:
        url: http://vmsingle-o12y:8429
      paths: ["/api/v1/write"]
    - static:
        url: http://vmsingle-o12y:8429
      paths: ["/api/v1/.*"]
    - static:
        url: http://vlogs-victorialogs:9428
      paths: ["/insert/elasticsearch/.*"]
    - static:
        url: http://vlogs-victorialogs:9428
      paths: ["/select/.*"]
--- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml
+++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml
@ -1,6 +1,6 @@
 global:
  # -- Cluster label to use for dashboards and rules
-  clusterLabel: cluster_environment
+  clusterLabel: cluster
  # -- Global license configuration
  license:
    key: ""
@ -201,13 +201,13 @@ defaultRules:
      enabled: true
      rules: {}
    kubernetesSystemControllerManager:
-      create: false
+      enabled: false
      rules: {}
    kubeScheduler:
-      create: false
+      enabled: false
      rules: {}
    kubernetesSystemScheduler:
-      create: false
+      enabled: false
      rules: {}
    kubeStateMetrics:
      enabled: true