diff --git a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml index 1591cc9..a6521b0 100644 --- a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml +++ b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml @@ -35,30 +35,6 @@ configs: tls: certificates: -controller: - metrics: - enabled: true - serviceMonitor: - enabled: false - -server: - metrics: - enabled: true - serviceMonitor: - enabled: false - -repoServer: - metrics: - enabled: true - serviceMonitor: - enabled: false - -applicationSet: - metrics: - enabled: true - serviceMonitor: - enabled: false - notifications: enabled: false diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index b99a903..ed54cb0 100644 --- a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 14400 + # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer + activeDeadlineSeconds: 1350 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml index 2393b1a..3fb5e53 100644 --- a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml +++ b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability.buildth.ing/insert/elasticsearch/ + - https://o12y.observability./insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml deleted file mode 100644 index aecf517..0000000 --- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMServiceScrape -metadata: - name: forgejo - namespace: observability -spec: - namespaceSelector: - matchNames: - - gitea - selector: - matchLabels: - app.kubernetes.io/name: forgejo - endpoints: - - port: http - path: /metrics diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml index 7f6dd00..dde927b 100644 --- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: false + create: true rules: {} kubeScheduler: - create: false + create: true rules: {} kubernetesSystemScheduler: - create: false + create: true rules: {} kubeStateMetrics: create: true @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability.buildth.ing/api/v1/write + - url: https://o12y.observability./api/v1/write basicAuth: username: name: simple-user-secret diff --git a/otc/dev.t09.de/stacks/core/argocd/values.yaml b/otc/dev.t09.de/stacks/core/argocd/values.yaml index cb856f0..dd5b83d 100644 --- a/otc/dev.t09.de/stacks/core/argocd/values.yaml +++ b/otc/dev.t09.de/stacks/core/argocd/values.yaml @@ -35,30 +35,6 @@ configs: tls: certificates: -controller: - metrics: - enabled: true - serviceMonitor: - enabled: false - -server: - metrics: - enabled: true - serviceMonitor: - enabled: false - -repoServer: - metrics: - enabled: true - serviceMonitor: - enabled: false - -applicationSet: - metrics: - enabled: true - serviceMonitor: - enabled: false - notifications: enabled: false diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index a8e236f..de14801 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 14400 + # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer + activeDeadlineSeconds: 1350 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/dev.t09.de/stacks/garm/garm/values.yaml b/otc/dev.t09.de/stacks/garm/garm/values.yaml index 41fc84c..5baed69 100644 --- a/otc/dev.t09.de/stacks/garm/garm/values.yaml +++ b/otc/dev.t09.de/stacks/garm/garm/values.yaml @@ -41,8 +41,5 @@ providerConfig: sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4 garm: - metrics: - enable: true - disableAuth: true logging: logLevel: info diff --git a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml index 4d7458a..c0644cf 100644 --- a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability.buildth.ing/insert/elasticsearch/ + - https://o12y.observability./insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml deleted file mode 100644 index 2e9248f..0000000 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMServiceScrape -metadata: - name: argocd - namespace: observability -spec: - namespaceSelector: - matchNames: - - argocd - selector: - matchLabels: - app.kubernetes.io/part-of: argocd - endpoints: - - port: http-metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml deleted file mode 100644 index aecf517..0000000 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMServiceScrape -metadata: - name: forgejo - namespace: observability -spec: - namespaceSelector: - matchNames: - - gitea - selector: - matchLabels: - app.kubernetes.io/name: forgejo - endpoints: - - port: http - path: /metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml deleted file mode 100644 index 9904e86..0000000 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMServiceScrape -metadata: - name: garm - namespace: observability -spec: - namespaceSelector: - matchNames: - - garm - selector: - matchLabels: - app.kubernetes.io/name: garm - endpoints: - - port: http - path: /metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml index c6d6b3a..f85a786 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: false + create: true rules: {} kubeScheduler: - create: false + create: true rules: {} kubernetesSystemScheduler: - create: false + create: true rules: {} kubeStateMetrics: create: true @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability.buildth.ing/api/v1/write + - url: https://o12y.observability./api/v1/write basicAuth: username: name: simple-user-secret @@ -801,20 +801,6 @@ vmagent: # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug promscrape.dropOriginalLabels: "true" - # Harden liveness probe: default failureThreshold=10 masked a 72h silent outage - livenessProbe: - httpGet: - path: /health - port: http - failureThreshold: 3 - periodSeconds: 5 - timeoutSeconds: 5 - startupProbe: - httpGet: - path: /health - port: http - failureThreshold: 30 - periodSeconds: 5 # -- (object) VMAgent ingress configuration ingress: enabled: false diff --git a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml index 17d6046..1e8b038 100644 --- a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml +++ b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml @@ -35,10 +35,8 @@ spec: server: root_url: "https://grafana.dev.t09.de" auth: + disable_login: "true" disable_login_form: "true" - security: - admin_user: admin - admin_password: admin auth.generic_oauth: enabled: "true" name: Forgejo diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml index d7599b9..3011a2f 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml @@ -9,13 +9,10 @@ spec: project: default syncPolicy: automated: - prune: true selfHeal: true syncOptions: - CreateNamespace=true - ServerSideApply=true - - RespectIgnoreDifferences=true - - SkipDryRunOnMissingResource=true destination: name: in-cluster namespace: observability diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml index 4b5807e..6fc8de6 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -11,4 +11,4 @@ spec: matchLabels: app.kubernetes.io/name: garm endpoints: - - port: http + - port: metrics diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml index 72e13d1..8657ac8 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml @@ -1,5 +1,5 @@ apiVersion: operator.victoriametrics.com/v1beta1 -kind: VLogs +kind: VLSingle metadata: name: victorialogs namespace: observability diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index a4f0368..5759093 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -12,12 +12,6 @@ spec: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] - - static: - url: http://vmsingle-o12y:8429 - paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] - - static: - url: http://vlogs-victorialogs:9428 - paths: ["/select/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] \ No newline at end of file diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml index e7bffbc..d407910 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml @@ -28,7 +28,10 @@ victoria-metrics-operator: crds: plain: true cleanup: - enabled: false # disabled: cleanup hook can't schedule on resource-constrained nodes (Insufficient cpu / Too many pods) + enabled: true + image: + repository: bitnami/kubectl + pullPolicy: IfNotPresent serviceMonitor: enabled: true operator: @@ -673,7 +676,7 @@ vmalert: vmauth: # -- Enable VMAuth CR - enabled: false + enabled: true # -- VMAuth annotations annotations: {} # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) @@ -696,7 +699,7 @@ vmauth: vmagent: # -- Create VMAgent CR - enabled: true + enabled: false # -- VMAgent annotations annotations: {} # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) @@ -708,8 +711,7 @@ vmagent: port: "8429" selectAllByDefault: true scrapeInterval: 20s - externalLabels: - cluster_environment: "dev" + externalLabels: {} # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. # For example: # cluster: cluster-name diff --git a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml index c1bde64..019dc65 100644 --- a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml +++ b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml @@ -35,30 +35,6 @@ configs: tls: certificates: -controller: - metrics: - enabled: true - serviceMonitor: - enabled: false - -server: - metrics: - enabled: true - serviceMonitor: - enabled: false - -repoServer: - metrics: - enabled: true - serviceMonitor: - enabled: false - -applicationSet: - metrics: - enabled: true - serviceMonitor: - enabled: false - notifications: enabled: false diff --git a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index dd51f5b..71f1649 100644 --- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 14400 + # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer + activeDeadlineSeconds: 1350 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml index 2fefacd..7b30cdc 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability.buildth.ing/insert/elasticsearch/ + - https://o12y.observability./insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml deleted file mode 100644 index aecf517..0000000 --- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMServiceScrape -metadata: - name: forgejo - namespace: observability -spec: - namespaceSelector: - matchNames: - - gitea - selector: - matchLabels: - app.kubernetes.io/name: forgejo - endpoints: - - port: http - path: /metrics diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml index a7ba239..4e1c079 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: false + create: true rules: {} kubeScheduler: - create: false + create: true rules: {} kubernetesSystemScheduler: - create: false + create: true rules: {} kubeStateMetrics: create: true @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability.buildth.ing/api/v1/write + - url: https://o12y.observability./api/v1/write basicAuth: username: name: simple-user-secret diff --git a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 9f86064..842a7cc 100644 --- a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m - activeDeadlineSeconds: 14400 + # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer + activeDeadlineSeconds: 1350 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml deleted file mode 100644 index 9130b42..0000000 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml +++ /dev/null @@ -1,153 +0,0 @@ -apiVersion: grafana.integreatly.org/v1beta1 -kind: GrafanaDashboard -metadata: - name: argocd-operational -spec: - instanceSelector: - matchLabels: - dashboards: "grafana" - folder: "EDP / Applications" - json: | - { - "annotations": {"list": []}, - "editable": true, - "graphTooltip": 1, - "panels": [ - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, - "title": "Application Status", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, - "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, - "title": "Total Apps", - "type": "stat", - "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, - "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, - "title": "Healthy", - "type": "stat", - "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, - "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, - "title": "Degraded", - "type": "stat", - "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, - "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, - "title": "Synced", - "type": "stat", - "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, - "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, - "title": "OutOfSync", - "type": "stat", - "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}}, - "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, - "title": "Progressing", - "type": "stat", - "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, - "title": "Application Details", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": { - "defaults": {"custom": {"filterable": true}}, - "overrides": [ - {"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]}, - {"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]} - ] - }, - "gridPos": {"h": 12, "w": 24, "x": 0, "y": 6}, - "title": "All Applications", - "type": "table", - "targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}], - "transformations": [ - {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}}, - {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}} - ] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}, - "title": "Sync Activity", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "ops"}}, - "gridPos": {"h": 8, "w": 12, "x": 0, "y": 19}, - "title": "Sync Operations (rate)", - "type": "timeseries", - "targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "ops"}}, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 19}, - "title": "Reconciliation Rate", - "type": "timeseries", - "targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}, - "title": "ArgoCD Logs", - "type": "row" - }, - { - "datasource": {"type": "victoriametrics-logs-datasource"}, - "gridPos": {"h": 10, "w": 24, "x": 0, "y": 28}, - "title": "ArgoCD Logs", - "type": "logs", - "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}], - "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} - } - ], - "schemaVersion": 39, - "tags": ["edp", "argocd", "gitops"], - "templating": { - "list": [ - { - "current": {"selected": true, "text": "All", "value": "$__all"}, - "datasource": {"type": "prometheus"}, - "definition": "label_values(argocd_app_info, cluster_environment)", - "includeAll": true, - "multi": true, - "name": "cluster_environment", - "label": "Environment", - "query": "label_values(argocd_app_info, cluster_environment)", - "refresh": 2, - "sort": 1, - "type": "query" - } - ] - }, - "time": {"from": "now-6h", "to": "now"}, - "title": "ArgoCD Operations", - "uid": "edp-argocd-ops" - } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml index 2b81b2b..b348ff7 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml @@ -6,5 +6,4 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - folder: "EDP / Applications" url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml deleted file mode 100644 index ddcc883..0000000 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml +++ /dev/null @@ -1,103 +0,0 @@ -apiVersion: grafana.integreatly.org/v1beta1 -kind: GrafanaDashboard -metadata: - name: cronjob-monitoring -spec: - instanceSelector: - matchLabels: - dashboards: "grafana" - folder: "EDP / Operations" - json: | - { - "annotations": {"list": []}, - "editable": true, - "graphTooltip": 1, - "panels": [ - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, - "title": "Backup Job Status", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, - "gridPos": {"h": 5, "w": 12, "x": 0, "y": 1}, - "title": "Time Since Last Schedule", - "type": "stat", - "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, - "gridPos": {"h": 5, "w": 12, "x": 12, "y": 1}, - "title": "Failed Jobs (Active)", - "type": "stat", - "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, - "title": "CronJob Overview", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]}, - "gridPos": {"h": 8, "w": 24, "x": 0, "y": 7}, - "title": "All CronJobs", - "type": "table", - "targets": [ - {"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"} - ], - "transformations": [ - {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}}, - {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}} - ] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, - "title": "Job History", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, - "title": "Job Completions (24h)", - "type": "timeseries", - "targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}}, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, - "title": "Job Failures (24h)", - "type": "timeseries", - "targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] - } - ], - "schemaVersion": 39, - "tags": ["edp", "backup", "cronjob"], - "templating": { - "list": [ - { - "current": {"selected": true, "text": "All", "value": "$__all"}, - "datasource": {"type": "prometheus"}, - "definition": "label_values(kube_cronjob_info, cluster_environment)", - "includeAll": true, - "multi": true, - "name": "cluster_environment", - "label": "Environment", - "query": "label_values(kube_cronjob_info, cluster_environment)", - "refresh": 2, - "sort": 1, - "type": "query" - } - ] - }, - "time": {"from": "now-24h", "to": "now"}, - "title": "CronJob & Backup Monitoring", - "uid": "edp-cronjobs" - } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml deleted file mode 100644 index ec40751..0000000 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml +++ /dev/null @@ -1,207 +0,0 @@ -apiVersion: grafana.integreatly.org/v1beta1 -kind: GrafanaDashboard -metadata: - name: forgejo -spec: - instanceSelector: - matchLabels: - dashboards: "grafana" - folder: "EDP / Applications" - json: | - { - "annotations": {"list": []}, - "editable": true, - "graphTooltip": 1, - "panels": [ - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, - "title": "Forgejo Health", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}}, - "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, - "title": "Status", - "type": "stat", - "targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, - "title": "Version", - "type": "stat", - "targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}], - "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"} - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, - "title": "Repositories", - "type": "stat", - "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, - "title": "Users", - "type": "stat", - "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, - "title": "Organizations", - "type": "stat", - "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, - "title": "Teams", - "type": "stat", - "targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, - "title": "Activity", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 6, "x": 0, "y": 6}, - "title": "Open Issues", - "type": "stat", - "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 6, "x": 6, "y": 6}, - "title": "Closed Issues", - "type": "stat", - "targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 6, "x": 12, "y": 6}, - "title": "Webhooks", - "type": "stat", - "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 6, "x": 18, "y": 6}, - "title": "Hook Tasks", - "type": "stat", - "targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, - "title": "Content & Auth", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 0, "y": 11}, - "title": "Stars", - "type": "stat", - "targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 4, "y": 11}, - "title": "Watches", - "type": "stat", - "targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 8, "y": 11}, - "title": "Releases", - "type": "stat", - "targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 12, "y": 11}, - "title": "Mirrors", - "type": "stat", - "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 16, "y": 11}, - "title": "Public Keys", - "type": "stat", - "targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 20, "y": 11}, - "title": "OAuth Apps", - "type": "stat", - "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, - "title": "Forgejo Logs", - "type": "row" - }, - { - "datasource": {"type": "victoriametrics-logs-datasource"}, - "gridPos": {"h": 10, "w": 12, "x": 0, "y": 16}, - "title": "Forgejo Server Logs", - "type": "logs", - "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}], - "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} - }, - { - "datasource": {"type": "victoriametrics-logs-datasource"}, - "gridPos": {"h": 10, "w": 12, "x": 12, "y": 16}, - "title": "Forgejo Errors", - "type": "logs", - "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}], - "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} - } - ], - "schemaVersion": 39, - "tags": ["edp", "forgejo", "gitea"], - "templating": { - "list": [ - { - "current": {"selected": true, "text": "All", "value": "$__all"}, - "datasource": {"type": "prometheus"}, - "definition": "label_values(gitea_repositories, cluster_environment)", - "includeAll": true, - "multi": true, - "name": "cluster_environment", - "label": "Environment", - "query": "label_values(gitea_repositories, cluster_environment)", - "refresh": 2, - "type": "query" - } - ] - }, - "time": {"from": "now-6h", "to": "now"}, - "title": "Forgejo", - "uid": "edp-forgejo" - } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml deleted file mode 100644 index 2a23e20..0000000 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml +++ /dev/null @@ -1,117 +0,0 @@ -apiVersion: grafana.integreatly.org/v1beta1 -kind: GrafanaDashboard -metadata: - name: garm -spec: - instanceSelector: - matchLabels: - dashboards: "grafana" - folder: "EDP / Applications" - json: | - { - "annotations": {"list": []}, - "editable": true, - "graphTooltip": 1, - "panels": [ - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, - "title": "GARM Runner Status", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, - "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1}, - "title": "Total Runners", - "type": "stat", - "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, - "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1}, - "title": "Idle Runners", - "type": "stat", - "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, - "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1}, - "title": "Creating", - "type": "stat", - "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, - "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1}, - "title": "Errors", - "type": "stat", - "targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, - "title": "GitHub API Rate Limits", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "min": 0}}, - "gridPos": {"h": 8, "w": 12, "x": 0, "y": 7}, - "title": "Rate Limit Remaining", - "type": "timeseries", - "targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "ops"}}, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 7}, - "title": "Runner Operations Rate", - "type": "timeseries", - "targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, - "title": "Runner Details", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"custom": {"filterable": true}}}, - "gridPos": {"h": 8, "w": 24, "x": 0, "y": 16}, - "title": "Runner Pool Status", - "type": "table", - "targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}], - "transformations": [ - {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}}, - {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}} - ] - } - ], - "schemaVersion": 39, - "tags": ["edp", "garm", "ci-cd", "runners"], - "templating": { - "list": [ - { - "current": {"selected": true, "text": "All", "value": "$__all"}, - "datasource": {"type": "prometheus"}, - "definition": "label_values(garm_runner_status, cluster_environment)", - "includeAll": true, - "multi": true, - "name": "cluster_environment", - "label": "Environment", - "query": "label_values(garm_runner_status, cluster_environment)", - "refresh": 2, - "sort": 1, - "type": "query" - } - ] - }, - "time": {"from": "now-6h", "to": "now"}, - "title": "GARM Runners", - "uid": "edp-garm" - } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml index 077edd8..c13d6a2 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml @@ -6,5 +6,4 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - folder: "EDP / Operations" url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml deleted file mode 100644 index ffce4e2..0000000 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml +++ /dev/null @@ -1,245 +0,0 @@ -apiVersion: grafana.integreatly.org/v1beta1 -kind: GrafanaDashboard -metadata: - name: platform-overview -spec: - instanceSelector: - matchLabels: - dashboards: "grafana" - folder: "EDP / Overview" - json: | - { - "annotations": {"list": []}, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, - "title": "Platform Health", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": { - "defaults": { - "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], - "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} - } - }, - "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, - "title": "Forgejo", - "type": "stat", - "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": { - "defaults": { - "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} - } - }, - "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, - "title": "Ingress 5xx (5m)", - "type": "stat", - "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": { - "defaults": { - "unit": "short", - "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]} - } - }, - "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, - "title": "Failed Jobs (24h)", - "type": "stat", - "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": { - "defaults": { - "unit": "percentunit", - "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} - } - }, - "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, - "title": "Cluster CPU Usage", - "type": "stat", - "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": { - "defaults": { - "unit": "percentunit", - "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} - } - }, - "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, - "title": "Cluster Memory Usage", - "type": "stat", - "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": { - "defaults": { - "unit": "percentunit", - "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]} - } - }, - "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, - "title": "Max PVC Usage", - "type": "stat", - "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, - "title": "Forgejo", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, - "title": "Repositories", - "type": "stat", - "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, - "title": "Users", - "type": "stat", - "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, - "title": "Organizations", - "type": "stat", - "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, - "title": "Open Issues", - "type": "stat", - "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, - "title": "Webhooks", - "type": "stat", - "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short"}}, - "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, - "title": "Mirrors", - "type": "stat", - "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, - "title": "Resources", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, - "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, - "title": "Node CPU Usage", - "type": "timeseries", - "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, - "title": "PVC Usage by Claim", - "type": "timeseries", - "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, - "title": "Backups", - "type": "row" - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, - "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, - "title": "Time Since Last Backup Schedule", - "type": "stat", - "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "s"}}, - "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, - "title": "Backup Job Duration (Last 7d)", - "type": "timeseries", - "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}], - "options": {"legend": {"displayMode": "table"}} - }, - { - "datasource": {"type": "prometheus"}, - "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, - "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, - "title": "Failed Backup Jobs (Active)", - "type": "stat", - "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] - }, - { - "collapsed": false, - "gridPos": {"h": 1, "w": 24, "x": 0, "y": 24}, - "title": "Logs", - "type": "row" - }, - { - "datasource": {"type": "victoriametrics-logs-datasource"}, - "gridPos": {"h": 10, "w": 24, "x": 0, "y": 25}, - "title": "Recent Errors (all namespaces)", - "type": "logs", - "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}], - "options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"} - } - ], - "schemaVersion": 39, - "tags": ["edp", "platform", "overview"], - "templating": { - "list": [ - { - "current": {"selected": true, "text": "All", "value": "$__all"}, - "datasource": {"type": "prometheus"}, - "definition": "label_values(up, cluster_environment)", - "includeAll": true, - "multi": true, - "name": "cluster_environment", - "label": "Environment", - "query": "label_values(up, cluster_environment)", - "refresh": 2, - "sort": 1, - "type": "query" - } - ] - }, - "time": {"from": "now-6h", "to": "now"}, - "title": "EDP Platform Overview", - "uid": "edp-platform-overview" - } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml index c44c474..4018fbd 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -6,7 +6,4 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - folder: "EDP / Operations" - grafanaCom: - id: 22698 - revision: 1 + url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml index 2cce6a3..110ee7e 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -1,119 +1,40 @@ apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: - name: edp-platform-alerts + name: forgejo-alerts namespace: observability spec: groups: - - name: platform-health + - name: forgejo rules: - - alert: ForgejoDown - expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1 - for: 1m + - alert: forgejo down + expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1 + for: 30s labels: severity: critical + job: "{{ $labels.job }}" annotations: - summary: "Forgejo is down on {{ $labels.cluster_environment }}" - description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}." - - - alert: IngressHighErrorRate - expr: | - sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m])) - / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05 - for: 5m - labels: - severity: major - annotations: - summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}" - description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes." - value: "{{ $value | humanizePercentage }}" - - - alert: NodeNotReady - expr: kube_node_status_condition{condition="Ready", status="true"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}" - description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes." - - - alert: PodCrashLooping - expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3 - for: 5m - labels: - severity: major - annotations: - summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}" - description: "Pod has restarted more than 3 times in the last 15 minutes." - - - name: storage + value: "{{ $value }}" + description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}' + - name: forgejo-backup rules: - - alert: PVCUsageHigh - expr: | - 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 - for: 5m - labels: - severity: major - annotations: - summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%" - description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}." - value: "{{ $value | humanizePercentage }}" - - - alert: PVCUsageCritical - expr: | - 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90 - for: 5m + - alert: forgejo s3 backup job failed + expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0 + for: 30s labels: severity: critical + job: "{{ $labels.job }}" annotations: - summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%" - description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required." - value: "{{ $value | humanizePercentage }}" - - - name: resources + value: "{{ $value }}" + description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}' + - name: disk-consumption-high rules: - - alert: NodeCPUHigh - expr: | - 1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85 - for: 15m + - alert: disk consumption high + expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6 + for: 30s labels: severity: major + job: "{{ $labels.job }}" annotations: - summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}" - description: "Node CPU utilization has been above 85% for 15 minutes." - value: "{{ $value | humanizePercentage }}" - - - alert: NodeMemoryHigh - expr: | - 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 - for: 10m - labels: - severity: major - annotations: - summary: "Node memory >90% on {{ $labels.cluster_environment }}" - description: "Node memory utilization above 90% for 10 minutes." - value: "{{ $value | humanizePercentage }}" - - - name: cluster-health - rules: - - alert: ClusterMetricsSilent - expr: | - count(up{job="kubelet"}) by (cluster_environment) < 1 - or - absent(up{job="kubelet", cluster_environment="dev"}) - for: 10m - labels: - severity: critical - annotations: - summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics" - description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable." - - - alert: ClusterAPIServerDown - expr: | - up{job="apiserver", cluster_environment=~".+"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "API server down on {{ $labels.cluster_environment }}" - description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}." + value: "{{ $value }}" + description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}' diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml deleted file mode 100644 index 259a2bf..0000000 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml +++ /dev/null @@ -1,78 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: backup-alerts - namespace: observability -spec: - groups: - - name: backup-schedule-staleness - rules: - - alert: BackupCronJobNotScheduled - expr: | - time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} - > 26 * 3600 - for: 5m - labels: - severity: critical - cronjob: "{{ $labels.cronjob }}" - annotations: - value: "{{ $value | humanizeDuration }}" - description: >- - CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been - scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. - Last schedule was {{ $value | humanizeDuration }} ago. - summary: "Backup CronJob {{ $labels.cronjob }} is stale" - - - alert: BackupCronJobNeverScheduled - expr: | - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} - == 0 - for: 30m - labels: - severity: critical - cronjob: "{{ $labels.cronjob }}" - annotations: - description: >- - CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been - scheduled in cluster {{ $labels.cluster_environment }}. - summary: "Backup CronJob {{ $labels.cronjob }} never ran" - - - name: backup-job-failures - rules: - - alert: BackupJobFailed - expr: | - max by(cluster_environment, namespace, job_name) ( - kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} - ) > 0 - for: 30s - labels: - severity: critical - job_name: "{{ $labels.job_name }}" - annotations: - value: "{{ $value }}" - description: >- - Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has - {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. - summary: "Backup job {{ $labels.job_name }} failed" - - - name: backup-job-duration - rules: - - alert: BackupJobTooSlow - expr: | - ( - time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} - ) > 300 - and - kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 - for: 1m - labels: - severity: major - job_name: "{{ $labels.job_name }}" - annotations: - value: "{{ $value | humanizeDuration }}" - description: >- - Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been - running for {{ $value | humanizeDuration }} (threshold: 5m) - in cluster {{ $labels.cluster_environment }}. This may indicate a - hung process or connectivity issue. - summary: "Backup job {{ $labels.job_name }} running too long" diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml index f73afa8..a4c6119 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -10,4 +10,4 @@ spec: matchLabels: app.kubernetes.io/name: garm endpoints: - - port: http + - port: metrics diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml deleted file mode 100644 index 7013863..0000000 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: simple-user-secret - namespace: observability -type: Opaque -data: - username: c2ltcGxlLXVzZXI= - password: c3g1Z0M3b29XYVdPT0R3RA== diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index e1de2c6..5759093 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -5,17 +5,13 @@ metadata: namespace: observability spec: username: simple-user - password: sx5gC7ooWaWOODwD + passwordRef: + key: password + name: simple-user-secret targetRefs: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] - - static: - url: http://vmsingle-o12y:8429 - paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] - - static: - url: http://vlogs-victorialogs:9428 - paths: ["/select/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] \ No newline at end of file diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml index 4868e3a..5bb9361 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml @@ -1,6 +1,6 @@ global: # -- Cluster label to use for dashboards and rules - clusterLabel: cluster_environment + clusterLabel: cluster # -- Global license configuration license: key: "" @@ -201,13 +201,13 @@ defaultRules: enabled: true rules: {} kubernetesSystemControllerManager: - create: false + enabled: false rules: {} kubeScheduler: - create: false + enabled: false rules: {} kubernetesSystemScheduler: - create: false + enabled: false rules: {} kubeStateMetrics: enabled: true