diff --git a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml index a6521b0..1591cc9 100644 --- a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml +++ b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index ed54cb0..b99a903 100644 --- a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml index 3fb5e53..2393b1a 100644 --- a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml +++ b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml index dde927b..7f6dd00 100644 --- a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret diff --git a/otc/dev.t09.de/stacks/core/argocd/values.yaml b/otc/dev.t09.de/stacks/core/argocd/values.yaml index dd5b83d..cb856f0 100644 --- a/otc/dev.t09.de/stacks/core/argocd/values.yaml +++ b/otc/dev.t09.de/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index de14801..a8e236f 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/dev.t09.de/stacks/garm/garm/values.yaml b/otc/dev.t09.de/stacks/garm/garm/values.yaml index 5baed69..41fc84c 100644 --- a/otc/dev.t09.de/stacks/garm/garm/values.yaml +++ b/otc/dev.t09.de/stacks/garm/garm/values.yaml @@ -41,5 +41,8 @@ providerConfig: sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4 garm: + metrics: + enable: true + disableAuth: true logging: logLevel: info diff --git a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml index c0644cf..4d7458a 100644 --- a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..2e9248f --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd + namespace: observability +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: http-metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..9904e86 --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm + namespace: observability +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http + path: /metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml index f85a786..c6d6b3a 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret @@ -801,6 +801,20 @@ vmagent: # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug promscrape.dropOriginalLabels: "true" + # Harden liveness probe: default failureThreshold=10 masked a 72h silent outage + livenessProbe: + httpGet: + path: /health + port: http + failureThreshold: 3 + periodSeconds: 5 + timeoutSeconds: 5 + startupProbe: + httpGet: + path: /health + port: http + failureThreshold: 30 + periodSeconds: 5 # -- (object) VMAgent ingress configuration ingress: enabled: false diff --git a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml index 1e8b038..17d6046 100644 --- a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml +++ b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml @@ -35,8 +35,10 @@ spec: server: root_url: "https://grafana.dev.t09.de" auth: - disable_login: "true" disable_login_form: "true" + security: + admin_user: admin + admin_password: admin auth.generic_oauth: enabled: "true" name: Forgejo diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml index 3011a2f..d7599b9 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml @@ -9,10 +9,13 @@ spec: project: default syncPolicy: automated: + prune: true selfHeal: true syncOptions: - CreateNamespace=true - ServerSideApply=true + - RespectIgnoreDifferences=true + - SkipDryRunOnMissingResource=true destination: name: in-cluster namespace: observability diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml index 6fc8de6..4b5807e 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -11,4 +11,4 @@ spec: matchLabels: app.kubernetes.io/name: garm endpoints: - - port: metrics + - port: http diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml index 8657ac8..72e13d1 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml @@ -1,5 +1,5 @@ apiVersion: operator.victoriametrics.com/v1beta1 -kind: VLSingle +kind: VLogs metadata: name: victorialogs namespace: observability diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 5759093..a4f0368 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -12,6 +12,12 @@ spec: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/select/.*"] \ No newline at end of file diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml index d407910..e7bffbc 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml @@ -28,10 +28,7 @@ victoria-metrics-operator: crds: plain: true cleanup: - enabled: true - image: - repository: bitnami/kubectl - pullPolicy: IfNotPresent + enabled: false # disabled: cleanup hook can't schedule on resource-constrained nodes (Insufficient cpu / Too many pods) serviceMonitor: enabled: true operator: @@ -676,7 +673,7 @@ vmalert: vmauth: # -- Enable VMAuth CR - enabled: true + enabled: false # -- VMAuth annotations annotations: {} # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) @@ -699,7 +696,7 @@ vmauth: vmagent: # -- Create VMAgent CR - enabled: false + enabled: true # -- VMAgent annotations annotations: {} # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) @@ -711,7 +708,8 @@ vmagent: port: "8429" selectAllByDefault: true scrapeInterval: 20s - externalLabels: {} + externalLabels: + cluster_environment: "dev" # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. # For example: # cluster: cluster-name diff --git a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml index 019dc65..c1bde64 100644 --- a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml +++ b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 71f1649..dd51f5b 100644 --- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml index 7b30cdc..2fefacd 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml index 4e1c079..a7ba239 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret diff --git a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 842a7cc..9f86064 100644 --- a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 14400 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml new file mode 100644 index 0000000..9130b42 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -0,0 +1,153 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: argocd-operational +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Application Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Total Apps", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Healthy", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Degraded", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Synced", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "OutOfSync", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Progressing", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Application Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": {"custom": {"filterable": true}}, + "overrides": [ + {"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]}, + {"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]} + ] + }, + "gridPos": {"h": 12, "w": 24, "x": 0, "y": 6}, + "title": "All Applications", + "type": "table", + "targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}, + "title": "Sync Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 19}, + "title": "Sync Operations (rate)", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 19}, + "title": "Reconciliation Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}, + "title": "ArgoCD Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 28}, + "title": "ArgoCD Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "argocd", "gitops"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(argocd_app_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(argocd_app_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "ArgoCD Operations", + "uid": "edp-argocd-ops" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml index b348ff7..2b81b2b 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Applications" url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml new file mode 100644 index 0000000..ddcc883 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -0,0 +1,103 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cronjob-monitoring +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Operations" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Backup Job Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 0, "y": 1}, + "title": "Time Since Last Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 12, "y": 1}, + "title": "Failed Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "CronJob Overview", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 7}, + "title": "All CronJobs", + "type": "table", + "targets": [ + {"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"} + ], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Job History", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "title": "Job Completions (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "title": "Job Failures (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "backup", "cronjob"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(kube_cronjob_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(kube_cronjob_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-24h", "to": "now"}, + "title": "CronJob & Backup Monitoring", + "uid": "edp-cronjobs" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml new file mode 100644 index 0000000..ec40751 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -0,0 +1,207 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: forgejo +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Forgejo Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Status", + "type": "stat", + "targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Version", + "type": "stat", + "targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Teams", + "type": "stat", + "targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 6}, + "title": "Closed Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 6}, + "title": "Hook Tasks", + "type": "stat", + "targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Content & Auth", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 11}, + "title": "Stars", + "type": "stat", + "targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 11}, + "title": "Watches", + "type": "stat", + "targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 11}, + "title": "Releases", + "type": "stat", + "targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 11}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 11}, + "title": "Public Keys", + "type": "stat", + "targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 11}, + "title": "OAuth Apps", + "type": "stat", + "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Forgejo Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 16}, + "title": "Forgejo Server Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 12, "y": 16}, + "title": "Forgejo Errors", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "forgejo", "gitea"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(gitea_repositories, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(gitea_repositories, cluster_environment)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "Forgejo", + "uid": "edp-forgejo" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml new file mode 100644 index 0000000..2a23e20 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml @@ -0,0 +1,117 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: garm +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "GARM Runner Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1}, + "title": "Total Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1}, + "title": "Idle Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1}, + "title": "Creating", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1}, + "title": "Errors", + "type": "stat", + "targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "GitHub API Rate Limits", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "min": 0}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 7}, + "title": "Rate Limit Remaining", + "type": "timeseries", + "targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 7}, + "title": "Runner Operations Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Runner Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 16}, + "title": "Runner Pool Status", + "type": "table", + "targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}} + ] + } + ], + "schemaVersion": 39, + "tags": ["edp", "garm", "ci-cd", "runners"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(garm_runner_status, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(garm_runner_status, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "GARM Runners", + "uid": "edp-garm" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml index c13d6a2..077edd8 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Operations" url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml new file mode 100644 index 0000000..ffce4e2 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -0,0 +1,245 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: platform-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Overview" + json: | + { + "annotations": {"list": []}, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Platform Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Forgejo", + "type": "stat", + "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Ingress 5xx (5m)", + "type": "stat", + "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Failed Jobs (24h)", + "type": "stat", + "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Cluster CPU Usage", + "type": "stat", + "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Cluster Memory Usage", + "type": "stat", + "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Max PVC Usage", + "type": "stat", + "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Forgejo", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Resources", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, + "title": "Node CPU Usage", + "type": "timeseries", + "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, + "title": "PVC Usage by Claim", + "type": "timeseries", + "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, + "title": "Backups", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, + "title": "Time Since Last Backup Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s"}}, + "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, + "title": "Backup Job Duration (Last 7d)", + "type": "timeseries", + "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}], + "options": {"legend": {"displayMode": "table"}} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, + "title": "Failed Backup Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 24}, + "title": "Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 25}, + "title": "Recent Errors (all namespaces)", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "platform", "overview"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(up, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(up, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "EDP Platform Overview", + "uid": "edp-platform-overview" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml index 4018fbd..c44c474 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -6,4 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" + folder: "EDP / Operations" + grafanaCom: + id: 22698 + revision: 1 diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml index 110ee7e..2cce6a3 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -1,40 +1,119 @@ apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: - name: forgejo-alerts + name: edp-platform-alerts namespace: observability spec: groups: - - name: forgejo + - name: platform-health rules: - - alert: forgejo down - expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1 - for: 30s + - alert: ForgejoDown + expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1 + for: 1m labels: severity: critical - job: "{{ $labels.job }}" annotations: - value: "{{ $value }}" - description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}' - - name: forgejo-backup - rules: - - alert: forgejo s3 backup job failed - expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0 - for: 30s - labels: - severity: critical - job: "{{ $labels.job }}" - annotations: - value: "{{ $value }}" - description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}' - - name: disk-consumption-high - rules: - - alert: disk consumption high - expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6 - for: 30s + summary: "Forgejo is down on {{ $labels.cluster_environment }}" + description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}." + + - alert: IngressHighErrorRate + expr: | + sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m])) + / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05 + for: 5m labels: severity: major - job: "{{ $labels.job }}" annotations: - value: "{{ $value }}" - description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}' + summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}" + description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeNotReady + expr: kube_node_status_condition{condition="Ready", status="true"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}" + description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes." + + - alert: PodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3 + for: 5m + labels: + severity: major + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}" + description: "Pod has restarted more than 3 times in the last 15 minutes." + + - name: storage + rules: + - alert: PVCUsageHigh + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: major + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%" + description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}." + value: "{{ $value | humanizePercentage }}" + + - alert: PVCUsageCritical + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90 + for: 5m + labels: + severity: critical + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%" + description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required." + value: "{{ $value | humanizePercentage }}" + + - name: resources + rules: + - alert: NodeCPUHigh + expr: | + 1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85 + for: 15m + labels: + severity: major + annotations: + summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}" + description: "Node CPU utilization has been above 85% for 15 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeMemoryHigh + expr: | + 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 + for: 10m + labels: + severity: major + annotations: + summary: "Node memory >90% on {{ $labels.cluster_environment }}" + description: "Node memory utilization above 90% for 10 minutes." + value: "{{ $value | humanizePercentage }}" + + - name: cluster-health + rules: + - alert: ClusterMetricsSilent + expr: | + count(up{job="kubelet"}) by (cluster_environment) < 1 + or + absent(up{job="kubelet", cluster_environment="dev"}) + for: 10m + labels: + severity: critical + annotations: + summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics" + description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable." + + - alert: ClusterAPIServerDown + expr: | + up{job="apiserver", cluster_environment=~".+"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "API server down on {{ $labels.cluster_environment }}" + description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}." diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml new file mode 100644 index 0000000..259a2bf --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml @@ -0,0 +1,78 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: backup-alerts + namespace: observability +spec: + groups: + - name: backup-schedule-staleness + rules: + - alert: BackupCronJobNotScheduled + expr: | + time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + > 26 * 3600 + for: 5m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been + scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. + Last schedule was {{ $value | humanizeDuration }} ago. + summary: "Backup CronJob {{ $labels.cronjob }} is stale" + + - alert: BackupCronJobNeverScheduled + expr: | + kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + == 0 + for: 30m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been + scheduled in cluster {{ $labels.cluster_environment }}. + summary: "Backup CronJob {{ $labels.cronjob }} never ran" + + - name: backup-job-failures + rules: + - alert: BackupJobFailed + expr: | + max by(cluster_environment, namespace, job_name) ( + kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 0 + for: 30s + labels: + severity: critical + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has + {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. + summary: "Backup job {{ $labels.job_name }} failed" + + - name: backup-job-duration + rules: + - alert: BackupJobTooSlow + expr: | + ( + time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 300 + and + kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 + for: 1m + labels: + severity: major + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been + running for {{ $value | humanizeDuration }} (threshold: 5m) + in cluster {{ $labels.cluster_environment }}. This may indicate a + hung process or connectivity issue. + summary: "Backup job {{ $labels.job_name }} running too long" diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml index a4c6119..f73afa8 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -10,4 +10,4 @@ spec: matchLabels: app.kubernetes.io/name: garm endpoints: - - port: metrics + - port: http diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml new file mode 100644 index 0000000..7013863 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: simple-user-secret + namespace: observability +type: Opaque +data: + username: c2ltcGxlLXVzZXI= + password: c3g1Z0M3b29XYVdPT0R3RA== diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 5759093..e1de2c6 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -5,13 +5,17 @@ metadata: namespace: observability spec: username: simple-user - passwordRef: - key: password - name: simple-user-secret + password: sx5gC7ooWaWOODwD targetRefs: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/select/.*"] \ No newline at end of file diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml index 5bb9361..4868e3a 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml @@ -1,6 +1,6 @@ global: # -- Cluster label to use for dashboards and rules - clusterLabel: cluster + clusterLabel: cluster_environment # -- Global license configuration license: key: "" @@ -201,13 +201,13 @@ defaultRules: enabled: true rules: {} kubernetesSystemControllerManager: - enabled: false + create: false rules: {} kubeScheduler: - enabled: false + create: false rules: {} kubernetesSystemScheduler: - enabled: false + create: false rules: {} kubeStateMetrics: enabled: true