From 3bc8a7444bcabb3ca4ba71205f63be60468c6d3c Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Fri, 19 Jun 2026 12:58:06 +0200 Subject: [PATCH] =?UTF-8?q?feat(observability):=20=E2=9C=A8=20add=20dashbo?= =?UTF-8?q?ards,=20scrape=20configs,=20and=20fix=20victoria-logs=20to=20te?= =?UTF-8?q?mplate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new Grafana dashboard CRs to grafana-operator/manifests: - platform-overview, forgejo, argocd-operational, cronjob-monitoring Fix victoria-logs dashboard to use grafana.com marketplace (id: 22698) instead of raw GitHub URL Add hub-side scrape configs to victoria-k8s-stack/manifests: - argocd-scrape, garm-scrape, coredns-scrape, ci-sustainability-rules Add client-side forgejo VMServiceScrape to observability-client/vm-client-stack/manifests Enable ArgoCD metrics endpoints in core/argocd/values.yaml (required by argocd-scrape) --- template/stacks/core/argocd/values.yaml | 16 ++ .../manifests/forgejo-scrape.yaml | 15 ++ .../manifests/argocd-operational.yaml | 11 + .../manifests/cronjob-monitoring.yaml | 11 + .../grafana-operator/manifests/forgejo.yaml | 184 ++++++++++++++ .../manifests/platform-overview.yaml | 230 ++++++++++++++++++ .../manifests/victoria-logs.yaml | 4 +- .../manifests/argocd-scrape.yaml | 13 + .../manifests/ci-sustainability-rules.yaml | 61 +++++ .../manifests/coredns-scrape.yaml | 30 +++ .../manifests/garm-scrape.yaml | 13 + 11 files changed, 587 insertions(+), 1 deletion(-) create mode 100644 template/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml create mode 100644 template/stacks/observability/grafana-operator/manifests/argocd-operational.yaml create mode 100644 template/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml create mode 100644 template/stacks/observability/grafana-operator/manifests/forgejo.yaml create mode 100644 template/stacks/observability/grafana-operator/manifests/platform-overview.yaml create mode 100644 template/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml create mode 100644 template/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml create mode 100644 template/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml create mode 100644 template/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml diff --git a/template/stacks/core/argocd/values.yaml b/template/stacks/core/argocd/values.yaml index d197745..f2495ec 100644 --- a/template/stacks/core/argocd/values.yaml +++ b/template/stacks/core/argocd/values.yaml @@ -40,3 +40,19 @@ notifications: dex: enabled: false + +controller: + metrics: + enabled: true + +server: + metrics: + enabled: true + +repoServer: + metrics: + enabled: true + +applicationSet: + metrics: + enabled: true diff --git a/template/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/template/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/template/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/template/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/template/stacks/observability/grafana-operator/manifests/argocd-operational.yaml new file mode 100644 index 0000000..b3fa256 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -0,0 +1,11 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: argocd-operational +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + grafanaCom: + id: 19993 + revision: 2 diff --git a/template/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/template/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml new file mode 100644 index 0000000..e77eb20 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -0,0 +1,11 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cronjob-monitoring +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + grafanaCom: + id: 14279 + revision: 1 diff --git a/template/stacks/observability/grafana-operator/manifests/forgejo.yaml b/template/stacks/observability/grafana-operator/manifests/forgejo.yaml new file mode 100644 index 0000000..bf566a5 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -0,0 +1,184 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: forgejo +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Forgejo Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Status", + "type": "stat", + "targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Version", + "type": "stat", + "targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Teams", + "type": "stat", + "targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 6}, + "title": "Closed Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 6}, + "title": "Hook Tasks", + "type": "stat", + "targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Content & Auth", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 11}, + "title": "Stars", + "type": "stat", + "targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 11}, + "title": "Watches", + "type": "stat", + "targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 11}, + "title": "Releases", + "type": "stat", + "targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 11}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 11}, + "title": "Public Keys", + "type": "stat", + "targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 11}, + "title": "OAuth Apps", + "type": "stat", + "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "forgejo", "gitea"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(gitea_repositories, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(gitea_repositories, cluster_environment)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "Forgejo", + "uid": "edp-forgejo" + } diff --git a/template/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/template/stacks/observability/grafana-operator/manifests/platform-overview.yaml new file mode 100644 index 0000000..d4102fb --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -0,0 +1,230 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: platform-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "annotations": {"list": []}, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Platform Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Forgejo", + "type": "stat", + "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Ingress 5xx (5m)", + "type": "stat", + "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Failed Jobs (24h)", + "type": "stat", + "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Cluster CPU Usage", + "type": "stat", + "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Cluster Memory Usage", + "type": "stat", + "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Max PVC Usage", + "type": "stat", + "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Forgejo", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Resources", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, + "title": "Node CPU Usage", + "type": "timeseries", + "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, + "title": "PVC Usage by Claim", + "type": "timeseries", + "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, + "title": "Backups", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, + "title": "Time Since Last Backup Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "s"}}, + "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, + "title": "Backup Job Duration (Last 7d)", + "type": "timeseries", + "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}], + "options": {"legend": {"displayMode": "table"}} + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, + "title": "Failed Backup Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "platform", "overview"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "definition": "label_values(up, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(up, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "EDP Platform Overview", + "uid": "edp-platform-overview" + } diff --git a/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml index 4018fbd..819dec7 100644 --- a/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml +++ b/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -6,4 +6,6 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" + grafanaCom: + id: 22698 + revision: 1 diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..0517321 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: http-metrics diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml new file mode 100644 index 0000000..2290b99 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml @@ -0,0 +1,61 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: ci-sustainability +spec: + groups: + - name: ci.sustainability.daily + interval: 5m + rules: + - record: ci:cpu_seconds:increase1d + expr: | + sum by(namespace, cluster) ( + increase(container_cpu_usage_seconds_total{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + }[1d]) + ) + - record: ci:memory_bytes_seconds:avg1d + expr: | + avg_over_time( + sum by(namespace, cluster) ( + container_memory_working_set_bytes{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + } + )[1d:5m] + ) + - record: ci:pod_count:avg1d + expr: | + avg_over_time( + count by(namespace, cluster) ( + kube_pod_info{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + } + )[1d:5m] + ) + - record: ci:pod_creations:increase1d + expr: | + sum by(namespace, cluster) ( + changes(kube_pod_start_time{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + }[1d]) + ) + - name: ci.sustainability.cluster + interval: 5m + rules: + - record: cluster:cpu_seconds:rate5m + expr: | + sum by(cluster) ( + rate(node_cpu_seconds_total{mode!="idle"}[5m]) + ) + - record: cluster:memory_used_bytes:sum + expr: | + sum by(cluster) ( + node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes + ) + diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml new file mode 100644 index 0000000..77cef00 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: Service +metadata: + name: coredns-metrics + namespace: kube-system + labels: + k8s-app: coredns-metrics +spec: + clusterIP: None + selector: + k8s-app: coredns + ports: + - name: metrics + port: 9153 + targetPort: 9153 + protocol: TCP +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: coredns +spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: coredns-metrics + endpoints: + - port: metrics diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..f73afa8 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http