fix(observability): 🐛 fix datasource UIDs, replace cronjob dashboard, add GARM

- Remove all ${DS_VICTORIAMETRICS} uid refs from platform-overview; use
  type-only datasource so grafana-operator resolves default prometheus DS
- Replace grafanaCom id:14279 cronjob dashboard with inline custom version
  supporting cluster_environment variable (dev/edp/observability)
- Add new GARM runners dashboard (edp-garm) ready for when GARM metrics
  are scraped; uses or vector(0) guards so panels show 0 not empty

Note: cluster_environment values confirmed as dev/edp/observability (no benchmark).
GARM metrics not yet present in VictoriaMetrics (0 series found).
This commit is contained in:
Daniel Sy 2026-06-19 13:11:32 +02:00
parent 6ea1e798d2
commit 076b2a16c9
Signed by untrusted user: danielsy
GPG key ID: 1F39A8BBCD2EE3D3
3 changed files with 228 additions and 21 deletions

View file

@ -6,6 +6,97 @@ spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
grafanaCom:
id: 14279
revision: 1
json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "Backup Job Status",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
"gridPos": {"h": 5, "w": 12, "x": 0, "y": 1},
"title": "Time Since Last Schedule",
"type": "stat",
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
"gridPos": {"h": 5, "w": 12, "x": 12, "y": 1},
"title": "Failed Jobs (Active)",
"type": "stat",
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
"title": "CronJob Overview",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 7},
"title": "All CronJobs",
"type": "table",
"targets": [
{"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"}
],
"transformations": [
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}},
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}}
]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
"title": "Job History",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"title": "Job Completions (24h)",
"type": "timeseries",
"targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"title": "Job Failures (24h)",
"type": "timeseries",
"targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
}
],
"schemaVersion": 39,
"tags": ["edp", "backup", "cronjob"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(kube_cronjob_info, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(kube_cronjob_info, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-24h", "to": "now"},
"title": "CronJob & Backup Monitoring",
"uid": "edp-cronjobs"
}

View file

@ -0,0 +1,116 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: garm
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "GARM Runner Status",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
"title": "Total Runners",
"type": "stat",
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
"title": "Idle Runners",
"type": "stat",
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
"title": "Creating",
"type": "stat",
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
"title": "Errors",
"type": "stat",
"targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
"title": "GitHub API Rate Limits",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "min": 0}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 7},
"title": "Rate Limit Remaining",
"type": "timeseries",
"targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "ops"}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 7},
"title": "Runner Operations Rate",
"type": "timeseries",
"targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
"title": "Runner Details",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"custom": {"filterable": true}}},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 16},
"title": "Runner Pool Status",
"type": "table",
"targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}],
"transformations": [
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}},
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}}
]
}
],
"schemaVersion": 39,
"tags": ["edp", "garm", "ci-cd", "runners"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(garm_runner_status, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(garm_runner_status, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"title": "GARM Runners",
"uid": "edp-garm"
}

View file

@ -21,7 +21,7 @@ spec:
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}],
@ -34,7 +34,7 @@ spec:
"targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
@ -46,7 +46,7 @@ spec:
"targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "short",
@ -59,7 +59,7 @@ spec:
"targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
@ -72,7 +72,7 @@ spec:
"targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
@ -85,7 +85,7 @@ spec:
"targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
@ -104,7 +104,7 @@ spec:
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
"title": "Repositories",
@ -112,7 +112,7 @@ spec:
"targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
"title": "Users",
@ -120,7 +120,7 @@ spec:
"targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
"title": "Organizations",
@ -128,7 +128,7 @@ spec:
"targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
"title": "Open Issues",
@ -136,7 +136,7 @@ spec:
"targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 6},
"title": "Webhooks",
@ -144,7 +144,7 @@ spec:
"targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 6},
"title": "Mirrors",
@ -158,7 +158,7 @@ spec:
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 11},
"title": "Node CPU Usage",
@ -166,7 +166,7 @@ spec:
"targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 11},
"title": "PVC Usage by Claim",
@ -180,7 +180,7 @@ spec:
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 20},
"title": "Time Since Last Backup Schedule",
@ -188,7 +188,7 @@ spec:
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s"}},
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 20},
"title": "Backup Job Duration (Last 7d)",
@ -197,7 +197,7 @@ spec:
"options": {"legend": {"displayMode": "table"}}
},
{
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 20},
"title": "Failed Backup Jobs (Active)",
@ -211,7 +211,7 @@ spec:
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
"datasource": {"type": "prometheus"},
"definition": "label_values(up, cluster_environment)",
"includeAll": true,
"multi": true,