fix(observability): 🐛 fix datasource UIDs, replace cronjob dashboard, add GARM
- Remove all ${DS_VICTORIAMETRICS} uid refs from platform-overview; use
type-only datasource so grafana-operator resolves default prometheus DS
- Replace grafanaCom id:14279 cronjob dashboard with inline custom version
supporting cluster_environment variable (dev/edp/observability)
- Add new GARM runners dashboard (edp-garm) ready for when GARM metrics
are scraped; uses or vector(0) guards so panels show 0 not empty
Note: cluster_environment values confirmed as dev/edp/observability (no benchmark).
GARM metrics not yet present in VictoriaMetrics (0 series found).
This commit is contained in:
parent
6ea1e798d2
commit
076b2a16c9
3 changed files with 228 additions and 21 deletions
|
|
@ -6,6 +6,97 @@ spec:
|
|||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
grafanaCom:
|
||||
id: 14279
|
||||
revision: 1
|
||||
json: |
|
||||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
||||
"title": "Backup Job Status",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
|
||||
"gridPos": {"h": 5, "w": 12, "x": 0, "y": 1},
|
||||
"title": "Time Since Last Schedule",
|
||||
"type": "stat",
|
||||
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
|
||||
"gridPos": {"h": 5, "w": 12, "x": 12, "y": 1},
|
||||
"title": "Failed Jobs (Active)",
|
||||
"type": "stat",
|
||||
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
|
||||
"title": "CronJob Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 7},
|
||||
"title": "All CronJobs",
|
||||
"type": "table",
|
||||
"targets": [
|
||||
{"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"}
|
||||
],
|
||||
"transformations": [
|
||||
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}},
|
||||
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
|
||||
"title": "Job History",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||
"title": "Job Completions (24h)",
|
||||
"type": "timeseries",
|
||||
"targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
|
||||
"title": "Job Failures (24h)",
|
||||
"type": "timeseries",
|
||||
"targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["edp", "backup", "cronjob"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"definition": "label_values(kube_cronjob_info, cluster_environment)",
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"name": "cluster_environment",
|
||||
"label": "Environment",
|
||||
"query": "label_values(kube_cronjob_info, cluster_environment)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {"from": "now-24h", "to": "now"},
|
||||
"title": "CronJob & Backup Monitoring",
|
||||
"uid": "edp-cronjobs"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,116 @@
|
|||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: garm
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
||||
"title": "GARM Runner Status",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
|
||||
"title": "Total Runners",
|
||||
"type": "stat",
|
||||
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
|
||||
"title": "Idle Runners",
|
||||
"type": "stat",
|
||||
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
|
||||
"title": "Creating",
|
||||
"type": "stat",
|
||||
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
|
||||
"title": "Errors",
|
||||
"type": "stat",
|
||||
"targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
|
||||
"title": "GitHub API Rate Limits",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "min": 0}},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 7},
|
||||
"title": "Rate Limit Remaining",
|
||||
"type": "timeseries",
|
||||
"targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 7},
|
||||
"title": "Runner Operations Rate",
|
||||
"type": "timeseries",
|
||||
"targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
|
||||
"title": "Runner Details",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"custom": {"filterable": true}}},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 16},
|
||||
"title": "Runner Pool Status",
|
||||
"type": "table",
|
||||
"targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}],
|
||||
"transformations": [
|
||||
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}},
|
||||
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["edp", "garm", "ci-cd", "runners"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"definition": "label_values(garm_runner_status, cluster_environment)",
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"name": "cluster_environment",
|
||||
"label": "Environment",
|
||||
"query": "label_values(garm_runner_status, cluster_environment)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {"from": "now-6h", "to": "now"},
|
||||
"title": "GARM Runners",
|
||||
"uid": "edp-garm"
|
||||
}
|
||||
|
|
@ -21,7 +21,7 @@ spec:
|
|||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}],
|
||||
|
|
@ -34,7 +34,7 @@ spec:
|
|||
"targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
|
||||
|
|
@ -46,7 +46,7 @@ spec:
|
|||
"targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
|
|
@ -59,7 +59,7 @@ spec:
|
|||
"targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
|
|
@ -72,7 +72,7 @@ spec:
|
|||
"targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
|
|
@ -85,7 +85,7 @@ spec:
|
|||
"targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
|
|
@ -104,7 +104,7 @@ spec:
|
|||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
|
||||
"title": "Repositories",
|
||||
|
|
@ -112,7 +112,7 @@ spec:
|
|||
"targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
|
||||
"title": "Users",
|
||||
|
|
@ -120,7 +120,7 @@ spec:
|
|||
"targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
|
||||
"title": "Organizations",
|
||||
|
|
@ -128,7 +128,7 @@ spec:
|
|||
"targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
|
||||
"title": "Open Issues",
|
||||
|
|
@ -136,7 +136,7 @@ spec:
|
|||
"targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 6},
|
||||
"title": "Webhooks",
|
||||
|
|
@ -144,7 +144,7 @@ spec:
|
|||
"targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 6},
|
||||
"title": "Mirrors",
|
||||
|
|
@ -158,7 +158,7 @@ spec:
|
|||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 11},
|
||||
"title": "Node CPU Usage",
|
||||
|
|
@ -166,7 +166,7 @@ spec:
|
|||
"targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 11},
|
||||
"title": "PVC Usage by Claim",
|
||||
|
|
@ -180,7 +180,7 @@ spec:
|
|||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
|
||||
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 20},
|
||||
"title": "Time Since Last Backup Schedule",
|
||||
|
|
@ -188,7 +188,7 @@ spec:
|
|||
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "s"}},
|
||||
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 20},
|
||||
"title": "Backup Job Duration (Last 7d)",
|
||||
|
|
@ -197,7 +197,7 @@ spec:
|
|||
"options": {"legend": {"displayMode": "table"}}
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
|
||||
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 20},
|
||||
"title": "Failed Backup Jobs (Active)",
|
||||
|
|
@ -211,7 +211,7 @@ spec:
|
|||
"list": [
|
||||
{
|
||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"},
|
||||
"datasource": {"type": "prometheus"},
|
||||
"definition": "label_values(up, cluster_environment)",
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue