diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml index e77eb20..5b5eeac 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -6,6 +6,97 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - grafanaCom: - id: 14279 - revision: 1 + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Backup Job Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 0, "y": 1}, + "title": "Time Since Last Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 12, "y": 1}, + "title": "Failed Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "CronJob Overview", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 7}, + "title": "All CronJobs", + "type": "table", + "targets": [ + {"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"} + ], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Job History", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "title": "Job Completions (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "title": "Job Failures (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "backup", "cronjob"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(kube_cronjob_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(kube_cronjob_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-24h", "to": "now"}, + "title": "CronJob & Backup Monitoring", + "uid": "edp-cronjobs" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml new file mode 100644 index 0000000..9e01a51 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml @@ -0,0 +1,116 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: garm +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "GARM Runner Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1}, + "title": "Total Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1}, + "title": "Idle Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1}, + "title": "Creating", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1}, + "title": "Errors", + "type": "stat", + "targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "GitHub API Rate Limits", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "min": 0}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 7}, + "title": "Rate Limit Remaining", + "type": "timeseries", + "targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 7}, + "title": "Runner Operations Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Runner Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 16}, + "title": "Runner Pool Status", + "type": "table", + "targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}} + ] + } + ], + "schemaVersion": 39, + "tags": ["edp", "garm", "ci-cd", "runners"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(garm_runner_status, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(garm_runner_status, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "GARM Runners", + "uid": "edp-garm" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml index d4102fb..ac099d0 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -21,7 +21,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], @@ -34,7 +34,7 @@ spec: "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} @@ -46,7 +46,7 @@ spec: "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "short", @@ -59,7 +59,7 @@ spec: "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", @@ -72,7 +72,7 @@ spec: "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", @@ -85,7 +85,7 @@ spec: "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", @@ -104,7 +104,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, "title": "Repositories", @@ -112,7 +112,7 @@ spec: "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, "title": "Users", @@ -120,7 +120,7 @@ spec: "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, "title": "Organizations", @@ -128,7 +128,7 @@ spec: "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, "title": "Open Issues", @@ -136,7 +136,7 @@ spec: "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, "title": "Webhooks", @@ -144,7 +144,7 @@ spec: "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, "title": "Mirrors", @@ -158,7 +158,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, "title": "Node CPU Usage", @@ -166,7 +166,7 @@ spec: "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, "title": "PVC Usage by Claim", @@ -180,7 +180,7 @@ spec: "type": "row" }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, "title": "Time Since Last Backup Schedule", @@ -188,7 +188,7 @@ spec: "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "s"}}, "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, "title": "Backup Job Duration (Last 7d)", @@ -197,7 +197,7 @@ spec: "options": {"legend": {"displayMode": "table"}} }, { - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, "title": "Failed Backup Jobs (Active)", @@ -211,7 +211,7 @@ spec: "list": [ { "current": {"selected": true, "text": "All", "value": "$__all"}, - "datasource": {"type": "prometheus", "uid": "${DS_VICTORIAMETRICS}"}, + "datasource": {"type": "prometheus"}, "definition": "label_values(up, cluster_environment)", "includeAll": true, "multi": true,