apiVersion: grafana.integreatly.org/v1beta1 kind: GrafanaDashboard metadata: name: platform-overview spec: instanceSelector: matchLabels: dashboards: "grafana" json: | { "annotations": {"list": []}, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, "links": [], "panels": [ { "collapsed": false, "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, "title": "Platform Health", "type": "row" }, { "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} } }, "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, "title": "Forgejo", "type": "stat", "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} } }, "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, "title": "Ingress 5xx (5m)", "type": "stat", "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]} } }, "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, "title": "Failed Jobs (24h)", "type": "stat", "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} } }, "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, "title": "Cluster CPU Usage", "type": "stat", "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} } }, "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, "title": "Cluster Memory Usage", "type": "stat", "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]} } }, "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, "title": "Max PVC Usage", "type": "stat", "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] }, { "collapsed": false, "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, "title": "Forgejo", "type": "row" }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, "title": "Repositories", "type": "stat", "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, "title": "Users", "type": "stat", "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, "title": "Organizations", "type": "stat", "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, "title": "Open Issues", "type": "stat", "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, "title": "Webhooks", "type": "stat", "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short"}}, "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, "title": "Mirrors", "type": "stat", "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] }, { "collapsed": false, "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, "title": "Resources", "type": "row" }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, "title": "Node CPU Usage", "type": "timeseries", "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, "title": "PVC Usage by Claim", "type": "timeseries", "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}] }, { "collapsed": false, "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, "title": "Backups", "type": "row" }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, "title": "Time Since Last Backup Schedule", "type": "stat", "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "s"}}, "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, "title": "Backup Job Duration (Last 7d)", "type": "timeseries", "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}], "options": {"legend": {"displayMode": "table"}} }, { "datasource": {"type": "prometheus"}, "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, "title": "Failed Backup Jobs (Active)", "type": "stat", "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] } ], "schemaVersion": 39, "tags": ["edp", "platform", "overview"], "templating": { "list": [ { "current": {"selected": true, "text": "All", "value": "$__all"}, "datasource": {"type": "prometheus"}, "definition": "label_values(up, cluster_environment)", "includeAll": true, "multi": true, "name": "cluster_environment", "label": "Environment", "query": "label_values(up, cluster_environment)", "refresh": 2, "sort": 1, "type": "query" } ] }, "time": {"from": "now-6h", "to": "now"}, "title": "EDP Platform Overview", "uid": "edp-platform-overview" }