stacks-instances/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml

244 lines
11 KiB
YAML

apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: platform-overview
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
json: |
{
"annotations": {"list": []},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "Platform Health",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}],
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
"title": "Forgejo",
"type": "stat",
"targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
"title": "Ingress 5xx (5m)",
"type": "stat",
"targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
"title": "Failed Jobs (24h)",
"type": "stat",
"targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
"title": "Cluster CPU Usage",
"type": "stat",
"targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
"title": "Cluster Memory Usage",
"type": "stat",
"targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
"title": "Max PVC Usage",
"type": "stat",
"targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
"title": "Forgejo",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
"title": "Repositories",
"type": "stat",
"targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
"title": "Users",
"type": "stat",
"targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
"title": "Organizations",
"type": "stat",
"targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
"title": "Open Issues",
"type": "stat",
"targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 6},
"title": "Webhooks",
"type": "stat",
"targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 6},
"title": "Mirrors",
"type": "stat",
"targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
"title": "Resources",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 11},
"title": "Node CPU Usage",
"type": "timeseries",
"targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 11},
"title": "PVC Usage by Claim",
"type": "timeseries",
"targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 19},
"title": "Backups",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 20},
"title": "Time Since Last Backup Schedule",
"type": "stat",
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s"}},
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 20},
"title": "Backup Job Duration (Last 7d)",
"type": "timeseries",
"targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}],
"options": {"legend": {"displayMode": "table"}}
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 20},
"title": "Failed Backup Jobs (Active)",
"type": "stat",
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 24},
"title": "Logs",
"type": "row"
},
{
"datasource": {"type": "victoriametrics-logs-datasource"},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 25},
"title": "Recent Errors (all namespaces)",
"type": "logs",
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}],
"options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"}
}
],
"schemaVersion": 39,
"tags": ["edp", "platform", "overview"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(up, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(up, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"title": "EDP Platform Overview",
"uid": "edp-platform-overview"
}