stacks-instances/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml

103 lines
4.6 KiB
YAML
Raw Normal View History

apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: cronjob-monitoring
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "Backup Job Status",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
"gridPos": {"h": 5, "w": 12, "x": 0, "y": 1},
"title": "Time Since Last Schedule",
"type": "stat",
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
"gridPos": {"h": 5, "w": 12, "x": 12, "y": 1},
"title": "Failed Jobs (Active)",
"type": "stat",
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
"title": "CronJob Overview",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 7},
"title": "All CronJobs",
"type": "table",
"targets": [
{"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"}
],
"transformations": [
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}},
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}}
]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
"title": "Job History",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"title": "Job Completions (24h)",
"type": "timeseries",
"targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"title": "Job Failures (24h)",
"type": "timeseries",
"targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
}
],
"schemaVersion": 39,
"tags": ["edp", "backup", "cronjob"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(kube_cronjob_info, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(kube_cronjob_info, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-24h", "to": "now"},
"title": "CronJob & Backup Monitoring",
"uid": "edp-cronjobs"
}