Compare commits

..

No commits in common. "main" and "benchmark-cluster-2026-06-19-canonical" have entirely different histories.

42 changed files with 66 additions and 1238 deletions

View file

@ -35,30 +35,6 @@ configs:
tls: tls:
certificates: certificates:
controller:
metrics:
enabled: true
serviceMonitor:
enabled: false
server:
metrics:
enabled: true
serviceMonitor:
enabled: false
repoServer:
metrics:
enabled: true
serviceMonitor:
enabled: false
applicationSet:
metrics:
enabled: true
serviceMonitor:
enabled: false
notifications: notifications:
enabled: false enabled: false

View file

@ -11,8 +11,8 @@ spec:
startingDeadlineSeconds: 600 # 10 minutes startingDeadlineSeconds: 600 # 10 minutes
jobTemplate: jobTemplate:
spec: spec:
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
activeDeadlineSeconds: 14400 activeDeadlineSeconds: 1350
backoffLimit: 2 backoffLimit: 2
ttlSecondsAfterFinished: 259200 # ttlSecondsAfterFinished: 259200 #
template: template:

View file

@ -48,7 +48,7 @@ customConfig:
type: elasticsearch type: elasticsearch
inputs: [parser] inputs: [parser]
endpoints: endpoints:
- https://o12y.observability.buildth.ing/insert/elasticsearch/ - https://o12y.observability./insert/elasticsearch/
auth: auth:
strategy: basic strategy: basic
user: ${VECTOR_USER} user: ${VECTOR_USER}

View file

@ -1,15 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: forgejo
namespace: observability
spec:
namespaceSelector:
matchNames:
- gitea
selector:
matchLabels:
app.kubernetes.io/name: forgejo
endpoints:
- port: http
path: /metrics

View file

@ -201,13 +201,13 @@ defaultRules:
create: true create: true
rules: {} rules: {}
kubernetesSystemControllerManager: kubernetesSystemControllerManager:
create: false create: true
rules: {} rules: {}
kubeScheduler: kubeScheduler:
create: false create: true
rules: {} rules: {}
kubernetesSystemScheduler: kubernetesSystemScheduler:
create: false create: true
rules: {} rules: {}
kubeStateMetrics: kubeStateMetrics:
create: true create: true
@ -778,7 +778,7 @@ vmagent:
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
additionalRemoteWrites: additionalRemoteWrites:
# [] # []
- url: https://o12y.observability.buildth.ing/api/v1/write - url: https://o12y.observability./api/v1/write
basicAuth: basicAuth:
username: username:
name: simple-user-secret name: simple-user-secret

View file

@ -35,30 +35,6 @@ configs:
tls: tls:
certificates: certificates:
controller:
metrics:
enabled: true
serviceMonitor:
enabled: false
server:
metrics:
enabled: true
serviceMonitor:
enabled: false
repoServer:
metrics:
enabled: true
serviceMonitor:
enabled: false
applicationSet:
metrics:
enabled: true
serviceMonitor:
enabled: false
notifications: notifications:
enabled: false enabled: false

View file

@ -11,8 +11,8 @@ spec:
startingDeadlineSeconds: 600 # 10 minutes startingDeadlineSeconds: 600 # 10 minutes
jobTemplate: jobTemplate:
spec: spec:
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
activeDeadlineSeconds: 14400 activeDeadlineSeconds: 1350
backoffLimit: 2 backoffLimit: 2
ttlSecondsAfterFinished: 259200 # ttlSecondsAfterFinished: 259200 #
template: template:

View file

@ -41,8 +41,5 @@ providerConfig:
sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4 sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4
garm: garm:
metrics:
enable: true
disableAuth: true
logging: logging:
logLevel: info logLevel: info

View file

@ -48,7 +48,7 @@ customConfig:
type: elasticsearch type: elasticsearch
inputs: [parser] inputs: [parser]
endpoints: endpoints:
- https://o12y.observability.buildth.ing/insert/elasticsearch/ - https://o12y.observability./insert/elasticsearch/
auth: auth:
strategy: basic strategy: basic
user: ${VECTOR_USER} user: ${VECTOR_USER}

View file

@ -1,14 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: argocd
namespace: observability
spec:
namespaceSelector:
matchNames:
- argocd
selector:
matchLabels:
app.kubernetes.io/part-of: argocd
endpoints:
- port: http-metrics

View file

@ -1,15 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: forgejo
namespace: observability
spec:
namespaceSelector:
matchNames:
- gitea
selector:
matchLabels:
app.kubernetes.io/name: forgejo
endpoints:
- port: http
path: /metrics

View file

@ -1,15 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: garm
namespace: observability
spec:
namespaceSelector:
matchNames:
- garm
selector:
matchLabels:
app.kubernetes.io/name: garm
endpoints:
- port: http
path: /metrics

View file

@ -201,13 +201,13 @@ defaultRules:
create: true create: true
rules: {} rules: {}
kubernetesSystemControllerManager: kubernetesSystemControllerManager:
create: false create: true
rules: {} rules: {}
kubeScheduler: kubeScheduler:
create: false create: true
rules: {} rules: {}
kubernetesSystemScheduler: kubernetesSystemScheduler:
create: false create: true
rules: {} rules: {}
kubeStateMetrics: kubeStateMetrics:
create: true create: true
@ -778,7 +778,7 @@ vmagent:
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
additionalRemoteWrites: additionalRemoteWrites:
# [] # []
- url: https://o12y.observability.buildth.ing/api/v1/write - url: https://o12y.observability./api/v1/write
basicAuth: basicAuth:
username: username:
name: simple-user-secret name: simple-user-secret
@ -801,20 +801,6 @@ vmagent:
# Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent
# but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug
promscrape.dropOriginalLabels: "true" promscrape.dropOriginalLabels: "true"
# Harden liveness probe: default failureThreshold=10 masked a 72h silent outage
livenessProbe:
httpGet:
path: /health
port: http
failureThreshold: 3
periodSeconds: 5
timeoutSeconds: 5
startupProbe:
httpGet:
path: /health
port: http
failureThreshold: 30
periodSeconds: 5
# -- (object) VMAgent ingress configuration # -- (object) VMAgent ingress configuration
ingress: ingress:
enabled: false enabled: false

View file

@ -35,10 +35,8 @@ spec:
server: server:
root_url: "https://grafana.dev.t09.de" root_url: "https://grafana.dev.t09.de"
auth: auth:
disable_login: "true"
disable_login_form: "true" disable_login_form: "true"
security:
admin_user: admin
admin_password: admin
auth.generic_oauth: auth.generic_oauth:
enabled: "true" enabled: "true"
name: Forgejo name: Forgejo

View file

@ -9,13 +9,10 @@ spec:
project: default project: default
syncPolicy: syncPolicy:
automated: automated:
prune: true
selfHeal: true selfHeal: true
syncOptions: syncOptions:
- CreateNamespace=true - CreateNamespace=true
- ServerSideApply=true - ServerSideApply=true
- RespectIgnoreDifferences=true
- SkipDryRunOnMissingResource=true
destination: destination:
name: in-cluster name: in-cluster
namespace: observability namespace: observability

View file

@ -11,4 +11,4 @@ spec:
matchLabels: matchLabels:
app.kubernetes.io/name: garm app.kubernetes.io/name: garm
endpoints: endpoints:
- port: http - port: metrics

View file

@ -1,5 +1,5 @@
apiVersion: operator.victoriametrics.com/v1beta1 apiVersion: operator.victoriametrics.com/v1beta1
kind: VLogs kind: VLSingle
metadata: metadata:
name: victorialogs name: victorialogs
namespace: observability namespace: observability

View file

@ -12,12 +12,6 @@ spec:
- static: - static:
url: http://vmsingle-o12y:8429 url: http://vmsingle-o12y:8429
paths: ["/api/v1/write"] paths: ["/api/v1/write"]
- static:
url: http://vmsingle-o12y:8429
paths: ["/api/v1/.*"]
- static: - static:
url: http://vlogs-victorialogs:9428 url: http://vlogs-victorialogs:9428
paths: ["/insert/elasticsearch/.*"] paths: ["/insert/elasticsearch/.*"]
- static:
url: http://vlogs-victorialogs:9428
paths: ["/select/.*"]

View file

@ -28,7 +28,10 @@ victoria-metrics-operator:
crds: crds:
plain: true plain: true
cleanup: cleanup:
enabled: false # disabled: cleanup hook can't schedule on resource-constrained nodes (Insufficient cpu / Too many pods) enabled: true
image:
repository: bitnami/kubectl
pullPolicy: IfNotPresent
serviceMonitor: serviceMonitor:
enabled: true enabled: true
operator: operator:
@ -673,7 +676,7 @@ vmalert:
vmauth: vmauth:
# -- Enable VMAuth CR # -- Enable VMAuth CR
enabled: false enabled: true
# -- VMAuth annotations # -- VMAuth annotations
annotations: {} annotations: {}
# -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec)
@ -696,7 +699,7 @@ vmauth:
vmagent: vmagent:
# -- Create VMAgent CR # -- Create VMAgent CR
enabled: true enabled: false
# -- VMAgent annotations # -- VMAgent annotations
annotations: {} annotations: {}
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
@ -708,8 +711,7 @@ vmagent:
port: "8429" port: "8429"
selectAllByDefault: true selectAllByDefault: true
scrapeInterval: 20s scrapeInterval: 20s
externalLabels: externalLabels: {}
cluster_environment: "dev"
# For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source.
# For example: # For example:
# cluster: cluster-name # cluster: cluster-name

View file

@ -35,30 +35,6 @@ configs:
tls: tls:
certificates: certificates:
controller:
metrics:
enabled: true
serviceMonitor:
enabled: false
server:
metrics:
enabled: true
serviceMonitor:
enabled: false
repoServer:
metrics:
enabled: true
serviceMonitor:
enabled: false
applicationSet:
metrics:
enabled: true
serviceMonitor:
enabled: false
notifications: notifications:
enabled: false enabled: false

View file

@ -11,8 +11,8 @@ spec:
startingDeadlineSeconds: 600 # 10 minutes startingDeadlineSeconds: 600 # 10 minutes
jobTemplate: jobTemplate:
spec: spec:
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
activeDeadlineSeconds: 14400 activeDeadlineSeconds: 1350
backoffLimit: 2 backoffLimit: 2
ttlSecondsAfterFinished: 259200 # ttlSecondsAfterFinished: 259200 #
template: template:

View file

@ -48,7 +48,7 @@ customConfig:
type: elasticsearch type: elasticsearch
inputs: [parser] inputs: [parser]
endpoints: endpoints:
- https://o12y.observability.buildth.ing/insert/elasticsearch/ - https://o12y.observability./insert/elasticsearch/
auth: auth:
strategy: basic strategy: basic
user: ${VECTOR_USER} user: ${VECTOR_USER}

View file

@ -1,15 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: forgejo
namespace: observability
spec:
namespaceSelector:
matchNames:
- gitea
selector:
matchLabels:
app.kubernetes.io/name: forgejo
endpoints:
- port: http
path: /metrics

View file

@ -201,13 +201,13 @@ defaultRules:
create: true create: true
rules: {} rules: {}
kubernetesSystemControllerManager: kubernetesSystemControllerManager:
create: false create: true
rules: {} rules: {}
kubeScheduler: kubeScheduler:
create: false create: true
rules: {} rules: {}
kubernetesSystemScheduler: kubernetesSystemScheduler:
create: false create: true
rules: {} rules: {}
kubeStateMetrics: kubeStateMetrics:
create: true create: true
@ -778,7 +778,7 @@ vmagent:
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
additionalRemoteWrites: additionalRemoteWrites:
# [] # []
- url: https://o12y.observability.buildth.ing/api/v1/write - url: https://o12y.observability./api/v1/write
basicAuth: basicAuth:
username: username:
name: simple-user-secret name: simple-user-secret

View file

@ -11,8 +11,8 @@ spec:
startingDeadlineSeconds: 600 # 10 minutes startingDeadlineSeconds: 600 # 10 minutes
jobTemplate: jobTemplate:
spec: spec:
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
activeDeadlineSeconds: 14400 activeDeadlineSeconds: 1350
backoffLimit: 2 backoffLimit: 2
ttlSecondsAfterFinished: 259200 # ttlSecondsAfterFinished: 259200 #
template: template:

View file

@ -1,153 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: argocd-operational
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
folder: "EDP / Applications"
json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "Application Status",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
"title": "Total Apps",
"type": "stat",
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
"title": "Healthy",
"type": "stat",
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
"title": "Degraded",
"type": "stat",
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
"title": "Synced",
"type": "stat",
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
"title": "OutOfSync",
"type": "stat",
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
"title": "Progressing",
"type": "stat",
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
"title": "Application Details",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {"custom": {"filterable": true}},
"overrides": [
{"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]},
{"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]}
]
},
"gridPos": {"h": 12, "w": 24, "x": 0, "y": 6},
"title": "All Applications",
"type": "table",
"targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}],
"transformations": [
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}},
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}}
]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 18},
"title": "Sync Activity",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "ops"}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 19},
"title": "Sync Operations (rate)",
"type": "timeseries",
"targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "ops"}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 19},
"title": "Reconciliation Rate",
"type": "timeseries",
"targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 27},
"title": "ArgoCD Logs",
"type": "row"
},
{
"datasource": {"type": "victoriametrics-logs-datasource"},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 28},
"title": "ArgoCD Logs",
"type": "logs",
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}],
"options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
}
],
"schemaVersion": 39,
"tags": ["edp", "argocd", "gitops"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(argocd_app_info, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(argocd_app_info, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"title": "ArgoCD Operations",
"uid": "edp-argocd-ops"
}

View file

@ -6,5 +6,4 @@ spec:
instanceSelector: instanceSelector:
matchLabels: matchLabels:
dashboards: "grafana" dashboards: "grafana"
folder: "EDP / Applications"
url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json"

View file

@ -1,103 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: cronjob-monitoring
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
folder: "EDP / Operations"
json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "Backup Job Status",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
"gridPos": {"h": 5, "w": 12, "x": 0, "y": 1},
"title": "Time Since Last Schedule",
"type": "stat",
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
"gridPos": {"h": 5, "w": 12, "x": 12, "y": 1},
"title": "Failed Jobs (Active)",
"type": "stat",
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
"title": "CronJob Overview",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 7},
"title": "All CronJobs",
"type": "table",
"targets": [
{"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"}
],
"transformations": [
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}},
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}}
]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
"title": "Job History",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"title": "Job Completions (24h)",
"type": "timeseries",
"targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"title": "Job Failures (24h)",
"type": "timeseries",
"targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
}
],
"schemaVersion": 39,
"tags": ["edp", "backup", "cronjob"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(kube_cronjob_info, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(kube_cronjob_info, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-24h", "to": "now"},
"title": "CronJob & Backup Monitoring",
"uid": "edp-cronjobs"
}

View file

@ -1,207 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: forgejo
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
folder: "EDP / Applications"
json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "Forgejo Health",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
"title": "Status",
"type": "stat",
"targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
"title": "Version",
"type": "stat",
"targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"}
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
"title": "Repositories",
"type": "stat",
"targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
"title": "Users",
"type": "stat",
"targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
"title": "Organizations",
"type": "stat",
"targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
"title": "Teams",
"type": "stat",
"targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
"title": "Activity",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 6},
"title": "Open Issues",
"type": "stat",
"targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 6},
"title": "Closed Issues",
"type": "stat",
"targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 6},
"title": "Webhooks",
"type": "stat",
"targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 6},
"title": "Hook Tasks",
"type": "stat",
"targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
"title": "Content & Auth",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 11},
"title": "Stars",
"type": "stat",
"targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 11},
"title": "Watches",
"type": "stat",
"targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 11},
"title": "Releases",
"type": "stat",
"targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 11},
"title": "Mirrors",
"type": "stat",
"targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 11},
"title": "Public Keys",
"type": "stat",
"targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 11},
"title": "OAuth Apps",
"type": "stat",
"targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
"title": "Forgejo Logs",
"type": "row"
},
{
"datasource": {"type": "victoriametrics-logs-datasource"},
"gridPos": {"h": 10, "w": 12, "x": 0, "y": 16},
"title": "Forgejo Server Logs",
"type": "logs",
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}],
"options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
},
{
"datasource": {"type": "victoriametrics-logs-datasource"},
"gridPos": {"h": 10, "w": 12, "x": 12, "y": 16},
"title": "Forgejo Errors",
"type": "logs",
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}],
"options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
}
],
"schemaVersion": 39,
"tags": ["edp", "forgejo", "gitea"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(gitea_repositories, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(gitea_repositories, cluster_environment)",
"refresh": 2,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"title": "Forgejo",
"uid": "edp-forgejo"
}

View file

@ -1,117 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: garm
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
folder: "EDP / Applications"
json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "GARM Runner Status",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
"title": "Total Runners",
"type": "stat",
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
"title": "Idle Runners",
"type": "stat",
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
"title": "Creating",
"type": "stat",
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
"title": "Errors",
"type": "stat",
"targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
"title": "GitHub API Rate Limits",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "min": 0}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 7},
"title": "Rate Limit Remaining",
"type": "timeseries",
"targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "ops"}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 7},
"title": "Runner Operations Rate",
"type": "timeseries",
"targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
"title": "Runner Details",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"custom": {"filterable": true}}},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 16},
"title": "Runner Pool Status",
"type": "table",
"targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}],
"transformations": [
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}},
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}}
]
}
],
"schemaVersion": 39,
"tags": ["edp", "garm", "ci-cd", "runners"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(garm_runner_status, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(garm_runner_status, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"title": "GARM Runners",
"uid": "edp-garm"
}

View file

@ -6,5 +6,4 @@ spec:
instanceSelector: instanceSelector:
matchLabels: matchLabels:
dashboards: "grafana" dashboards: "grafana"
folder: "EDP / Operations"
url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json"

View file

@ -1,245 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: platform-overview
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
folder: "EDP / Overview"
json: |
{
"annotations": {"list": []},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"title": "Platform Health",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}],
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
"title": "Forgejo",
"type": "stat",
"targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
"title": "Ingress 5xx (5m)",
"type": "stat",
"targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
"title": "Failed Jobs (24h)",
"type": "stat",
"targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
"title": "Cluster CPU Usage",
"type": "stat",
"targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
"title": "Cluster Memory Usage",
"type": "stat",
"targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]}
}
},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
"title": "Max PVC Usage",
"type": "stat",
"targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
"title": "Forgejo",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
"title": "Repositories",
"type": "stat",
"targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
"title": "Users",
"type": "stat",
"targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
"title": "Organizations",
"type": "stat",
"targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
"title": "Open Issues",
"type": "stat",
"targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 6},
"title": "Webhooks",
"type": "stat",
"targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short"}},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 6},
"title": "Mirrors",
"type": "stat",
"targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
"title": "Resources",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 11},
"title": "Node CPU Usage",
"type": "timeseries",
"targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 11},
"title": "PVC Usage by Claim",
"type": "timeseries",
"targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 19},
"title": "Backups",
"type": "row"
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 20},
"title": "Time Since Last Backup Schedule",
"type": "stat",
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "s"}},
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 20},
"title": "Backup Job Duration (Last 7d)",
"type": "timeseries",
"targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}],
"options": {"legend": {"displayMode": "table"}}
},
{
"datasource": {"type": "prometheus"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 20},
"title": "Failed Backup Jobs (Active)",
"type": "stat",
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 24},
"title": "Logs",
"type": "row"
},
{
"datasource": {"type": "victoriametrics-logs-datasource"},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 25},
"title": "Recent Errors (all namespaces)",
"type": "logs",
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}],
"options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"}
}
],
"schemaVersion": 39,
"tags": ["edp", "platform", "overview"],
"templating": {
"list": [
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus"},
"definition": "label_values(up, cluster_environment)",
"includeAll": true,
"multi": true,
"name": "cluster_environment",
"label": "Environment",
"query": "label_values(up, cluster_environment)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"title": "EDP Platform Overview",
"uid": "edp-platform-overview"
}

View file

@ -6,7 +6,4 @@ spec:
instanceSelector: instanceSelector:
matchLabels: matchLabels:
dashboards: "grafana" dashboards: "grafana"
folder: "EDP / Operations" url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json"
grafanaCom:
id: 22698
revision: 1

View file

@ -1,119 +1,40 @@
apiVersion: operator.victoriametrics.com/v1beta1 apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule kind: VMRule
metadata: metadata:
name: edp-platform-alerts name: forgejo-alerts
namespace: observability namespace: observability
spec: spec:
groups: groups:
- name: platform-health - name: forgejo
rules: rules:
- alert: ForgejoDown - alert: forgejo down
expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1 expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1
for: 1m for: 30s
labels: labels:
severity: critical severity: critical
job: "{{ $labels.job }}"
annotations: annotations:
summary: "Forgejo is down on {{ $labels.cluster_environment }}" value: "{{ $value }}"
description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}." description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}'
- name: forgejo-backup
- alert: IngressHighErrorRate
expr: |
sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m]))
/ sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05
for: 5m
labels:
severity: major
annotations:
summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}"
description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes."
value: "{{ $value | humanizePercentage }}"
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}"
description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3
for: 5m
labels:
severity: major
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}"
description: "Pod has restarted more than 3 times in the last 15 minutes."
- name: storage
rules: rules:
- alert: PVCUsageHigh - alert: forgejo s3 backup job failed
expr: | expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 for: 30s
for: 5m
labels:
severity: major
annotations:
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%"
description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}."
value: "{{ $value | humanizePercentage }}"
- alert: PVCUsageCritical
expr: |
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90
for: 5m
labels: labels:
severity: critical severity: critical
job: "{{ $labels.job }}"
annotations: annotations:
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%" value: "{{ $value }}"
description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required." description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}'
value: "{{ $value | humanizePercentage }}" - name: disk-consumption-high
- name: resources
rules: rules:
- alert: NodeCPUHigh - alert: disk consumption high
expr: | expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6
1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85 for: 30s
for: 15m
labels: labels:
severity: major severity: major
job: "{{ $labels.job }}"
annotations: annotations:
summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}" value: "{{ $value }}"
description: "Node CPU utilization has been above 85% for 15 minutes." description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}'
value: "{{ $value | humanizePercentage }}"
- alert: NodeMemoryHigh
expr: |
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
for: 10m
labels:
severity: major
annotations:
summary: "Node memory >90% on {{ $labels.cluster_environment }}"
description: "Node memory utilization above 90% for 10 minutes."
value: "{{ $value | humanizePercentage }}"
- name: cluster-health
rules:
- alert: ClusterMetricsSilent
expr: |
count(up{job="kubelet"}) by (cluster_environment) < 1
or
absent(up{job="kubelet", cluster_environment="dev"})
for: 10m
labels:
severity: critical
annotations:
summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
- alert: ClusterAPIServerDown
expr: |
up{job="apiserver", cluster_environment=~".+"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "API server down on {{ $labels.cluster_environment }}"
description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."

View file

@ -1,78 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: backup-alerts
namespace: observability
spec:
groups:
- name: backup-schedule-staleness
rules:
- alert: BackupCronJobNotScheduled
expr: |
time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
> 26 * 3600
for: 5m
labels:
severity: critical
cronjob: "{{ $labels.cronjob }}"
annotations:
value: "{{ $value | humanizeDuration }}"
description: >-
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been
scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}.
Last schedule was {{ $value | humanizeDuration }} ago.
summary: "Backup CronJob {{ $labels.cronjob }} is stale"
- alert: BackupCronJobNeverScheduled
expr: |
kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
== 0
for: 30m
labels:
severity: critical
cronjob: "{{ $labels.cronjob }}"
annotations:
description: >-
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been
scheduled in cluster {{ $labels.cluster_environment }}.
summary: "Backup CronJob {{ $labels.cronjob }} never ran"
- name: backup-job-failures
rules:
- alert: BackupJobFailed
expr: |
max by(cluster_environment, namespace, job_name) (
kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
) > 0
for: 30s
labels:
severity: critical
job_name: "{{ $labels.job_name }}"
annotations:
value: "{{ $value }}"
description: >-
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has
{{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}.
summary: "Backup job {{ $labels.job_name }} failed"
- name: backup-job-duration
rules:
- alert: BackupJobTooSlow
expr: |
(
time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
) > 300
and
kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0
for: 1m
labels:
severity: major
job_name: "{{ $labels.job_name }}"
annotations:
value: "{{ $value | humanizeDuration }}"
description: >-
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been
running for {{ $value | humanizeDuration }} (threshold: 5m)
in cluster {{ $labels.cluster_environment }}. This may indicate a
hung process or connectivity issue.
summary: "Backup job {{ $labels.job_name }} running too long"

View file

@ -10,4 +10,4 @@ spec:
matchLabels: matchLabels:
app.kubernetes.io/name: garm app.kubernetes.io/name: garm
endpoints: endpoints:
- port: http - port: metrics

View file

@ -1,9 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: simple-user-secret
namespace: observability
type: Opaque
data:
username: c2ltcGxlLXVzZXI=
password: c3g1Z0M3b29XYVdPT0R3RA==

View file

@ -5,17 +5,13 @@ metadata:
namespace: observability namespace: observability
spec: spec:
username: simple-user username: simple-user
password: sx5gC7ooWaWOODwD passwordRef:
key: password
name: simple-user-secret
targetRefs: targetRefs:
- static: - static:
url: http://vmsingle-o12y:8429 url: http://vmsingle-o12y:8429
paths: ["/api/v1/write"] paths: ["/api/v1/write"]
- static:
url: http://vmsingle-o12y:8429
paths: ["/api/v1/.*"]
- static: - static:
url: http://vlogs-victorialogs:9428 url: http://vlogs-victorialogs:9428
paths: ["/insert/elasticsearch/.*"] paths: ["/insert/elasticsearch/.*"]
- static:
url: http://vlogs-victorialogs:9428
paths: ["/select/.*"]

View file

@ -1,6 +1,6 @@
global: global:
# -- Cluster label to use for dashboards and rules # -- Cluster label to use for dashboards and rules
clusterLabel: cluster_environment clusterLabel: cluster
# -- Global license configuration # -- Global license configuration
license: license:
key: "" key: ""
@ -201,13 +201,13 @@ defaultRules:
enabled: true enabled: true
rules: {} rules: {}
kubernetesSystemControllerManager: kubernetesSystemControllerManager:
create: false enabled: false
rules: {} rules: {}
kubeScheduler: kubeScheduler:
create: false enabled: false
rules: {} rules: {}
kubernetesSystemScheduler: kubernetesSystemScheduler:
create: false enabled: false
rules: {} rules: {}
kubeStateMetrics: kubeStateMetrics:
enabled: true enabled: true