Compare commits
No commits in common. "main" and "benchmark-cluster-2026-06-19-canonical" have entirely different histories.
main
...
benchmark-
42 changed files with 66 additions and 1238 deletions
|
|
@ -35,30 +35,6 @@ configs:
|
||||||
tls:
|
tls:
|
||||||
certificates:
|
certificates:
|
||||||
|
|
||||||
controller:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
repoServer:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
applicationSet:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,8 +11,8 @@ spec:
|
||||||
startingDeadlineSeconds: 600 # 10 minutes
|
startingDeadlineSeconds: 600 # 10 minutes
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
|
# 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
|
||||||
activeDeadlineSeconds: 14400
|
activeDeadlineSeconds: 1350
|
||||||
backoffLimit: 2
|
backoffLimit: 2
|
||||||
ttlSecondsAfterFinished: 259200 #
|
ttlSecondsAfterFinished: 259200 #
|
||||||
template:
|
template:
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ customConfig:
|
||||||
type: elasticsearch
|
type: elasticsearch
|
||||||
inputs: [parser]
|
inputs: [parser]
|
||||||
endpoints:
|
endpoints:
|
||||||
- https://o12y.observability.buildth.ing/insert/elasticsearch/
|
- https://o12y.observability./insert/elasticsearch/
|
||||||
auth:
|
auth:
|
||||||
strategy: basic
|
strategy: basic
|
||||||
user: ${VECTOR_USER}
|
user: ${VECTOR_USER}
|
||||||
|
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
|
||||||
kind: VMServiceScrape
|
|
||||||
metadata:
|
|
||||||
name: forgejo
|
|
||||||
namespace: observability
|
|
||||||
spec:
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- gitea
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: forgejo
|
|
||||||
endpoints:
|
|
||||||
- port: http
|
|
||||||
path: /metrics
|
|
||||||
|
|
@ -201,13 +201,13 @@ defaultRules:
|
||||||
create: true
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemControllerManager:
|
kubernetesSystemControllerManager:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeScheduler:
|
kubeScheduler:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemScheduler:
|
kubernetesSystemScheduler:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeStateMetrics:
|
kubeStateMetrics:
|
||||||
create: true
|
create: true
|
||||||
|
|
@ -778,7 +778,7 @@ vmagent:
|
||||||
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
||||||
additionalRemoteWrites:
|
additionalRemoteWrites:
|
||||||
# []
|
# []
|
||||||
- url: https://o12y.observability.buildth.ing/api/v1/write
|
- url: https://o12y.observability./api/v1/write
|
||||||
basicAuth:
|
basicAuth:
|
||||||
username:
|
username:
|
||||||
name: simple-user-secret
|
name: simple-user-secret
|
||||||
|
|
|
||||||
|
|
@ -35,30 +35,6 @@ configs:
|
||||||
tls:
|
tls:
|
||||||
certificates:
|
certificates:
|
||||||
|
|
||||||
controller:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
repoServer:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
applicationSet:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,8 +11,8 @@ spec:
|
||||||
startingDeadlineSeconds: 600 # 10 minutes
|
startingDeadlineSeconds: 600 # 10 minutes
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
|
# 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
|
||||||
activeDeadlineSeconds: 14400
|
activeDeadlineSeconds: 1350
|
||||||
backoffLimit: 2
|
backoffLimit: 2
|
||||||
ttlSecondsAfterFinished: 259200 #
|
ttlSecondsAfterFinished: 259200 #
|
||||||
template:
|
template:
|
||||||
|
|
|
||||||
|
|
@ -41,8 +41,5 @@ providerConfig:
|
||||||
sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4
|
sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4
|
||||||
|
|
||||||
garm:
|
garm:
|
||||||
metrics:
|
|
||||||
enable: true
|
|
||||||
disableAuth: true
|
|
||||||
logging:
|
logging:
|
||||||
logLevel: info
|
logLevel: info
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ customConfig:
|
||||||
type: elasticsearch
|
type: elasticsearch
|
||||||
inputs: [parser]
|
inputs: [parser]
|
||||||
endpoints:
|
endpoints:
|
||||||
- https://o12y.observability.buildth.ing/insert/elasticsearch/
|
- https://o12y.observability./insert/elasticsearch/
|
||||||
auth:
|
auth:
|
||||||
strategy: basic
|
strategy: basic
|
||||||
user: ${VECTOR_USER}
|
user: ${VECTOR_USER}
|
||||||
|
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
|
||||||
kind: VMServiceScrape
|
|
||||||
metadata:
|
|
||||||
name: argocd
|
|
||||||
namespace: observability
|
|
||||||
spec:
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- argocd
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/part-of: argocd
|
|
||||||
endpoints:
|
|
||||||
- port: http-metrics
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
|
||||||
kind: VMServiceScrape
|
|
||||||
metadata:
|
|
||||||
name: forgejo
|
|
||||||
namespace: observability
|
|
||||||
spec:
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- gitea
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: forgejo
|
|
||||||
endpoints:
|
|
||||||
- port: http
|
|
||||||
path: /metrics
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
|
||||||
kind: VMServiceScrape
|
|
||||||
metadata:
|
|
||||||
name: garm
|
|
||||||
namespace: observability
|
|
||||||
spec:
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- garm
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: garm
|
|
||||||
endpoints:
|
|
||||||
- port: http
|
|
||||||
path: /metrics
|
|
||||||
|
|
@ -201,13 +201,13 @@ defaultRules:
|
||||||
create: true
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemControllerManager:
|
kubernetesSystemControllerManager:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeScheduler:
|
kubeScheduler:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemScheduler:
|
kubernetesSystemScheduler:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeStateMetrics:
|
kubeStateMetrics:
|
||||||
create: true
|
create: true
|
||||||
|
|
@ -778,7 +778,7 @@ vmagent:
|
||||||
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
||||||
additionalRemoteWrites:
|
additionalRemoteWrites:
|
||||||
# []
|
# []
|
||||||
- url: https://o12y.observability.buildth.ing/api/v1/write
|
- url: https://o12y.observability./api/v1/write
|
||||||
basicAuth:
|
basicAuth:
|
||||||
username:
|
username:
|
||||||
name: simple-user-secret
|
name: simple-user-secret
|
||||||
|
|
@ -801,20 +801,6 @@ vmagent:
|
||||||
# Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent
|
# Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent
|
||||||
# but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug
|
# but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug
|
||||||
promscrape.dropOriginalLabels: "true"
|
promscrape.dropOriginalLabels: "true"
|
||||||
# Harden liveness probe: default failureThreshold=10 masked a 72h silent outage
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /health
|
|
||||||
port: http
|
|
||||||
failureThreshold: 3
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 5
|
|
||||||
startupProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /health
|
|
||||||
port: http
|
|
||||||
failureThreshold: 30
|
|
||||||
periodSeconds: 5
|
|
||||||
# -- (object) VMAgent ingress configuration
|
# -- (object) VMAgent ingress configuration
|
||||||
ingress:
|
ingress:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
|
||||||
|
|
@ -35,10 +35,8 @@ spec:
|
||||||
server:
|
server:
|
||||||
root_url: "https://grafana.dev.t09.de"
|
root_url: "https://grafana.dev.t09.de"
|
||||||
auth:
|
auth:
|
||||||
|
disable_login: "true"
|
||||||
disable_login_form: "true"
|
disable_login_form: "true"
|
||||||
security:
|
|
||||||
admin_user: admin
|
|
||||||
admin_password: admin
|
|
||||||
auth.generic_oauth:
|
auth.generic_oauth:
|
||||||
enabled: "true"
|
enabled: "true"
|
||||||
name: Forgejo
|
name: Forgejo
|
||||||
|
|
|
||||||
|
|
@ -9,13 +9,10 @@ spec:
|
||||||
project: default
|
project: default
|
||||||
syncPolicy:
|
syncPolicy:
|
||||||
automated:
|
automated:
|
||||||
prune: true
|
|
||||||
selfHeal: true
|
selfHeal: true
|
||||||
syncOptions:
|
syncOptions:
|
||||||
- CreateNamespace=true
|
- CreateNamespace=true
|
||||||
- ServerSideApply=true
|
- ServerSideApply=true
|
||||||
- RespectIgnoreDifferences=true
|
|
||||||
- SkipDryRunOnMissingResource=true
|
|
||||||
destination:
|
destination:
|
||||||
name: in-cluster
|
name: in-cluster
|
||||||
namespace: observability
|
namespace: observability
|
||||||
|
|
|
||||||
|
|
@ -11,4 +11,4 @@ spec:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/name: garm
|
app.kubernetes.io/name: garm
|
||||||
endpoints:
|
endpoints:
|
||||||
- port: http
|
- port: metrics
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
apiVersion: operator.victoriametrics.com/v1beta1
|
||||||
kind: VLogs
|
kind: VLSingle
|
||||||
metadata:
|
metadata:
|
||||||
name: victorialogs
|
name: victorialogs
|
||||||
namespace: observability
|
namespace: observability
|
||||||
|
|
|
||||||
|
|
@ -12,12 +12,6 @@ spec:
|
||||||
- static:
|
- static:
|
||||||
url: http://vmsingle-o12y:8429
|
url: http://vmsingle-o12y:8429
|
||||||
paths: ["/api/v1/write"]
|
paths: ["/api/v1/write"]
|
||||||
- static:
|
|
||||||
url: http://vmsingle-o12y:8429
|
|
||||||
paths: ["/api/v1/.*"]
|
|
||||||
- static:
|
- static:
|
||||||
url: http://vlogs-victorialogs:9428
|
url: http://vlogs-victorialogs:9428
|
||||||
paths: ["/insert/elasticsearch/.*"]
|
paths: ["/insert/elasticsearch/.*"]
|
||||||
- static:
|
|
||||||
url: http://vlogs-victorialogs:9428
|
|
||||||
paths: ["/select/.*"]
|
|
||||||
|
|
@ -28,7 +28,10 @@ victoria-metrics-operator:
|
||||||
crds:
|
crds:
|
||||||
plain: true
|
plain: true
|
||||||
cleanup:
|
cleanup:
|
||||||
enabled: false # disabled: cleanup hook can't schedule on resource-constrained nodes (Insufficient cpu / Too many pods)
|
enabled: true
|
||||||
|
image:
|
||||||
|
repository: bitnami/kubectl
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
serviceMonitor:
|
serviceMonitor:
|
||||||
enabled: true
|
enabled: true
|
||||||
operator:
|
operator:
|
||||||
|
|
@ -673,7 +676,7 @@ vmalert:
|
||||||
|
|
||||||
vmauth:
|
vmauth:
|
||||||
# -- Enable VMAuth CR
|
# -- Enable VMAuth CR
|
||||||
enabled: false
|
enabled: true
|
||||||
# -- VMAuth annotations
|
# -- VMAuth annotations
|
||||||
annotations: {}
|
annotations: {}
|
||||||
# -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec)
|
# -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec)
|
||||||
|
|
@ -696,7 +699,7 @@ vmauth:
|
||||||
|
|
||||||
vmagent:
|
vmagent:
|
||||||
# -- Create VMAgent CR
|
# -- Create VMAgent CR
|
||||||
enabled: true
|
enabled: false
|
||||||
# -- VMAgent annotations
|
# -- VMAgent annotations
|
||||||
annotations: {}
|
annotations: {}
|
||||||
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
||||||
|
|
@ -708,8 +711,7 @@ vmagent:
|
||||||
port: "8429"
|
port: "8429"
|
||||||
selectAllByDefault: true
|
selectAllByDefault: true
|
||||||
scrapeInterval: 20s
|
scrapeInterval: 20s
|
||||||
externalLabels:
|
externalLabels: {}
|
||||||
cluster_environment: "dev"
|
|
||||||
# For multi-cluster setups it is useful to use "cluster" label to identify the metrics source.
|
# For multi-cluster setups it is useful to use "cluster" label to identify the metrics source.
|
||||||
# For example:
|
# For example:
|
||||||
# cluster: cluster-name
|
# cluster: cluster-name
|
||||||
|
|
|
||||||
|
|
@ -35,30 +35,6 @@ configs:
|
||||||
tls:
|
tls:
|
||||||
certificates:
|
certificates:
|
||||||
|
|
||||||
controller:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
repoServer:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
applicationSet:
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
serviceMonitor:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,8 +11,8 @@ spec:
|
||||||
startingDeadlineSeconds: 600 # 10 minutes
|
startingDeadlineSeconds: 600 # 10 minutes
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
|
# 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
|
||||||
activeDeadlineSeconds: 14400
|
activeDeadlineSeconds: 1350
|
||||||
backoffLimit: 2
|
backoffLimit: 2
|
||||||
ttlSecondsAfterFinished: 259200 #
|
ttlSecondsAfterFinished: 259200 #
|
||||||
template:
|
template:
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ customConfig:
|
||||||
type: elasticsearch
|
type: elasticsearch
|
||||||
inputs: [parser]
|
inputs: [parser]
|
||||||
endpoints:
|
endpoints:
|
||||||
- https://o12y.observability.buildth.ing/insert/elasticsearch/
|
- https://o12y.observability./insert/elasticsearch/
|
||||||
auth:
|
auth:
|
||||||
strategy: basic
|
strategy: basic
|
||||||
user: ${VECTOR_USER}
|
user: ${VECTOR_USER}
|
||||||
|
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
|
||||||
kind: VMServiceScrape
|
|
||||||
metadata:
|
|
||||||
name: forgejo
|
|
||||||
namespace: observability
|
|
||||||
spec:
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- gitea
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: forgejo
|
|
||||||
endpoints:
|
|
||||||
- port: http
|
|
||||||
path: /metrics
|
|
||||||
|
|
@ -201,13 +201,13 @@ defaultRules:
|
||||||
create: true
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemControllerManager:
|
kubernetesSystemControllerManager:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeScheduler:
|
kubeScheduler:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemScheduler:
|
kubernetesSystemScheduler:
|
||||||
create: false
|
create: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeStateMetrics:
|
kubeStateMetrics:
|
||||||
create: true
|
create: true
|
||||||
|
|
@ -778,7 +778,7 @@ vmagent:
|
||||||
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
# -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec)
|
||||||
additionalRemoteWrites:
|
additionalRemoteWrites:
|
||||||
# []
|
# []
|
||||||
- url: https://o12y.observability.buildth.ing/api/v1/write
|
- url: https://o12y.observability./api/v1/write
|
||||||
basicAuth:
|
basicAuth:
|
||||||
username:
|
username:
|
||||||
name: simple-user-secret
|
name: simple-user-secret
|
||||||
|
|
|
||||||
|
|
@ -11,8 +11,8 @@ spec:
|
||||||
startingDeadlineSeconds: 600 # 10 minutes
|
startingDeadlineSeconds: 600 # 10 minutes
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
# 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m
|
# 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer
|
||||||
activeDeadlineSeconds: 14400
|
activeDeadlineSeconds: 1350
|
||||||
backoffLimit: 2
|
backoffLimit: 2
|
||||||
ttlSecondsAfterFinished: 259200 #
|
ttlSecondsAfterFinished: 259200 #
|
||||||
template:
|
template:
|
||||||
|
|
|
||||||
|
|
@ -1,153 +0,0 @@
|
||||||
apiVersion: grafana.integreatly.org/v1beta1
|
|
||||||
kind: GrafanaDashboard
|
|
||||||
metadata:
|
|
||||||
name: argocd-operational
|
|
||||||
spec:
|
|
||||||
instanceSelector:
|
|
||||||
matchLabels:
|
|
||||||
dashboards: "grafana"
|
|
||||||
folder: "EDP / Applications"
|
|
||||||
json: |
|
|
||||||
{
|
|
||||||
"annotations": {"list": []},
|
|
||||||
"editable": true,
|
|
||||||
"graphTooltip": 1,
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
|
||||||
"title": "Application Status",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
|
|
||||||
"title": "Total Apps",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
|
|
||||||
"title": "Healthy",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
|
|
||||||
"title": "Degraded",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
|
|
||||||
"title": "Synced",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
|
|
||||||
"title": "OutOfSync",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
|
|
||||||
"title": "Progressing",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
|
|
||||||
"title": "Application Details",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {"custom": {"filterable": true}},
|
|
||||||
"overrides": [
|
|
||||||
{"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]},
|
|
||||||
{"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"gridPos": {"h": 12, "w": 24, "x": 0, "y": 6},
|
|
||||||
"title": "All Applications",
|
|
||||||
"type": "table",
|
|
||||||
"targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}],
|
|
||||||
"transformations": [
|
|
||||||
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}},
|
|
||||||
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 18},
|
|
||||||
"title": "Sync Activity",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 19},
|
|
||||||
"title": "Sync Operations (rate)",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 19},
|
|
||||||
"title": "Reconciliation Rate",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 27},
|
|
||||||
"title": "ArgoCD Logs",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "victoriametrics-logs-datasource"},
|
|
||||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 28},
|
|
||||||
"title": "ArgoCD Logs",
|
|
||||||
"type": "logs",
|
|
||||||
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}],
|
|
||||||
"options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 39,
|
|
||||||
"tags": ["edp", "argocd", "gitops"],
|
|
||||||
"templating": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"definition": "label_values(argocd_app_info, cluster_environment)",
|
|
||||||
"includeAll": true,
|
|
||||||
"multi": true,
|
|
||||||
"name": "cluster_environment",
|
|
||||||
"label": "Environment",
|
|
||||||
"query": "label_values(argocd_app_info, cluster_environment)",
|
|
||||||
"refresh": 2,
|
|
||||||
"sort": 1,
|
|
||||||
"type": "query"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"time": {"from": "now-6h", "to": "now"},
|
|
||||||
"title": "ArgoCD Operations",
|
|
||||||
"uid": "edp-argocd-ops"
|
|
||||||
}
|
|
||||||
|
|
@ -6,5 +6,4 @@ spec:
|
||||||
instanceSelector:
|
instanceSelector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
dashboards: "grafana"
|
dashboards: "grafana"
|
||||||
folder: "EDP / Applications"
|
|
||||||
url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json"
|
url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json"
|
||||||
|
|
|
||||||
|
|
@ -1,103 +0,0 @@
|
||||||
apiVersion: grafana.integreatly.org/v1beta1
|
|
||||||
kind: GrafanaDashboard
|
|
||||||
metadata:
|
|
||||||
name: cronjob-monitoring
|
|
||||||
spec:
|
|
||||||
instanceSelector:
|
|
||||||
matchLabels:
|
|
||||||
dashboards: "grafana"
|
|
||||||
folder: "EDP / Operations"
|
|
||||||
json: |
|
|
||||||
{
|
|
||||||
"annotations": {"list": []},
|
|
||||||
"editable": true,
|
|
||||||
"graphTooltip": 1,
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
|
||||||
"title": "Backup Job Status",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
|
|
||||||
"gridPos": {"h": 5, "w": 12, "x": 0, "y": 1},
|
|
||||||
"title": "Time Since Last Schedule",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
|
|
||||||
"gridPos": {"h": 5, "w": 12, "x": 12, "y": 1},
|
|
||||||
"title": "Failed Jobs (Active)",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
|
|
||||||
"title": "CronJob Overview",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]},
|
|
||||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 7},
|
|
||||||
"title": "All CronJobs",
|
|
||||||
"type": "table",
|
|
||||||
"targets": [
|
|
||||||
{"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"}
|
|
||||||
],
|
|
||||||
"transformations": [
|
|
||||||
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}},
|
|
||||||
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
|
|
||||||
"title": "Job History",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
|
||||||
"title": "Job Completions (24h)",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
|
|
||||||
"title": "Job Failures (24h)",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 39,
|
|
||||||
"tags": ["edp", "backup", "cronjob"],
|
|
||||||
"templating": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"definition": "label_values(kube_cronjob_info, cluster_environment)",
|
|
||||||
"includeAll": true,
|
|
||||||
"multi": true,
|
|
||||||
"name": "cluster_environment",
|
|
||||||
"label": "Environment",
|
|
||||||
"query": "label_values(kube_cronjob_info, cluster_environment)",
|
|
||||||
"refresh": 2,
|
|
||||||
"sort": 1,
|
|
||||||
"type": "query"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"time": {"from": "now-24h", "to": "now"},
|
|
||||||
"title": "CronJob & Backup Monitoring",
|
|
||||||
"uid": "edp-cronjobs"
|
|
||||||
}
|
|
||||||
|
|
@ -1,207 +0,0 @@
|
||||||
apiVersion: grafana.integreatly.org/v1beta1
|
|
||||||
kind: GrafanaDashboard
|
|
||||||
metadata:
|
|
||||||
name: forgejo
|
|
||||||
spec:
|
|
||||||
instanceSelector:
|
|
||||||
matchLabels:
|
|
||||||
dashboards: "grafana"
|
|
||||||
folder: "EDP / Applications"
|
|
||||||
json: |
|
|
||||||
{
|
|
||||||
"annotations": {"list": []},
|
|
||||||
"editable": true,
|
|
||||||
"graphTooltip": 1,
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
|
||||||
"title": "Forgejo Health",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
|
|
||||||
"title": "Status",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
|
|
||||||
"title": "Version",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}],
|
|
||||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
|
|
||||||
"title": "Repositories",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
|
|
||||||
"title": "Users",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
|
|
||||||
"title": "Organizations",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
|
|
||||||
"title": "Teams",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
|
|
||||||
"title": "Activity",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 6},
|
|
||||||
"title": "Open Issues",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 6},
|
|
||||||
"title": "Closed Issues",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 6},
|
|
||||||
"title": "Webhooks",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 6},
|
|
||||||
"title": "Hook Tasks",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
|
|
||||||
"title": "Content & Auth",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 11},
|
|
||||||
"title": "Stars",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 11},
|
|
||||||
"title": "Watches",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 11},
|
|
||||||
"title": "Releases",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 11},
|
|
||||||
"title": "Mirrors",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 11},
|
|
||||||
"title": "Public Keys",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 11},
|
|
||||||
"title": "OAuth Apps",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
|
|
||||||
"title": "Forgejo Logs",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "victoriametrics-logs-datasource"},
|
|
||||||
"gridPos": {"h": 10, "w": 12, "x": 0, "y": 16},
|
|
||||||
"title": "Forgejo Server Logs",
|
|
||||||
"type": "logs",
|
|
||||||
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}],
|
|
||||||
"options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "victoriametrics-logs-datasource"},
|
|
||||||
"gridPos": {"h": 10, "w": 12, "x": 12, "y": 16},
|
|
||||||
"title": "Forgejo Errors",
|
|
||||||
"type": "logs",
|
|
||||||
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}],
|
|
||||||
"options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 39,
|
|
||||||
"tags": ["edp", "forgejo", "gitea"],
|
|
||||||
"templating": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"definition": "label_values(gitea_repositories, cluster_environment)",
|
|
||||||
"includeAll": true,
|
|
||||||
"multi": true,
|
|
||||||
"name": "cluster_environment",
|
|
||||||
"label": "Environment",
|
|
||||||
"query": "label_values(gitea_repositories, cluster_environment)",
|
|
||||||
"refresh": 2,
|
|
||||||
"type": "query"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"time": {"from": "now-6h", "to": "now"},
|
|
||||||
"title": "Forgejo",
|
|
||||||
"uid": "edp-forgejo"
|
|
||||||
}
|
|
||||||
|
|
@ -1,117 +0,0 @@
|
||||||
apiVersion: grafana.integreatly.org/v1beta1
|
|
||||||
kind: GrafanaDashboard
|
|
||||||
metadata:
|
|
||||||
name: garm
|
|
||||||
spec:
|
|
||||||
instanceSelector:
|
|
||||||
matchLabels:
|
|
||||||
dashboards: "grafana"
|
|
||||||
folder: "EDP / Applications"
|
|
||||||
json: |
|
|
||||||
{
|
|
||||||
"annotations": {"list": []},
|
|
||||||
"editable": true,
|
|
||||||
"graphTooltip": 1,
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
|
||||||
"title": "GARM Runner Status",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
|
|
||||||
"title": "Total Runners",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
|
|
||||||
"title": "Idle Runners",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
|
|
||||||
"title": "Creating",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}},
|
|
||||||
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
|
|
||||||
"title": "Errors",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 6},
|
|
||||||
"title": "GitHub API Rate Limits",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "min": 0}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 7},
|
|
||||||
"title": "Rate Limit Remaining",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 7},
|
|
||||||
"title": "Runner Operations Rate",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 15},
|
|
||||||
"title": "Runner Details",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"custom": {"filterable": true}}},
|
|
||||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 16},
|
|
||||||
"title": "Runner Pool Status",
|
|
||||||
"type": "table",
|
|
||||||
"targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}],
|
|
||||||
"transformations": [
|
|
||||||
{"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}},
|
|
||||||
{"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 39,
|
|
||||||
"tags": ["edp", "garm", "ci-cd", "runners"],
|
|
||||||
"templating": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"definition": "label_values(garm_runner_status, cluster_environment)",
|
|
||||||
"includeAll": true,
|
|
||||||
"multi": true,
|
|
||||||
"name": "cluster_environment",
|
|
||||||
"label": "Environment",
|
|
||||||
"query": "label_values(garm_runner_status, cluster_environment)",
|
|
||||||
"refresh": 2,
|
|
||||||
"sort": 1,
|
|
||||||
"type": "query"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"time": {"from": "now-6h", "to": "now"},
|
|
||||||
"title": "GARM Runners",
|
|
||||||
"uid": "edp-garm"
|
|
||||||
}
|
|
||||||
|
|
@ -6,5 +6,4 @@ spec:
|
||||||
instanceSelector:
|
instanceSelector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
dashboards: "grafana"
|
dashboards: "grafana"
|
||||||
folder: "EDP / Operations"
|
|
||||||
url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json"
|
url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json"
|
||||||
|
|
|
||||||
|
|
@ -1,245 +0,0 @@
|
||||||
apiVersion: grafana.integreatly.org/v1beta1
|
|
||||||
kind: GrafanaDashboard
|
|
||||||
metadata:
|
|
||||||
name: platform-overview
|
|
||||||
spec:
|
|
||||||
instanceSelector:
|
|
||||||
matchLabels:
|
|
||||||
dashboards: "grafana"
|
|
||||||
folder: "EDP / Overview"
|
|
||||||
json: |
|
|
||||||
{
|
|
||||||
"annotations": {"list": []},
|
|
||||||
"editable": true,
|
|
||||||
"fiscalYearStartMonth": 0,
|
|
||||||
"graphTooltip": 1,
|
|
||||||
"links": [],
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
|
||||||
"title": "Platform Health",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}],
|
|
||||||
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
|
|
||||||
"title": "Forgejo",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
|
|
||||||
"title": "Ingress 5xx (5m)",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "short",
|
|
||||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
|
|
||||||
"title": "Failed Jobs (24h)",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percentunit",
|
|
||||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
|
|
||||||
"title": "Cluster CPU Usage",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percentunit",
|
|
||||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
|
|
||||||
"title": "Cluster Memory Usage",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percentunit",
|
|
||||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 1},
|
|
||||||
"title": "Max PVC Usage",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
|
|
||||||
"title": "Forgejo",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
|
|
||||||
"title": "Repositories",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
|
|
||||||
"title": "Users",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
|
|
||||||
"title": "Organizations",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
|
|
||||||
"title": "Open Issues",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 6},
|
|
||||||
"title": "Webhooks",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short"}},
|
|
||||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 6},
|
|
||||||
"title": "Mirrors",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
|
|
||||||
"title": "Resources",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 11},
|
|
||||||
"title": "Node CPU Usage",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}},
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 11},
|
|
||||||
"title": "PVC Usage by Claim",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 19},
|
|
||||||
"title": "Backups",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 20},
|
|
||||||
"title": "Time Since Last Backup Schedule",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "s"}},
|
|
||||||
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 20},
|
|
||||||
"title": "Backup Job Duration (Last 7d)",
|
|
||||||
"type": "timeseries",
|
|
||||||
"targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}],
|
|
||||||
"options": {"legend": {"displayMode": "table"}}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}},
|
|
||||||
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 20},
|
|
||||||
"title": "Failed Backup Jobs (Active)",
|
|
||||||
"type": "stat",
|
|
||||||
"targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 24},
|
|
||||||
"title": "Logs",
|
|
||||||
"type": "row"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "victoriametrics-logs-datasource"},
|
|
||||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 25},
|
|
||||||
"title": "Recent Errors (all namespaces)",
|
|
||||||
"type": "logs",
|
|
||||||
"targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}],
|
|
||||||
"options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 39,
|
|
||||||
"tags": ["edp", "platform", "overview"],
|
|
||||||
"templating": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
|
||||||
"datasource": {"type": "prometheus"},
|
|
||||||
"definition": "label_values(up, cluster_environment)",
|
|
||||||
"includeAll": true,
|
|
||||||
"multi": true,
|
|
||||||
"name": "cluster_environment",
|
|
||||||
"label": "Environment",
|
|
||||||
"query": "label_values(up, cluster_environment)",
|
|
||||||
"refresh": 2,
|
|
||||||
"sort": 1,
|
|
||||||
"type": "query"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"time": {"from": "now-6h", "to": "now"},
|
|
||||||
"title": "EDP Platform Overview",
|
|
||||||
"uid": "edp-platform-overview"
|
|
||||||
}
|
|
||||||
|
|
@ -6,7 +6,4 @@ spec:
|
||||||
instanceSelector:
|
instanceSelector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
dashboards: "grafana"
|
dashboards: "grafana"
|
||||||
folder: "EDP / Operations"
|
url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json"
|
||||||
grafanaCom:
|
|
||||||
id: 22698
|
|
||||||
revision: 1
|
|
||||||
|
|
|
||||||
|
|
@ -1,119 +1,40 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
apiVersion: operator.victoriametrics.com/v1beta1
|
||||||
kind: VMRule
|
kind: VMRule
|
||||||
metadata:
|
metadata:
|
||||||
name: edp-platform-alerts
|
name: forgejo-alerts
|
||||||
namespace: observability
|
namespace: observability
|
||||||
spec:
|
spec:
|
||||||
groups:
|
groups:
|
||||||
- name: platform-health
|
- name: forgejo
|
||||||
rules:
|
rules:
|
||||||
- alert: ForgejoDown
|
- alert: forgejo down
|
||||||
expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1
|
expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1
|
||||||
for: 1m
|
for: 30s
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
job: "{{ $labels.job }}"
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Forgejo is down on {{ $labels.cluster_environment }}"
|
value: "{{ $value }}"
|
||||||
description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}."
|
description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}'
|
||||||
|
- name: forgejo-backup
|
||||||
- alert: IngressHighErrorRate
|
|
||||||
expr: |
|
|
||||||
sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m]))
|
|
||||||
/ sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: major
|
|
||||||
annotations:
|
|
||||||
summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}"
|
|
||||||
description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes."
|
|
||||||
value: "{{ $value | humanizePercentage }}"
|
|
||||||
|
|
||||||
- alert: NodeNotReady
|
|
||||||
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}"
|
|
||||||
description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
|
|
||||||
|
|
||||||
- alert: PodCrashLooping
|
|
||||||
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: major
|
|
||||||
annotations:
|
|
||||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}"
|
|
||||||
description: "Pod has restarted more than 3 times in the last 15 minutes."
|
|
||||||
|
|
||||||
- name: storage
|
|
||||||
rules:
|
rules:
|
||||||
- alert: PVCUsageHigh
|
- alert: forgejo s3 backup job failed
|
||||||
expr: |
|
expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0
|
||||||
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
|
for: 30s
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: major
|
|
||||||
annotations:
|
|
||||||
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%"
|
|
||||||
description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}."
|
|
||||||
value: "{{ $value | humanizePercentage }}"
|
|
||||||
|
|
||||||
- alert: PVCUsageCritical
|
|
||||||
expr: |
|
|
||||||
1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90
|
|
||||||
for: 5m
|
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
job: "{{ $labels.job }}"
|
||||||
annotations:
|
annotations:
|
||||||
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%"
|
value: "{{ $value }}"
|
||||||
description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required."
|
description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}'
|
||||||
value: "{{ $value | humanizePercentage }}"
|
- name: disk-consumption-high
|
||||||
|
|
||||||
- name: resources
|
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeCPUHigh
|
- alert: disk consumption high
|
||||||
expr: |
|
expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6
|
||||||
1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85
|
for: 30s
|
||||||
for: 15m
|
|
||||||
labels:
|
labels:
|
||||||
severity: major
|
severity: major
|
||||||
|
job: "{{ $labels.job }}"
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}"
|
value: "{{ $value }}"
|
||||||
description: "Node CPU utilization has been above 85% for 15 minutes."
|
description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}'
|
||||||
value: "{{ $value | humanizePercentage }}"
|
|
||||||
|
|
||||||
- alert: NodeMemoryHigh
|
|
||||||
expr: |
|
|
||||||
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: major
|
|
||||||
annotations:
|
|
||||||
summary: "Node memory >90% on {{ $labels.cluster_environment }}"
|
|
||||||
description: "Node memory utilization above 90% for 10 minutes."
|
|
||||||
value: "{{ $value | humanizePercentage }}"
|
|
||||||
|
|
||||||
- name: cluster-health
|
|
||||||
rules:
|
|
||||||
- alert: ClusterMetricsSilent
|
|
||||||
expr: |
|
|
||||||
count(up{job="kubelet"}) by (cluster_environment) < 1
|
|
||||||
or
|
|
||||||
absent(up{job="kubelet", cluster_environment="dev"})
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics"
|
|
||||||
description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable."
|
|
||||||
|
|
||||||
- alert: ClusterAPIServerDown
|
|
||||||
expr: |
|
|
||||||
up{job="apiserver", cluster_environment=~".+"} == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "API server down on {{ $labels.cluster_environment }}"
|
|
||||||
description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}."
|
|
||||||
|
|
|
||||||
|
|
@ -1,78 +0,0 @@
|
||||||
apiVersion: operator.victoriametrics.com/v1beta1
|
|
||||||
kind: VMRule
|
|
||||||
metadata:
|
|
||||||
name: backup-alerts
|
|
||||||
namespace: observability
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: backup-schedule-staleness
|
|
||||||
rules:
|
|
||||||
- alert: BackupCronJobNotScheduled
|
|
||||||
expr: |
|
|
||||||
time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
|
|
||||||
> 26 * 3600
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
cronjob: "{{ $labels.cronjob }}"
|
|
||||||
annotations:
|
|
||||||
value: "{{ $value | humanizeDuration }}"
|
|
||||||
description: >-
|
|
||||||
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been
|
|
||||||
scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}.
|
|
||||||
Last schedule was {{ $value | humanizeDuration }} ago.
|
|
||||||
summary: "Backup CronJob {{ $labels.cronjob }} is stale"
|
|
||||||
|
|
||||||
- alert: BackupCronJobNeverScheduled
|
|
||||||
expr: |
|
|
||||||
kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"}
|
|
||||||
== 0
|
|
||||||
for: 30m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
cronjob: "{{ $labels.cronjob }}"
|
|
||||||
annotations:
|
|
||||||
description: >-
|
|
||||||
CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been
|
|
||||||
scheduled in cluster {{ $labels.cluster_environment }}.
|
|
||||||
summary: "Backup CronJob {{ $labels.cronjob }} never ran"
|
|
||||||
|
|
||||||
- name: backup-job-failures
|
|
||||||
rules:
|
|
||||||
- alert: BackupJobFailed
|
|
||||||
expr: |
|
|
||||||
max by(cluster_environment, namespace, job_name) (
|
|
||||||
kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
|
|
||||||
) > 0
|
|
||||||
for: 30s
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
job_name: "{{ $labels.job_name }}"
|
|
||||||
annotations:
|
|
||||||
value: "{{ $value }}"
|
|
||||||
description: >-
|
|
||||||
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has
|
|
||||||
{{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}.
|
|
||||||
summary: "Backup job {{ $labels.job_name }} failed"
|
|
||||||
|
|
||||||
- name: backup-job-duration
|
|
||||||
rules:
|
|
||||||
- alert: BackupJobTooSlow
|
|
||||||
expr: |
|
|
||||||
(
|
|
||||||
time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"}
|
|
||||||
) > 300
|
|
||||||
and
|
|
||||||
kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: major
|
|
||||||
job_name: "{{ $labels.job_name }}"
|
|
||||||
annotations:
|
|
||||||
value: "{{ $value | humanizeDuration }}"
|
|
||||||
description: >-
|
|
||||||
Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been
|
|
||||||
running for {{ $value | humanizeDuration }} (threshold: 5m)
|
|
||||||
in cluster {{ $labels.cluster_environment }}. This may indicate a
|
|
||||||
hung process or connectivity issue.
|
|
||||||
summary: "Backup job {{ $labels.job_name }} running too long"
|
|
||||||
|
|
@ -10,4 +10,4 @@ spec:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/name: garm
|
app.kubernetes.io/name: garm
|
||||||
endpoints:
|
endpoints:
|
||||||
- port: http
|
- port: metrics
|
||||||
|
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
apiVersion: v1
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
name: simple-user-secret
|
|
||||||
namespace: observability
|
|
||||||
type: Opaque
|
|
||||||
data:
|
|
||||||
username: c2ltcGxlLXVzZXI=
|
|
||||||
password: c3g1Z0M3b29XYVdPT0R3RA==
|
|
||||||
|
|
@ -5,17 +5,13 @@ metadata:
|
||||||
namespace: observability
|
namespace: observability
|
||||||
spec:
|
spec:
|
||||||
username: simple-user
|
username: simple-user
|
||||||
password: sx5gC7ooWaWOODwD
|
passwordRef:
|
||||||
|
key: password
|
||||||
|
name: simple-user-secret
|
||||||
targetRefs:
|
targetRefs:
|
||||||
- static:
|
- static:
|
||||||
url: http://vmsingle-o12y:8429
|
url: http://vmsingle-o12y:8429
|
||||||
paths: ["/api/v1/write"]
|
paths: ["/api/v1/write"]
|
||||||
- static:
|
|
||||||
url: http://vmsingle-o12y:8429
|
|
||||||
paths: ["/api/v1/.*"]
|
|
||||||
- static:
|
- static:
|
||||||
url: http://vlogs-victorialogs:9428
|
url: http://vlogs-victorialogs:9428
|
||||||
paths: ["/insert/elasticsearch/.*"]
|
paths: ["/insert/elasticsearch/.*"]
|
||||||
- static:
|
|
||||||
url: http://vlogs-victorialogs:9428
|
|
||||||
paths: ["/select/.*"]
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
global:
|
global:
|
||||||
# -- Cluster label to use for dashboards and rules
|
# -- Cluster label to use for dashboards and rules
|
||||||
clusterLabel: cluster_environment
|
clusterLabel: cluster
|
||||||
# -- Global license configuration
|
# -- Global license configuration
|
||||||
license:
|
license:
|
||||||
key: ""
|
key: ""
|
||||||
|
|
@ -201,13 +201,13 @@ defaultRules:
|
||||||
enabled: true
|
enabled: true
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemControllerManager:
|
kubernetesSystemControllerManager:
|
||||||
create: false
|
enabled: false
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeScheduler:
|
kubeScheduler:
|
||||||
create: false
|
enabled: false
|
||||||
rules: {}
|
rules: {}
|
||||||
kubernetesSystemScheduler:
|
kubernetesSystemScheduler:
|
||||||
create: false
|
enabled: false
|
||||||
rules: {}
|
rules: {}
|
||||||
kubeStateMetrics:
|
kubeStateMetrics:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue