feat(observability): add sustainability metrics, Kepler, 6-month retention, GARM scrape

This commit is contained in:
Martin McCaffery 2026-06-02 15:51:26 +01:00
parent bbdca11f00
commit b5594a8017
No known key found for this signature in database
GPG key ID: 7C4D0F375BCEE533
6 changed files with 141 additions and 1 deletions

View file

@ -0,0 +1,29 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: kepler
namespace: argocd
labels:
env: dev
spec:
project: default
syncPolicy:
automated:
selfHeal: true
syncOptions:
- CreateNamespace=true
retry:
limit: -1
destination:
name: in-cluster
namespace: observability
sources:
- chart: kepler
repoURL: https://sustainable-computing-io.github.io/kepler-helm-chart
targetRevision: 0.6.1
helm:
valueFiles:
- $values/otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml
- repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances
targetRevision: HEAD
ref: values

View file

@ -0,0 +1,10 @@
canMount:
usrSrc: false
serviceMonitor:
enabled: false
extraEnvVars:
ENABLE_GPU: "false"
ENABLE_EBPF_CGROUPID: "true"
KEPLER_LOG_LEVEL: "1"

View file

@ -0,0 +1,75 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: ci-sustainability
spec:
groups:
- name: ci.sustainability.daily
interval: 5m
rules:
- record: ci:cpu_seconds:increase1d
expr: |
sum by(namespace, cluster) (
increase(container_cpu_usage_seconds_total{
namespace=~"gitea|garm",
pod=~"forgejo-runner.*|garm-.*",
container!=""
}[1d])
)
- record: ci:memory_bytes_seconds:avg1d
expr: |
avg_over_time(
sum by(namespace, cluster) (
container_memory_working_set_bytes{
namespace=~"gitea|garm",
pod=~"forgejo-runner.*|garm-.*",
container!=""
}
)[1d:5m]
)
- record: ci:pod_count:avg1d
expr: |
avg_over_time(
count by(namespace, cluster) (
kube_pod_info{
namespace=~"gitea|garm",
pod=~"forgejo-runner.*|garm-.*"
}
)[1d:5m]
)
- record: ci:pod_creations:increase1d
expr: |
sum by(namespace, cluster) (
changes(kube_pod_start_time{
namespace=~"gitea|garm",
pod=~"forgejo-runner.*|garm-.*"
}[1d])
)
- name: ci.sustainability.cluster
interval: 5m
rules:
- record: cluster:cpu_seconds:rate5m
expr: |
sum by(cluster) (
rate(node_cpu_seconds_total{mode!="idle"}[5m])
)
- record: cluster:memory_used_bytes:sum
expr: |
sum by(cluster) (
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
)
- name: ci.sustainability.energy
interval: 5m
rules:
- record: ci:joules:increase1d
expr: |
sum by(container_namespace, cluster) (
increase(kepler_container_joules_total{
container_namespace=~"gitea|garm"
}[1d])
)
- record: cluster:joules:rate5m
expr: |
sum by(cluster) (
rate(kepler_node_joules_total[5m])
)

View file

@ -0,0 +1,13 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: garm
spec:
namespaceSelector:
matchNames:
- garm
selector:
matchLabels:
app.kubernetes.io/name: garm
endpoints:
- port: metrics

View file

@ -0,0 +1,13 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: kepler
spec:
namespaceSelector:
matchNames:
- observability
selector:
matchLabels:
app.kubernetes.io/name: kepler
endpoints:
- port: http

View file

@ -283,7 +283,7 @@ vmsingle:
spec:
port: "8429"
# -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention)
retentionPeriod: "1"
retentionPeriod: "6"
replicaCount: 1
extraArgs: {}
storageMetadata: