From b5594a8017952e180485fb3a8623a09c7b587ac7 Mon Sep 17 00:00:00 2001 From: Martin McCaffery Date: Tue, 2 Jun 2026 15:51:26 +0100 Subject: [PATCH] feat(observability): add sustainability metrics, Kepler, 6-month retention, GARM scrape --- .../stacks/observability-client/kepler.yaml | 29 +++++++ .../observability-client/kepler/values.yaml | 10 +++ .../manifests/ci-sustainability-rules.yaml | 75 +++++++++++++++++++ .../manifests/garm-scrape.yaml | 13 ++++ .../manifests/kepler-scrape.yaml | 13 ++++ .../victoria-k8s-stack/values.yaml | 2 +- 6 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 otc/observability.buildth.ing/stacks/observability-client/kepler.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml create mode 100644 otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/kepler-scrape.yaml diff --git a/otc/observability.buildth.ing/stacks/observability-client/kepler.yaml b/otc/observability.buildth.ing/stacks/observability-client/kepler.yaml new file mode 100644 index 0000000..288718e --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability-client/kepler.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: kepler + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: observability + sources: + - chart: kepler + repoURL: https://sustainable-computing-io.github.io/kepler-helm-chart + targetRevision: 0.6.1 + helm: + valueFiles: + - $values/otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml b/otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml new file mode 100644 index 0000000..90fa6e4 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml @@ -0,0 +1,10 @@ +canMount: + usrSrc: false + +serviceMonitor: + enabled: false + +extraEnvVars: + ENABLE_GPU: "false" + ENABLE_EBPF_CGROUPID: "true" + KEPLER_LOG_LEVEL: "1" diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml new file mode 100644 index 0000000..0108b14 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml @@ -0,0 +1,75 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: ci-sustainability +spec: + groups: + - name: ci.sustainability.daily + interval: 5m + rules: + - record: ci:cpu_seconds:increase1d + expr: | + sum by(namespace, cluster) ( + increase(container_cpu_usage_seconds_total{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + }[1d]) + ) + - record: ci:memory_bytes_seconds:avg1d + expr: | + avg_over_time( + sum by(namespace, cluster) ( + container_memory_working_set_bytes{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + } + )[1d:5m] + ) + - record: ci:pod_count:avg1d + expr: | + avg_over_time( + count by(namespace, cluster) ( + kube_pod_info{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + } + )[1d:5m] + ) + - record: ci:pod_creations:increase1d + expr: | + sum by(namespace, cluster) ( + changes(kube_pod_start_time{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + }[1d]) + ) + - name: ci.sustainability.cluster + interval: 5m + rules: + - record: cluster:cpu_seconds:rate5m + expr: | + sum by(cluster) ( + rate(node_cpu_seconds_total{mode!="idle"}[5m]) + ) + - record: cluster:memory_used_bytes:sum + expr: | + sum by(cluster) ( + node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes + ) + - name: ci.sustainability.energy + interval: 5m + rules: + - record: ci:joules:increase1d + expr: | + sum by(container_namespace, cluster) ( + increase(kepler_container_joules_total{ + container_namespace=~"gitea|garm" + }[1d]) + ) + - record: cluster:joules:rate5m + expr: | + sum by(cluster) ( + rate(kepler_node_joules_total[5m]) + ) diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..a4c6119 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: metrics diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/kepler-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/kepler-scrape.yaml new file mode 100644 index 0000000..3cdbc1d --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/kepler-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: kepler +spec: + namespaceSelector: + matchNames: + - observability + selector: + matchLabels: + app.kubernetes.io/name: kepler + endpoints: + - port: http diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml index bd22879..5bb9361 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml @@ -283,7 +283,7 @@ vmsingle: spec: port: "8429" # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) - retentionPeriod: "1" + retentionPeriod: "6" replicaCount: 1 extraArgs: {} storageMetadata: