feat(observability): add sustainability metrics, Kepler, 6-month retention, GARM scrape
This commit is contained in:
parent
bbdca11f00
commit
b5594a8017
6 changed files with 141 additions and 1 deletions
|
|
@ -0,0 +1,29 @@
|
||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: kepler
|
||||||
|
namespace: argocd
|
||||||
|
labels:
|
||||||
|
env: dev
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
syncPolicy:
|
||||||
|
automated:
|
||||||
|
selfHeal: true
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
retry:
|
||||||
|
limit: -1
|
||||||
|
destination:
|
||||||
|
name: in-cluster
|
||||||
|
namespace: observability
|
||||||
|
sources:
|
||||||
|
- chart: kepler
|
||||||
|
repoURL: https://sustainable-computing-io.github.io/kepler-helm-chart
|
||||||
|
targetRevision: 0.6.1
|
||||||
|
helm:
|
||||||
|
valueFiles:
|
||||||
|
- $values/otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml
|
||||||
|
- repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances
|
||||||
|
targetRevision: HEAD
|
||||||
|
ref: values
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
canMount:
|
||||||
|
usrSrc: false
|
||||||
|
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
extraEnvVars:
|
||||||
|
ENABLE_GPU: "false"
|
||||||
|
ENABLE_EBPF_CGROUPID: "true"
|
||||||
|
KEPLER_LOG_LEVEL: "1"
|
||||||
|
|
@ -0,0 +1,75 @@
|
||||||
|
apiVersion: operator.victoriametrics.com/v1beta1
|
||||||
|
kind: VMRule
|
||||||
|
metadata:
|
||||||
|
name: ci-sustainability
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: ci.sustainability.daily
|
||||||
|
interval: 5m
|
||||||
|
rules:
|
||||||
|
- record: ci:cpu_seconds:increase1d
|
||||||
|
expr: |
|
||||||
|
sum by(namespace, cluster) (
|
||||||
|
increase(container_cpu_usage_seconds_total{
|
||||||
|
namespace=~"gitea|garm",
|
||||||
|
pod=~"forgejo-runner.*|garm-.*",
|
||||||
|
container!=""
|
||||||
|
}[1d])
|
||||||
|
)
|
||||||
|
- record: ci:memory_bytes_seconds:avg1d
|
||||||
|
expr: |
|
||||||
|
avg_over_time(
|
||||||
|
sum by(namespace, cluster) (
|
||||||
|
container_memory_working_set_bytes{
|
||||||
|
namespace=~"gitea|garm",
|
||||||
|
pod=~"forgejo-runner.*|garm-.*",
|
||||||
|
container!=""
|
||||||
|
}
|
||||||
|
)[1d:5m]
|
||||||
|
)
|
||||||
|
- record: ci:pod_count:avg1d
|
||||||
|
expr: |
|
||||||
|
avg_over_time(
|
||||||
|
count by(namespace, cluster) (
|
||||||
|
kube_pod_info{
|
||||||
|
namespace=~"gitea|garm",
|
||||||
|
pod=~"forgejo-runner.*|garm-.*"
|
||||||
|
}
|
||||||
|
)[1d:5m]
|
||||||
|
)
|
||||||
|
- record: ci:pod_creations:increase1d
|
||||||
|
expr: |
|
||||||
|
sum by(namespace, cluster) (
|
||||||
|
changes(kube_pod_start_time{
|
||||||
|
namespace=~"gitea|garm",
|
||||||
|
pod=~"forgejo-runner.*|garm-.*"
|
||||||
|
}[1d])
|
||||||
|
)
|
||||||
|
- name: ci.sustainability.cluster
|
||||||
|
interval: 5m
|
||||||
|
rules:
|
||||||
|
- record: cluster:cpu_seconds:rate5m
|
||||||
|
expr: |
|
||||||
|
sum by(cluster) (
|
||||||
|
rate(node_cpu_seconds_total{mode!="idle"}[5m])
|
||||||
|
)
|
||||||
|
- record: cluster:memory_used_bytes:sum
|
||||||
|
expr: |
|
||||||
|
sum by(cluster) (
|
||||||
|
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
|
||||||
|
)
|
||||||
|
- name: ci.sustainability.energy
|
||||||
|
interval: 5m
|
||||||
|
rules:
|
||||||
|
- record: ci:joules:increase1d
|
||||||
|
expr: |
|
||||||
|
sum by(container_namespace, cluster) (
|
||||||
|
increase(kepler_container_joules_total{
|
||||||
|
container_namespace=~"gitea|garm"
|
||||||
|
}[1d])
|
||||||
|
)
|
||||||
|
- record: cluster:joules:rate5m
|
||||||
|
expr: |
|
||||||
|
sum by(cluster) (
|
||||||
|
rate(kepler_node_joules_total[5m])
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
apiVersion: operator.victoriametrics.com/v1beta1
|
||||||
|
kind: VMServiceScrape
|
||||||
|
metadata:
|
||||||
|
name: garm
|
||||||
|
spec:
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- garm
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: garm
|
||||||
|
endpoints:
|
||||||
|
- port: metrics
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
apiVersion: operator.victoriametrics.com/v1beta1
|
||||||
|
kind: VMServiceScrape
|
||||||
|
metadata:
|
||||||
|
name: kepler
|
||||||
|
spec:
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- observability
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: kepler
|
||||||
|
endpoints:
|
||||||
|
- port: http
|
||||||
|
|
@ -283,7 +283,7 @@ vmsingle:
|
||||||
spec:
|
spec:
|
||||||
port: "8429"
|
port: "8429"
|
||||||
# -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention)
|
# -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention)
|
||||||
retentionPeriod: "1"
|
retentionPeriod: "6"
|
||||||
replicaCount: 1
|
replicaCount: 1
|
||||||
extraArgs: {}
|
extraArgs: {}
|
||||||
storageMetadata:
|
storageMetadata:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue