feat(observability): add sustainability metrics, Kepler, 6-month retention, GARM scrape
This commit is contained in:
parent
bbdca11f00
commit
b5594a8017
6 changed files with 141 additions and 1 deletions
|
|
@ -0,0 +1,29 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: kepler
|
||||
namespace: argocd
|
||||
labels:
|
||||
env: dev
|
||||
spec:
|
||||
project: default
|
||||
syncPolicy:
|
||||
automated:
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
retry:
|
||||
limit: -1
|
||||
destination:
|
||||
name: in-cluster
|
||||
namespace: observability
|
||||
sources:
|
||||
- chart: kepler
|
||||
repoURL: https://sustainable-computing-io.github.io/kepler-helm-chart
|
||||
targetRevision: 0.6.1
|
||||
helm:
|
||||
valueFiles:
|
||||
- $values/otc/observability.buildth.ing/stacks/observability-client/kepler/values.yaml
|
||||
- repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances
|
||||
targetRevision: HEAD
|
||||
ref: values
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
canMount:
|
||||
usrSrc: false
|
||||
|
||||
serviceMonitor:
|
||||
enabled: false
|
||||
|
||||
extraEnvVars:
|
||||
ENABLE_GPU: "false"
|
||||
ENABLE_EBPF_CGROUPID: "true"
|
||||
KEPLER_LOG_LEVEL: "1"
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: ci-sustainability
|
||||
spec:
|
||||
groups:
|
||||
- name: ci.sustainability.daily
|
||||
interval: 5m
|
||||
rules:
|
||||
- record: ci:cpu_seconds:increase1d
|
||||
expr: |
|
||||
sum by(namespace, cluster) (
|
||||
increase(container_cpu_usage_seconds_total{
|
||||
namespace=~"gitea|garm",
|
||||
pod=~"forgejo-runner.*|garm-.*",
|
||||
container!=""
|
||||
}[1d])
|
||||
)
|
||||
- record: ci:memory_bytes_seconds:avg1d
|
||||
expr: |
|
||||
avg_over_time(
|
||||
sum by(namespace, cluster) (
|
||||
container_memory_working_set_bytes{
|
||||
namespace=~"gitea|garm",
|
||||
pod=~"forgejo-runner.*|garm-.*",
|
||||
container!=""
|
||||
}
|
||||
)[1d:5m]
|
||||
)
|
||||
- record: ci:pod_count:avg1d
|
||||
expr: |
|
||||
avg_over_time(
|
||||
count by(namespace, cluster) (
|
||||
kube_pod_info{
|
||||
namespace=~"gitea|garm",
|
||||
pod=~"forgejo-runner.*|garm-.*"
|
||||
}
|
||||
)[1d:5m]
|
||||
)
|
||||
- record: ci:pod_creations:increase1d
|
||||
expr: |
|
||||
sum by(namespace, cluster) (
|
||||
changes(kube_pod_start_time{
|
||||
namespace=~"gitea|garm",
|
||||
pod=~"forgejo-runner.*|garm-.*"
|
||||
}[1d])
|
||||
)
|
||||
- name: ci.sustainability.cluster
|
||||
interval: 5m
|
||||
rules:
|
||||
- record: cluster:cpu_seconds:rate5m
|
||||
expr: |
|
||||
sum by(cluster) (
|
||||
rate(node_cpu_seconds_total{mode!="idle"}[5m])
|
||||
)
|
||||
- record: cluster:memory_used_bytes:sum
|
||||
expr: |
|
||||
sum by(cluster) (
|
||||
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
|
||||
)
|
||||
- name: ci.sustainability.energy
|
||||
interval: 5m
|
||||
rules:
|
||||
- record: ci:joules:increase1d
|
||||
expr: |
|
||||
sum by(container_namespace, cluster) (
|
||||
increase(kepler_container_joules_total{
|
||||
container_namespace=~"gitea|garm"
|
||||
}[1d])
|
||||
)
|
||||
- record: cluster:joules:rate5m
|
||||
expr: |
|
||||
sum by(cluster) (
|
||||
rate(kepler_node_joules_total[5m])
|
||||
)
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: garm
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- garm
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: garm
|
||||
endpoints:
|
||||
- port: metrics
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: kepler
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- observability
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kepler
|
||||
endpoints:
|
||||
- port: http
|
||||
|
|
@ -283,7 +283,7 @@ vmsingle:
|
|||
spec:
|
||||
port: "8429"
|
||||
# -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention)
|
||||
retentionPeriod: "1"
|
||||
retentionPeriod: "6"
|
||||
replicaCount: 1
|
||||
extraArgs: {}
|
||||
storageMetadata:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue