From 3ed3487e972d2509256d2cf78a6b0b1cdf59e04c Mon Sep 17 00:00:00 2001 From: Daniel Sy Date: Mon, 22 Jun 2026 10:40:43 +0200 Subject: [PATCH] =?UTF-8?q?fix(observability):=20=F0=9F=90=9B=20harden=20v?= =?UTF-8?q?magent=20liveness=20probe=20failureThreshold=2010=E2=86=923?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Silent outage for 72h went undetected due to lenient probe. Add startupProbe (failureThreshold=30) to allow slow starts. --- .../vm-client-stack/values.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml index 9224a46..06930b0 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -801,6 +801,20 @@ vmagent: # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug promscrape.dropOriginalLabels: "true" + # Harden liveness probe: default failureThreshold=10 masked a 72h silent outage + livenessProbe: + httpGet: + path: /health + port: http + failureThreshold: 3 + periodSeconds: 5 + timeoutSeconds: 5 + startupProbe: + httpGet: + path: /health + port: http + failureThreshold: 30 + periodSeconds: 5 # -- (object) VMAgent ingress configuration ingress: enabled: false