diff --git a/otc/benchmark.t09.de/edfbuilder.yaml b/otc/benchmark.t09.de/edfbuilder.yaml new file mode 100644 index 0000000..1d105ce --- /dev/null +++ b/otc/benchmark.t09.de/edfbuilder.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: edfbuilder + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/registry" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/ci-sizer.yaml b/otc/benchmark.t09.de/registry/ci-sizer.yaml new file mode 100644 index 0000000..953c8c1 --- /dev/null +++ b/otc/benchmark.t09.de/registry/ci-sizer.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ci-sizer-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/ci-sizer" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/coder.yaml b/otc/benchmark.t09.de/registry/coder.yaml new file mode 100644 index 0000000..2c36d8d --- /dev/null +++ b/otc/benchmark.t09.de/registry/coder.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: coder-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/coder" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/core.yaml b/otc/benchmark.t09.de/registry/core.yaml new file mode 100644 index 0000000..7a9b64c --- /dev/null +++ b/otc/benchmark.t09.de/registry/core.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: core + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/core" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/docs.yaml b/otc/benchmark.t09.de/registry/docs.yaml new file mode 100644 index 0000000..9d88777 --- /dev/null +++ b/otc/benchmark.t09.de/registry/docs.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: docs-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: argocd-stack + repoURL: "https://edp.buildth.ing/DevFW-CICD/website-and-documentation" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/forgejo.yaml b/otc/benchmark.t09.de/registry/forgejo.yaml new file mode 100644 index 0000000..2442409 --- /dev/null +++ b/otc/benchmark.t09.de/registry/forgejo.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: forgejo + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/forgejo" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/garm.yaml b/otc/benchmark.t09.de/registry/garm.yaml new file mode 100644 index 0000000..1e44b8b --- /dev/null +++ b/otc/benchmark.t09.de/registry/garm.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: garm-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/garm" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/observability-client.yaml b/otc/benchmark.t09.de/registry/observability-client.yaml new file mode 100644 index 0000000..1ca1b3e --- /dev/null +++ b/otc/benchmark.t09.de/registry/observability-client.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: observability-client + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/observability-client" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/observability.yaml b/otc/benchmark.t09.de/registry/observability.yaml new file mode 100644 index 0000000..e5473d3 --- /dev/null +++ b/otc/benchmark.t09.de/registry/observability.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: observability + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/observability" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/otc.yaml b/otc/benchmark.t09.de/registry/otc.yaml new file mode 100644 index 0000000..dbba541 --- /dev/null +++ b/otc/benchmark.t09.de/registry/otc.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: otc + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/otc" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/registry/terralist.yaml b/otc/benchmark.t09.de/registry/terralist.yaml new file mode 100644 index 0000000..3ef37d1 --- /dev/null +++ b/otc/benchmark.t09.de/registry/terralist.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: terralist-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/benchmark.t09.de/stacks/terralist" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook.yaml b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook.yaml new file mode 100644 index 0000000..f876092 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook.yaml @@ -0,0 +1,29 @@ +# Optional: GitLab CI integration +# Only hydrate this app for clusters that run GitLab Runner. +# For Forgejo/GitHub-only deployments, omit this app from stacks-instances. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: gitlab-sizer-webhook + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: ci-sizer + source: + repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook" diff --git a/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/certificates.yaml b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/certificates.yaml new file mode 100644 index 0000000..ee1fece --- /dev/null +++ b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/certificates.yaml @@ -0,0 +1,27 @@ +# Self-signed Issuer for webhook TLS. +# For production, replace with a ClusterIssuer backed by a real CA. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} +--- +# cert-manager Certificate for the webhook TLS. +# The resulting Secret (gitlab-sizer-webhook-tls) is mounted into the webhook pod. +# cert-manager also injects the CA into the MutatingWebhookConfiguration via the +# cert-manager.io/inject-ca-from annotation. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: gitlab-sizer-webhook-cert +spec: + secretName: gitlab-sizer-webhook-tls + issuerRef: + name: selfsigned-issuer + kind: Issuer + dnsNames: + - gitlab-sizer-webhook.ci-sizer.svc + - gitlab-sizer-webhook.ci-sizer.svc.cluster.local + duration: 8760h + renewBefore: 720h diff --git a/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/deployment.yaml b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/deployment.yaml new file mode 100644 index 0000000..0b99859 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/deployment.yaml @@ -0,0 +1,141 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gitlab-sizer-webhook +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gitlab-sizer-webhook +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: gitlab-sizer-webhook +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gitlab-sizer-webhook +subjects: + - kind: ServiceAccount + name: gitlab-sizer-webhook + namespace: ci-sizer +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gitlab-sizer-webhook + labels: + app: gitlab-sizer-webhook +spec: + replicas: 2 + selector: + matchLabels: + app: gitlab-sizer-webhook + template: + metadata: + labels: + app: gitlab-sizer-webhook + spec: + serviceAccountName: gitlab-sizer-webhook + securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: webhook + image: edp.buildth.ing/devfw-cicd/gitlab-webhook-edge-connect:latest + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + ports: + - containerPort: 8443 + protocol: TCP + args: + - --listen-addr=:8443 + - --tls-cert-file=/etc/webhook/tls/tls.crt + - --tls-key-file=/etc/webhook/tls/tls.key + - --sizer-url=http://sizer-receiver.ci-sizer.svc:8080 + - --sizer-sidecar-image=edp.buildth.ing/devfw-cicd/ci-sizer-collector:latest + env: + - name: WEBHOOK_SIZER_READ_TOKEN + valueFrom: + secretKeyRef: + name: gitlab-sizer-webhook-tokens + key: sizer-read-token + - name: WEBHOOK_SIZER_PUSH_TOKEN + valueFrom: + secretKeyRef: + name: gitlab-sizer-webhook-tokens + key: sizer-push-token + - name: HTTP_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: HTTP_PROXY + optional: true + - name: HTTPS_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: HTTPS_PROXY + optional: true + - name: NO_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: NO_PROXY + optional: true + volumeMounts: + - name: webhook-tls + mountPath: /etc/webhook/tls + readOnly: true + livenessProbe: + httpGet: + path: /healthz + port: 8443 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 8443 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 128Mi + volumes: + - name: webhook-tls + secret: + secretName: gitlab-sizer-webhook-tls +--- +apiVersion: v1 +kind: Service +metadata: + name: gitlab-sizer-webhook + labels: + app: gitlab-sizer-webhook +spec: + selector: + app: gitlab-sizer-webhook + ports: + - port: 443 + targetPort: 8443 + protocol: TCP diff --git a/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml new file mode 100644 index 0000000..72aea4a --- /dev/null +++ b/otc/benchmark.t09.de/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml @@ -0,0 +1,30 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: gitlab-sizer-webhook + annotations: + cert-manager.io/inject-ca-from: ci-sizer/gitlab-sizer-webhook-cert +webhooks: + - name: gitlab-sizer-webhook.ci-sizer.svc + admissionReviewVersions: ["v1"] + sideEffects: NoneOnDryRun + failurePolicy: Ignore + timeoutSeconds: 5 + reinvocationPolicy: Never + clientConfig: + service: + name: gitlab-sizer-webhook + namespace: ci-sizer + path: /mutate + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE"] + resources: ["pods"] + namespaceSelector: + matchLabels: + ci-sizer.devfw.io/watch: "true" + objectSelector: + matchExpressions: + - key: job.runner.gitlab.com/pod + operator: Exists diff --git a/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver.yaml b/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver.yaml new file mode 100644 index 0000000..a1623f9 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver.yaml @@ -0,0 +1,29 @@ +# Required: CI Sizer receiver +# Always deploy this — it stores metrics and computes sizing recommendations. +# Works standalone or with GARM (Forgejo/GitHub) and/or GitLab webhook. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: sizer-receiver + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: ci-sizer + source: + repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver" diff --git a/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml b/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml new file mode 100644 index 0000000..fc78147 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml @@ -0,0 +1,126 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sizer-receiver + labels: + app: sizer-receiver +spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app: sizer-receiver + template: + metadata: + labels: + app: sizer-receiver + spec: + securityContext: + fsGroup: 65534 + containers: + - name: receiver + image: edp.buildth.ing/devfw-cicd/ci-sizer-receiver:latest + imagePullPolicy: Always + args: + - --db=/data/metrics.db + ports: + - name: http + containerPort: 8080 + protocol: TCP + env: + - name: RECEIVER_READ_TOKEN + valueFrom: + secretKeyRef: + name: sizer-tokens + key: read-token + - name: RECEIVER_HMAC_KEY + valueFrom: + secretKeyRef: + name: sizer-tokens + key: hmac-key + - name: GARM_URL + value: "http://garm.garm.svc:80" + - name: GARM_USER + value: "admin" + - name: GARM_PASSWORD + valueFrom: + secretKeyRef: + name: garm-fixed-credentials + key: admin_password + - name: RECEIVER_OIDC_ISSUER + value: "https://dex.benchmark.t09.de" + - name: RECEIVER_OIDC_CLIENT_ID + value: "ci-sizer" + - name: RECEIVER_OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: sizer-oidc-client + key: client-secret + - name: RECEIVER_OIDC_REDIRECT_URI + value: "https://sizer.benchmark.t09.de/ui/callback" + - name: RECEIVER_SESSION_TTL + value: "12h" + - name: RECEIVER_ALLOWED_ORG + value: "giteaAdmin" + - name: RECEIVER_CPU_SIZING_MODE + value: "observe" + - name: RECEIVER_MEMORY_QOS + value: "guaranteed" + volumeMounts: + - name: data + mountPath: /data + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 2 + periodSeconds: 10 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + volumes: + - name: data + persistentVolumeClaim: + claimName: sizer-receiver-data +--- +apiVersion: v1 +kind: Service +metadata: + name: sizer-receiver + labels: + app: sizer-receiver +spec: + selector: + app: sizer-receiver + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: sizer-receiver-data + labels: + app: sizer-receiver + annotations: + everest.io/disk-volume-type: GPSSD +spec: + storageClassName: csi-disk + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi diff --git a/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver/ingress.yaml b/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver/ingress.yaml new file mode 100644 index 0000000..79d90f3 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/ci-sizer/sizer-receiver/ingress.yaml @@ -0,0 +1,36 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: main + + name: sizer-receiver + namespace: ci-sizer +spec: + ingressClassName: nginx + rules: + - host: sizer.benchmark.t09.de + http: + paths: + - backend: + service: + name: sizer-receiver + port: + number: 8080 + path: / + pathType: Prefix + - host: ci-sizer.benchmark.t09.de + http: + paths: + - backend: + service: + name: sizer-receiver + port: + number: 8080 + path: / + pathType: Prefix + tls: + - hosts: + - sizer.benchmark.t09.de + secretName: sizer-receiver-tls diff --git a/otc/benchmark.t09.de/stacks/coder/coder.yaml b/otc/benchmark.t09.de/stacks/coder/coder.yaml new file mode 100644 index 0000000..f40d6a6 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/coder/coder.yaml @@ -0,0 +1,32 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: coder + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: coder + sources: + - repoURL: https://helm.coder.com/v2 + chart: coder + targetRevision: 2.28.3 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/coder/coder/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/coder/coder/manifests" diff --git a/otc/benchmark.t09.de/stacks/coder/coder/manifests/postgres.yaml b/otc/benchmark.t09.de/stacks/coder/coder/manifests/postgres.yaml new file mode 100644 index 0000000..cae4b97 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/coder/coder/manifests/postgres.yaml @@ -0,0 +1,38 @@ +--- +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: coder-db + namespace: coder +spec: + instances: 1 + primaryUpdateStrategy: unsupervised + resources: + requests: + memory: "1Gi" + cpu: "1" + limits: + memory: "1Gi" + cpu: "1" + managed: + roles: + - name: coder + createdb: true + login: true + passwordSecret: + name: coder-db-user + storage: + size: 10Gi + storageClass: csi-disk +--- +apiVersion: postgresql.cnpg.io/v1 +kind: Database +metadata: + name: coder + namespace: coder +spec: + cluster: + name: coder-db + name: coder + owner: coder +--- diff --git a/otc/benchmark.t09.de/stacks/coder/coder/values.yaml b/otc/benchmark.t09.de/stacks/coder/coder/values.yaml new file mode 100644 index 0000000..eef7ac4 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/coder/coder/values.yaml @@ -0,0 +1,61 @@ +coder: + # You can specify any environment variables you'd like to pass to Coder + # here. Coder consumes environment variables listed in + # `coder server --help`, and these environment variables are also passed + # to the workspace provisioner (so you can consume them in your Terraform + # templates for auth keys etc.). + # + # Please keep in mind that you should not set `CODER_HTTP_ADDRESS`, + # `CODER_TLS_ENABLE`, `CODER_TLS_CERT_FILE` or `CODER_TLS_KEY_FILE` as + # they are already set by the Helm chart and will cause conflicts. + env: + - name: CODER_ACCESS_URL + value: https://coder.benchmark.t09.de + - name: CODER_PG_CONNECTION_URL + valueFrom: + secretKeyRef: + # You'll need to create a secret called coder-db-url with your + # Postgres connection URL like: + # postgres://coder:password@postgres:5432/coder?sslmode=disable + name: coder-db-user + key: url + # For production deployments, we recommend configuring your own GitHub + # OAuth2 provider and disabling the default one. + - name: CODER_OAUTH2_GITHUB_DEFAULT_PROVIDER_ENABLE + value: "false" + - name: EDGE_CONNECT_ENDPOINT + valueFrom: + secretKeyRef: + name: edge-credential + key: endpoint + - name: EDGE_CONNECT_USERNAME + valueFrom: + secretKeyRef: + name: edge-credential + key: username + - name: EDGE_CONNECT_PASSWORD + valueFrom: + secretKeyRef: + name: edge-credential + key: password + + # (Optional) For production deployments the access URL should be set. + # If you're just trying Coder, access the dashboard via the service IP. + # - name: CODER_ACCESS_URL + # value: "https://coder.example.com" + + #tls: + # secretNames: + # - my-tls-secret-name + service: + type: ClusterIP + + ingress: + enable: true + className: nginx + host: coder.benchmark.t09.de + annotations: + cert-manager.io/cluster-issuer: main + tls: + enable: true + secretName: coder-tls-secret diff --git a/otc/benchmark.t09.de/stacks/core/argocd.yaml b/otc/benchmark.t09.de/stacks/core/argocd.yaml new file mode 100644 index 0000000..33d9a7d --- /dev/null +++ b/otc/benchmark.t09.de/stacks/core/argocd.yaml @@ -0,0 +1,35 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: argocd + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: argocd + sources: + - repoURL: https://github.com/argoproj/argo-helm.git + path: charts/argo-cd + # TODO: RIRE Can be updated when https://github.com/argoproj/argo-cd/issues/20790 is fixed and merged + # As logout make problems, it is suggested to switch from path based routing to an own argocd domain, + # similar to the CNOE amazon reference implementation and in our case, Forgejo + targetRevision: argo-cd-9.4.6 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/core/argocd/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/core/argocd/manifests" diff --git a/otc/benchmark.t09.de/stacks/core/argocd/manifests/argocd-server-ingress.yaml b/otc/benchmark.t09.de/stacks/core/argocd/manifests/argocd-server-ingress.yaml new file mode 100644 index 0000000..1c7f405 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/core/argocd/manifests/argocd-server-ingress.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/backend-protocol: HTTP + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: main + + name: argocd-server + namespace: argocd +spec: + ingressClassName: nginx + rules: + - host: argocd.benchmark.t09.de + http: + paths: + - backend: + service: + name: argocd-server + port: + number: 80 + path: / + pathType: Prefix + tls: + - hosts: + - argocd.benchmark.t09.de + secretName: argocd-net-tls diff --git a/otc/benchmark.t09.de/stacks/core/argocd/values.yaml b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml new file mode 100644 index 0000000..1591cc9 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/core/argocd/values.yaml @@ -0,0 +1,66 @@ +global: + domain: argocd.benchmark.t09.de + +configs: + params: + server.insecure: true + cm: + oidc.config: | + name: FORGEJO + issuer: https://dex.benchmark.t09.de + clientID: controller-argocd-dex + clientSecret: $dex-argo-client:clientSecret + requestedScopes: + - openid + - profile + - email + - groups + application.resourceTrackingMethod: annotation + timeout.reconciliation: 60s + resource.exclusions: | + - apiGroups: + - "*" + kinds: + - ProviderConfigUsage + - apiGroups: + - cilium.io + kinds: + - CiliumIdentity + clusters: + - "*" + url: https://argocd.benchmark.t09.de + rbac: + policy.csv: 'g, DevFW, role:admin' + + tls: + certificates: + +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + +notifications: + enabled: false + +dex: + enabled: false diff --git a/otc/benchmark.t09.de/stacks/core/cloudnative-pg.yaml b/otc/benchmark.t09.de/stacks/core/cloudnative-pg.yaml new file mode 100644 index 0000000..aae0345 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/core/cloudnative-pg.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cloudnative-pg + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: cloudnative-pg + sources: + - repoURL: https://cloudnative-pg.github.io/charts + chart: cloudnative-pg + targetRevision: 0.26.1 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/core/cloudnative-pg/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/benchmark.t09.de/stacks/core/cloudnative-pg/values.yaml b/otc/benchmark.t09.de/stacks/core/cloudnative-pg/values.yaml new file mode 100644 index 0000000..cfebbfc --- /dev/null +++ b/otc/benchmark.t09.de/stacks/core/cloudnative-pg/values.yaml @@ -0,0 +1 @@ +# No need for values here. diff --git a/otc/benchmark.t09.de/stacks/core/dex.yaml b/otc/benchmark.t09.de/stacks/core/dex.yaml new file mode 100644 index 0000000..bb58b24 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/core/dex.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: dex + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: dex + sources: + - repoURL: https://charts.dexidp.io + chart: dex + targetRevision: 0.23.0 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/core/dex/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/benchmark.t09.de/stacks/core/dex/values.yaml b/otc/benchmark.t09.de/stacks/core/dex/values.yaml new file mode 100644 index 0000000..76b8450 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/core/dex/values.yaml @@ -0,0 +1,86 @@ +ingress: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: main + hosts: + - host: dex.benchmark.t09.de + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - dex.benchmark.t09.de + secretName: dex-cert + +envVars: + - name: FORGEJO_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-forgejo-client + key: clientSecret + - name: FORGEJO_CLIENT_ID + valueFrom: + secretKeyRef: + name: dex-forgejo-client + key: clientID + - name: OIDC_DEX_GRAFANA_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-grafana-client + key: clientSecret + - name: OIDC_DEX_ARGO_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-argo-client + key: clientSecret + - name: FORGEJO_RUNNER_SIZER_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-sizer-client + key: clientSecret + - name: LOG_LEVEL + value: debug + +config: + # Set it to a valid URL + issuer: https://dex.benchmark.t09.de + + # See https://dexidp.io/docs/storage/ for more options + storage: + type: memory + + oauth2: + skipApprovalScreen: true + alwaysShowLoginScreen: false + + connectors: + - type: gitea + id: gitea + name: Forgejo + config: + clientID: "$FORGEJO_CLIENT_ID" + clientSecret: "$FORGEJO_CLIENT_SECRET" + redirectURI: https://dex.benchmark.t09.de/callback + baseURL: https://edp.buildth.ing + # loadAllGroups: true + orgs: + - name: DevFW + enablePasswordDB: false + + staticClients: + - id: controller-argocd-dex + name: ArgoCD Client + redirectURIs: + - "https://argocd.benchmark.t09.de/auth/callback" + secretEnv: "OIDC_DEX_ARGO_CLIENT_SECRET" + - id: grafana + redirectURIs: + - "https://grafana.benchmark.t09.de/login/generic_oauth" + name: "Grafana" + secretEnv: "OIDC_DEX_GRAFANA_CLIENT_SECRET" + - id: ci-sizer + name: "CI Sizer" + redirectURIs: + - "https://sizer.benchmark.t09.de/ui/callback" + secretEnv: "FORGEJO_RUNNER_SIZER_CLIENT_SECRET" diff --git a/otc/dev.t09.de/stacks/garm/sizer-receiver.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-runner.yaml similarity index 76% rename from otc/dev.t09.de/stacks/garm/sizer-receiver.yaml rename to otc/benchmark.t09.de/stacks/forgejo/forgejo-runner.yaml index 1425cc6..5889ae5 100644 --- a/otc/dev.t09.de/stacks/garm/sizer-receiver.yaml +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-runner.yaml @@ -1,7 +1,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Application metadata: - name: sizer-receiver + name: forgejo-runner namespace: argocd labels: env: dev @@ -17,9 +17,8 @@ spec: retry: limit: -1 destination: - name: in-cluster - namespace: garm + server: "https://kubernetes.default.svc" source: repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances targetRevision: HEAD - path: "otc/dev.t09.de/stacks/garm/sizer-receiver" + path: "otc/benchmark.t09.de/stacks/forgejo/forgejo-runner" diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-runner/dind-docker.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-runner/dind-docker.yaml new file mode 100644 index 0000000..fa1ab7e --- /dev/null +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-runner/dind-docker.yaml @@ -0,0 +1,104 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: forgejo-runner + name: forgejo-runner + namespace: gitea +spec: + # Two replicas means that if one is busy, the other can pick up jobs. + replicas: 3 + selector: + matchLabels: + app: forgejo-runner + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + app: forgejo-runner + spec: + restartPolicy: Always + volumes: + - name: docker-certs + emptyDir: {} + - name: runner-data + emptyDir: {} + # Initialise our configuration file using offline registration + # https://forgejo.org/docs/v1.21/admin/actions/#offline-registration + initContainers: + - name: runner-register + image: code.forgejo.org/forgejo/runner:12.6.4 + command: + - "sh" + - "-c" + - | + forgejo-runner \ + register \ + --no-interactive \ + --token ${RUNNER_SECRET} \ + --name ${RUNNER_NAME} \ + --instance ${FORGEJO_INSTANCE_URL} \ + --labels docker:docker://node:24-bookworm,ubuntu-22.04:docker://ghcr.io/catthehacker/ubuntu:act-22.04,ubuntu-latest:docker://ghcr.io/catthehacker/ubuntu:act-24.04,ubuntu-24.04:docker://ghcr.io/catthehacker/ubuntu:act-24.04 + env: + - name: RUNNER_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: RUNNER_SECRET + valueFrom: + secretKeyRef: + name: forgejo-runner-token + key: token + - name: FORGEJO_INSTANCE_URL + value: https://benchmark.t09.de + volumeMounts: + - name: runner-data + mountPath: /data + containers: + - name: runner + image: code.forgejo.org/forgejo/runner:12.6.4 + command: + - "sh" + - "-c" + - | + while ! nc -z 127.0.0.1 2376 config.yml ; + sed -i -e "s|privileged: .*|privileged: true|" config.yml + sed -i -e "s|network: .*|network: host|" config.yml ; + sed -i -e "s|^ envs:$$| envs:\n DOCKER_HOST: tcp://127.0.0.1:2376\n DOCKER_TLS_VERIFY: 1\n DOCKER_CERT_PATH: /certs/client|" config.yml ; + sed -i -e "s|^ options:| options: -v /certs/client:/certs/client|" config.yml ; + sed -i -e "s| valid_volumes: \[\]$$| valid_volumes:\n - /certs/client|" config.yml ; + /bin/forgejo-runner --config config.yml daemon + securityContext: + allowPrivilegeEscalation: true + privileged: true + readOnlyRootFilesystem: false + runAsGroup: 0 + runAsNonRoot: false + runAsUser: 0 + env: + - name: DOCKER_HOST + value: tcp://localhost:2376 + - name: DOCKER_CERT_PATH + value: /certs/client + - name: DOCKER_TLS_VERIFY + value: "1" + volumeMounts: + - name: docker-certs + mountPath: /certs + - name: runner-data + mountPath: /data + - name: daemon + image: docker:28.0.4-dind + env: + - name: DOCKER_TLS_CERTDIR + value: /certs + securityContext: + privileged: true + volumeMounts: + - name: docker-certs + mountPath: /certs diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server.yaml new file mode 100644 index 0000000..17e91c5 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server.yaml @@ -0,0 +1,32 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: forgejo-server + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: gitea + sources: + - repoURL: https://code.forgejo.org/forgejo-helm/forgejo-helm.git + path: . + targetRevision: v16.2.0 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests" \ No newline at end of file diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml new file mode 100644 index 0000000..e850f89 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + nginx.ingress.kubernetes.io/proxy-body-size: 5120m + cert-manager.io/cluster-issuer: main + + name: forgejo-server + namespace: gitea +spec: + ingressClassName: nginx + rules: + - host: benchmark.t09.de + http: + paths: + - backend: + service: + name: forgejo-server-http + port: + number: 3000 + path: / + pathType: Prefix + tls: + - hosts: + - benchmark.t09.de + secretName: forgejo-net-tls diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml new file mode 100644 index 0000000..12883a9 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -0,0 +1,91 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: forgejo-s3-backup + namespace: gitea +spec: + schedule: "0 1 * * *" + concurrencyPolicy: "Forbid" + successfulJobsHistoryLimit: 5 + failedJobsHistoryLimit: 5 + startingDeadlineSeconds: 600 # 10 minutes + jobTemplate: + spec: + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 + backoffLimit: 2 + ttlSecondsAfterFinished: 259200 # + template: + spec: + containers: + - name: rclone + image: rclone/rclone:1.70 + imagePullPolicy: IfNotPresent + env: + - name: SOURCE_BUCKET + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: bucket-name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: access-key + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: secret-key + volumeMounts: + - name: rclone-config + mountPath: /config/rclone + readOnly: true + - name: backup-dir + mountPath: /backup + readOnly: false + command: + - /bin/sh + - -c + - | + rclone sync source:/${SOURCE_BUCKET} /backup -v --ignore-checksum + restartPolicy: OnFailure + volumes: + - name: rclone-config + secret: + secretName: forgejo-s3-backup + - name: backup-dir + persistentVolumeClaim: + claimName: s3-backup +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: s3-backup + namespace: gitea + annotations: + everest.io/disk-volume-type: GPSSD + everest.io/crypt-key-id: ac5a45e8-c705-445e-8026-e643e3f2525d +spec: + storageClassName: csi-disk + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 500Gi +--- +apiVersion: v1 +kind: Secret +metadata: + name: forgejo-s3-backup + namespace: gitea +type: Opaque +stringData: + rclone.conf: | + [source] + type = s3 + provider = HuaweiOBS + env_auth = true + endpoint = obs.eu-de.otc.t-systems.com + region = eu-de + acl = private diff --git a/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/values.yaml b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/values.yaml new file mode 100644 index 0000000..df16dee --- /dev/null +++ b/otc/benchmark.t09.de/stacks/forgejo/forgejo-server/values.yaml @@ -0,0 +1,180 @@ + +# We use recreate to make sure only one instance with one version is running, because Forgejo might break or data gets inconsistant. +strategy: + type: Recreate + +redis-cluster: + enabled: false + +redis: + enabled: false + +postgresql: + enabled: false + +postgresql-ha: + enabled: false + +persistence: + enabled: true + size: 200Gi + storageClass: csi-disk + annotations: + everest.io/crypt-key-id: ac5a45e8-c705-445e-8026-e643e3f2525d + everest.io/disk-volume-type: GPSSD + +test: + enabled: false + +deployment: + env: + - name: SSL_CERT_DIR + value: /etc/ssl/forgejo + +extraVolumeMounts: + - mountPath: /etc/ssl/forgejo + name: custom-database-certs-volume + readOnly: true + +extraVolumes: + - name: custom-database-certs-volume + secret: + secretName: custom-database-certs + +gitea: + metrics: + enabled: true + serviceMonitor: + enabled: true + additionalConfigFromEnvs: + - name: FORGEJO__storage__MINIO_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: access-key + - name: FORGEJO__storage__MINIO_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: secret-key + - name: FORGEJO__queue__CONN_STR + valueFrom: + secretKeyRef: + name: redis-forgejo-cloud-credentials + key: connection-string + - name: FORGEJO__session__PROVIDER_CONFIG + valueFrom: + secretKeyRef: + name: redis-forgejo-cloud-credentials + key: connection-string + - name: FORGEJO__cache__HOST + valueFrom: + secretKeyRef: + name: redis-forgejo-cloud-credentials + key: connection-string + - name: FORGEJO__database__HOST + valueFrom: + secretKeyRef: + name: postgres-forgejo-cloud-credentials + key: host_port + - name: FORGEJO__database__NAME + valueFrom: + secretKeyRef: + name: postgres-forgejo-cloud-credentials + key: database + - name: FORGEJO__database__USER + valueFrom: + secretKeyRef: + name: postgres-forgejo-cloud-credentials + key: username + - name: FORGEJO__database__PASSWD + valueFrom: + secretKeyRef: + name: postgres-forgejo-cloud-credentials + key: password + # Either 'elasticsearch' or 'bleve' (go in memory search engine) + - name: FORGEJO__indexer__ISSUE_INDEXER_TYPE + valueFrom: + secretKeyRef: + name: elasticsearch-cloud-credentials + key: type + - name: FORGEJO__indexer__ISSUE_INDEXER_CONN_STR + valueFrom: + secretKeyRef: + name: elasticsearch-cloud-credentials + key: connection-string + - name: FORGEJO__indexer__ISSUE_INDEXER_ENABLED + valueFrom: + secretKeyRef: + name: elasticsearch-cloud-credentials + key: enabled + - name: FORGEJO__mailer__PASSWD + valueFrom: + secretKeyRef: + name: email-user-credentials + key: connection-string + + admin: + existingSecret: gitea-credential + + config: + APP_NAME: 'EDP' + APP_SLOGAN: 'Build your thing in minutes' + storage: + MINIO_ENDPOINT: obs.eu-de.otc.t-systems.com:443 + STORAGE_TYPE: minio + MINIO_LOCATION: eu-de + MINIO_BUCKET: "edp-forgejo-non-prod-benchmark" + MINIO_USE_SSL: true + + queue: + TYPE: redis + + session: + PROVIDER: redis + + cache: + ENABLED: true + ADAPTER: redis + + service: + DISABLE_REGISTRATION: true + ENABLE_NOTIFY_MAIL: true + + other: + SHOW_FOOTER_VERSION: false + SHOW_FOOTER_TEMPLATE_LOAD_TIME: false + + database: + DB_TYPE: postgres + SSL_MODE: verify-ca + + server: + DOMAIN: 'benchmark.t09.de' + ROOT_URL: 'https://benchmark.t09.de:443' + + mailer: + ENABLED: true + USER: ipcei-cis-devfw@mms-support.de + PROTOCOL: smtps + FROM: '"IPCEI CIS DevFW" ' + SMTP_ADDR: mail.mms-support.de + SMTP_PORT: 465 + +service: + ssh: + type: LoadBalancer + nodePort: 32222 + externalTrafficPolicy: Cluster + annotations: + kubernetes.io/elb.id: db60c1a9-312c-42b7-847b-781d950a0e7a + +image: + pullPolicy: "IfNotPresent" + # Overrides the image tag whose default is the chart appVersion. + #tag: "8.0.3" + # Adds -rootless suffix to image name + # rootless: true + fullOverride: edp.buildth.ing/devfw-cicd/edp-forgejo:workflow-webhook-20260305 + +forgejo: {} diff --git a/otc/benchmark.t09.de/stacks/garm/garm.yaml b/otc/benchmark.t09.de/stacks/garm/garm.yaml new file mode 100644 index 0000000..05bb67c --- /dev/null +++ b/otc/benchmark.t09.de/stacks/garm/garm.yaml @@ -0,0 +1,33 @@ +# Default: Forgejo/GitHub Actions runner manager +# Deploys GARM with the ci-sizer provider for automatic sizing + collector injection. +# For GitLab-only deployments, omit this and use gitlab-webhook instead. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: garm + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: garm + sources: + - repoURL: https://edp.buildth.ing/DevFW-CICD/garm-helm + path: charts/garm + targetRevision: v0.0.17 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/garm/garm/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/benchmark.t09.de/stacks/garm/garm/values.yaml b/otc/benchmark.t09.de/stacks/garm/garm/values.yaml new file mode 100644 index 0000000..347f792 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/garm/garm/values.yaml @@ -0,0 +1,51 @@ +ingress: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: main + nginx.ingress.kubernetes.io/backend-protocol: HTTP + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + hosts: + - host: garm.benchmark.t09.de + paths: + - path: / + pathType: Prefix + tls: + - secretName: garm-net-tls + hosts: + - garm.benchmark.t09.de + +# Credentials and Secrets +credentials: + edgeConnect: + existingSecretName: "edge-credential" + gitea: + url: "https://benchmark.t09.de" # Required + db: + existingSecretName: garm-fixed-credentials + +image: + repository: edp.buildth.ing/devfw-cicd/garm-forgejo + tag: v0.1.7-forgejo-22 + +providerConfig: + edgeConnect: + organization: edp2 + region: EU + edgeConnectUrl: "https://hub.apps.edge.platform.mg3.mdb.osc.live" + cloudlet: + name: Hamburg + organization: TelekomOP + edgeConnectK8s: + pendingTimeout: "5m" + sizer: + sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.9.7 + sidecarPushEndpoint: https://sizer.benchmark.t09.de/api/v1/metrics + baseUrl: "https://sizer.benchmark.t09.de" + readToken: + existingSecretName: sizer-tokens + # key/mountPath/fileName default sanely in garm-helm ≥v0.0.17 + +garm: + logging: + logLevel: info diff --git a/otc/benchmark.t09.de/stacks/observability-client/metrics-server.yaml b/otc/benchmark.t09.de/stacks/observability-client/metrics-server.yaml new file mode 100644 index 0000000..454a0b7 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/metrics-server.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: metrics-server + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: observability + sources: + - chart: metrics-server + repoURL: https://kubernetes-sigs.github.io/metrics-server/ + targetRevision: 3.12.2 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/observability-client/metrics-server/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/benchmark.t09.de/stacks/observability-client/metrics-server/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/metrics-server/values.yaml new file mode 100644 index 0000000..e96ba41 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/metrics-server/values.yaml @@ -0,0 +1,4 @@ +metrics: + enabled: true +serviceMonitor: + enabled: true diff --git a/otc/benchmark.t09.de/stacks/observability-client/vector.yaml b/otc/benchmark.t09.de/stacks/observability-client/vector.yaml new file mode 100644 index 0000000..a56dbe8 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/vector.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: vector + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: observability + sources: + - chart: vector + repoURL: https://helm.vector.dev + targetRevision: 0.43.0 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml new file mode 100644 index 0000000..2393b1a --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/vector/values.yaml @@ -0,0 +1,68 @@ +# -- Enable deployment of vector +role: Agent +dataDir: /vector-data-dir +resources: {} +args: + - -w + - --config-dir + - /etc/vector/ +env: + - name: VECTOR_USER + valueFrom: + secretKeyRef: + name: simple-user-secret + key: username + - name: VECTOR_PASSWORD + valueFrom: + secretKeyRef: + name: simple-user-secret + key: password +containerPorts: + - name: prom-exporter + containerPort: 9090 + protocol: TCP +service: + enabled: false +customConfig: + data_dir: /vector-data-dir + api: + enabled: false + address: 0.0.0.0:8686 + playground: true + sources: + k8s: + type: kubernetes_logs + internal_metrics: + type: internal_metrics + transforms: + parser: + type: remap + inputs: [k8s] + source: | + ._msg = parse_json(.message) ?? .message + del(.message) + # Add the cluster environment to the log event + .cluster_environment = "benchmark" + sinks: + vlogs: + type: elasticsearch + inputs: [parser] + endpoints: + - https://o12y.observability.buildth.ing/insert/elasticsearch/ + auth: + strategy: basic + user: ${VECTOR_USER} + password: ${VECTOR_PASSWORD} + mode: bulk + api_version: v8 + compression: gzip + healthcheck: + enabled: false + request: + headers: + AccountID: "0" + ProjectID: "0" + query: + _msg_field: _msg + _time_field: _time + _stream_fields: cluster_environment,kubernetes.container_name,kubernetes.namespace \ No newline at end of file diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack.yaml new file mode 100644 index 0000000..bcc2fbc --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: vm-client + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + destination: + name: in-cluster + namespace: observability + sources: + - chart: victoria-metrics-k8s-stack + repoURL: https://victoriametrics.github.io/helm-charts/ + targetRevision: 0.48.1 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests" diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml new file mode 100644 index 0000000..4bc089d --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -0,0 +1,1288 @@ +global: + # -- Cluster label to use for dashboards and rules + clusterLabel: cluster + # -- Global license configuration + license: + key: "" + keyRef: {} + # name: secret-license + # key: license + cluster: + # -- K8s cluster domain suffix, uses for building storage pods' FQDN. Details are [here](https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/) + dnsDomain: cluster.local. + +# -- Override chart name +nameOverride: "" +# -- Resource full name override +fullnameOverride: "" +# -- Tenant to use for Grafana datasources and remote write +tenant: "0" +# -- If this chart is used in "Argocd" with "releaseName" field then +# VMServiceScrapes couldn't select the proper services. +# For correct working need set value 'argocdReleaseOverride=$ARGOCD_APP_NAME' +argocdReleaseOverride: "" + +# -- VictoriaMetrics Operator dependency chart configuration. More values can be found [here](https://docs.victoriametrics.com/helm/victoriametrics-operator#parameters). Also checkout [here](https://docs.victoriametrics.com/operator/vars) possible ENV variables to configure operator behaviour +victoria-metrics-operator: + enabled: true + crds: + plain: true + cleanup: + enabled: true + image: + repository: bitnami/kubectl + pullPolicy: IfNotPresent + serviceMonitor: + enabled: true + operator: + # -- By default, operator converts prometheus-operator objects. + disable_prometheus_converter: false + # group pinguin added the admissionWebhooks value according to https://docs.victoriametrics.com/helm/victoriametrics-k8s-stack/#argocd-issues + admissionWebhooks: + certManager: + enabled: true + +defaultDashboards: + # -- Enable custom dashboards installation + enabled: false + defaultTimezone: utc + labels: {} + annotations: {} + grafanaOperator: + # -- Create dashboards as CRDs (requires grafana-operator to be installed) + enabled: false + spec: + instanceSelector: + matchLabels: + dashboards: grafana + allowCrossNamespaceImport: false + # -- Create dashboards as ConfigMap despite dependency it requires is not installed + dashboards: + victoriametrics-vmalert: + enabled: true + victoriametrics-operator: + enabled: true + # -- In ArgoCD using client-side apply this dashboard reaches annotations size limit and causes k8s issues without server side apply + # See [this issue](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack#metadataannotations-too-long-must-have-at-most-262144-bytes-on-dashboards) + node-exporter-full: + enabled: true + +# -- Create default rules for monitoring the cluster +defaultRules: + # -- Labels, which are used for grouping results of the queries. Note that these labels are joined with `.Values.global.clusterLabel` + additionalGroupByLabels: [] + create: true + + # -- Common properties for VMRule groups + group: + spec: + # -- Optional HTTP URL parameters added to each rule request + params: {} + + # -- Common properties for all VMRules + rule: + spec: + # -- Additional labels for all VMRules + labels: {} + # -- Additional annotations for all VMRules + annotations: {} + + # -- Common properties for VMRules alerts + alerting: + spec: + # -- Additional labels for VMRule alerts + labels: {} + # -- Additional annotations for VMRule alerts + annotations: {} + + # -- Common properties for VMRules recording rules + recording: + spec: + # -- Additional labels for VMRule recording rules + labels: {} + # -- Additional annotations for VMRule recording rules + annotations: {} + + # -- Per rule properties + rules: {} + # CPUThrottlingHigh: + # create: true + # spec: + # for: 15m + # labels: + # severity: critical + # -- Rule group properties + groups: + etcd: + create: true + # -- Common properties for all rules in a group + rules: {} + # spec: + # annotations: + # dashboard: https://example.com/dashboard/1 + general: + create: true + rules: {} + k8sContainerCpuLimits: + create: true + rules: {} + k8sContainerCpuRequests: + create: true + rules: {} + k8sContainerCpuUsageSecondsTotal: + create: true + rules: {} + k8sContainerMemoryLimits: + create: true + rules: {} + k8sContainerMemoryRequests: + create: true + rules: {} + k8sContainerMemoryRss: + create: true + rules: {} + k8sContainerMemoryCache: + create: true + rules: {} + k8sContainerMemoryWorkingSetBytes: + create: true + rules: {} + k8sContainerMemorySwap: + create: true + rules: {} + k8sPodOwner: + create: true + rules: {} + k8sContainerResource: + create: true + rules: {} + kubeApiserver: + create: true + rules: {} + kubeApiserverAvailability: + create: true + rules: {} + kubeApiserverBurnrate: + create: true + rules: {} + kubeApiserverHistogram: + create: true + rules: {} + kubeApiserverSlos: + create: true + rules: {} + kubelet: + create: true + rules: {} + kubePrometheusGeneral: + create: true + rules: {} + kubePrometheusNodeRecording: + create: true + rules: {} + kubernetesApps: + create: true + rules: {} + targetNamespace: ".*" + kubernetesResources: + create: true + rules: {} + kubernetesStorage: + create: true + rules: {} + targetNamespace: ".*" + kubernetesSystem: + create: true + rules: {} + kubernetesSystemKubelet: + create: true + rules: {} + kubernetesSystemApiserver: + create: true + rules: {} + kubernetesSystemControllerManager: + create: true + rules: {} + kubeScheduler: + create: true + rules: {} + kubernetesSystemScheduler: + create: true + rules: {} + kubeStateMetrics: + create: true + rules: {} + nodeNetwork: + create: true + rules: {} + node: + create: true + rules: {} + vmagent: + create: true + rules: {} + vmsingle: + create: true + rules: {} + vmcluster: + create: true + rules: {} + vmHealth: + create: true + rules: {} + vmoperator: + create: true + rules: {} + alertmanager: + create: true + rules: {} + + # -- Runbook url prefix for default rules + runbookUrl: https://runbooks.prometheus-operator.dev/runbooks + + # -- Labels for default rules + labels: {} + # -- Annotations for default rules + annotations: {} + +# -- Provide custom recording or alerting rules to be deployed into the cluster. +additionalVictoriaMetricsMap: +# rule-name: +# groups: +# - name: my_group +# rules: +# - record: my_record +# expr: 100 * my_record + +external: + grafana: + # -- External Grafana host + host: "" + # -- External Grafana datasource name + datasource: VictoriaMetrics + # -- External VM read and write URLs + vm: + read: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + write: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + +# Configures vmsingle params +vmsingle: + # -- VMSingle annotations + annotations: {} + # -- Create VMSingle CR + enabled: false + # -- Full spec for VMSingle CRD. Allowed values describe [here](https://docs.victoriametrics.com/operator/api#vmsinglespec) + spec: + port: "8429" + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicaCount: 1 + extraArgs: {} + storage: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + ingress: + # -- Enable deployment of ingress for server component + enabled: false + # -- Ingress annotations + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # -- Ingress extra labels + labels: {} + # -- Ingress default path + path: "" + # -- Ingress path type + pathType: Prefix + # -- Ingress controller class name + ingressClassName: "" + + # -- Array of host objects + hosts: [] + # - vmsingle.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vmsingle-ingress-tls + # hosts: + # - vmsingle.domain.com + +vmcluster: + # -- Create VMCluster CR + enabled: false + # -- VMCluster annotations + annotations: {} + # -- Full spec for VMCluster CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmclusterspec) + spec: + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicationFactor: 2 + vmstorage: + replicaCount: 2 + storageDataPath: /vm-data + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 10Gi + resources: + {} + # limits: + # cpu: "1" + # memory: 1500Mi + vmselect: + # -- Set this value to false to disable VMSelect + enabled: true + port: "8481" + replicaCount: 2 + cacheMountPath: /select-cache + extraArgs: {} + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 2Gi + resources: + {} + # limits: + # cpu: "1" + # memory: "1000Mi" + # requests: + # cpu: "0.5" + # memory: "500Mi" + vminsert: + # -- Set this value to false to disable VMInsert + enabled: true + port: "8480" + replicaCount: 2 + extraArgs: {} + resources: + {} + # limits: + # cpu: "1" + # memory: 1000Mi + # requests: + # cpu: "0.5" + # memory: "500Mi" + + ingress: + storage: + # -- Enable deployment of ingress for server component + enabled: false + + # -- Ingress annotations + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + + # -- Ingress extra labels + labels: {} + + # -- Ingress controller class name + ingressClassName: "" + + # -- Ingress path type + pathType: Prefix + + # -- Ingress default path + path: "" + + # -- Array of host objects + hosts: [] + # - vmstorage.domain.com + + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vmstorage-ingress-tls + # hosts: + # - vmstorage.domain.com + select: + # -- Enable deployment of ingress for server component + enabled: false + + # -- Ingress annotations + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + + # -- Ingress extra labels + labels: {} + + # -- Ingress controller class name + ingressClassName: "" + + # -- Ingress path type + pathType: Prefix + + # -- Ingress default path + path: '{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vmselect }}' + + # -- Array of host objects + hosts: [] + # - vmselect.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vmselect-ingress-tls + # hosts: + # - vmselect.domain.com + insert: + # -- Enable deployment of ingress for server component + enabled: false + + # -- Ingress annotations + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + + # -- Ingress extra labels + labels: {} + + # -- Ingress controller class name + ingressClassName: "" + + # -- Ingress path type + pathType: Prefix + + # -- Ingress default path + path: '{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vminsert }}' + + # -- Array of host objects + hosts: [] + # - vminsert.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vminsert-ingress-tls + # hosts: + # - vminsert.domain.com + +alertmanager: + # -- Create VMAlertmanager CR + enabled: false + # -- Alertmanager annotations + annotations: {} + # -- (object) Full spec for VMAlertmanager CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmalertmanagerspec) + spec: + replicaCount: 1 + port: "9093" + selectAllByDefault: true + image: + tag: v0.28.1 + externalURL: "" + routePrefix: / + + # -- (string) If this one defined, it will be used for alertmanager configuration and config parameter will be ignored + configSecret: "" + # -- + # @raw + # enable storing .Values.alertmanager.config in VMAlertmanagerConfig instead of k8s Secret. + # Note: VMAlertmanagerConfig and plain Alertmanager config structures are not equal. + # If you're migrating existing config, please make sure that `.Values.alertmanager.config`: + # - with `useManagedConfig: false` has structure described [here](https://prometheus.io/docs/alerting/latest/configuration/). + # - with `useManagedConfig: true` has structure described [here](https://docs.victoriametrics.com/operator/api/#vmalertmanagerconfig). + useManagedConfig: false + # -- (object) Alertmanager configuration + config: + route: + receiver: "blackhole" + # group_by: ["alertgroup", "job"] + # group_wait: 30s + # group_interval: 5m + # repeat_interval: 12h + # routes: + # + # # Duplicate code_owner routes to teams + # # These will send alerts to team channels but continue + # # processing through the rest of the tree to handled by on-call + # - matchers: + # - code_owner_channel!="" + # - severity=~"info|warning|critical" + # group_by: ["code_owner_channel", "alertgroup", "job"] + # receiver: slack-code-owners + # + # # Standard on-call routes + # - matchers: + # - severity=~"info|warning|critical" + # receiver: slack-monitoring + # continue: true + # + # inhibit_rules: + # - target_matchers: + # - severity=~"warning|info" + # source_matchers: + # - severity=critical + # equal: + # - cluster + # - namespace + # - alertname + # - target_matchers: + # - severity=info + # source_matchers: + # - severity=warning + # equal: + # - cluster + # - namespace + # - alertname + # - target_matchers: + # - severity=info + # source_matchers: + # - alertname=InfoInhibitor + # equal: + # - cluster + # - namespace + + receivers: + - name: blackhole + # - name: "slack-monitoring" + # slack_configs: + # - channel: "#channel" + # send_resolved: true + # title: '{{ template "slack.monzo.title" . }}' + # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + # color: '{{ template "slack.monzo.color" . }}' + # text: '{{ template "slack.monzo.text" . }}' + # actions: + # - type: button + # text: "Runbook :green_book:" + # url: "{{ (index .Alerts 0).Annotations.runbook_url }}" + # - type: button + # text: "Query :mag:" + # url: "{{ (index .Alerts 0).GeneratorURL }}" + # - type: button + # text: "Dashboard :grafana:" + # url: "{{ (index .Alerts 0).Annotations.dashboard }}" + # - type: button + # text: "Silence :no_bell:" + # url: '{{ template "__alert_silence_link" . }}' + # - type: button + # text: '{{ template "slack.monzo.link_button_text" . }}' + # url: "{{ .CommonAnnotations.link_url }}" + # - name: slack-code-owners + # slack_configs: + # - channel: "#{{ .CommonLabels.code_owner_channel }}" + # send_resolved: true + # title: '{{ template "slack.monzo.title" . }}' + # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + # color: '{{ template "slack.monzo.color" . }}' + # text: '{{ template "slack.monzo.text" . }}' + # actions: + # - type: button + # text: "Runbook :green_book:" + # url: "{{ (index .Alerts 0).Annotations.runbook }}" + # - type: button + # text: "Query :mag:" + # url: "{{ (index .Alerts 0).GeneratorURL }}" + # - type: button + # text: "Dashboard :grafana:" + # url: "{{ (index .Alerts 0).Annotations.dashboard }}" + # - type: button + # text: "Silence :no_bell:" + # url: '{{ template "__alert_silence_link" . }}' + # - type: button + # text: '{{ template "slack.monzo.link_button_text" . }}' + # url: "{{ .CommonAnnotations.link_url }}" + # + # -- Better alert templates for [slack source](https://gist.github.com/milesbxf/e2744fc90e9c41b47aa47925f8ff6512) + monzoTemplate: + enabled: true + + # -- (object) Extra alert templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + # -- (object) Alertmanager ingress configuration + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: '{{ .Values.alertmanager.spec.routePrefix | default "/" }}' + pathType: Prefix + + hosts: + - alertmanager.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: alertmanager-ingress-tls + # hosts: + # - alertmanager.domain.com + +vmalert: + # -- VMAlert annotations + annotations: {} + # -- Create VMAlert CR + enabled: false + + # -- Controls whether VMAlert should use VMAgent or VMInsert as a target for remotewrite + remoteWriteVMAgent: false + # -- (object) Full spec for VMAlert CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmalertspec) + spec: + port: "8080" + selectAllByDefault: true + evaluationInterval: 20s + extraArgs: + http.pathPrefix: "/" + + # External labels to add to all generated recording rules and alerts + externalLabels: {} + + # -- (object) Extra VMAlert annotation templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + # -- Allows to configure static notifiers, discover notifiers via Consul and DNS, + # see specification [here](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file). + # This configuration will be created as separate secret and mounted to VMAlert pod. + additionalNotifierConfigs: {} + # dns_sd_configs: + # - names: + # - my.domain.com + # type: 'A' + # port: 9093 + # -- (object) VMAlert ingress config + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + pathType: Prefix + + hosts: + - vmalert.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmalert-ingress-tls + # hosts: + # - vmalert.domain.com + +vmauth: + # -- Enable VMAuth CR + enabled: false + # -- VMAuth annotations + annotations: {} + # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) + # It's possible to use given below predefined variables in spec: + # * `{{ .vm.read }}` - parsed vmselect, vmsingle or external.vm.read URL + # * `{{ .vm.write }}` - parsed vminsert, vmsingle or external.vm.write URL + spec: + port: "8427" + ingress: + class_name: nginx + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: main + host: o12y.benchmark.t09.de + tlsHosts: + - o12y.benchmark.t09.de + tlsSecretName: vmauth-tls-secret + unauthorizedUserAccessSpec: {} + selectAllByDefault: true + +vmagent: + # -- Create VMAgent CR + enabled: true + # -- VMAgent annotations + annotations: {} + # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) + additionalRemoteWrites: + # [] + - url: https://o12y.observability.buildth.ing/api/v1/write + basicAuth: + username: + name: simple-user-secret + key: username + password: + name: simple-user-secret + key: password + # -- (object) Full spec for VMAgent CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmagentspec) + spec: + port: "8429" + selectAllByDefault: true + scrapeInterval: 20s + externalLabels: + cluster_environment: "benchmark" + # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. + # For example: + # cluster: cluster-name + extraArgs: + promscrape.streamParse: "true" + # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent + # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug + promscrape.dropOriginalLabels: "true" + # -- (object) VMAgent ingress configuration + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + pathType: Prefix + + hosts: + - vmagent.domain.com + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmagent-ingress-tls + # hosts: + # - vmagent.domain.com + +defaultDatasources: + grafanaOperator: + # -- Create datasources as CRDs (requires grafana-operator to be installed) + enabled: false + annotations: {} + spec: + instanceSelector: + matchLabels: + dashboards: grafana + allowCrossNamespaceImport: false + victoriametrics: + # -- Create per replica prometheus compatible datasource + perReplica: false + # -- List of prometheus compatible datasource configurations. + # VM `url` will be added to each of them in templates. + datasources: + - name: VictoriaMetrics + type: prometheus + access: proxy + isDefault: true + - name: VictoriaMetrics (DS) + isDefault: false + access: proxy + type: victoriametrics-metrics-datasource + version: "0.15.1" + # -- List of alertmanager datasources. + # Alertmanager generated `url` will be added to each datasource in template if alertmanager is enabled + alertmanager: + # -- Create per replica alertmanager compatible datasource + perReplica: false + datasources: + - name: Alertmanager + access: proxy + jsonData: + implementation: prometheus + # -- Configure additional grafana datasources (passed through tpl). + # Check [here](http://docs.grafana.org/administration/provisioning/#datasources) for details + extra: + - name: victoria-logs + access: proxy + type: VictoriaLogs + url: http://vlogs-victorialogs:9428 + version: 1 + +# -- Grafana dependency chart configuration. For possible values refer [here](https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration) +grafana: + enabled: false + # all values for grafana helm chart can be specified here + persistence: + enabled: true + type: pvc + storageClassName: "default" + sidecar: + datasources: + enabled: true + initDatasources: true + label: grafana_datasource + dashboards: + provider: + name: default + orgid: 1 + folder: /var/lib/grafana/dashboards + defaultFolderName: default + enabled: true + multicluster: false + + # -- Create datasource configmap even if grafana deployment has been disabled + forceDeployDatasource: false + + # Uncomment the block below, if you want to enable VictoriaMetrics Datasource in Grafana: + # Note that Grafana will need internet access to install the datasource plugin. + # + # plugins: + # - victoriametrics-metrics-datasource + + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + pathType: Prefix + + hosts: + - grafana.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: grafana-ingress-tls + # hosts: + # - grafana.domain.com + + # -- Grafana VM scrape config + vmScrape: + # whether we should create a service scrape resource for grafana + enabled: true + + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Grafana + spec: + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "grafana.name" .Subcharts.grafana }}' + endpoints: + - port: '{{ .Values.grafana.service.portName }}' + +# -- prometheus-node-exporter dependency chart configuration. For possible values check [here](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml) +prometheus-node-exporter: + enabled: true + + # all values for prometheus-node-exporter helm chart can be specified here + service: + # Add the 'node-exporter' label to be used by serviceMonitor to match standard common usage in rules and grafana dashboards + # + labels: + jobLabel: node-exporter + extraArgs: + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|erofs|sysfs|tracefs)$ + # -- Node Exporter VM scrape config + vmScrape: + # whether we should create a service scrape resource for node-exporter + enabled: true + + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Node Exporter + spec: + jobLabel: jobLabel + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "prometheus-node-exporter.name" (index .Subcharts "prometheus-node-exporter") }}' + endpoints: + - port: metrics + metricRelabelConfigs: + - action: drop + source_labels: [mountpoint] + regex: "/var/lib/kubelet/pods.+" +# -- kube-state-metrics dependency chart configuration. For possible values check [here](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml) +kube-state-metrics: + enabled: true + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Kube State Metrics + vmScrape: + enabled: true + spec: + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "kube-state-metrics.name" (index .Subcharts "kube-state-metrics") }}' + app.kubernetes.io/instance: '{{ include "vm.release" . }}' + endpoints: + - port: http + honorLabels: true + metricRelabelConfigs: + - action: labeldrop + regex: (uid|container_id|image_id) + jobLabel: app.kubernetes.io/name + +# -- Component scraping the kubelets +kubelet: + enabled: true + vmScrapes: + # -- Enable scraping /metrics/cadvisor from kubelet's service + cadvisor: + enabled: true + spec: + path: /metrics/cadvisor + # -- Enable scraping /metrics/probes from kubelet's service + probes: + enabled: true + spec: + path: /metrics/probes + # -- Enabled scraping /metrics/resource from kubelet's service + resources: + enabled: true + spec: + path: /metrics/resource + kubelet: + spec: {} + # -- Spec for VMNodeScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmnodescrapespec) + vmScrape: + kind: VMNodeScrape + spec: + scheme: "https" + honorLabels: true + interval: "30s" + scrapeTimeout: "5s" + tlsConfig: + insecureSkipVerify: true + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # drop high cardinality label and useless metrics for cadvisor and kubelet + metricRelabelConfigs: + - action: labeldrop + regex: (uid) + - action: labeldrop + regex: (id|name) + - action: drop + source_labels: [__name__] + regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count) + relabelConfigs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - sourceLabels: [__metrics_path__] + targetLabel: metrics_path + - targetLabel: job + replacement: kubelet + # ignore timestamps of cadvisor's metrics by default + # more info here https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1656540535 + honorTimestamps: false +# Component scraping the kube api server +kubeApiServer: + # -- Enable Kube Api Server metrics scraping + enabled: true + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes + +# Component scraping the kube controller manager +kubeControllerManager: + # -- Enable kube controller manager metrics scraping + enabled: true + + # -- If your kube controller manager is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + # If using kubeControllerManager.endpoints only the port and targetPort are used + service: + # -- Create service for kube controller manager metrics scraping + enabled: true + # -- Kube controller manager service port + port: 10257 + # -- Kube controller manager service target port + targetPort: 10257 + # -- Kube controller manager service pod selector + selector: + component: kube-controller-manager + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: + - kube-system + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + +# Component scraping kubeDns. Use either this or coreDns +kubeDns: + # -- Enabled KubeDNS metrics scraping + enabled: false + service: + # -- Create Service for KubeDNS metrics + enabled: false + # -- KubeDNS service ports + ports: + dnsmasq: + port: 10054 + targetPort: 10054 + skydns: + port: 10055 + targetPort: 10055 + # -- KubeDNS service pods selector + selector: + k8s-app: kube-dns + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - port: http-metrics-dnsmasq + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + - port: http-metrics-skydns + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +# Component scraping coreDns. Use either this or kubeDns +coreDns: + # -- Enabled CoreDNS metrics scraping + enabled: true + service: + # -- Create service for CoreDNS metrics + enabled: true + # -- CoreDNS service port + port: 9153 + # -- CoreDNS service target port + targetPort: 9153 + # -- CoreDNS service pod selector + selector: + k8s-app: kube-dns + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - port: http-metrics + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +# Component scraping etcd +kubeEtcd: + # -- Enabled KubeETCD metrics scraping + enabled: true + + # -- If your etcd is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + # Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used + service: + # -- Enable service for ETCD metrics scraping + enabled: true + # -- ETCD service port + port: 2379 + # -- ETCD service target port + targetPort: 2379 + # -- ETCD service pods selector + selector: + component: etcd + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# Component scraping kube scheduler +kubeScheduler: + # -- Enable KubeScheduler metrics scraping + enabled: true + + # -- If your kube scheduler is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + # If using kubeScheduler.endpoints only the port and targetPort are used + service: + # -- Enable service for KubeScheduler metrics scrape + enabled: true + # -- KubeScheduler service port + port: 10259 + # -- KubeScheduler service target port + targetPort: 10259 + # -- KubeScheduler service pod selector + selector: + component: kube-scheduler + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# Component scraping kube proxy +kubeProxy: + # -- Enable kube proxy metrics scraping + enabled: false + + # -- If your kube proxy is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + service: + # -- Enable service for kube proxy metrics scraping + enabled: true + # -- Kube proxy service port + port: 10249 + # -- Kube proxy service target port + targetPort: 10249 + # -- Kube proxy service pod selector + selector: + k8s-app: kube-proxy + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# -- Add extra objects dynamically to this chart +extraObjects: [] + diff --git a/otc/benchmark.t09.de/stacks/observability/grafana-operator.yaml b/otc/benchmark.t09.de/stacks/observability/grafana-operator.yaml new file mode 100644 index 0000000..6c208d5 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/grafana-operator.yaml @@ -0,0 +1,25 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grafana-operator + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + destination: + name: in-cluster + namespace: observability + sources: + - chart: grafana-operator + repoURL: ghcr.io/grafana/helm-charts + targetRevision: v5.18.0 + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests" diff --git a/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/argocd.yaml b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/argocd.yaml new file mode 100644 index 0000000..b348ff7 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/argocd.yaml @@ -0,0 +1,9 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: argocd +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" diff --git a/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml new file mode 100644 index 0000000..0989872 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml @@ -0,0 +1,75 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: grafana + labels: + dashboards: "grafana" +spec: + persistentVolumeClaim: + metadata: + annotations: + everest.io/disk-volume-type: GPSSD + everest.io/crypt-key-id: ac5a45e8-c705-445e-8026-e643e3f2525d + spec: + storageClassName: csi-disk + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + deployment: + spec: + template: + spec: + containers: + - name: grafana + env: + - name: OAUTH_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: clientSecret + name: dex-grafana-client + config: + log.console: + level: debug + server: + root_url: "https://grafana.benchmark.t09.de" + auth: + disable_login: "true" + disable_login_form: "true" + auth.generic_oauth: + enabled: "true" + name: Forgejo + allow_sign_up: "true" + use_refresh_token: "true" + client_id: grafana + client_secret: $__env{OAUTH_CLIENT_SECRET} + scopes: openid email profile offline_access groups + auth_url: https://dex.benchmark.t09.de/auth + token_url: https://dex.benchmark.t09.de/token + api_url: https://dex.benchmark.t09.de/userinfo + redirect_uri: https://grafana.benchmark.t09.de/login/generic_oauth + role_attribute_path: "contains(groups[*], 'DevFW') && 'GrafanaAdmin' || 'None'" + allow_assign_grafana_admin: "true" + ingress: + metadata: + annotations: + cert-manager.io/cluster-issuer: main + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + spec: + ingressClassName: nginx + rules: + - host: grafana.benchmark.t09.de + http: + paths: + - backend: + service: + name: grafana-service + port: + number: 3000 + path: / + pathType: Prefix + tls: + - hosts: + - grafana.benchmark.t09.de + secretName: grafana-net-tls diff --git a/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml new file mode 100644 index 0000000..c13d6a2 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml @@ -0,0 +1,9 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: ingress-nginx +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" diff --git a/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/victoria-logs.yaml new file mode 100644 index 0000000..4018fbd --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -0,0 +1,9 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: victoria-logs +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" diff --git a/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack.yaml b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack.yaml new file mode 100644 index 0000000..3a6506f --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack.yaml @@ -0,0 +1,31 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: o12y + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + destination: + name: in-cluster + namespace: observability + sources: + - chart: victoria-metrics-k8s-stack + repoURL: https://victoriametrics.github.io/helm-charts/ + targetRevision: 0.48.1 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests" diff --git a/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml new file mode 100644 index 0000000..110ee7e --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -0,0 +1,40 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: forgejo-alerts + namespace: observability +spec: + groups: + - name: forgejo + rules: + - alert: forgejo down + expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1 + for: 30s + labels: + severity: critical + job: "{{ $labels.job }}" + annotations: + value: "{{ $value }}" + description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}' + - name: forgejo-backup + rules: + - alert: forgejo s3 backup job failed + expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0 + for: 30s + labels: + severity: critical + job: "{{ $labels.job }}" + annotations: + value: "{{ $value }}" + description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}' + - name: disk-consumption-high + rules: + - alert: disk consumption high + expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6 + for: 30s + labels: + severity: major + job: "{{ $labels.job }}" + annotations: + value: "{{ $value }}" + description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}' diff --git a/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml new file mode 100644 index 0000000..2247375 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml @@ -0,0 +1,26 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VLogs +metadata: + name: victorialogs + namespace: observability +spec: + retentionPeriod: "12" + removePvcAfterDelete: true + storageMetadata: + annotations: + everest.io/crypt-key-id: ac5a45e8-c705-445e-8026-e643e3f2525d + everest.io/disk-volume-type: GPSSD + storage: + storageClassName: csi-disk + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + resources: + requests: + memory: 500Mi + cpu: 500m + limits: + memory: 10Gi + cpu: 2 diff --git a/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml new file mode 100644 index 0000000..5759093 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -0,0 +1,17 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMUser +metadata: + name: simple-user + namespace: observability +spec: + username: simple-user + passwordRef: + key: password + name: simple-user-secret + targetRefs: + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/write"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/insert/elasticsearch/.*"] \ No newline at end of file diff --git a/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/values.yaml b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/values.yaml new file mode 100644 index 0000000..999f596 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/observability/victoria-k8s-stack/values.yaml @@ -0,0 +1,1230 @@ +global: + # -- Cluster label to use for dashboards and rules + clusterLabel: cluster + # -- Global license configuration + license: + key: "" + keyRef: {} + # name: secret-license + # key: license + cluster: + # -- K8s cluster domain suffix, uses for building storage pods' FQDN. Details are [here](https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/) + dnsDomain: cluster.local. + +# -- Override chart name +nameOverride: "" +# -- Resource full name override +fullnameOverride: "o12y" +# -- Tenant to use for Grafana datasources and remote write +tenant: "0" +# -- If this chart is used in "Argocd" with "releaseName" field then +# VMServiceScrapes couldn't select the proper services. +# For correct working need set value 'argocdReleaseOverride=$ARGOCD_APP_NAME' +argocdReleaseOverride: "o12y" + +# -- VictoriaMetrics Operator dependency chart configuration. More values can be found [here](https://docs.victoriametrics.com/helm/victoriametrics-operator#parameters). Also checkout [here](https://docs.victoriametrics.com/operator/vars) possible ENV variables to configure operator behaviour +victoria-metrics-operator: + enabled: true + crds: + plain: true + cleanup: + enabled: true + image: + repository: bitnami/kubectl + pullPolicy: IfNotPresent + serviceMonitor: + enabled: true + operator: + # -- By default, operator converts prometheus-operator objects. + disable_prometheus_converter: false + # group pinguin added the admissionWebhooks value according to https://docs.victoriametrics.com/helm/victoriametrics-k8s-stack/#argocd-issues + admissionWebhooks: + certManager: + enabled: true + +defaultDashboards: + # -- Enable custom dashboards installation + enabled: true + defaultTimezone: utc + labels: {} + annotations: {} + grafanaOperator: + # -- Create dashboards as CRDs (requires grafana-operator to be installed) + enabled: true + spec: + instanceSelector: + matchLabels: + dashboards: grafana + allowCrossNamespaceImport: false + # -- Create dashboards as ConfigMap despite dependency it requires is not installed + dashboards: + victoriametrics-vmalert: + enabled: true + victoriametrics-operator: + enabled: true + # -- In ArgoCD using client-side apply this dashboard reaches annotations size limit and causes k8s issues without server side apply + # See [this issue](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack#metadataannotations-too-long-must-have-at-most-262144-bytes-on-dashboards) + node-exporter-full: + enabled: true + +# -- Create default rules for monitoring the cluster +defaultRules: + # -- Labels, which are used for grouping results of the queries. Note that these labels are joined with `.Values.global.clusterLabel` + additionalGroupByLabels: [] + create: true + + # -- Common properties for VMRule groups + group: + spec: + # -- Optional HTTP URL parameters added to each rule request + params: {} + + # -- Common properties for all VMRules + rule: + spec: + # -- Additional labels for all VMRules + labels: {} + # -- Additional annotations for all VMRules + annotations: {} + + # -- Common properties for VMRules alerts + alerting: + spec: + # -- Additional labels for VMRule alerts + labels: {} + # -- Additional annotations for VMRule alerts + annotations: {} + + # -- Common properties for VMRules recording rules + recording: + spec: + # -- Additional labels for VMRule recording rules + labels: {} + # -- Additional annotations for VMRule recording rules + annotations: {} + + # -- Per rule properties + rules: {} + # CPUThrottlingHigh: + # create: true + # spec: + # for: 15m + # labels: + # severity: critical + # -- Rule group properties + groups: + etcd: + create: true + # -- Common properties for all rules in a group + rules: {} + # spec: + # annotations: + # dashboard: https://example.com/dashboard/1 + general: + create: true + rules: {} + k8sContainerCpuLimits: + create: true + rules: {} + k8sContainerCpuRequests: + create: true + rules: {} + k8sContainerCpuUsageSecondsTotal: + create: true + rules: {} + k8sContainerMemoryLimits: + create: true + rules: {} + k8sContainerMemoryRequests: + create: true + rules: {} + k8sContainerMemoryRss: + create: true + rules: {} + k8sContainerMemoryCache: + create: true + rules: {} + k8sContainerMemoryWorkingSetBytes: + create: true + rules: {} + k8sContainerMemorySwap: + create: true + rules: {} + k8sPodOwner: + create: true + rules: {} + k8sContainerResource: + create: true + rules: {} + kubeApiserver: + create: true + rules: {} + kubeApiserverAvailability: + create: true + rules: {} + kubeApiserverBurnrate: + create: true + rules: {} + kubeApiserverHistogram: + create: true + rules: {} + kubeApiserverSlos: + create: true + rules: {} + kubelet: + create: true + rules: {} + kubePrometheusGeneral: + create: true + rules: {} + kubePrometheusNodeRecording: + create: true + rules: {} + kubernetesApps: + create: true + rules: {} + targetNamespace: ".*" + kubernetesResources: + create: true + rules: {} + kubernetesStorage: + create: true + rules: {} + targetNamespace: ".*" + kubernetesSystem: + create: true + rules: {} + kubernetesSystemKubelet: + create: true + rules: {} + kubernetesSystemApiserver: + create: true + rules: {} + kubernetesSystemControllerManager: + create: false + rules: {} + kubeScheduler: + create: false + rules: {} + kubernetesSystemScheduler: + create: false + rules: {} + kubeStateMetrics: + create: true + rules: {} + nodeNetwork: + create: true + rules: {} + node: + create: true + rules: {} + vmagent: + create: true + rules: {} + vmsingle: + create: true + rules: {} + vmcluster: + create: true + rules: {} + vmHealth: + create: true + rules: {} + vmoperator: + create: true + rules: {} + alertmanager: + create: true + rules: {} + + # -- Runbook url prefix for default rules + runbookUrl: https://runbooks.prometheus-operator.dev/runbooks + + # -- Labels for default rules + labels: {} + # -- Annotations for default rules + annotations: {} + +# -- Provide custom recording or alerting rules to be deployed into the cluster. +additionalVictoriaMetricsMap: +# rule-name: +# groups: +# - name: my_group +# rules: +# - record: my_record +# expr: 100 * my_record + +external: + grafana: + # -- External Grafana host + host: "" + # -- External Grafana datasource name + datasource: VictoriaMetrics + # -- External VM read and write URLs + vm: + read: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + write: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + +# Configures vmsingle params +vmsingle: + # -- VMSingle annotations + annotations: {} + # -- Create VMSingle CR + enabled: true + # -- Full spec for VMSingle CRD. Allowed values describe [here](https://docs.victoriametrics.com/operator/api#vmsinglespec) + spec: + port: "8429" + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicaCount: 1 + extraArgs: {} + storageMetadata: + annotations: + everest.io/crypt-key-id: ac5a45e8-c705-445e-8026-e643e3f2525d + everest.io/disk-volume-type: GPSSD + storage: + storageClassName: csi-disk + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + ingress: + # -- Enable deployment of ingress for server component + enabled: false + # -- Ingress annotations + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # -- Ingress extra labels + labels: {} + # -- Ingress default path + path: "" + # -- Ingress path type + pathType: Prefix + # -- Ingress controller class name + ingressClassName: "" + + # -- Array of host objects + hosts: [] + # - vmsingle.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vmsingle-ingress-tls + # hosts: + # - vmsingle.domain.com + +vmcluster: + # -- Create VMCluster CR + enabled: false + # -- VMCluster annotations + annotations: {} + # -- Full spec for VMCluster CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmclusterspec) + spec: + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicationFactor: 2 + vmstorage: + replicaCount: 2 + storageDataPath: /vm-data + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 10Gi + resources: + {} + # limits: + # cpu: "1" + # memory: 1500Mi + vmselect: + # -- Set this value to false to disable VMSelect + enabled: true + port: "8481" + replicaCount: 2 + cacheMountPath: /select-cache + extraArgs: {} + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 2Gi + resources: + {} + # limits: + # cpu: "1" + # memory: "1000Mi" + # requests: + # cpu: "0.5" + # memory: "500Mi" + vminsert: + # -- Set this value to false to disable VMInsert + enabled: true + port: "8480" + replicaCount: 2 + extraArgs: {} + resources: + {} + # limits: + # cpu: "1" + # memory: 1000Mi + # requests: + # cpu: "0.5" + # memory: "500Mi" + + ingress: + storage: + # -- Enable deployment of ingress for server component + enabled: false + + # -- Ingress annotations + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + + # -- Ingress extra labels + labels: {} + + # -- Ingress controller class name + ingressClassName: "" + + # -- Ingress path type + pathType: Prefix + + # -- Ingress default path + path: "" + + # -- Array of host objects + hosts: [] + # - vmstorage.domain.com + + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vmstorage-ingress-tls + # hosts: + # - vmstorage.domain.com + select: + # -- Enable deployment of ingress for server component + enabled: false + + # -- Ingress annotations + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + + # -- Ingress extra labels + labels: {} + + # -- Ingress controller class name + ingressClassName: "" + + # -- Ingress path type + pathType: Prefix + + # -- Ingress default path + path: '{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vmselect }}' + + # -- Array of host objects + hosts: [] + # - vmselect.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vmselect-ingress-tls + # hosts: + # - vmselect.domain.com + insert: + # -- Enable deployment of ingress for server component + enabled: false + + # -- Ingress annotations + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + + # -- Ingress extra labels + labels: {} + + # -- Ingress controller class name + ingressClassName: "" + + # -- Ingress path type + pathType: Prefix + + # -- Ingress default path + path: '{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vminsert }}' + + # -- Array of host objects + hosts: [] + # - vminsert.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + + # -- Array of TLS objects + tls: [] + # - secretName: vminsert-ingress-tls + # hosts: + # - vminsert.domain.com + +alertmanager: + # -- Create VMAlertmanager CR + enabled: true + # -- Alertmanager annotations + annotations: {} + # -- (object) Full spec for VMAlertmanager CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmalertmanagerspec) + spec: + replicaCount: 1 + port: "9093" + selectAllByDefault: true + image: + tag: v0.28.1 + externalURL: "" + routePrefix: / + + # -- (string) If this one defined, it will be used for alertmanager configuration and config parameter will be ignored + configSecret: "" + # -- + # @raw + # enable storing .Values.alertmanager.config in VMAlertmanagerConfig instead of k8s Secret. + # Note: VMAlertmanagerConfig and plain Alertmanager config structures are not equal. + # If you're migrating existing config, please make sure that `.Values.alertmanager.config`: + # - with `useManagedConfig: false` has structure described [here](https://prometheus.io/docs/alerting/latest/configuration/). + # - with `useManagedConfig: true` has structure described [here](https://docs.victoriametrics.com/operator/api/#vmalertmanagerconfig). + useManagedConfig: true + # -- (object) Alertmanager configuration + config: + route: + receiver: "blackhole" + routes: + - matchers: + - severity=~"critical|major" + receiver: outlook + receivers: + - name: blackhole + - name: outlook + email_configs: + - smarthost: 'mail.mms-support.de:465' + auth_username: 'ipcei-cis-devfw@mms-support.de' + auth_password: + name: email-user-credentials + key: connection-string + from: '"IPCEI CIS DevFW" ' + to: 'f9f9953a.mg.telekom.de@de.teams.ms' + headers: + subject: 'Grafana Mail Alerts' + require_tls: false + + # -- Better alert templates for [slack source](https://gist.github.com/milesbxf/e2744fc90e9c41b47aa47925f8ff6512) + monzoTemplate: + enabled: true + + # -- (object) Extra alert templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + # -- (object) Alertmanager ingress configuration + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: '{{ .Values.alertmanager.spec.routePrefix | default "/" }}' + pathType: Prefix + + hosts: + - alertmanager.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: alertmanager-ingress-tls + # hosts: + # - alertmanager.domain.com + +vmalert: + # -- VMAlert annotations + annotations: {} + # -- Create VMAlert CR + enabled: true + + # -- Controls whether VMAlert should use VMAgent or VMInsert as a target for remotewrite + remoteWriteVMAgent: false + # -- (object) Full spec for VMAlert CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmalertspec) + spec: + port: "8080" + selectAllByDefault: true + evaluationInterval: 20s + extraArgs: + http.pathPrefix: "/" + + # External labels to add to all generated recording rules and alerts + externalLabels: {} + + # -- (object) Extra VMAlert annotation templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + # -- Allows to configure static notifiers, discover notifiers via Consul and DNS, + # see specification [here](https://docs.victoriametrics.com/vmalert/#notifier-configuration-file). + # This configuration will be created as separate secret and mounted to VMAlert pod. + additionalNotifierConfigs: {} + # dns_sd_configs: + # - names: + # - my.domain.com + # type: 'A' + # port: 9093 + # -- (object) VMAlert ingress config + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + pathType: Prefix + + hosts: + - vmalert.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmalert-ingress-tls + # hosts: + # - vmalert.domain.com + +vmauth: + # -- Enable VMAuth CR + enabled: true + # -- VMAuth annotations + annotations: {} + # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) + # It's possible to use given below predefined variables in spec: + # * `{{ .vm.read }}` - parsed vmselect, vmsingle or external.vm.read URL + # * `{{ .vm.write }}` - parsed vminsert, vmsingle or external.vm.write URL + spec: + port: "8427" + ingress: + class_name: nginx + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: main + host: o12y.observability. + tlsHosts: + - o12y.observability. + tlsSecretName: vmauth-tls-secret + unauthorizedUserAccessSpec: {} + selectAllByDefault: true + +vmagent: + # -- Create VMAgent CR + enabled: false + # -- VMAgent annotations + annotations: {} + # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) + additionalRemoteWrites: + [] + #- url: http://some-remote-write/api/v1/write + # -- (object) Full spec for VMAgent CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmagentspec) + spec: + port: "8429" + selectAllByDefault: true + scrapeInterval: 20s + externalLabels: {} + # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. + # For example: + # cluster: cluster-name + extraArgs: + promscrape.streamParse: "true" + # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent + # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug + promscrape.dropOriginalLabels: "true" + # -- (object) VMAgent ingress configuration + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + pathType: Prefix + + hosts: + - vmagent.domain.com + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmagent-ingress-tls + # hosts: + # - vmagent.domain.com + +defaultDatasources: + grafanaOperator: + # -- Create datasources as CRDs (requires grafana-operator to be installed) + enabled: true + annotations: {} + spec: + plugins: + - name: victoriametrics-metrics-datasource + version: 0.16.0 + - name: victoriametrics-logs-datasource + version: 0.17.0 + instanceSelector: + matchLabels: + dashboards: grafana + allowCrossNamespaceImport: false + victoriametrics: + # -- Create per replica prometheus compatible datasource + perReplica: false + # -- List of prometheus compatible datasource configurations. + # VM `url` will be added to each of them in templates. + datasources: + - name: VictoriaMetrics + type: prometheus + access: proxy + isDefault: true + - name: VictoriaMetrics (DS) + isDefault: false + access: proxy + type: victoriametrics-metrics-datasource + version: "0.15.1" + # -- List of alertmanager datasources. + # Alertmanager generated `url` will be added to each datasource in template if alertmanager is enabled + alertmanager: + # -- Create per replica alertmanager compatible datasource + perReplica: false + datasources: + - name: Alertmanager + access: proxy + jsonData: + implementation: prometheus + # -- Configure additional grafana datasources (passed through tpl). + # Check [here](http://docs.grafana.org/administration/provisioning/#datasources) for details + extra: + - name: VictoriaLogs + access: proxy + type: victoriametrics-logs-datasource + url: http://vlogs-victorialogs:9428 + version: 0.18.0 + +# -- Grafana dependency chart configuration. For possible values refer [here](https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration) +grafana: + enabled: false + # all values for grafana helm chart can be specified here + persistence: + enabled: false + type: pvc + storageClassName: "default" + grafana.ini: + # auth: + # login_maximum_inactive_lifetime_duration: 0 + # login_maximum_lifetime_duration: 0 + security: + disable_brute_force_login_protection: true + sidecar: + datasources: + enabled: true + initDatasources: true + label: grafana_datasource + dashboards: + provider: + name: default + orgid: 1 + folder: /var/lib/grafana/dashboards + defaultFolderName: default + enabled: true + multicluster: false + + # dashboards: + # default: + # victoria-logs: + # url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" + # victoria-logs-explorer: + # url: "https://grafana.com/api/dashboards/22759/revisions/6/download" + # ingress-nginx: + # url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" + # argocd: + # url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" + + # -- Create datasource configmap even if grafana deployment has been disabled + forceDeployDatasource: true + + # Uncomment the block below, if you want to enable VictoriaMetrics Datasource in Grafana: + # Note that Grafana will need internet access to install the datasource plugin. + + plugins: + - victoriametrics-metrics-datasource + - victoriametrics-logs-datasource + + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + pathType: Prefix + + hosts: + - grafana.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: grafana-ingress-tls + # hosts: + # - grafana.domain.com + + # -- Grafana VM scrape config + vmScrape: + # whether we should create a service scrape resource for grafana + enabled: true + + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Grafana + spec: + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "grafana.name" .Subcharts.grafana }}' + endpoints: + - port: '{{ .Values.grafana.service.portName }}' + +# -- prometheus-node-exporter dependency chart configuration. For possible values check [here](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml) +prometheus-node-exporter: + enabled: true + + # all values for prometheus-node-exporter helm chart can be specified here + service: + # Add the 'node-exporter' label to be used by serviceMonitor to match standard common usage in rules and grafana dashboards + # + labels: + jobLabel: node-exporter + extraArgs: + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|erofs|sysfs|tracefs)$ + # -- Node Exporter VM scrape config + vmScrape: + # whether we should create a service scrape resource for node-exporter + enabled: true + + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Node Exporter + spec: + jobLabel: jobLabel + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "prometheus-node-exporter.name" (index .Subcharts "prometheus-node-exporter") }}' + endpoints: + - port: metrics + metricRelabelConfigs: + - action: drop + source_labels: [mountpoint] + regex: "/var/lib/kubelet/pods.+" +# -- kube-state-metrics dependency chart configuration. For possible values check [here](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml) +kube-state-metrics: + enabled: true + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Kube State Metrics + vmScrape: + enabled: true + spec: + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "kube-state-metrics.name" (index .Subcharts "kube-state-metrics") }}' + app.kubernetes.io/instance: '{{ include "vm.release" . }}' + endpoints: + - port: http + honorLabels: true + metricRelabelConfigs: + - action: labeldrop + regex: (uid|container_id|image_id) + jobLabel: app.kubernetes.io/name + +# -- Component scraping the kubelets +kubelet: + enabled: true + vmScrapes: + # -- Enable scraping /metrics/cadvisor from kubelet's service + cadvisor: + enabled: true + spec: + path: /metrics/cadvisor + # -- Enable scraping /metrics/probes from kubelet's service + probes: + enabled: true + spec: + path: /metrics/probes + # -- Enabled scraping /metrics/resource from kubelet's service + resources: + enabled: true + spec: + path: /metrics/resource + kubelet: + spec: {} + # -- Spec for VMNodeScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmnodescrapespec) + vmScrape: + kind: VMNodeScrape + spec: + scheme: "https" + honorLabels: true + interval: "30s" + scrapeTimeout: "5s" + tlsConfig: + insecureSkipVerify: true + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # drop high cardinality label and useless metrics for cadvisor and kubelet + metricRelabelConfigs: + - action: labeldrop + regex: (uid) + - action: labeldrop + regex: (id|name) + - action: drop + source_labels: [__name__] + regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count) + relabelConfigs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - sourceLabels: [__metrics_path__] + targetLabel: metrics_path + - targetLabel: job + replacement: kubelet + # ignore timestamps of cadvisor's metrics by default + # more info here https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1656540535 + honorTimestamps: false +# Component scraping the kube api server +kubeApiServer: + # -- Enable Kube Api Server metrics scraping + enabled: true + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes + +# Component scraping the kube controller manager +kubeControllerManager: + # -- Enable kube controller manager metrics scraping + enabled: false + + # -- If your kube controller manager is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + # If using kubeControllerManager.endpoints only the port and targetPort are used + service: + # -- Create service for kube controller manager metrics scraping + enabled: true + # -- Kube controller manager service port + port: 10257 + # -- Kube controller manager service target port + targetPort: 10257 + # -- Kube controller manager service pod selector + selector: + component: kube-controller-manager + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: + - kube-system + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + +# Component scraping kubeDns. Use either this or coreDns +kubeDns: + # -- Enabled KubeDNS metrics scraping + enabled: false + service: + # -- Create Service for KubeDNS metrics + enabled: false + # -- KubeDNS service ports + ports: + dnsmasq: + port: 10054 + targetPort: 10054 + skydns: + port: 10055 + targetPort: 10055 + # -- KubeDNS service pods selector + selector: + k8s-app: kube-dns + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - port: http-metrics-dnsmasq + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + - port: http-metrics-skydns + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +# Component scraping coreDns. Use either this or kubeDns +coreDns: + # -- Enabled CoreDNS metrics scraping + enabled: true + service: + # -- Create service for CoreDNS metrics + enabled: true + # -- CoreDNS service port + port: 9153 + # -- CoreDNS service target port + targetPort: 9153 + # -- CoreDNS service pod selector + selector: + k8s-app: kube-dns + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - port: http-metrics + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +# Component scraping etcd +kubeEtcd: + # -- Enabled KubeETCD metrics scraping + enabled: true + + # -- If your etcd is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + # Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used + service: + # -- Enable service for ETCD metrics scraping + enabled: true + # -- ETCD service port + port: 2379 + # -- ETCD service target port + targetPort: 2379 + # -- ETCD service pods selector + selector: + component: etcd + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# Component scraping kube scheduler +kubeScheduler: + # -- Enable KubeScheduler metrics scraping + enabled: false + + # -- If your kube scheduler is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + # If using kubeScheduler.endpoints only the port and targetPort are used + service: + # -- Enable service for KubeScheduler metrics scrape + enabled: true + # -- KubeScheduler service port + port: 10259 + # -- KubeScheduler service target port + targetPort: 10259 + # -- KubeScheduler service pod selector + selector: + component: kube-scheduler + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# Component scraping kube proxy +kubeProxy: + # -- Enable kube proxy metrics scraping + enabled: false + + # -- If your kube proxy is not deployed as a pod, specify IPs it can be found on + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + service: + # -- Enable service for kube proxy metrics scraping + enabled: true + # -- Kube proxy service port + port: 10249 + # -- Kube proxy service target port + targetPort: 10249 + # -- Kube proxy service pod selector + selector: + k8s-app: kube-proxy + + # -- Spec for VMServiceScrape CRD is [here](https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec) + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# -- Add extra objects dynamically to this chart +extraObjects: [] + diff --git a/otc/benchmark.t09.de/stacks/otc/cert-manager/manifests/clusterissuer.yaml b/otc/benchmark.t09.de/stacks/otc/cert-manager/manifests/clusterissuer.yaml new file mode 100644 index 0000000..73d0b7f --- /dev/null +++ b/otc/benchmark.t09.de/stacks/otc/cert-manager/manifests/clusterissuer.yaml @@ -0,0 +1,14 @@ +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: main +spec: + acme: + email: admin@think-ahead.tech + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: cluster-issuer-account-key + solvers: + - http01: + ingress: + ingressClassName: nginx diff --git a/otc/benchmark.t09.de/stacks/otc/cert-manager/values.yaml b/otc/benchmark.t09.de/stacks/otc/cert-manager/values.yaml new file mode 100644 index 0000000..a0b2211 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/otc/cert-manager/values.yaml @@ -0,0 +1,4 @@ +crds: + enabled: true + +replicaCount: 1 diff --git a/otc/benchmark.t09.de/stacks/otc/cert-manger.yaml b/otc/benchmark.t09.de/stacks/otc/cert-manger.yaml new file mode 100644 index 0000000..2c93d4c --- /dev/null +++ b/otc/benchmark.t09.de/stacks/otc/cert-manger.yaml @@ -0,0 +1,32 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cert-manager + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: cert-manager + sources: + - chart: cert-manager + repoURL: https://charts.jetstack.io + targetRevision: v1.17.2 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/otc/cert-manager/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/otc/cert-manager/manifests" diff --git a/otc/benchmark.t09.de/stacks/otc/ingress-nginx.yaml b/otc/benchmark.t09.de/stacks/otc/ingress-nginx.yaml new file mode 100644 index 0000000..33d6d7b --- /dev/null +++ b/otc/benchmark.t09.de/stacks/otc/ingress-nginx.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ingress-nginx + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: ingress-nginx + sources: + - repoURL: https://github.com/kubernetes/ingress-nginx.git + path: charts/ingress-nginx + targetRevision: helm-chart-4.12.1 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/otc/ingress-nginx/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/benchmark.t09.de/stacks/otc/ingress-nginx/values.yaml b/otc/benchmark.t09.de/stacks/otc/ingress-nginx/values.yaml new file mode 100644 index 0000000..ec2d3aa --- /dev/null +++ b/otc/benchmark.t09.de/stacks/otc/ingress-nginx/values.yaml @@ -0,0 +1,31 @@ +controller: + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + + service: + annotations: + kubernetes.io/elb.class: union + kubernetes.io/elb.port: '80' + kubernetes.io/elb.id: db60c1a9-312c-42b7-847b-781d950a0e7a + kubernetes.io/elb.ip: 164.30.20.78 + + ingressClassResource: + name: nginx + + # added for idpbuilder + allowSnippetAnnotations: true + + # added for idpbuilder + config: + proxy-buffer-size: 32k + use-forwarded-headers: "true" + + # monitoring nginx + metrics: + enabled: true + serviceMonitor: + additionalLabels: + release: "ingress-nginx" + enabled: true diff --git a/otc/benchmark.t09.de/stacks/otc/storageclass.yaml b/otc/benchmark.t09.de/stacks/otc/storageclass.yaml new file mode 100644 index 0000000..bf46764 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/otc/storageclass.yaml @@ -0,0 +1,25 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: storageclass + namespace: argocd + labels: + example: otc + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + namespace: default + server: "https://kubernetes.default.svc" + source: + repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/benchmark.t09.de/stacks/otc/storageclass" + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 diff --git a/otc/benchmark.t09.de/stacks/otc/storageclass/storageclass.yaml b/otc/benchmark.t09.de/stacks/otc/storageclass/storageclass.yaml new file mode 100644 index 0000000..038bf24 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/otc/storageclass/storageclass.yaml @@ -0,0 +1,18 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + annotations: + storageclass.beta.kubernetes.io/is-default-class: "true" + labels: + kubernetes.io/cluster-service: "true" + name: default +parameters: + kubernetes.io/description: "" + kubernetes.io/hw:passthrough: "true" + kubernetes.io/storagetype: BS + kubernetes.io/volumetype: SATA + kubernetes.io/zone: eu-de-02 +provisioner: flexvolume-huawei.com/fuxivol +reclaimPolicy: Delete +volumeBindingMode: Immediate +allowVolumeExpansion: true \ No newline at end of file diff --git a/otc/benchmark.t09.de/stacks/terralist/terralist.yaml b/otc/benchmark.t09.de/stacks/terralist/terralist.yaml new file mode 100644 index 0000000..83afc42 --- /dev/null +++ b/otc/benchmark.t09.de/stacks/terralist/terralist.yaml @@ -0,0 +1,30 @@ +# helm upgrade --install --create-namespace --namespace terralist terralist oci://ghcr.io/terralist/helm-charts/terralist -f terralist-values.yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: terralist + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: terralist + sources: + - repoURL: https://github.com/terralist/helm-charts + path: charts/terralist + targetRevision: terralist-0.8.1 + helm: + valueFiles: + - $values/otc/benchmark.t09.de/stacks/terralist/terralist/values.yaml + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + ref: values diff --git a/otc/benchmark.t09.de/stacks/terralist/terralist/values.yaml b/otc/benchmark.t09.de/stacks/terralist/terralist/values.yaml new file mode 100644 index 0000000..95af42f --- /dev/null +++ b/otc/benchmark.t09.de/stacks/terralist/terralist/values.yaml @@ -0,0 +1,87 @@ +controllers: + main: + strategy: Recreate + containers: + app: + env: + - name: TERRALIST_OAUTH_PROVIDER + value: oidc + - name: TERRALIST_OI_CLIENT_ID + valueFrom: + secretKeyRef: + name: oidc-credentials + key: client-id + - name: TERRALIST_OI_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: oidc-credentials + key: client-secret + - name: TERRALIST_OI_AUTHORIZE_URL + valueFrom: + secretKeyRef: + name: oidc-credentials + key: authorize-url + - name: TERRALIST_OI_TOKEN_URL + valueFrom: + secretKeyRef: + name: oidc-credentials + key: token-url + - name: TERRALIST_OI_USERINFO_URL + valueFrom: + secretKeyRef: + name: oidc-credentials + key: userinfo-url + - name: TERRALIST_OI_SCOPE + valueFrom: + secretKeyRef: + name: oidc-credentials + key: scope + - name: TERRALIST_TOKEN_SIGNING_SECRET + valueFrom: + secretKeyRef: + name: terralist-secret + key: token-signing-secret + - name: TERRALIST_COOKIE_SECRET + valueFrom: + secretKeyRef: + name: terralist-secret + key: cookie-secret + - name: TERRALIST_URL + value: https://terralist.benchmark.t09.de + - name: TERRALIST_SQLITE_PATH + value: /data/db.sqlite + - name: TERRALIST_LOCAL_STORE + value: /data/modules + - name: TERRALIST_PROVIDERS_ANONYMOUS_READ + value: "true" + +ingress: + main: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: main + hosts: + - host: terralist.benchmark.t09.de + paths: + - path: / + pathType: Prefix + service: + identifier: main + port: http + tls: + - hosts: + - terralist.benchmark.t09.de + secretName: terralist-tls-secret + +persistence: + data: + enabled: true + accessMode: ReadWriteOnce + size: 10Gi + retain: false + storageClass: "csi-disk" + annotations: + everest.io/disk-volume-type: GPSSD + globalMounts: + - path: /data diff --git a/otc/dev.t09.de/registry/ci-sizer.yaml b/otc/dev.t09.de/registry/ci-sizer.yaml new file mode 100644 index 0000000..58df27e --- /dev/null +++ b/otc/dev.t09.de/registry/ci-sizer.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ci-sizer-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "otc/dev.t09.de/stacks/ci-sizer" + repoURL: "https://edp.buildth.ing/DevFW-CICD/stacks-instances" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook.yaml b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook.yaml new file mode 100644 index 0000000..c02e1cc --- /dev/null +++ b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook.yaml @@ -0,0 +1,29 @@ +# Optional: GitLab CI integration +# Only hydrate this app for clusters that run GitLab Runner. +# For Forgejo/GitHub-only deployments, omit this app from stacks-instances. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: gitlab-sizer-webhook + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: ci-sizer + source: + repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook" diff --git a/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/certificates.yaml b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/certificates.yaml new file mode 100644 index 0000000..ee1fece --- /dev/null +++ b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/certificates.yaml @@ -0,0 +1,27 @@ +# Self-signed Issuer for webhook TLS. +# For production, replace with a ClusterIssuer backed by a real CA. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} +--- +# cert-manager Certificate for the webhook TLS. +# The resulting Secret (gitlab-sizer-webhook-tls) is mounted into the webhook pod. +# cert-manager also injects the CA into the MutatingWebhookConfiguration via the +# cert-manager.io/inject-ca-from annotation. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: gitlab-sizer-webhook-cert +spec: + secretName: gitlab-sizer-webhook-tls + issuerRef: + name: selfsigned-issuer + kind: Issuer + dnsNames: + - gitlab-sizer-webhook.ci-sizer.svc + - gitlab-sizer-webhook.ci-sizer.svc.cluster.local + duration: 8760h + renewBefore: 720h diff --git a/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/deployment.yaml b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/deployment.yaml new file mode 100644 index 0000000..0b99859 --- /dev/null +++ b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/deployment.yaml @@ -0,0 +1,141 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gitlab-sizer-webhook +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gitlab-sizer-webhook +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: gitlab-sizer-webhook +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gitlab-sizer-webhook +subjects: + - kind: ServiceAccount + name: gitlab-sizer-webhook + namespace: ci-sizer +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gitlab-sizer-webhook + labels: + app: gitlab-sizer-webhook +spec: + replicas: 2 + selector: + matchLabels: + app: gitlab-sizer-webhook + template: + metadata: + labels: + app: gitlab-sizer-webhook + spec: + serviceAccountName: gitlab-sizer-webhook + securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: webhook + image: edp.buildth.ing/devfw-cicd/gitlab-webhook-edge-connect:latest + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + ports: + - containerPort: 8443 + protocol: TCP + args: + - --listen-addr=:8443 + - --tls-cert-file=/etc/webhook/tls/tls.crt + - --tls-key-file=/etc/webhook/tls/tls.key + - --sizer-url=http://sizer-receiver.ci-sizer.svc:8080 + - --sizer-sidecar-image=edp.buildth.ing/devfw-cicd/ci-sizer-collector:latest + env: + - name: WEBHOOK_SIZER_READ_TOKEN + valueFrom: + secretKeyRef: + name: gitlab-sizer-webhook-tokens + key: sizer-read-token + - name: WEBHOOK_SIZER_PUSH_TOKEN + valueFrom: + secretKeyRef: + name: gitlab-sizer-webhook-tokens + key: sizer-push-token + - name: HTTP_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: HTTP_PROXY + optional: true + - name: HTTPS_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: HTTPS_PROXY + optional: true + - name: NO_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: NO_PROXY + optional: true + volumeMounts: + - name: webhook-tls + mountPath: /etc/webhook/tls + readOnly: true + livenessProbe: + httpGet: + path: /healthz + port: 8443 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 8443 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 128Mi + volumes: + - name: webhook-tls + secret: + secretName: gitlab-sizer-webhook-tls +--- +apiVersion: v1 +kind: Service +metadata: + name: gitlab-sizer-webhook + labels: + app: gitlab-sizer-webhook +spec: + selector: + app: gitlab-sizer-webhook + ports: + - port: 443 + targetPort: 8443 + protocol: TCP diff --git a/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml new file mode 100644 index 0000000..72aea4a --- /dev/null +++ b/otc/dev.t09.de/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml @@ -0,0 +1,30 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: gitlab-sizer-webhook + annotations: + cert-manager.io/inject-ca-from: ci-sizer/gitlab-sizer-webhook-cert +webhooks: + - name: gitlab-sizer-webhook.ci-sizer.svc + admissionReviewVersions: ["v1"] + sideEffects: NoneOnDryRun + failurePolicy: Ignore + timeoutSeconds: 5 + reinvocationPolicy: Never + clientConfig: + service: + name: gitlab-sizer-webhook + namespace: ci-sizer + path: /mutate + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE"] + resources: ["pods"] + namespaceSelector: + matchLabels: + ci-sizer.devfw.io/watch: "true" + objectSelector: + matchExpressions: + - key: job.runner.gitlab.com/pod + operator: Exists diff --git a/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver.yaml b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver.yaml new file mode 100644 index 0000000..1f56541 --- /dev/null +++ b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver.yaml @@ -0,0 +1,29 @@ +# Required: CI Sizer receiver +# Always deploy this — it stores metrics and computes sizing recommendations. +# Works standalone or with GARM (Forgejo/GitHub) and/or GitLab webhook. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: sizer-receiver + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: ci-sizer + source: + repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/dev.t09.de/stacks/ci-sizer/sizer-receiver" diff --git a/otc/edp.buildth.ing/stacks/garm/optimiser-receiver/deployment.yaml b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml similarity index 51% rename from otc/edp.buildth.ing/stacks/garm/optimiser-receiver/deployment.yaml rename to otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml index f7e366b..3cbfb4c 100644 --- a/otc/edp.buildth.ing/stacks/garm/optimiser-receiver/deployment.yaml +++ b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/deployment.yaml @@ -1,22 +1,27 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: optimiser-receiver + name: sizer-receiver labels: - app: optimiser-receiver + app: sizer-receiver spec: + strategy: + type: Recreate replicas: 1 selector: matchLabels: - app: optimiser-receiver + app: sizer-receiver template: metadata: labels: - app: optimiser-receiver + app: sizer-receiver spec: + securityContext: + fsGroup: 65534 containers: - name: receiver - image: edp.buildth.ing/devfw-cicd/forgejo-runner-optimiser-receiver:0.0.3 + image: edp.buildth.ing/devfw-cicd/ci-sizer-receiver:latest + imagePullPolicy: Always args: - --db=/data/metrics.db ports: @@ -27,13 +32,41 @@ spec: - name: RECEIVER_READ_TOKEN valueFrom: secretKeyRef: - name: optimiser-tokens + name: sizer-tokens key: read-token - name: RECEIVER_HMAC_KEY valueFrom: secretKeyRef: - name: optimiser-tokens + name: sizer-tokens key: hmac-key + - name: GARM_URL + value: "http://garm.garm.svc:80" + - name: GARM_USER + value: "admin" + - name: GARM_PASSWORD + valueFrom: + secretKeyRef: + name: garm-fixed-credentials + key: admin_password + - name: RECEIVER_OIDC_ISSUER + value: "https://dex.dev.t09.de" + - name: RECEIVER_OIDC_CLIENT_ID + value: "ci-sizer" + - name: RECEIVER_OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: sizer-oidc-client + key: client-secret + - name: RECEIVER_OIDC_REDIRECT_URI + value: "https://sizer.dev.t09.de/ui/callback" + - name: RECEIVER_SESSION_TTL + value: "12h" + - name: RECEIVER_ALLOWED_ORG + value: "DevFW-CICD" + - name: RECEIVER_CPU_SIZING_MODE + value: "observe" + - name: RECEIVER_MEMORY_QOS + value: "guaranteed" volumeMounts: - name: data mountPath: /data @@ -59,17 +92,17 @@ spec: volumes: - name: data persistentVolumeClaim: - claimName: optimiser-receiver-data + claimName: sizer-receiver-data --- apiVersion: v1 kind: Service metadata: - name: optimiser-receiver + name: sizer-receiver labels: - app: optimiser-receiver + app: sizer-receiver spec: selector: - app: optimiser-receiver + app: sizer-receiver ports: - name: http port: 8080 @@ -79,9 +112,9 @@ spec: apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: optimiser-receiver-data + name: sizer-receiver-data labels: - app: optimiser-receiver + app: sizer-receiver annotations: everest.io/disk-volume-type: GPSSD spec: diff --git a/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/ingress.yaml b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/ingress.yaml new file mode 100644 index 0000000..1bd81a9 --- /dev/null +++ b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/ingress.yaml @@ -0,0 +1,36 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: main + + name: sizer-receiver + namespace: ci-sizer +spec: + ingressClassName: nginx + rules: + - host: sizer.dev.t09.de + http: + paths: + - backend: + service: + name: sizer-receiver + port: + number: 8080 + path: / + pathType: Prefix + - host: ci-sizer.dev.t09.de + http: + paths: + - backend: + service: + name: sizer-receiver + port: + number: 8080 + path: / + pathType: Prefix + tls: + - hosts: + - sizer.dev.t09.de + secretName: sizer-receiver-tls diff --git a/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/secret.yaml b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/secret.yaml new file mode 100644 index 0000000..ac8a37c --- /dev/null +++ b/otc/dev.t09.de/stacks/ci-sizer/sizer-receiver/secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: sizer-oidc-client + labels: + app: sizer-receiver +type: Opaque +stringData: + client-secret: "73eda9068bd00dfe67d29f087b5540cb1cd82cc1dd2ac0f838558ac8bbcfcb3a" diff --git a/otc/dev.t09.de/stacks/core/argocd/values.yaml b/otc/dev.t09.de/stacks/core/argocd/values.yaml index dd5b83d..cb856f0 100644 --- a/otc/dev.t09.de/stacks/core/argocd/values.yaml +++ b/otc/dev.t09.de/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/dev.t09.de/stacks/core/dex.yaml b/otc/dev.t09.de/stacks/core/dex.yaml index 5da98f5..b67aa7d 100644 --- a/otc/dev.t09.de/stacks/core/dex.yaml +++ b/otc/dev.t09.de/stacks/core/dex.yaml @@ -27,3 +27,6 @@ spec: - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances targetRevision: HEAD ref: values + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/dev.t09.de/stacks/core/dex/manifests" diff --git a/otc/dev.t09.de/stacks/core/dex/manifests/dex-sizer-client.yaml b/otc/dev.t09.de/stacks/core/dex/manifests/dex-sizer-client.yaml new file mode 100644 index 0000000..884df64 --- /dev/null +++ b/otc/dev.t09.de/stacks/core/dex/manifests/dex-sizer-client.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: dex-sizer-client + namespace: dex +type: Opaque +stringData: + clientSecret: "73eda9068bd00dfe67d29f087b5540cb1cd82cc1dd2ac0f838558ac8bbcfcb3a" diff --git a/otc/dev.t09.de/stacks/core/dex/values.yaml b/otc/dev.t09.de/stacks/core/dex/values.yaml index 8a2a79d..6f4955b 100644 --- a/otc/dev.t09.de/stacks/core/dex/values.yaml +++ b/otc/dev.t09.de/stacks/core/dex/values.yaml @@ -34,6 +34,11 @@ envVars: secretKeyRef: name: dex-argo-client key: clientSecret + - name: FORGEJO_RUNNER_SIZER_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-sizer-client + key: clientSecret - name: LOG_LEVEL value: debug @@ -74,3 +79,8 @@ config: - "https://grafana.dev.t09.de/login/generic_oauth" name: "Grafana" secretEnv: "OIDC_DEX_GRAFANA_CLIENT_SECRET" + - id: ci-sizer + name: "CI Sizer" + redirectURIs: + - "https://sizer.dev.t09.de/ui/callback" + secretEnv: "FORGEJO_RUNNER_SIZER_CLIENT_SECRET" diff --git a/otc/dev.t09.de/stacks/core/secrets-backup.yaml b/otc/dev.t09.de/stacks/core/secrets-backup.yaml new file mode 100644 index 0000000..1f33c8d --- /dev/null +++ b/otc/dev.t09.de/stacks/core/secrets-backup.yaml @@ -0,0 +1,23 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: secrets-backup + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: gitea + sources: + - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances + targetRevision: HEAD + path: "otc/dev.t09.de/stacks/core/secrets-backup/manifests" diff --git a/otc/dev.t09.de/stacks/core/secrets-backup/manifests/secrets-backup-cronjob.yaml b/otc/dev.t09.de/stacks/core/secrets-backup/manifests/secrets-backup-cronjob.yaml new file mode 100644 index 0000000..5ea260d --- /dev/null +++ b/otc/dev.t09.de/stacks/core/secrets-backup/manifests/secrets-backup-cronjob.yaml @@ -0,0 +1,107 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: secrets-backup + namespace: gitea +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: secrets-backup-reader +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: secrets-backup-reader +subjects: + - kind: ServiceAccount + name: secrets-backup + namespace: gitea +roleRef: + kind: ClusterRole + name: secrets-backup-reader + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: secrets-backup + namespace: gitea +spec: + schedule: "30 3 * * *" + concurrencyPolicy: "Forbid" + successfulJobsHistoryLimit: 5 + failedJobsHistoryLimit: 5 + startingDeadlineSeconds: 600 # 10 minutes + jobTemplate: + spec: + activeDeadlineSeconds: 900 + backoffLimit: 2 + ttlSecondsAfterFinished: 259200 + template: + spec: + serviceAccountName: secrets-backup + containers: + - name: secrets-backup + image: edp.buildth.ing/devfw-cicd/secrets-backup:1.0.1 + imagePullPolicy: IfNotPresent + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: access-key + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: secret-key + - name: SOURCE_BUCKET + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: bucket-name + - name: OBS_ENDPOINT + value: "obs.eu-de.otc.t-systems.com" + command: + - /bin/sh + - -c + - | + set -euo pipefail + + TIMESTAMP=$(date +%Y%m%d-%H%M%S) + BACKUP_DIR="/tmp/secrets-backup-${TIMESTAMP}" + NAMESPACES="argocd cert-manager external-secrets" + + mkdir -p "${BACKUP_DIR}" + + echo "=== Exporting secrets from critical namespaces ===" + for NS in ${NAMESPACES}; do + echo "Exporting namespace: ${NS}" + kubectl get secrets -n "${NS}" \ + -o json \ + --field-selector type!=kubernetes.io/service-account-token \ + > "${BACKUP_DIR}/${NS}-secrets.json" + done + + echo "=== Creating compressed archive ===" + ARCHIVE="${BACKUP_DIR}/secrets-backup-${TIMESTAMP}.tar.gz" + tar -czf "${ARCHIVE}" -C "${BACKUP_DIR}" \ + $(ls "${BACKUP_DIR}"/*.json 2>/dev/null | xargs -n1 basename) + + echo "=== Uploading to OBS (SSE-KMS encryption at rest) ===" + aws s3 cp "${ARCHIVE}" \ + "s3://${SOURCE_BUCKET}/cluster-secrets-backup/${TIMESTAMP}/secrets-backup.tar.gz" \ + --endpoint-url "https://${OBS_ENDPOINT}" + + echo "=== Cleanup ===" + rm -rf "${BACKUP_DIR}" + echo "Backup completed: ${TIMESTAMP}" + restartPolicy: OnFailure diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-runner/dind-docker.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-runner/dind-docker.yaml index 093a819..bcdb719 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-runner/dind-docker.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-runner/dind-docker.yaml @@ -7,7 +7,7 @@ metadata: namespace: gitea spec: # Two replicas means that if one is busy, the other can pick up jobs. - replicas: 0 + replicas: 3 selector: matchLabels: app: forgejo-runner diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml index bcefb1d..8e5146a 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml @@ -3,7 +3,7 @@ kind: Ingress metadata: annotations: nginx.ingress.kubernetes.io/force-ssl-redirect: "true" - nginx.ingress.kubernetes.io/proxy-body-size: 512m + nginx.ingress.kubernetes.io/proxy-body-size: 5120m cert-manager.io/cluster-issuer: main name: forgejo-server diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 1251a81..d313b18 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: @@ -72,7 +72,7 @@ spec: - ReadWriteOnce resources: requests: - storage: 100Gi + storage: 500Gi --- apiVersion: v1 kind: Secret diff --git a/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml b/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml index 4c35c43..bf6a0fd 100644 --- a/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml +++ b/otc/dev.t09.de/stacks/forgejo/forgejo-server/values.yaml @@ -137,6 +137,9 @@ gitea: ENABLED: true ADAPTER: redis + security: + GLOBAL_TWO_FACTOR_REQUIREMENT: admin + service: DISABLE_REGISTRATION: true ENABLE_NOTIFY_MAIL: true @@ -171,10 +174,9 @@ service: image: pullPolicy: "IfNotPresent" - # Overrides the image tag whose default is the chart appVersion. - #tag: "8.0.3" - # Adds -rootless suffix to image name - # rootless: true + # DB has v15a/v15b migrations from workflow-webhook build. + # Using that image until a proper v15+ EDP release is cut. + # DO NOT revert — automated upload will break the DB schema. fullOverride: edp.buildth.ing/devfw-cicd/edp-forgejo:workflow-webhook-20260305 forgejo: {} diff --git a/otc/dev.t09.de/stacks/garm/garm.yaml b/otc/dev.t09.de/stacks/garm/garm.yaml index 43c7d4e..a0bbd69 100644 --- a/otc/dev.t09.de/stacks/garm/garm.yaml +++ b/otc/dev.t09.de/stacks/garm/garm.yaml @@ -1,3 +1,7 @@ +# Default: Forgejo/GitHub Actions runner manager +# Deploys GARM with the ci-sizer provider for automatic sizing + collector injection. +# For GitLab-only deployments, omit this and use gitlab-webhook instead. +# See: ci-sizer/docs/deployment-modes.md apiVersion: argoproj.io/v1alpha1 kind: Application metadata: @@ -20,7 +24,7 @@ spec: sources: - repoURL: https://edp.buildth.ing/DevFW-CICD/garm-helm path: charts/garm - targetRevision: v0.0.12 + targetRevision: v0.0.16 helm: valueFiles: - $values/otc/dev.t09.de/stacks/garm/garm/values.yaml diff --git a/otc/dev.t09.de/stacks/garm/garm/values.yaml b/otc/dev.t09.de/stacks/garm/garm/values.yaml index eebfcf1..41fc84c 100644 --- a/otc/dev.t09.de/stacks/garm/garm/values.yaml +++ b/otc/dev.t09.de/stacks/garm/garm/values.yaml @@ -26,7 +26,7 @@ credentials: image: repository: edp.buildth.ing/devfw-cicd/garm-forgejo - tag: v0.1.7-forgejo-3 + tag: v0.1.7-forgejo-24 providerConfig: edgeConnect: @@ -38,12 +38,11 @@ providerConfig: organization: TelekomOP edgeConnectK8s: sizer: - sidecarImage: edp.buildth.ing/devfw-cicd/forgejo-runner-sizer-collector:latest - sidecarPushEndpoint: https://sizer.dev.t09.de/api/v1/metrics - baseUrl: "https://sizer.dev.t09.de" - readToken: - existingSecretName: sizer-tokens + sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4 garm: + metrics: + enable: true + disableAuth: true logging: - logLevel: debug + logLevel: info diff --git a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml index c0644cf..4d7458a 100644 --- a/otc/dev.t09.de/stacks/observability-client/vector/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..2e9248f --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd + namespace: observability +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: http-metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..9904e86 --- /dev/null +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm + namespace: observability +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http + path: /metrics diff --git a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml index f85a786..9224a46 100644 --- a/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability-client/vm-client-stack/values.yaml @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret diff --git a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml index 1e8b038..17d6046 100644 --- a/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml +++ b/otc/dev.t09.de/stacks/observability/grafana-operator/manifests/grafana.yaml @@ -35,8 +35,10 @@ spec: server: root_url: "https://grafana.dev.t09.de" auth: - disable_login: "true" disable_login_form: "true" + security: + admin_user: admin + admin_password: admin auth.generic_oauth: enabled: "true" name: Forgejo diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml index 3011a2f..d7599b9 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack.yaml @@ -9,10 +9,13 @@ spec: project: default syncPolicy: automated: + prune: true selfHeal: true syncOptions: - CreateNamespace=true - ServerSideApply=true + - RespectIgnoreDifferences=true + - SkipDryRunOnMissingResource=true destination: name: in-cluster namespace: observability diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..2e9248f --- /dev/null +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd + namespace: observability +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: http-metrics diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml new file mode 100644 index 0000000..259a2bf --- /dev/null +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml @@ -0,0 +1,78 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: backup-alerts + namespace: observability +spec: + groups: + - name: backup-schedule-staleness + rules: + - alert: BackupCronJobNotScheduled + expr: | + time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + > 26 * 3600 + for: 5m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been + scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. + Last schedule was {{ $value | humanizeDuration }} ago. + summary: "Backup CronJob {{ $labels.cronjob }} is stale" + + - alert: BackupCronJobNeverScheduled + expr: | + kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + == 0 + for: 30m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been + scheduled in cluster {{ $labels.cluster_environment }}. + summary: "Backup CronJob {{ $labels.cronjob }} never ran" + + - name: backup-job-failures + rules: + - alert: BackupJobFailed + expr: | + max by(cluster_environment, namespace, job_name) ( + kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 0 + for: 30s + labels: + severity: critical + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has + {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. + summary: "Backup job {{ $labels.job_name }} failed" + + - name: backup-job-duration + rules: + - alert: BackupJobTooSlow + expr: | + ( + time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 300 + and + kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 + for: 1m + labels: + severity: major + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been + running for {{ $value | humanizeDuration }} (threshold: 5m) + in cluster {{ $labels.cluster_environment }}. This may indicate a + hung process or connectivity issue. + summary: "Backup job {{ $labels.job_name }} running too long" diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml new file mode 100644 index 0000000..d4814dc --- /dev/null +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: coredns + namespace: observability +spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kube-dns + endpoints: + - port: metrics diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/forgejo-scrape.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..4b5807e --- /dev/null +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm + namespace: observability +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 5759093..a4f0368 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -12,6 +12,12 @@ spec: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/select/.*"] \ No newline at end of file diff --git a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml index d407910..e7bffbc 100644 --- a/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/dev.t09.de/stacks/observability/victoria-k8s-stack/values.yaml @@ -28,10 +28,7 @@ victoria-metrics-operator: crds: plain: true cleanup: - enabled: true - image: - repository: bitnami/kubectl - pullPolicy: IfNotPresent + enabled: false # disabled: cleanup hook can't schedule on resource-constrained nodes (Insufficient cpu / Too many pods) serviceMonitor: enabled: true operator: @@ -676,7 +673,7 @@ vmalert: vmauth: # -- Enable VMAuth CR - enabled: true + enabled: false # -- VMAuth annotations annotations: {} # -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) @@ -699,7 +696,7 @@ vmauth: vmagent: # -- Create VMAgent CR - enabled: false + enabled: true # -- VMAgent annotations annotations: {} # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) @@ -711,7 +708,8 @@ vmagent: port: "8429" selectAllByDefault: true scrapeInterval: 20s - externalLabels: {} + externalLabels: + cluster_environment: "dev" # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. # For example: # cluster: cluster-name diff --git a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml index 019dc65..c1bde64 100644 --- a/otc/edp.buildth.ing/stacks/core/argocd/values.yaml +++ b/otc/edp.buildth.ing/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml index 8203a51..e5d71d6 100644 --- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml +++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml @@ -3,7 +3,7 @@ kind: Ingress metadata: annotations: nginx.ingress.kubernetes.io/force-ssl-redirect: "true" - nginx.ingress.kubernetes.io/proxy-body-size: 512m + nginx.ingress.kubernetes.io/proxy-body-size: 5120m cert-manager.io/cluster-issuer: main name: forgejo-server diff --git a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index cc153d1..7226bd2 100644 --- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: @@ -72,7 +72,7 @@ spec: - ReadWriteOnce resources: requests: - storage: 100Gi + storage: 500Gi --- apiVersion: v1 kind: Secret diff --git a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/values.yaml b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/values.yaml index 2b64cca..c9e7a8a 100644 --- a/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/values.yaml +++ b/otc/edp.buildth.ing/stacks/forgejo/forgejo-server/values.yaml @@ -137,6 +137,9 @@ gitea: ENABLED: true ADAPTER: redis + security: + GLOBAL_TWO_FACTOR_REQUIREMENT: admin + service: DISABLE_REGISTRATION: true ENABLE_NOTIFY_MAIL: true @@ -177,4 +180,4 @@ image: # rootless: true fullOverride: observability.buildth.ing/devfw-cicd/edp-forgejo:14.0.2-edp1-rootless -forgejo: {} \ No newline at end of file +forgejo: {} diff --git a/otc/edp.buildth.ing/stacks/garm/garm.yaml b/otc/edp.buildth.ing/stacks/garm/garm.yaml index ab493b2..1a44c7c 100644 --- a/otc/edp.buildth.ing/stacks/garm/garm.yaml +++ b/otc/edp.buildth.ing/stacks/garm/garm.yaml @@ -20,7 +20,7 @@ spec: sources: - repoURL: https://edp.buildth.ing/DevFW-CICD/garm-helm path: charts/garm - targetRevision: v0.0.7 + targetRevision: v0.0.11 helm: valueFiles: - $values/otc/edp.buildth.ing/stacks/garm/garm/values.yaml diff --git a/otc/edp.buildth.ing/stacks/garm/garm/values.yaml b/otc/edp.buildth.ing/stacks/garm/garm/values.yaml index 7ad8f26..7c4eccc 100644 --- a/otc/edp.buildth.ing/stacks/garm/garm/values.yaml +++ b/otc/edp.buildth.ing/stacks/garm/garm/values.yaml @@ -26,7 +26,7 @@ credentials: image: repository: observability.buildth.ing/devfw-cicd/garm-forgejo - tag: v0.1.7-forgejo-1 + tag: v0.1.7-forgejo-21 providerConfig: edgeConnect: @@ -36,6 +36,9 @@ providerConfig: cloudlet: name: Hamburg organization: TelekomOP + edgeConnectK8s: + sizer: + sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.0.4 garm: logging: diff --git a/otc/edp.buildth.ing/stacks/garm/optimiser-receiver/ingress.yaml b/otc/edp.buildth.ing/stacks/garm/optimiser-receiver/ingress.yaml deleted file mode 100644 index aa6ac34..0000000 --- a/otc/edp.buildth.ing/stacks/garm/optimiser-receiver/ingress.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - annotations: - nginx.ingress.kubernetes.io/force-ssl-redirect: "true" - cert-manager.io/cluster-issuer: main - - name: optimiser-receiver - namespace: garm -spec: - ingressClassName: nginx - rules: - - host: optimiser.edp.buildth.ing - http: - paths: - - backend: - service: - name: optimiser-receiver - port: - number: 8080 - path: / - pathType: Prefix - tls: - - hosts: - - optimiser.edp.buildth.ing - secretName: optimiser-receiver-tls diff --git a/otc/edp.buildth.ing/stacks/garm/optimiser-receiver.yaml b/otc/edp.buildth.ing/stacks/garm/sizer-receiver.yaml similarity index 84% rename from otc/edp.buildth.ing/stacks/garm/optimiser-receiver.yaml rename to otc/edp.buildth.ing/stacks/garm/sizer-receiver.yaml index 4fd45cf..a382e6a 100644 --- a/otc/edp.buildth.ing/stacks/garm/optimiser-receiver.yaml +++ b/otc/edp.buildth.ing/stacks/garm/sizer-receiver.yaml @@ -1,7 +1,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Application metadata: - name: optimiser-receiver + name: sizer-receiver namespace: argocd labels: env: dev @@ -22,4 +22,4 @@ spec: source: repoURL: https://observability.buildth.ing/DevFW-CICD/stacks-instances targetRevision: HEAD - path: "otc/edp.buildth.ing/stacks/garm/optimiser-receiver" + path: "otc/edp.buildth.ing/stacks/garm/sizer-receiver" diff --git a/otc/dev.t09.de/stacks/garm/sizer-receiver/deployment.yaml b/otc/edp.buildth.ing/stacks/garm/sizer-receiver/deployment.yaml similarity index 90% rename from otc/dev.t09.de/stacks/garm/sizer-receiver/deployment.yaml rename to otc/edp.buildth.ing/stacks/garm/sizer-receiver/deployment.yaml index 91a1049..2d3eeaa 100644 --- a/otc/dev.t09.de/stacks/garm/sizer-receiver/deployment.yaml +++ b/otc/edp.buildth.ing/stacks/garm/sizer-receiver/deployment.yaml @@ -5,8 +5,6 @@ metadata: labels: app: sizer-receiver spec: - strategy: - type: Recreate replicas: 1 selector: matchLabels: @@ -18,8 +16,7 @@ spec: spec: containers: - name: receiver - image: edp.buildth.ing/devfw-cicd/forgejo-runner-sizer-receiver:latest - imagePullPolicy: Always + image: edp.buildth.ing/devfw-cicd/ci-sizer-receiver:0.0.4 args: - --db=/data/metrics.db ports: @@ -37,6 +34,10 @@ spec: secretKeyRef: name: sizer-tokens key: hmac-key + - name: RECEIVER_CPU_SIZING_MODE + value: "observe" + - name: RECEIVER_MEMORY_QOS + value: "guaranteed" volumeMounts: - name: data mountPath: /data diff --git a/otc/dev.t09.de/stacks/garm/sizer-receiver/ingress.yaml b/otc/edp.buildth.ing/stacks/garm/sizer-receiver/ingress.yaml similarity index 88% rename from otc/dev.t09.de/stacks/garm/sizer-receiver/ingress.yaml rename to otc/edp.buildth.ing/stacks/garm/sizer-receiver/ingress.yaml index bc50d98..3fcc484 100644 --- a/otc/dev.t09.de/stacks/garm/sizer-receiver/ingress.yaml +++ b/otc/edp.buildth.ing/stacks/garm/sizer-receiver/ingress.yaml @@ -10,7 +10,7 @@ metadata: spec: ingressClassName: nginx rules: - - host: sizer.dev.t09.de + - host: sizer.edp.buildth.ing http: paths: - backend: @@ -22,5 +22,5 @@ spec: pathType: Prefix tls: - hosts: - - sizer.dev.t09.de + - sizer.edp.buildth.ing secretName: sizer-receiver-tls diff --git a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml index 7b30cdc..2fefacd 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vector/values.yaml @@ -48,7 +48,7 @@ customConfig: type: elasticsearch inputs: [parser] endpoints: - - https://o12y.observability./insert/elasticsearch/ + - https://o12y.observability.buildth.ing/insert/elasticsearch/ auth: strategy: basic user: ${VECTOR_USER} diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml index 4e1c079..255e9e5 100644 --- a/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/edp.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml @@ -778,7 +778,7 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write + - url: https://o12y.observability.buildth.ing/api/v1/write basicAuth: username: name: simple-user-secret diff --git a/otc/observability.buildth.ing/stacks/core/argocd.yaml b/otc/observability.buildth.ing/stacks/core/argocd.yaml index 57fe99b..32f51a9 100644 --- a/otc/observability.buildth.ing/stacks/core/argocd.yaml +++ b/otc/observability.buildth.ing/stacks/core/argocd.yaml @@ -12,6 +12,7 @@ spec: selfHeal: true syncOptions: - CreateNamespace=true + - ServerSideApply=true retry: limit: -1 destination: @@ -23,7 +24,7 @@ spec: # TODO: RIRE Can be updated when https://github.com/argoproj/argo-cd/issues/20790 is fixed and merged # As logout make problems, it is suggested to switch from path based routing to an own argocd domain, # similar to the CNOE amazon reference implementation and in our case, Forgejo - targetRevision: argo-cd-9.4.6 + targetRevision: argo-cd-9.5.17 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/core/argocd/values.yaml diff --git a/otc/observability.buildth.ing/stacks/core/argocd/values.yaml b/otc/observability.buildth.ing/stacks/core/argocd/values.yaml index 04b0f2b..182d8ca 100644 --- a/otc/observability.buildth.ing/stacks/core/argocd/values.yaml +++ b/otc/observability.buildth.ing/stacks/core/argocd/values.yaml @@ -35,6 +35,30 @@ configs: tls: certificates: +controller: + metrics: + enabled: true + serviceMonitor: + enabled: false + +server: + metrics: + enabled: true + serviceMonitor: + enabled: false + +repoServer: + metrics: + enabled: true + serviceMonitor: + enabled: false + +applicationSet: + metrics: + enabled: true + serviceMonitor: + enabled: false + notifications: enabled: false diff --git a/otc/observability.buildth.ing/stacks/core/cloudnative-pg.yaml b/otc/observability.buildth.ing/stacks/core/cloudnative-pg.yaml index 0642dd0..be7ebcc 100644 --- a/otc/observability.buildth.ing/stacks/core/cloudnative-pg.yaml +++ b/otc/observability.buildth.ing/stacks/core/cloudnative-pg.yaml @@ -21,7 +21,7 @@ spec: sources: - repoURL: https://cloudnative-pg.github.io/charts chart: cloudnative-pg - targetRevision: 0.26.1 + targetRevision: 0.28.2 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/core/cloudnative-pg/values.yaml diff --git a/otc/observability.buildth.ing/stacks/core/dex.yaml b/otc/observability.buildth.ing/stacks/core/dex.yaml index e24fe03..4e76f2d 100644 --- a/otc/observability.buildth.ing/stacks/core/dex.yaml +++ b/otc/observability.buildth.ing/stacks/core/dex.yaml @@ -20,7 +20,7 @@ spec: sources: - repoURL: https://charts.dexidp.io chart: dex - targetRevision: 0.23.0 + targetRevision: 0.24.1 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/core/dex/values.yaml diff --git a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index 581f2d2..a1caaae 100644 --- a/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/otc/observability.buildth.ing/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -11,8 +11,8 @@ spec: startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: - # 60 min until backup - 10 min start - (backoffLimit * activeDeadlineSeconds) - some time sync buffer - activeDeadlineSeconds: 1350 + # 2h window: handles large incremental syncs after repo growth or OBS slowness; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 7200 backoffLimit: 2 ttlSecondsAfterFinished: 259200 # template: @@ -72,7 +72,7 @@ spec: - ReadWriteOnce resources: requests: - storage: 100Gi + storage: 500Gi --- apiVersion: v1 kind: Secret diff --git a/otc/observability.buildth.ing/stacks/observability-client/metrics-server.yaml b/otc/observability.buildth.ing/stacks/observability-client/metrics-server.yaml index 286ba67..80a405b 100644 --- a/otc/observability.buildth.ing/stacks/observability-client/metrics-server.yaml +++ b/otc/observability.buildth.ing/stacks/observability-client/metrics-server.yaml @@ -20,7 +20,7 @@ spec: sources: - chart: metrics-server repoURL: https://kubernetes-sigs.github.io/metrics-server/ - targetRevision: 3.12.2 + targetRevision: 3.13.1 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/observability-client/metrics-server/values.yaml diff --git a/otc/observability.buildth.ing/stacks/observability-client/vector.yaml b/otc/observability.buildth.ing/stacks/observability-client/vector.yaml index c66556e..ab888de 100644 --- a/otc/observability.buildth.ing/stacks/observability-client/vector.yaml +++ b/otc/observability.buildth.ing/stacks/observability-client/vector.yaml @@ -20,7 +20,7 @@ spec: sources: - chart: vector repoURL: https://helm.vector.dev - targetRevision: 0.43.0 + targetRevision: 0.52.0 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/observability-client/vector/values.yaml diff --git a/otc/observability.buildth.ing/stacks/observability-client/vector/values.yaml b/otc/observability.buildth.ing/stacks/observability-client/vector/values.yaml index 4905c71..042df5e 100644 --- a/otc/observability.buildth.ing/stacks/observability-client/vector/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability-client/vector/values.yaml @@ -28,7 +28,6 @@ customConfig: api: enabled: false address: 0.0.0.0:8686 - playground: true sources: k8s: type: kubernetes_logs @@ -47,12 +46,8 @@ customConfig: vlogs: type: elasticsearch inputs: [parser] - endpoints: - - https://o12y.observability./insert/elasticsearch/ - auth: - strategy: basic - user: ${VECTOR_USER} - password: ${VECTOR_PASSWORD} + endpoints: + - http://vlogs-victorialogs.observability.svc:9428/insert/elasticsearch/ mode: bulk api_version: v8 compression: gzip diff --git a/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack.yaml b/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack.yaml index 673c087..ffbb931 100644 --- a/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack.yaml +++ b/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack.yaml @@ -12,13 +12,14 @@ spec: selfHeal: true syncOptions: - CreateNamespace=true + - ServerSideApply=true destination: name: in-cluster namespace: observability sources: - chart: victoria-metrics-k8s-stack repoURL: https://victoriametrics.github.io/helm-charts/ - targetRevision: 0.48.1 + targetRevision: 0.81.0 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml diff --git a/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep b/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack/manifests/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml b/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml index 7bd29da..8784dcc 100644 --- a/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability-client/vm-client-stack/values.yaml @@ -70,8 +70,8 @@ defaultDashboards: # -- Create default rules for monitoring the cluster defaultRules: # -- Labels, which are used for grouping results of the queries. Note that these labels are joined with `.Values.global.clusterLabel` - additionalGroupByLabels: [] - create: true + extraGroupByLabels: [] + enabled: true # -- Common properties for VMRule groups group: @@ -114,127 +114,127 @@ defaultRules: # -- Rule group properties groups: etcd: - create: true + enabled: true # -- Common properties for all rules in a group rules: {} # spec: # annotations: # dashboard: https://example.com/dashboard/1 general: - create: true + enabled: true rules: {} k8sContainerCpuLimits: - create: true + enabled: true rules: {} k8sContainerCpuRequests: - create: true + enabled: true rules: {} k8sContainerCpuUsageSecondsTotal: - create: true + enabled: true rules: {} k8sContainerMemoryLimits: - create: true + enabled: true rules: {} k8sContainerMemoryRequests: - create: true + enabled: true rules: {} k8sContainerMemoryRss: - create: true + enabled: true rules: {} k8sContainerMemoryCache: - create: true + enabled: true rules: {} k8sContainerMemoryWorkingSetBytes: - create: true + enabled: true rules: {} k8sContainerMemorySwap: - create: true + enabled: true rules: {} k8sPodOwner: - create: true + enabled: true rules: {} k8sContainerResource: - create: true + enabled: true rules: {} kubeApiserver: - create: true + enabled: true rules: {} kubeApiserverAvailability: - create: true + enabled: true rules: {} kubeApiserverBurnrate: - create: true + enabled: true rules: {} kubeApiserverHistogram: - create: true + enabled: true rules: {} kubeApiserverSlos: - create: true + enabled: true rules: {} kubelet: - create: true + enabled: true rules: {} kubePrometheusGeneral: - create: true + enabled: true rules: {} kubePrometheusNodeRecording: - create: true + enabled: true rules: {} kubernetesApps: - create: true + enabled: true rules: {} targetNamespace: ".*" kubernetesResources: - create: true + enabled: true rules: {} kubernetesStorage: - create: true + enabled: true rules: {} targetNamespace: ".*" kubernetesSystem: - create: true + enabled: true rules: {} kubernetesSystemKubelet: - create: true + enabled: true rules: {} kubernetesSystemApiserver: - create: true + enabled: true rules: {} kubernetesSystemControllerManager: - create: true + enabled: true rules: {} kubeScheduler: - create: true + enabled: true rules: {} kubernetesSystemScheduler: - create: true + enabled: true rules: {} kubeStateMetrics: - create: true + enabled: true rules: {} nodeNetwork: - create: true + enabled: true rules: {} node: - create: true + enabled: true rules: {} vmagent: - create: true + enabled: true rules: {} vmsingle: - create: true + enabled: true rules: {} vmcluster: - create: true + enabled: true rules: {} vmHealth: - create: true + enabled: true rules: {} vmoperator: - create: true + enabled: true rules: {} alertmanager: - create: true + enabled: true rules: {} # -- Runbook url prefix for default rules @@ -778,24 +778,15 @@ vmagent: # -- Remote write configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) additionalRemoteWrites: # [] - - url: https://o12y.observability./api/v1/write - basicAuth: - username: - name: simple-user-secret - key: username - password: - name: simple-user-secret - key: password + - url: http://vmsingle-o12y.observability.svc:8429/api/v1/write # -- (object) Full spec for VMAgent CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmagentspec) spec: port: "8429" selectAllByDefault: true scrapeInterval: 20s - externalLabels: + externalLabels: cluster_environment: "observability" - # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. - # For example: - # cluster: cluster-name + cluster: observability extraArgs: promscrape.streamParse: "true" # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent @@ -843,7 +834,6 @@ defaultDatasources: allowCrossNamespaceImport: false victoriametrics: # -- Create per replica prometheus compatible datasource - perReplica: false # -- List of prometheus compatible datasource configurations. # VM `url` will be added to each of them in templates. datasources: @@ -860,7 +850,6 @@ defaultDatasources: # Alertmanager generated `url` will be added to each datasource in template if alertmanager is enabled alertmanager: # -- Create per replica alertmanager compatible datasource - perReplica: false datasources: - name: Alertmanager access: proxy @@ -979,6 +968,11 @@ prometheus-node-exporter: - action: drop source_labels: [mountpoint] regex: "/var/lib/kubelet/pods.+" + - action: replace + source_labels: [__name__, instance] + regex: "node_uname_info;([^:]+):.+" + target_label: nodename + replacement: "$1" # -- kube-state-metrics dependency chart configuration. For possible values check [here](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml) kube-state-metrics: enabled: true diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator.yaml index 69b4f6b..4a27771 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator.yaml @@ -19,7 +19,7 @@ spec: sources: - chart: grafana-operator repoURL: ghcr.io/grafana/helm-charts - targetRevision: v5.18.0 + targetRevision: 5.23.0 - repoURL: https://edp.buildth.ing/DevFW-CICD/stacks-instances targetRevision: HEAD path: "otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml new file mode 100644 index 0000000..9130b42 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -0,0 +1,153 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: argocd-operational +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Application Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Total Apps", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Healthy", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Degraded", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Synced", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "OutOfSync", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Progressing", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Application Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": {"custom": {"filterable": true}}, + "overrides": [ + {"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]}, + {"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]} + ] + }, + "gridPos": {"h": 12, "w": 24, "x": 0, "y": 6}, + "title": "All Applications", + "type": "table", + "targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}, + "title": "Sync Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 19}, + "title": "Sync Operations (rate)", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 19}, + "title": "Reconciliation Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}, + "title": "ArgoCD Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 28}, + "title": "ArgoCD Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "argocd", "gitops"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(argocd_app_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(argocd_app_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "ArgoCD Operations", + "uid": "edp-argocd-ops" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml index b348ff7..2b81b2b 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/argocd.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Applications" url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml new file mode 100644 index 0000000..ddcc883 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -0,0 +1,103 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cronjob-monitoring +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Operations" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Backup Job Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 0, "y": 1}, + "title": "Time Since Last Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 12, "y": 1}, + "title": "Failed Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "CronJob Overview", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 7}, + "title": "All CronJobs", + "type": "table", + "targets": [ + {"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"} + ], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Job History", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "title": "Job Completions (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "title": "Job Failures (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "backup", "cronjob"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(kube_cronjob_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(kube_cronjob_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-24h", "to": "now"}, + "title": "CronJob & Backup Monitoring", + "uid": "edp-cronjobs" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml new file mode 100644 index 0000000..ec40751 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -0,0 +1,207 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: forgejo +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Forgejo Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Status", + "type": "stat", + "targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Version", + "type": "stat", + "targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Teams", + "type": "stat", + "targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 6}, + "title": "Closed Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 6}, + "title": "Hook Tasks", + "type": "stat", + "targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Content & Auth", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 11}, + "title": "Stars", + "type": "stat", + "targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 11}, + "title": "Watches", + "type": "stat", + "targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 11}, + "title": "Releases", + "type": "stat", + "targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 11}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 11}, + "title": "Public Keys", + "type": "stat", + "targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 11}, + "title": "OAuth Apps", + "type": "stat", + "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Forgejo Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 16}, + "title": "Forgejo Server Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 12, "y": 16}, + "title": "Forgejo Errors", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "forgejo", "gitea"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(gitea_repositories, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(gitea_repositories, cluster_environment)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "Forgejo", + "uid": "edp-forgejo" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml new file mode 100644 index 0000000..2a23e20 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/garm.yaml @@ -0,0 +1,117 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: garm +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "GARM Runner Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1}, + "title": "Total Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1}, + "title": "Idle Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1}, + "title": "Creating", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1}, + "title": "Errors", + "type": "stat", + "targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "GitHub API Rate Limits", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "min": 0}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 7}, + "title": "Rate Limit Remaining", + "type": "timeseries", + "targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 7}, + "title": "Runner Operations Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Runner Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 16}, + "title": "Runner Pool Status", + "type": "table", + "targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}} + ] + } + ], + "schemaVersion": 39, + "tags": ["edp", "garm", "ci-cd", "runners"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(garm_runner_status, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(garm_runner_status, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "GARM Runners", + "uid": "edp-garm" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/grafana.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/grafana.yaml index f51be7b..8e186c2 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/grafana.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/grafana.yaml @@ -5,6 +5,9 @@ metadata: labels: dashboards: "grafana" spec: + version: "12.4.0" + client: + useKubeAuth: true persistentVolumeClaim: metadata: annotations: @@ -37,6 +40,18 @@ spec: auth: disable_login: "true" disable_login_form: "true" + auth.jwt: + enabled: "true" + header_name: Authorization + username_claim: sub + email_claim: sub + auto_sign_up: "true" + role_attribute_strict: "true" + role_attribute_path: "contains(sub, 'system:serviceaccount:observability:grafana-operator') && 'GrafanaAdmin' || 'None'" + jwk_set_url: "https://kubernetes.default.svc:443/openid/v1/jwks" + jwk_set_bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_client_ca: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + expect_claims: '{"aud": ["operator.grafana.com"]}' auth.generic_oauth: enabled: "true" name: Forgejo diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml index c13d6a2..077edd8 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Operations" url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml new file mode 100644 index 0000000..ffce4e2 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -0,0 +1,245 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: platform-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Overview" + json: | + { + "annotations": {"list": []}, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Platform Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Forgejo", + "type": "stat", + "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Ingress 5xx (5m)", + "type": "stat", + "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Failed Jobs (24h)", + "type": "stat", + "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Cluster CPU Usage", + "type": "stat", + "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Cluster Memory Usage", + "type": "stat", + "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Max PVC Usage", + "type": "stat", + "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Forgejo", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Resources", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, + "title": "Node CPU Usage", + "type": "timeseries", + "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, + "title": "PVC Usage by Claim", + "type": "timeseries", + "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, + "title": "Backups", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, + "title": "Time Since Last Backup Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s"}}, + "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, + "title": "Backup Job Duration (Last 7d)", + "type": "timeseries", + "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}], + "options": {"legend": {"displayMode": "table"}} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, + "title": "Failed Backup Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 24}, + "title": "Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 25}, + "title": "Recent Errors (all namespaces)", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "platform", "overview"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(up, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(up, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "EDP Platform Overview", + "uid": "edp-platform-overview" + } diff --git a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml index 4018fbd..c44c474 100644 --- a/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml +++ b/otc/observability.buildth.ing/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -6,4 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" + folder: "EDP / Operations" + grafanaCom: + id: 22698 + revision: 1 diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack.yaml index e38414f..a236b2c 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack.yaml @@ -19,7 +19,7 @@ spec: sources: - chart: victoria-metrics-k8s-stack repoURL: https://victoriametrics.github.io/helm-charts/ - targetRevision: 0.48.1 + targetRevision: 0.81.0 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml index 110ee7e..cb0f1e3 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -1,40 +1,95 @@ apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: - name: forgejo-alerts + name: edp-platform-alerts namespace: observability spec: groups: - - name: forgejo + - name: platform-health rules: - - alert: forgejo down - expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1 - for: 30s + - alert: ForgejoDown + expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1 + for: 1m labels: severity: critical - job: "{{ $labels.job }}" annotations: - value: "{{ $value }}" - description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}' - - name: forgejo-backup - rules: - - alert: forgejo s3 backup job failed - expr: max by(cluster_environment) (kube_job_status_failed{job_name=~"forgejo-s3-backup-.*"}) != 0 - for: 30s - labels: - severity: critical - job: "{{ $labels.job }}" - annotations: - value: "{{ $value }}" - description: 'forgejo s3 backup job failed in cluster environment {{ $labels.cluster_environment }}' - - name: disk-consumption-high - rules: - - alert: disk consumption high - expr: 1-(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.6 - for: 30s + summary: "Forgejo is down on {{ $labels.cluster_environment }}" + description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}." + + - alert: IngressHighErrorRate + expr: | + sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m])) + / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05 + for: 5m labels: severity: major - job: "{{ $labels.job }}" annotations: - value: "{{ $value }}" - description: 'disk consumption of pvc {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is high in cluster environment {{ $labels.cluster_environment }}' + summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}" + description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeNotReady + expr: kube_node_status_condition{condition="Ready", status="true"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}" + description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes." + + - alert: PodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3 + for: 5m + labels: + severity: major + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}" + description: "Pod has restarted more than 3 times in the last 15 minutes." + + - name: storage + rules: + - alert: PVCUsageHigh + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: major + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%" + description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}." + value: "{{ $value | humanizePercentage }}" + + - alert: PVCUsageCritical + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90 + for: 5m + labels: + severity: critical + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%" + description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required." + value: "{{ $value | humanizePercentage }}" + + - name: resources + rules: + - alert: NodeCPUHigh + expr: | + 1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85 + for: 15m + labels: + severity: major + annotations: + summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}" + description: "Node CPU utilization has been above 85% for 15 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeMemoryHigh + expr: | + 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 + for: 10m + labels: + severity: major + annotations: + summary: "Node memory >90% on {{ $labels.cluster_environment }}" + description: "Node memory utilization above 90% for 10 minutes." + value: "{{ $value | humanizePercentage }}" diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..0517321 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: http-metrics diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml new file mode 100644 index 0000000..259a2bf --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml @@ -0,0 +1,78 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: backup-alerts + namespace: observability +spec: + groups: + - name: backup-schedule-staleness + rules: + - alert: BackupCronJobNotScheduled + expr: | + time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + > 26 * 3600 + for: 5m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been + scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. + Last schedule was {{ $value | humanizeDuration }} ago. + summary: "Backup CronJob {{ $labels.cronjob }} is stale" + + - alert: BackupCronJobNeverScheduled + expr: | + kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + == 0 + for: 30m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been + scheduled in cluster {{ $labels.cluster_environment }}. + summary: "Backup CronJob {{ $labels.cronjob }} never ran" + + - name: backup-job-failures + rules: + - alert: BackupJobFailed + expr: | + max by(cluster_environment, namespace, job_name) ( + kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 0 + for: 30s + labels: + severity: critical + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has + {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. + summary: "Backup job {{ $labels.job_name }} failed" + + - name: backup-job-duration + rules: + - alert: BackupJobTooSlow + expr: | + ( + time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 300 + and + kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 + for: 1m + labels: + severity: major + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been + running for {{ $value | humanizeDuration }} (threshold: 5m) + in cluster {{ $labels.cluster_environment }}. This may indicate a + hung process or connectivity issue. + summary: "Backup job {{ $labels.job_name }} running too long" diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml new file mode 100644 index 0000000..2290b99 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml @@ -0,0 +1,61 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: ci-sustainability +spec: + groups: + - name: ci.sustainability.daily + interval: 5m + rules: + - record: ci:cpu_seconds:increase1d + expr: | + sum by(namespace, cluster) ( + increase(container_cpu_usage_seconds_total{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + }[1d]) + ) + - record: ci:memory_bytes_seconds:avg1d + expr: | + avg_over_time( + sum by(namespace, cluster) ( + container_memory_working_set_bytes{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + } + )[1d:5m] + ) + - record: ci:pod_count:avg1d + expr: | + avg_over_time( + count by(namespace, cluster) ( + kube_pod_info{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + } + )[1d:5m] + ) + - record: ci:pod_creations:increase1d + expr: | + sum by(namespace, cluster) ( + changes(kube_pod_start_time{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + }[1d]) + ) + - name: ci.sustainability.cluster + interval: 5m + rules: + - record: cluster:cpu_seconds:rate5m + expr: | + sum by(cluster) ( + rate(node_cpu_seconds_total{mode!="idle"}[5m]) + ) + - record: cluster:memory_used_bytes:sum + expr: | + sum by(cluster) ( + node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes + ) + diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml new file mode 100644 index 0000000..77cef00 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: Service +metadata: + name: coredns-metrics + namespace: kube-system + labels: + k8s-app: coredns-metrics +spec: + clusterIP: None + selector: + k8s-app: coredns + ports: + - name: metrics + port: 9153 + targetPort: 9153 + protocol: TCP +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: coredns +spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: coredns-metrics + endpoints: + - port: metrics diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..f73afa8 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml new file mode 100644 index 0000000..7013863 --- /dev/null +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/simple-user-secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: simple-user-secret + namespace: observability +type: Opaque +data: + username: c2ltcGxlLXVzZXI= + password: c3g1Z0M3b29XYVdPT0R3RA== diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 5759093..e1de2c6 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -5,13 +5,17 @@ metadata: namespace: observability spec: username: simple-user - passwordRef: - key: password - name: simple-user-secret + password: sx5gC7ooWaWOODwD targetRefs: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 - paths: ["/insert/elasticsearch/.*"] \ No newline at end of file + paths: ["/insert/elasticsearch/.*"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/select/.*"] \ No newline at end of file diff --git a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml index cdb96a9..c535829 100644 --- a/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml +++ b/otc/observability.buildth.ing/stacks/observability/victoria-k8s-stack/values.yaml @@ -70,8 +70,8 @@ defaultDashboards: # -- Create default rules for monitoring the cluster defaultRules: # -- Labels, which are used for grouping results of the queries. Note that these labels are joined with `.Values.global.clusterLabel` - additionalGroupByLabels: [] - create: true + extraGroupByLabels: [] + enabled: true # -- Common properties for VMRule groups group: @@ -114,91 +114,91 @@ defaultRules: # -- Rule group properties groups: etcd: - create: true + enabled: true # -- Common properties for all rules in a group rules: {} # spec: # annotations: # dashboard: https://example.com/dashboard/1 general: - create: true + enabled: true rules: {} k8sContainerCpuLimits: - create: true + enabled: true rules: {} k8sContainerCpuRequests: - create: true + enabled: true rules: {} k8sContainerCpuUsageSecondsTotal: - create: true + enabled: true rules: {} k8sContainerMemoryLimits: - create: true + enabled: true rules: {} k8sContainerMemoryRequests: - create: true + enabled: true rules: {} k8sContainerMemoryRss: - create: true + enabled: true rules: {} k8sContainerMemoryCache: - create: true + enabled: true rules: {} k8sContainerMemoryWorkingSetBytes: - create: true + enabled: true rules: {} k8sContainerMemorySwap: - create: true + enabled: true rules: {} k8sPodOwner: - create: true + enabled: true rules: {} k8sContainerResource: - create: true + enabled: true rules: {} kubeApiserver: - create: true + enabled: true rules: {} kubeApiserverAvailability: - create: true + enabled: true rules: {} kubeApiserverBurnrate: - create: true + enabled: true rules: {} kubeApiserverHistogram: - create: true + enabled: true rules: {} kubeApiserverSlos: - create: true + enabled: true rules: {} kubelet: - create: true + enabled: true rules: {} kubePrometheusGeneral: - create: true + enabled: true rules: {} kubePrometheusNodeRecording: - create: true + enabled: true rules: {} kubernetesApps: - create: true + enabled: true rules: {} targetNamespace: ".*" kubernetesResources: - create: true + enabled: true rules: {} kubernetesStorage: - create: true + enabled: true rules: {} targetNamespace: ".*" kubernetesSystem: - create: true + enabled: true rules: {} kubernetesSystemKubelet: - create: true + enabled: true rules: {} kubernetesSystemApiserver: - create: true + enabled: true rules: {} kubernetesSystemControllerManager: create: false @@ -210,31 +210,31 @@ defaultRules: create: false rules: {} kubeStateMetrics: - create: true + enabled: true rules: {} nodeNetwork: - create: true + enabled: true rules: {} node: - create: true + enabled: true rules: {} vmagent: - create: true + enabled: true rules: {} vmsingle: - create: true + enabled: true rules: {} vmcluster: - create: true + enabled: true rules: {} vmHealth: - create: true + enabled: true rules: {} vmoperator: - create: true + enabled: true rules: {} alertmanager: - create: true + enabled: true rules: {} # -- Runbook url prefix for default rules @@ -283,7 +283,7 @@ vmsingle: spec: port: "8429" # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) - retentionPeriod: "1" + retentionPeriod: "6" replicaCount: 1 extraArgs: {} storageMetadata: @@ -711,10 +711,8 @@ vmagent: port: "8429" selectAllByDefault: true scrapeInterval: 20s - externalLabels: {} - # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. - # For example: - # cluster: cluster-name + externalLabels: + cluster: observability extraArgs: promscrape.streamParse: "true" # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent @@ -767,7 +765,6 @@ defaultDatasources: allowCrossNamespaceImport: false victoriametrics: # -- Create per replica prometheus compatible datasource - perReplica: false # -- List of prometheus compatible datasource configurations. # VM `url` will be added to each of them in templates. datasources: @@ -784,7 +781,6 @@ defaultDatasources: # Alertmanager generated `url` will be added to each datasource in template if alertmanager is enabled alertmanager: # -- Create per replica alertmanager compatible datasource - perReplica: false datasources: - name: Alertmanager access: proxy @@ -921,6 +917,11 @@ prometheus-node-exporter: - action: drop source_labels: [mountpoint] regex: "/var/lib/kubelet/pods.+" + - action: replace + source_labels: [__name__, instance] + regex: "node_uname_info;([^:]+):.+" + target_label: nodename + replacement: "$1" # -- kube-state-metrics dependency chart configuration. For possible values check [here](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml) kube-state-metrics: enabled: true diff --git a/otc/observability.buildth.ing/stacks/otc/cert-manger.yaml b/otc/observability.buildth.ing/stacks/otc/cert-manger.yaml index 3ee7573..ea4125e 100644 --- a/otc/observability.buildth.ing/stacks/otc/cert-manger.yaml +++ b/otc/observability.buildth.ing/stacks/otc/cert-manger.yaml @@ -20,7 +20,7 @@ spec: sources: - chart: cert-manager repoURL: https://charts.jetstack.io - targetRevision: v1.17.2 + targetRevision: v1.19.5 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/otc/cert-manager/values.yaml diff --git a/otc/observability.buildth.ing/stacks/otc/ingress-nginx.yaml b/otc/observability.buildth.ing/stacks/otc/ingress-nginx.yaml index db06173..3ddf6a2 100644 --- a/otc/observability.buildth.ing/stacks/otc/ingress-nginx.yaml +++ b/otc/observability.buildth.ing/stacks/otc/ingress-nginx.yaml @@ -20,7 +20,7 @@ spec: sources: - repoURL: https://github.com/kubernetes/ingress-nginx.git path: charts/ingress-nginx - targetRevision: helm-chart-4.12.1 + targetRevision: helm-chart-4.15.1 helm: valueFiles: - $values/otc/observability.buildth.ing/stacks/otc/ingress-nginx/values.yaml