diff --git a/.forgejo/workflows/build-secrets-backup-image.yml b/.forgejo/workflows/build-secrets-backup-image.yml new file mode 100644 index 0000000..4f5185f --- /dev/null +++ b/.forgejo/workflows/build-secrets-backup-image.yml @@ -0,0 +1,35 @@ +name: Build secrets-backup image + +on: + push: + paths: + - 'build/secrets-backup/Dockerfile' + branches: + - main + workflow_dispatch: + +jobs: + build-and-push: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Log in to registry + run: | + echo "${{ secrets.PACKAGES_TOKEN }}" | \ + docker login edp.buildth.ing \ + -u "${{ env.FORGEJO_REPOSITORY_OWNER }}" \ + --password-stdin + + - name: Build image + run: | + docker build \ + -t edp.buildth.ing/devfw-cicd/secrets-backup:1.0.1 \ + -t edp.buildth.ing/devfw-cicd/secrets-backup:latest \ + build/secrets-backup/ + + - name: Push image + run: | + docker push edp.buildth.ing/devfw-cicd/secrets-backup:1.0.1 + docker push edp.buildth.ing/devfw-cicd/secrets-backup:latest diff --git a/build/secrets-backup/Dockerfile b/build/secrets-backup/Dockerfile new file mode 100644 index 0000000..fa560d7 --- /dev/null +++ b/build/secrets-backup/Dockerfile @@ -0,0 +1,3 @@ +FROM alpine/k8s:1.32.0 +# No extra packages needed — kubectl and aws CLI are bundled in alpine/k8s +# OBS SSE-KMS handles encryption at rest; no openssl required diff --git a/template/registry/ci-sizer.yaml b/template/registry/ci-sizer.yaml new file mode 100644 index 0000000..ce415d6 --- /dev/null +++ b/template/registry/ci-sizer.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ci-sizer-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/ci-sizer" + repoURL: "https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}}" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/template/registry/coder.yaml b/template/registry/coder.yaml new file mode 100644 index 0000000..40cfffb --- /dev/null +++ b/template/registry/coder.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: coder-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/coder" + repoURL: "https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}}" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/template/registry/docs.yaml b/template/registry/docs.yaml new file mode 100644 index 0000000..9d88777 --- /dev/null +++ b/template/registry/docs.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: docs-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: argocd-stack + repoURL: "https://edp.buildth.ing/DevFW-CICD/website-and-documentation" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/template/registry/garm.yaml b/template/registry/garm.yaml new file mode 100644 index 0000000..3b9a08c --- /dev/null +++ b/template/registry/garm.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: garm-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/garm" + repoURL: "https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}}" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/template/registry/terralist.yaml b/template/registry/terralist.yaml new file mode 100644 index 0000000..167345d --- /dev/null +++ b/template/registry/terralist.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: terralist-reg + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + name: in-cluster + namespace: argocd + source: + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/terralist" + repoURL: "https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}}" + targetRevision: HEAD + project: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/template/stacks/ci-sizer/gitlab-webhook.yaml b/template/stacks/ci-sizer/gitlab-webhook.yaml new file mode 100644 index 0000000..c92cae5 --- /dev/null +++ b/template/stacks/ci-sizer/gitlab-webhook.yaml @@ -0,0 +1,29 @@ +# Optional: GitLab CI integration +# Only hydrate this app for clusters that run GitLab Runner. +# For Forgejo/GitHub-only deployments, omit this app from stacks-instances. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: gitlab-sizer-webhook + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: ci-sizer + source: + repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/ci-sizer/gitlab-webhook" diff --git a/template/stacks/ci-sizer/gitlab-webhook/certificates.yaml b/template/stacks/ci-sizer/gitlab-webhook/certificates.yaml new file mode 100644 index 0000000..ee1fece --- /dev/null +++ b/template/stacks/ci-sizer/gitlab-webhook/certificates.yaml @@ -0,0 +1,27 @@ +# Self-signed Issuer for webhook TLS. +# For production, replace with a ClusterIssuer backed by a real CA. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} +--- +# cert-manager Certificate for the webhook TLS. +# The resulting Secret (gitlab-sizer-webhook-tls) is mounted into the webhook pod. +# cert-manager also injects the CA into the MutatingWebhookConfiguration via the +# cert-manager.io/inject-ca-from annotation. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: gitlab-sizer-webhook-cert +spec: + secretName: gitlab-sizer-webhook-tls + issuerRef: + name: selfsigned-issuer + kind: Issuer + dnsNames: + - gitlab-sizer-webhook.ci-sizer.svc + - gitlab-sizer-webhook.ci-sizer.svc.cluster.local + duration: 8760h + renewBefore: 720h diff --git a/template/stacks/ci-sizer/gitlab-webhook/deployment.yaml b/template/stacks/ci-sizer/gitlab-webhook/deployment.yaml new file mode 100644 index 0000000..0b99859 --- /dev/null +++ b/template/stacks/ci-sizer/gitlab-webhook/deployment.yaml @@ -0,0 +1,141 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gitlab-sizer-webhook +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gitlab-sizer-webhook +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: gitlab-sizer-webhook +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gitlab-sizer-webhook +subjects: + - kind: ServiceAccount + name: gitlab-sizer-webhook + namespace: ci-sizer +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gitlab-sizer-webhook + labels: + app: gitlab-sizer-webhook +spec: + replicas: 2 + selector: + matchLabels: + app: gitlab-sizer-webhook + template: + metadata: + labels: + app: gitlab-sizer-webhook + spec: + serviceAccountName: gitlab-sizer-webhook + securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: webhook + image: edp.buildth.ing/devfw-cicd/gitlab-webhook-edge-connect:latest + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + ports: + - containerPort: 8443 + protocol: TCP + args: + - --listen-addr=:8443 + - --tls-cert-file=/etc/webhook/tls/tls.crt + - --tls-key-file=/etc/webhook/tls/tls.key + - --sizer-url=http://sizer-receiver.ci-sizer.svc:8080 + - --sizer-sidecar-image=edp.buildth.ing/devfw-cicd/ci-sizer-collector:latest + env: + - name: WEBHOOK_SIZER_READ_TOKEN + valueFrom: + secretKeyRef: + name: gitlab-sizer-webhook-tokens + key: sizer-read-token + - name: WEBHOOK_SIZER_PUSH_TOKEN + valueFrom: + secretKeyRef: + name: gitlab-sizer-webhook-tokens + key: sizer-push-token + - name: HTTP_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: HTTP_PROXY + optional: true + - name: HTTPS_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: HTTPS_PROXY + optional: true + - name: NO_PROXY + valueFrom: + configMapKeyRef: + name: gitlab-sizer-webhook-config + key: NO_PROXY + optional: true + volumeMounts: + - name: webhook-tls + mountPath: /etc/webhook/tls + readOnly: true + livenessProbe: + httpGet: + path: /healthz + port: 8443 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 8443 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 128Mi + volumes: + - name: webhook-tls + secret: + secretName: gitlab-sizer-webhook-tls +--- +apiVersion: v1 +kind: Service +metadata: + name: gitlab-sizer-webhook + labels: + app: gitlab-sizer-webhook +spec: + selector: + app: gitlab-sizer-webhook + ports: + - port: 443 + targetPort: 8443 + protocol: TCP diff --git a/template/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml b/template/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml new file mode 100644 index 0000000..72aea4a --- /dev/null +++ b/template/stacks/ci-sizer/gitlab-webhook/webhook-config.yaml @@ -0,0 +1,30 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: gitlab-sizer-webhook + annotations: + cert-manager.io/inject-ca-from: ci-sizer/gitlab-sizer-webhook-cert +webhooks: + - name: gitlab-sizer-webhook.ci-sizer.svc + admissionReviewVersions: ["v1"] + sideEffects: NoneOnDryRun + failurePolicy: Ignore + timeoutSeconds: 5 + reinvocationPolicy: Never + clientConfig: + service: + name: gitlab-sizer-webhook + namespace: ci-sizer + path: /mutate + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE"] + resources: ["pods"] + namespaceSelector: + matchLabels: + ci-sizer.devfw.io/watch: "true" + objectSelector: + matchExpressions: + - key: job.runner.gitlab.com/pod + operator: Exists diff --git a/template/stacks/ci-sizer/sizer-receiver.yaml b/template/stacks/ci-sizer/sizer-receiver.yaml new file mode 100644 index 0000000..1fc7279 --- /dev/null +++ b/template/stacks/ci-sizer/sizer-receiver.yaml @@ -0,0 +1,29 @@ +# Required: CI Sizer receiver +# Always deploy this — it stores metrics and computes sizing recommendations. +# Works standalone or with GARM (Forgejo/GitHub) and/or GitLab webhook. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: sizer-receiver + namespace: argocd + labels: + env: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: ci-sizer + source: + repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/ci-sizer/sizer-receiver" diff --git a/template/stacks/ci-sizer/sizer-receiver/deployment.yaml b/template/stacks/ci-sizer/sizer-receiver/deployment.yaml new file mode 100644 index 0000000..91d0bce --- /dev/null +++ b/template/stacks/ci-sizer/sizer-receiver/deployment.yaml @@ -0,0 +1,128 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sizer-receiver + labels: + app: sizer-receiver +spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app: sizer-receiver + template: + metadata: + labels: + app: sizer-receiver + spec: + securityContext: + fsGroup: 65534 + containers: + - name: receiver + image: edp.buildth.ing/devfw-cicd/ci-sizer-receiver:latest + imagePullPolicy: Always + args: + - --db=/data/metrics.db + ports: + - name: http + containerPort: 8080 + protocol: TCP + env: + - name: RECEIVER_READ_TOKEN + valueFrom: + secretKeyRef: + name: sizer-tokens + key: read-token + - name: RECEIVER_HMAC_KEY + valueFrom: + secretKeyRef: + name: sizer-tokens + key: hmac-key +{{{- if index .Env "DOMAIN_GITEA" }}} + - name: GARM_URL + value: "http://garm.garm.svc:80" + - name: GARM_USER + value: "admin" + - name: GARM_PASSWORD + valueFrom: + secretKeyRef: + name: garm-fixed-credentials + key: admin_password + {{{- end }}} + - name: RECEIVER_OIDC_ISSUER + value: "https://dex.{{{ .Env.DOMAIN }}}" + - name: RECEIVER_OIDC_CLIENT_ID + value: "ci-sizer" + - name: RECEIVER_OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: sizer-oidc-client + key: client-secret + - name: RECEIVER_OIDC_REDIRECT_URI + value: "https://sizer.{{{ .Env.DOMAIN }}}/ui/callback" + - name: RECEIVER_SESSION_TTL + value: "12h" + - name: RECEIVER_ALLOWED_ORG + value: "{{{ getenv "SIZER_ALLOWED_ORG" "DevFW-CICD" }}}" + - name: RECEIVER_CPU_SIZING_MODE + value: "observe" + - name: RECEIVER_MEMORY_QOS + value: "guaranteed" + volumeMounts: + - name: data + mountPath: /data + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 2 + periodSeconds: 10 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + volumes: + - name: data + persistentVolumeClaim: + claimName: sizer-receiver-data +--- +apiVersion: v1 +kind: Service +metadata: + name: sizer-receiver + labels: + app: sizer-receiver +spec: + selector: + app: sizer-receiver + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: sizer-receiver-data + labels: + app: sizer-receiver + annotations: + everest.io/disk-volume-type: GPSSD +spec: + storageClassName: csi-disk + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi diff --git a/template/stacks/ci-sizer/sizer-receiver/ingress.yaml b/template/stacks/ci-sizer/sizer-receiver/ingress.yaml new file mode 100644 index 0000000..94e77c3 --- /dev/null +++ b/template/stacks/ci-sizer/sizer-receiver/ingress.yaml @@ -0,0 +1,40 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: main +{{{ if eq .Env.CLUSTER_TYPE "osc" }}} + dns.gardener.cloud/class: garden + dns.gardener.cloud/dnsnames: sizer.{{{ .Env.DOMAIN }}} + dns.gardener.cloud/ttl: "600" +{{{ end }}} + name: sizer-receiver + namespace: ci-sizer +spec: + ingressClassName: nginx + rules: + - host: sizer.{{{ .Env.DOMAIN }}} + http: + paths: + - backend: + service: + name: sizer-receiver + port: + number: 8080 + path: / + pathType: Prefix + - host: ci-sizer.{{{ .Env.DOMAIN }}} + http: + paths: + - backend: + service: + name: sizer-receiver + port: + number: 8080 + path: / + pathType: Prefix + tls: + - hosts: + - sizer.{{{ .Env.DOMAIN }}} + secretName: sizer-receiver-tls diff --git a/template/stacks/coder/coder.yaml b/template/stacks/coder/coder.yaml new file mode 100644 index 0000000..a0eaa9c --- /dev/null +++ b/template/stacks/coder/coder.yaml @@ -0,0 +1,32 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: coder + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: coder + sources: + - repoURL: https://helm.coder.com/v2 + chart: coder + targetRevision: 2.28.3 + helm: + valueFiles: + - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/coder/coder/values.yaml + - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + ref: values + - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/coder/coder/manifests" diff --git a/template/stacks/coder/coder/manifests/postgres.yaml b/template/stacks/coder/coder/manifests/postgres.yaml new file mode 100644 index 0000000..cae4b97 --- /dev/null +++ b/template/stacks/coder/coder/manifests/postgres.yaml @@ -0,0 +1,38 @@ +--- +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: coder-db + namespace: coder +spec: + instances: 1 + primaryUpdateStrategy: unsupervised + resources: + requests: + memory: "1Gi" + cpu: "1" + limits: + memory: "1Gi" + cpu: "1" + managed: + roles: + - name: coder + createdb: true + login: true + passwordSecret: + name: coder-db-user + storage: + size: 10Gi + storageClass: csi-disk +--- +apiVersion: postgresql.cnpg.io/v1 +kind: Database +metadata: + name: coder + namespace: coder +spec: + cluster: + name: coder-db + name: coder + owner: coder +--- diff --git a/template/stacks/coder/coder/values.yaml b/template/stacks/coder/coder/values.yaml new file mode 100644 index 0000000..df4334e --- /dev/null +++ b/template/stacks/coder/coder/values.yaml @@ -0,0 +1,61 @@ +coder: + # You can specify any environment variables you'd like to pass to Coder + # here. Coder consumes environment variables listed in + # `coder server --help`, and these environment variables are also passed + # to the workspace provisioner (so you can consume them in your Terraform + # templates for auth keys etc.). + # + # Please keep in mind that you should not set `CODER_HTTP_ADDRESS`, + # `CODER_TLS_ENABLE`, `CODER_TLS_CERT_FILE` or `CODER_TLS_KEY_FILE` as + # they are already set by the Helm chart and will cause conflicts. + env: + - name: CODER_ACCESS_URL + value: https://coder.{{{ .Env.DOMAIN_GITEA }}} + - name: CODER_PG_CONNECTION_URL + valueFrom: + secretKeyRef: + # You'll need to create a secret called coder-db-url with your + # Postgres connection URL like: + # postgres://coder:password@postgres:5432/coder?sslmode=disable + name: coder-db-user + key: url + # For production deployments, we recommend configuring your own GitHub + # OAuth2 provider and disabling the default one. + - name: CODER_OAUTH2_GITHUB_DEFAULT_PROVIDER_ENABLE + value: "false" + - name: EDGE_CONNECT_ENDPOINT + valueFrom: + secretKeyRef: + name: edge-credential + key: endpoint + - name: EDGE_CONNECT_USERNAME + valueFrom: + secretKeyRef: + name: edge-credential + key: username + - name: EDGE_CONNECT_PASSWORD + valueFrom: + secretKeyRef: + name: edge-credential + key: password + + # (Optional) For production deployments the access URL should be set. + # If you're just trying Coder, access the dashboard via the service IP. + # - name: CODER_ACCESS_URL + # value: "https://coder.example.com" + + #tls: + # secretNames: + # - my-tls-secret-name + service: + type: ClusterIP + + ingress: + enable: true + className: nginx + host: coder.{{{ .Env.DOMAIN_GITEA }}} + annotations: + cert-manager.io/cluster-issuer: main + tls: + enable: true + secretName: coder-tls-secret diff --git a/template/stacks/core/argocd.yaml b/template/stacks/core/argocd.yaml index cb1e886..1b6df68 100644 --- a/template/stacks/core/argocd.yaml +++ b/template/stacks/core/argocd.yaml @@ -18,12 +18,12 @@ spec: name: in-cluster namespace: argocd sources: - - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/DevFW-CICD/argocd-helm.git + - repoURL: https://github.com/argoproj/argo-helm.git path: charts/argo-cd # TODO: RIRE Can be updated when https://github.com/argoproj/argo-cd/issues/20790 is fixed and merged # As logout make problems, it is suggested to switch from path based routing to an own argocd domain, # similar to the CNOE amazon reference implementation and in our case, Forgejo - targetRevision: argo-cd-7.8.28-depends + targetRevision: argo-cd-9.4.6 helm: valueFiles: - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/core/argocd/values.yaml @@ -32,4 +32,4 @@ spec: ref: values - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} targetRevision: HEAD - path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/core/argocd/manifests" \ No newline at end of file + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/core/argocd/manifests" diff --git a/template/stacks/core/argocd/values.yaml b/template/stacks/core/argocd/values.yaml index dfb7f96..f2495ec 100644 --- a/template/stacks/core/argocd/values.yaml +++ b/template/stacks/core/argocd/values.yaml @@ -5,6 +5,16 @@ configs: params: server.insecure: true cm: + oidc.config: | + name: FORGEJO + issuer: https://{{{ .Env.DOMAIN_DEX }}} + clientID: controller-argocd-dex + clientSecret: $dex-argo-client:clientSecret + requestedScopes: + - openid + - profile + - email + - groups application.resourceTrackingMethod: annotation timeout.reconciliation: 60s resource.exclusions: | @@ -18,10 +28,9 @@ configs: - CiliumIdentity clusters: - "*" - accounts.provider-argocd: apiKey url: https://{{{ .Env.DOMAIN_ARGOCD }}} rbac: - policy.csv: 'g, provider-argocd, role:admin' + policy.csv: 'g, DevFW, role:admin' tls: certificates: @@ -31,3 +40,19 @@ notifications: dex: enabled: false + +controller: + metrics: + enabled: true + +server: + metrics: + enabled: true + +repoServer: + metrics: + enabled: true + +applicationSet: + metrics: + enabled: true diff --git a/template/stacks/core/cloudnative-pg.yaml b/template/stacks/core/cloudnative-pg.yaml new file mode 100644 index 0000000..fa832d9 --- /dev/null +++ b/template/stacks/core/cloudnative-pg.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cloudnative-pg + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: cloudnative-pg + sources: + - repoURL: https://cloudnative-pg.github.io/charts + chart: cloudnative-pg + targetRevision: 0.26.1 + helm: + valueFiles: + - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/core/cloudnative-pg/values.yaml + - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + ref: values diff --git a/template/stacks/core/cloudnative-pg/values.yaml b/template/stacks/core/cloudnative-pg/values.yaml new file mode 100644 index 0000000..cfebbfc --- /dev/null +++ b/template/stacks/core/cloudnative-pg/values.yaml @@ -0,0 +1 @@ +# No need for values here. diff --git a/template/stacks/core/dex.yaml b/template/stacks/core/dex.yaml new file mode 100644 index 0000000..d41c0bf --- /dev/null +++ b/template/stacks/core/dex.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: dex + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: dex + sources: + - repoURL: https://charts.dexidp.io + chart: dex + targetRevision: 0.23.0 + helm: + valueFiles: + - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/core/dex/values.yaml + - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + ref: values diff --git a/template/stacks/core/dex/values.yaml b/template/stacks/core/dex/values.yaml new file mode 100644 index 0000000..1802203 --- /dev/null +++ b/template/stacks/core/dex/values.yaml @@ -0,0 +1,86 @@ +ingress: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: main + hosts: + - host: {{{ .Env.DOMAIN_DEX }}} + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - {{{ .Env.DOMAIN_DEX }}} + secretName: dex-cert + +envVars: + - name: FORGEJO_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-forgejo-client + key: clientSecret + - name: FORGEJO_CLIENT_ID + valueFrom: + secretKeyRef: + name: dex-forgejo-client + key: clientID + - name: OIDC_DEX_GRAFANA_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-grafana-client + key: clientSecret + - name: OIDC_DEX_ARGO_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-argo-client + key: clientSecret + - name: FORGEJO_RUNNER_SIZER_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: dex-sizer-client + key: clientSecret + - name: LOG_LEVEL + value: debug + +config: + # Set it to a valid URL + issuer: https://{{{ .Env.DOMAIN_DEX }}} + + # See https://dexidp.io/docs/storage/ for more options + storage: + type: memory + + oauth2: + skipApprovalScreen: true + alwaysShowLoginScreen: false + + connectors: + - type: gitea + id: gitea + name: Forgejo + config: + clientID: "$FORGEJO_CLIENT_ID" + clientSecret: "$FORGEJO_CLIENT_SECRET" + redirectURI: https://{{{ .Env.DOMAIN_DEX }}}/callback + baseURL: https://edp.buildth.ing + # loadAllGroups: true + orgs: + - name: DevFW + enablePasswordDB: false + + staticClients: + - id: controller-argocd-dex + name: ArgoCD Client + redirectURIs: + - "https://{{{ .Env.DOMAIN_ARGOCD }}}/auth/callback" + secretEnv: "OIDC_DEX_ARGO_CLIENT_SECRET" + - id: grafana + redirectURIs: + - "https://{{{ .Env.DOMAIN_GRAFANA }}}/login/generic_oauth" + name: "Grafana" + secretEnv: "OIDC_DEX_GRAFANA_CLIENT_SECRET" + - id: ci-sizer + name: "CI Sizer" + redirectURIs: + - "https://sizer.{{{ .Env.DOMAIN }}}/ui/callback" + secretEnv: "FORGEJO_RUNNER_SIZER_CLIENT_SECRET" diff --git a/template/stacks/core/secrets-backup.yaml b/template/stacks/core/secrets-backup.yaml new file mode 100644 index 0000000..08947ae --- /dev/null +++ b/template/stacks/core/secrets-backup.yaml @@ -0,0 +1,23 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: secrets-backup + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: gitea + sources: + - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + path: "{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/core/secrets-backup/manifests" diff --git a/template/stacks/core/secrets-backup/manifests/secrets-backup-cronjob.yaml b/template/stacks/core/secrets-backup/manifests/secrets-backup-cronjob.yaml new file mode 100644 index 0000000..5ea260d --- /dev/null +++ b/template/stacks/core/secrets-backup/manifests/secrets-backup-cronjob.yaml @@ -0,0 +1,107 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: secrets-backup + namespace: gitea +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: secrets-backup-reader +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: secrets-backup-reader +subjects: + - kind: ServiceAccount + name: secrets-backup + namespace: gitea +roleRef: + kind: ClusterRole + name: secrets-backup-reader + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: secrets-backup + namespace: gitea +spec: + schedule: "30 3 * * *" + concurrencyPolicy: "Forbid" + successfulJobsHistoryLimit: 5 + failedJobsHistoryLimit: 5 + startingDeadlineSeconds: 600 # 10 minutes + jobTemplate: + spec: + activeDeadlineSeconds: 900 + backoffLimit: 2 + ttlSecondsAfterFinished: 259200 + template: + spec: + serviceAccountName: secrets-backup + containers: + - name: secrets-backup + image: edp.buildth.ing/devfw-cicd/secrets-backup:1.0.1 + imagePullPolicy: IfNotPresent + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: access-key + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: secret-key + - name: SOURCE_BUCKET + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: bucket-name + - name: OBS_ENDPOINT + value: "obs.eu-de.otc.t-systems.com" + command: + - /bin/sh + - -c + - | + set -euo pipefail + + TIMESTAMP=$(date +%Y%m%d-%H%M%S) + BACKUP_DIR="/tmp/secrets-backup-${TIMESTAMP}" + NAMESPACES="argocd cert-manager external-secrets" + + mkdir -p "${BACKUP_DIR}" + + echo "=== Exporting secrets from critical namespaces ===" + for NS in ${NAMESPACES}; do + echo "Exporting namespace: ${NS}" + kubectl get secrets -n "${NS}" \ + -o json \ + --field-selector type!=kubernetes.io/service-account-token \ + > "${BACKUP_DIR}/${NS}-secrets.json" + done + + echo "=== Creating compressed archive ===" + ARCHIVE="${BACKUP_DIR}/secrets-backup-${TIMESTAMP}.tar.gz" + tar -czf "${ARCHIVE}" -C "${BACKUP_DIR}" \ + $(ls "${BACKUP_DIR}"/*.json 2>/dev/null | xargs -n1 basename) + + echo "=== Uploading to OBS (SSE-KMS encryption at rest) ===" + aws s3 cp "${ARCHIVE}" \ + "s3://${SOURCE_BUCKET}/cluster-secrets-backup/${TIMESTAMP}/secrets-backup.tar.gz" \ + --endpoint-url "https://${OBS_ENDPOINT}" + + echo "=== Cleanup ===" + rm -rf "${BACKUP_DIR}" + echo "Backup completed: ${TIMESTAMP}" + restartPolicy: OnFailure diff --git a/template/stacks/forgejo/forgejo-runner/dind-docker.yaml b/template/stacks/forgejo/forgejo-runner/dind-docker.yaml index d9b6cfb..e89d99f 100644 --- a/template/stacks/forgejo/forgejo-runner/dind-docker.yaml +++ b/template/stacks/forgejo/forgejo-runner/dind-docker.yaml @@ -28,7 +28,7 @@ spec: # https://forgejo.org/docs/v1.21/admin/actions/#offline-registration initContainers: - name: runner-register - image: code.forgejo.org/forgejo/runner:6.4.0 + image: code.forgejo.org/forgejo/runner:12.6.4 command: - "sh" - "-c" @@ -39,7 +39,7 @@ spec: --token ${RUNNER_SECRET} \ --name ${RUNNER_NAME} \ --instance ${FORGEJO_INSTANCE_URL} \ - --labels docker:docker://node:20-bookworm,ubuntu-22.04:docker://ghcr.io/catthehacker/ubuntu:act-22.04,ubuntu-latest:docker://ghcr.io/catthehacker/ubuntu:act-22.04 + --labels docker:docker://node:24-bookworm,ubuntu-22.04:docker://ghcr.io/catthehacker/ubuntu:act-22.04,ubuntu-latest:docker://ghcr.io/catthehacker/ubuntu:act-24.04,ubuntu-24.04:docker://ghcr.io/catthehacker/ubuntu:act-24.04 env: - name: RUNNER_NAME valueFrom: @@ -57,8 +57,8 @@ spec: mountPath: /data containers: - name: runner - image: code.forgejo.org/forgejo/runner:6.4.0 - command: + image: code.forgejo.org/forgejo/runner:12.6.4 + command: - "sh" - "-c" - | diff --git a/template/stacks/forgejo/forgejo-server.yaml b/template/stacks/forgejo/forgejo-server.yaml index 249976a..ee8b2b2 100644 --- a/template/stacks/forgejo/forgejo-server.yaml +++ b/template/stacks/forgejo/forgejo-server.yaml @@ -18,15 +18,9 @@ spec: name: in-cluster namespace: gitea sources: - - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/DevFW-CICD/forgejo-helm.git + - repoURL: https://code.forgejo.org/forgejo-helm/forgejo-helm.git path: . - # first check out the desired version (example v9.0.0): https://code.forgejo.org/forgejo-helm/forgejo-helm/src/tag/v9.0.0/Chart.yaml - # (note that the chart version is not the same as the forgejo application version, which is specified in the above Chart.yaml file) - # then use the devops pipeline and select development, forgejo and the desired version (example v9.0.0): - # https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/DevFW-CICD/devops-pipelines/actions?workflow=update-helm-depends.yaml&actor=0&status=0 - # finally update the desired version here and include "-depends", it is created by the devops pipeline. - # why do we have an added "-depends" tag? it resolves rate limitings when downloading helm OCI dependencies - targetRevision: v12.0.0-depends + targetRevision: v16.2.0 helm: valueFiles: - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/forgejo/forgejo-server/values.yaml diff --git a/template/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml b/template/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml index 1caab08..d1d503f 100644 --- a/template/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml +++ b/template/stacks/forgejo/forgejo-server/manifests/forgejo-ingress.yaml @@ -3,7 +3,7 @@ kind: Ingress metadata: annotations: nginx.ingress.kubernetes.io/force-ssl-redirect: "true" - nginx.ingress.kubernetes.io/proxy-body-size: 512m + nginx.ingress.kubernetes.io/proxy-body-size: 5120m cert-manager.io/cluster-issuer: main {{{ if eq .Env.CLUSTER_TYPE "osc" }}} dns.gardener.cloud/class: garden diff --git a/template/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml b/template/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml index e5ea7df..30cadc6 100644 --- a/template/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml +++ b/template/stacks/forgejo/forgejo-server/manifests/forgejo-s3-backup-cronjob.yaml @@ -5,50 +5,58 @@ metadata: namespace: gitea spec: schedule: "0 1 * * *" + concurrencyPolicy: "Forbid" + successfulJobsHistoryLimit: 5 + failedJobsHistoryLimit: 5 + startingDeadlineSeconds: 600 # 10 minutes jobTemplate: spec: + # 4h window: bumped from 2h after Jun 20-21 deadline hit on heavy sync; BackupJobTooSlow alert fires at 5m + activeDeadlineSeconds: 14400 + backoffLimit: 2 + ttlSecondsAfterFinished: 259200 # template: spec: containers: - - name: rclone - image: rclone/rclone:1.70 - imagePullPolicy: IfNotPresent - env: - - name: SOURCE_BUCKET - valueFrom: - secretKeyRef: - name: forgejo-cloud-credentials - key: bucket-name - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: forgejo-cloud-credentials - key: access-key - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: forgejo-cloud-credentials - key: secret-key - volumeMounts: - - name: rclone-config - mountPath: /config/rclone - readOnly: true - - name: backup-dir - mountPath: /backup - readOnly: false - command: - - /bin/sh - - -c - - | - rclone sync source:/${SOURCE_BUCKET}/packages /backup -v --ignore-checksum + - name: rclone + image: rclone/rclone:1.70 + imagePullPolicy: IfNotPresent + env: + - name: SOURCE_BUCKET + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: bucket-name + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: access-key + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: forgejo-cloud-credentials + key: secret-key + volumeMounts: + - name: rclone-config + mountPath: /config/rclone + readOnly: true + - name: backup-dir + mountPath: /backup + readOnly: false + command: + - /bin/sh + - -c + - | + rclone sync source:/${SOURCE_BUCKET} /backup -v --ignore-checksum restartPolicy: OnFailure volumes: - - name: rclone-config - secret: - secretName: forgejo-s3-backup - - name: backup-dir - persistentVolumeClaim: - claimName: s3-backup + - name: rclone-config + secret: + secretName: forgejo-s3-backup + - name: backup-dir + persistentVolumeClaim: + claimName: s3-backup --- apiVersion: v1 kind: PersistentVolumeClaim @@ -56,7 +64,7 @@ metadata: name: s3-backup namespace: gitea annotations: - everest.io/disk-volume-type: SATA + everest.io/disk-volume-type: GPSSD everest.io/crypt-key-id: {{{ .Env.PVC_KMS_KEY_ID }}} spec: storageClassName: csi-disk @@ -64,7 +72,7 @@ spec: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: 500Gi --- apiVersion: v1 kind: Secret diff --git a/template/stacks/forgejo/forgejo-server/values.yaml b/template/stacks/forgejo/forgejo-server/values.yaml index a7d7335..38ae176 100644 --- a/template/stacks/forgejo/forgejo-server/values.yaml +++ b/template/stacks/forgejo/forgejo-server/values.yaml @@ -1,15 +1,5 @@ -# This is only used for deploying older versions of infra-catalogue where the bucket name is not an output of the terragrunt modules -{{{- define "BUCKET_NAME" -}}} -{{{- if (getenv "FORGEJO_BUCKET_NAME") -}}} -{{{ getenv "FORGEJO_BUCKET_NAME" }}} -{{{- else -}}} -edp-forgejo-{{{ getenv "CLUSTER_ENVIRONMENT" }}} -{{{- end -}}} -{{{- end -}}} - - -# We use recreate to make sure only one instance with one version is running, because Forgejo might break or data gets inconsistant. +# We use recreate to make sure only one instance with one version is running, because Forgejo might break or data gets inconsistant. strategy: type: Recreate @@ -31,7 +21,7 @@ persistence: storageClass: csi-disk annotations: everest.io/crypt-key-id: {{{ .Env.PVC_KMS_KEY_ID }}} - everest.io/disk-volume-type: SATA + everest.io/disk-volume-type: GPSSD test: enabled: false @@ -134,7 +124,7 @@ gitea: MINIO_ENDPOINT: obs.eu-de.otc.t-systems.com:443 STORAGE_TYPE: minio MINIO_LOCATION: eu-de - MINIO_BUCKET: "{{{ template "BUCKET_NAME" }}}" + MINIO_BUCKET: "{{{ getenv "FORGEJO_BUCKET_NAME" }}}" MINIO_USE_SSL: true queue: @@ -147,8 +137,12 @@ gitea: ENABLED: true ADAPTER: redis + security: + GLOBAL_TWO_FACTOR_REQUIREMENT: admin + service: DISABLE_REGISTRATION: true + ENABLE_NOTIFY_MAIL: true other: SHOW_FOOTER_VERSION: false @@ -176,7 +170,7 @@ service: nodePort: 32222 externalTrafficPolicy: Cluster annotations: - kubernetes.io/elb.id: {{{ .Env.LOADBALANCER_ID }}} + kubernetes.io/elb.id: {{{ .Env.LOADBALANCER_ID }}} image: pullPolicy: "IfNotPresent" @@ -184,19 +178,6 @@ image: #tag: "8.0.3" # Adds -rootless suffix to image name # rootless: true - #fullOverride: {{{ getenv "CLIENT_REPO_DOMAIN" }}}/devfw-cicd/edp-forgejo:v1.1.0-edp-v11.0.3 - fullOverride: {{{ getenv "CLIENT_REPO_DOMAIN" }}}/devfw-cicd/edp-forgejo:osctest + fullOverride: {{{ getenv "CLIENT_REPO_DOMAIN" }}}/devfw-cicd/edp-forgejo:{{{ .Env.FORGEJO_IMAGE_TAG }}} -forgejo: - runner: - enabled: true - image: - tag: latest - # replicas: 3 - config: - runner: - labels: - - docker:docker://node:16-bullseye - - self-hosted:docker://ghcr.io/catthehacker/ubuntu:act-22.04 - - ubuntu-22.04:docker://ghcr.io/catthehacker/ubuntu:act-22.04 - - ubuntu-latest:docker://ghcr.io/catthehacker/ubuntu:act-22.04 +forgejo: {} diff --git a/template/stacks/garm/garm.yaml b/template/stacks/garm/garm.yaml new file mode 100644 index 0000000..53f2972 --- /dev/null +++ b/template/stacks/garm/garm.yaml @@ -0,0 +1,33 @@ +# Default: Forgejo/GitHub Actions runner manager +# Deploys GARM with the ci-sizer provider for automatic sizing + collector injection. +# For GitLab-only deployments, omit this and use gitlab-webhook instead. +# See: ci-sizer/docs/deployment-modes.md +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: garm + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: garm + sources: + - repoURL: https://edp.buildth.ing/DevFW-CICD/garm-helm + path: charts/garm + targetRevision: v0.0.17 + helm: + valueFiles: + - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/garm/garm/values.yaml + - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + ref: values diff --git a/template/stacks/garm/garm/values.yaml b/template/stacks/garm/garm/values.yaml new file mode 100644 index 0000000..fb30bda --- /dev/null +++ b/template/stacks/garm/garm/values.yaml @@ -0,0 +1,45 @@ +ingress: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: main + nginx.ingress.kubernetes.io/backend-protocol: HTTP + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + hosts: + - host: garm.{{{ .Env.DOMAIN_GITEA }}} + paths: + - path: / + pathType: Prefix + tls: + - secretName: garm-net-tls + hosts: + - garm.{{{ .Env.DOMAIN_GITEA }}} + +# Credentials and Secrets +credentials: + edgeConnect: + existingSecretName: "edge-credential" + gitea: + url: "https://{{{ .Env.DOMAIN_GITEA }}}" # Required + db: + existingSecretName: garm-fixed-credentials + +image: + repository: {{{ .Env.CLIENT_REPO_DOMAIN }}}/devfw-cicd/garm-forgejo + tag: v0.1.7-forgejo-24 + +providerConfig: + edgeConnect: + organization: edp2 + region: EU + edgeConnectUrl: "https://hub.apps.edge.platform.mg3.mdb.osc.live" + cloudlet: + name: Hamburg + organization: TelekomOP + edgeConnectK8s: + sizer: + sidecarImage: edp.buildth.ing/devfw-cicd/ci-sizer-collector:0.9.7 + +garm: + logging: + logLevel: info diff --git a/template/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml b/template/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..2e9248f --- /dev/null +++ b/template/stacks/observability-client/vm-client-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd + namespace: observability +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: http-metrics diff --git a/template/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml b/template/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml new file mode 100644 index 0000000..aecf517 --- /dev/null +++ b/template/stacks/observability-client/vm-client-stack/manifests/forgejo-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: forgejo + namespace: observability +spec: + namespaceSelector: + matchNames: + - gitea + selector: + matchLabels: + app.kubernetes.io/name: forgejo + endpoints: + - port: http + path: /metrics diff --git a/template/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml b/template/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..9904e86 --- /dev/null +++ b/template/stacks/observability-client/vm-client-stack/manifests/garm-scrape.yaml @@ -0,0 +1,15 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm + namespace: observability +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http + path: /metrics diff --git a/template/stacks/observability-client/vm-client-stack/manifests/simple-user-secret.yaml b/template/stacks/observability-client/vm-client-stack/manifests/simple-user-secret.yaml deleted file mode 100644 index f13b0b6..0000000 --- a/template/stacks/observability-client/vm-client-stack/manifests/simple-user-secret.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: simple-user-secret - namespace: observability -type: Opaque -stringData: - username: simple-user - password: simple-password diff --git a/template/stacks/observability-client/vm-client-stack/values.yaml b/template/stacks/observability-client/vm-client-stack/values.yaml index 33afb8d..93521d2 100644 --- a/template/stacks/observability-client/vm-client-stack/values.yaml +++ b/template/stacks/observability-client/vm-client-stack/values.yaml @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true @@ -801,6 +801,20 @@ vmagent: # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug promscrape.dropOriginalLabels: "true" + # Harden liveness probe: default failureThreshold=10 masked a 72h silent outage + livenessProbe: + httpGet: + path: /health + port: http + failureThreshold: 3 + periodSeconds: 5 + timeoutSeconds: 5 + startupProbe: + httpGet: + path: /health + port: http + failureThreshold: 30 + periodSeconds: 5 # -- (object) VMAgent ingress configuration ingress: enabled: false diff --git a/template/stacks/observability/grafana-operator/manifests/argocd-operational.yaml b/template/stacks/observability/grafana-operator/manifests/argocd-operational.yaml new file mode 100644 index 0000000..9130b42 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/argocd-operational.yaml @@ -0,0 +1,153 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: argocd-operational +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Application Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Total Apps", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Healthy", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Healthy\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Degraded", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Degraded\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Synced", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"Synced\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "OutOfSync", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", sync_status=\"OutOfSync\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "orange", "value": null}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Progressing", + "type": "stat", + "targets": [{"expr": "count(argocd_app_info{cluster_environment=~\"$cluster_environment\", health_status=\"Progressing\"}) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Application Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": {"custom": {"filterable": true}}, + "overrides": [ + {"matcher": {"id": "byName", "options": "Health"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Healthy": {"color": "green", "text": "Healthy"}, "Degraded": {"color": "red", "text": "Degraded"}, "Progressing": {"color": "yellow", "text": "Progressing"}, "Missing": {"color": "purple", "text": "Missing"}}, "type": "value"}]}]}, + {"matcher": {"id": "byName", "options": "Sync"}, "properties": [{"id": "custom.cellOptions", "value": {"type": "color-text"}}, {"id": "mappings", "value": [{"options": {"Synced": {"color": "green", "text": "Synced"}, "OutOfSync": {"color": "orange", "text": "OutOfSync"}}, "type": "value"}]}]} + ] + }, + "gridPos": {"h": 12, "w": 24, "x": 0, "y": 6}, + "title": "All Applications", + "type": "table", + "targets": [{"expr": "argocd_app_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "legendFormat": ""}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "dest_namespace", "health_status", "sync_status", "repo"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Application", "dest_namespace": "Namespace", "health_status": "Health", "sync_status": "Sync", "repo": "Repository"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}, + "title": "Sync Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 19}, + "title": "Sync Operations (rate)", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_sync_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (name, phase)", "legendFormat": "{{name}} ({{phase}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 19}, + "title": "Reconciliation Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(argocd_app_reconcile_count{cluster_environment=~\"$cluster_environment\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}, + "title": "ArgoCD Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 28}, + "title": "ArgoCD Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"argocd\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "argocd", "gitops"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(argocd_app_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(argocd_app_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "ArgoCD Operations", + "uid": "edp-argocd-ops" + } diff --git a/template/stacks/observability/grafana-operator/manifests/argocd.yaml b/template/stacks/observability/grafana-operator/manifests/argocd.yaml index b348ff7..2b81b2b 100644 --- a/template/stacks/observability/grafana-operator/manifests/argocd.yaml +++ b/template/stacks/observability/grafana-operator/manifests/argocd.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Applications" url: "https://raw.githubusercontent.com/argoproj/argo-cd/refs/heads/master/examples/dashboard.json" diff --git a/template/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml b/template/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml new file mode 100644 index 0000000..ddcc883 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/cronjob-monitoring.yaml @@ -0,0 +1,103 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cronjob-monitoring +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Operations" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Backup Job Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 0, "y": 1}, + "title": "Time Since Last Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 5, "w": 12, "x": 12, "y": 1}, + "title": "Failed Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) > 0", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "CronJob Overview", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}, "overrides": [{"matcher": {"id": "byName", "options": "Suspended"}, "properties": [{"id": "mappings", "value": [{"options": {"0": {"text": "No", "color": "green"}, "1": {"text": "YES", "color": "red"}}, "type": "value"}]}]}]}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 7}, + "title": "All CronJobs", + "type": "table", + "targets": [ + {"expr": "kube_cronjob_info{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true, "refId": "A"} + ], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "cronjob", "namespace", "schedule"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "cronjob": "CronJob", "namespace": "Namespace", "schedule": "Schedule"}}} + ] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Job History", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "title": "Job Completions (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_succeeded{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "color": {"mode": "palette-classic"}}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "title": "Job Failures (24h)", + "type": "timeseries", + "targets": [{"expr": "sum(kube_job_status_failed{cluster_environment=~\"$cluster_environment\"}) by (job_name, cluster_environment)", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + } + ], + "schemaVersion": 39, + "tags": ["edp", "backup", "cronjob"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(kube_cronjob_info, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(kube_cronjob_info, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-24h", "to": "now"}, + "title": "CronJob & Backup Monitoring", + "uid": "edp-cronjobs" + } diff --git a/template/stacks/observability/grafana-operator/manifests/forgejo.yaml b/template/stacks/observability/grafana-operator/manifests/forgejo.yaml new file mode 100644 index 0000000..ec40751 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/forgejo.yaml @@ -0,0 +1,207 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: forgejo +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Forgejo Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Status", + "type": "stat", + "targets": [{"expr": "up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Version", + "type": "stat", + "targets": [{"expr": "gitea_build_info{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{version}}"}], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name"} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Teams", + "type": "stat", + "targets": [{"expr": "gitea_teams{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Activity", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 6}, + "title": "Closed Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_closed{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 6}, + "title": "Hook Tasks", + "type": "stat", + "targets": [{"expr": "gitea_hooktasks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Content & Auth", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 11}, + "title": "Stars", + "type": "stat", + "targets": [{"expr": "gitea_stars{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 11}, + "title": "Watches", + "type": "stat", + "targets": [{"expr": "gitea_watches{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 11}, + "title": "Releases", + "type": "stat", + "targets": [{"expr": "gitea_releases{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 11}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 11}, + "title": "Public Keys", + "type": "stat", + "targets": [{"expr": "gitea_publickeys{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 11}, + "title": "OAuth Apps", + "type": "stat", + "targets": [{"expr": "gitea_oauths{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Forgejo Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 16}, + "title": "Forgejo Server Logs", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"}", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 12, "x": 12, "y": 16}, + "title": "Forgejo Errors", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\", kubernetes.namespace=\"gitea\"} error OR Error OR ERROR OR panic", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "wrapLogMessage": true, "enableLogDetails": true, "sortOrder": "Descending"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "forgejo", "gitea"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(gitea_repositories, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(gitea_repositories, cluster_environment)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "Forgejo", + "uid": "edp-forgejo" + } diff --git a/template/stacks/observability/grafana-operator/manifests/garm.yaml b/template/stacks/observability/grafana-operator/manifests/garm.yaml new file mode 100644 index 0000000..2a23e20 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/garm.yaml @@ -0,0 +1,117 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: garm +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Applications" + json: | + { + "annotations": {"list": []}, + "editable": true, + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "GARM Runner Status", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1}, + "title": "Total Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1}, + "title": "Idle Runners", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"idle\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "yellow", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1}, + "title": "Creating", + "type": "stat", + "targets": [{"expr": "count(garm_runner_status{cluster_environment=~\"$cluster_environment\", status=\"creating\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}]}}}, + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1}, + "title": "Errors", + "type": "stat", + "targets": [{"expr": "sum(rate(garm_runner_errors_total{cluster_environment=~\"$cluster_environment\"}[5m])) or vector(0)", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 6}, + "title": "GitHub API Rate Limits", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "min": 0}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 7}, + "title": "Rate Limit Remaining", + "type": "timeseries", + "targets": [{"expr": "garm_github_rate_limit_remaining{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "ops"}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 7}, + "title": "Runner Operations Rate", + "type": "timeseries", + "targets": [{"expr": "sum(rate(garm_runner_operations_total{cluster_environment=~\"$cluster_environment\"}[5m])) by (cluster_environment)", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 15}, + "title": "Runner Details", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"custom": {"filterable": true}}}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 16}, + "title": "Runner Pool Status", + "type": "table", + "targets": [{"expr": "garm_runner_status{cluster_environment=~\"$cluster_environment\"}", "format": "table", "instant": true}], + "transformations": [ + {"id": "filterFieldsByName", "options": {"include": {"names": ["cluster_environment", "name", "status", "pool_owner", "pool_type", "provider"]}}}, + {"id": "organize", "options": {"renameByName": {"cluster_environment": "Environment", "name": "Runner", "status": "Status", "pool_owner": "Pool Owner", "pool_type": "Type", "provider": "Provider"}}} + ] + } + ], + "schemaVersion": 39, + "tags": ["edp", "garm", "ci-cd", "runners"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(garm_runner_status, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(garm_runner_status, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "GARM Runners", + "uid": "edp-garm" + } diff --git a/template/stacks/observability/grafana-operator/manifests/grafana.yaml b/template/stacks/observability/grafana-operator/manifests/grafana.yaml index 1c47357..41f32db 100644 --- a/template/stacks/observability/grafana-operator/manifests/grafana.yaml +++ b/template/stacks/observability/grafana-operator/manifests/grafana.yaml @@ -8,7 +8,8 @@ spec: persistentVolumeClaim: metadata: annotations: - everest.io/disk-volume-type: SATA + everest.io/disk-volume-type: GPSSD + everest.io/crypt-key-id: {{{ .Env.PVC_KMS_KEY_ID }}} spec: storageClassName: csi-disk accessModes: @@ -16,6 +17,40 @@ spec: resources: requests: storage: 10Gi + deployment: + spec: + template: + spec: + containers: + - name: grafana + env: + - name: OAUTH_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: clientSecret + name: dex-grafana-client + config: + log.console: + level: debug + server: + root_url: "https://{{{ .Env.DOMAIN_GRAFANA }}}" + auth: + disable_login: "true" + disable_login_form: "true" + auth.generic_oauth: + enabled: "true" + name: Forgejo + allow_sign_up: "true" + use_refresh_token: "true" + client_id: grafana + client_secret: $__env{OAUTH_CLIENT_SECRET} + scopes: openid email profile offline_access groups + auth_url: https://{{{ .Env.DOMAIN_DEX }}}/auth + token_url: https://{{{ .Env.DOMAIN_DEX }}}/token + api_url: https://{{{ .Env.DOMAIN_DEX }}}/userinfo + redirect_uri: https://{{{ .Env.DOMAIN_GRAFANA }}}/login/generic_oauth + role_attribute_path: "contains(groups[*], 'DevFW') && 'GrafanaAdmin' || 'None'" + allow_assign_grafana_admin: "true" ingress: metadata: annotations: @@ -24,7 +59,7 @@ spec: spec: ingressClassName: nginx rules: - - host: grafana.{{{ .Env.DOMAIN }}} + - host: {{{ .Env.DOMAIN_GRAFANA }}} http: paths: - backend: @@ -36,5 +71,5 @@ spec: pathType: Prefix tls: - hosts: - - grafana.{{{ .Env.DOMAIN }}} + - {{{ .Env.DOMAIN_GRAFANA }}} secretName: grafana-net-tls diff --git a/template/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml b/template/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml index c13d6a2..077edd8 100644 --- a/template/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml +++ b/template/stacks/observability/grafana-operator/manifests/ingress-nginx.yaml @@ -6,4 +6,5 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" + folder: "EDP / Operations" url: "https://raw.githubusercontent.com/adinhodovic/ingress-nginx-mixin/refs/heads/main/dashboards_out/ingress-nginx-overview.json" diff --git a/template/stacks/observability/grafana-operator/manifests/platform-overview.yaml b/template/stacks/observability/grafana-operator/manifests/platform-overview.yaml new file mode 100644 index 0000000..ffce4e2 --- /dev/null +++ b/template/stacks/observability/grafana-operator/manifests/platform-overview.yaml @@ -0,0 +1,245 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: platform-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + folder: "EDP / Overview" + json: | + { + "annotations": {"list": []}, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "title": "Platform Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "mappings": [{"options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}, "type": "value"}], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "title": "Forgejo", + "type": "stat", + "targets": [{"expr": "sum(up{job=\"forgejo-server-http\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "title": "Ingress 5xx (5m)", + "type": "stat", + "targets": [{"expr": "sum(rate(nginx_ingress_controller_requests{status=~\"5..\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "title": "Failed Jobs (24h)", + "type": "stat", + "targets": [{"expr": "sum(kube_job_status_failed{namespace=\"gitea\", cluster_environment=~\"$cluster_environment\"}) or vector(0)", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "title": "Cluster CPU Usage", + "type": "stat", + "targets": [{"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m]))", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.7}, {"color": "red", "value": 0.85}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "title": "Cluster Memory Usage", + "type": "stat", + "targets": [{"expr": "1 - sum(node_memory_MemAvailable_bytes{cluster_environment=~\"$cluster_environment\"}) / sum(node_memory_MemTotal_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.6}, {"color": "red", "value": 0.8}]} + } + }, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "title": "Max PVC Usage", + "type": "stat", + "targets": [{"expr": "max(1 - kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": ""}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "title": "Forgejo", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 6}, + "title": "Repositories", + "type": "stat", + "targets": [{"expr": "gitea_repositories{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 6}, + "title": "Users", + "type": "stat", + "targets": [{"expr": "gitea_users{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 6}, + "title": "Organizations", + "type": "stat", + "targets": [{"expr": "gitea_organizations{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 6}, + "title": "Open Issues", + "type": "stat", + "targets": [{"expr": "gitea_issues_open{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 6}, + "title": "Webhooks", + "type": "stat", + "targets": [{"expr": "gitea_webhooks{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short"}}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 6}, + "title": "Mirrors", + "type": "stat", + "targets": [{"expr": "gitea_mirrors{cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cluster_environment}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "title": "Resources", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 11}, + "title": "Node CPU Usage", + "type": "timeseries", + "targets": [{"expr": "1 - rate(node_cpu_seconds_total{mode=\"idle\", cluster_environment=~\"$cluster_environment\"}[5m])", "legendFormat": "{{instance}}"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 11}, + "title": "PVC Usage by Claim", + "type": "timeseries", + "targets": [{"expr": "1 - (kubelet_volume_stats_available_bytes{cluster_environment=~\"$cluster_environment\"} / kubelet_volume_stats_capacity_bytes{cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, + "title": "Backups", + "type": "row" + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 0, "y": 20}, + "title": "Time Since Last Backup Schedule", + "type": "stat", + "targets": [{"expr": "time() - kube_cronjob_status_last_schedule_time{cronjob=~\"forgejo-s3-backup|secrets-backup\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{cronjob}} ({{cluster_environment}})"}] + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "s"}}, + "gridPos": {"h": 4, "w": 8, "x": 8, "y": 20}, + "title": "Backup Job Duration (Last 7d)", + "type": "timeseries", + "targets": [{"expr": "kube_job_status_completion_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"} - kube_job_status_start_time{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"}", "legendFormat": "{{job_name}}"}], + "options": {"legend": {"displayMode": "table"}} + }, + { + "datasource": {"type": "prometheus"}, + "fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "red", "value": 1}]}}}, + "gridPos": {"h": 4, "w": 8, "x": 16, "y": 20}, + "title": "Failed Backup Jobs (Active)", + "type": "stat", + "targets": [{"expr": "sum by(cluster_environment, job_name) (kube_job_status_failed{job_name=~\"forgejo-s3-backup.*|secrets-backup.*\", cluster_environment=~\"$cluster_environment\"})", "legendFormat": "{{job_name}} ({{cluster_environment}})"}] + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 24}, + "title": "Logs", + "type": "row" + }, + { + "datasource": {"type": "victoriametrics-logs-datasource"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 25}, + "title": "Recent Errors (all namespaces)", + "type": "logs", + "targets": [{"expr": "{cluster_environment=~\"$cluster_environment\"} error OR Error OR ERROR OR panic OR PANIC", "refId": "A"}], + "options": {"showTime": true, "showLabels": true, "showCommonLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "sortOrder": "Descending", "dedupStrategy": "none"} + } + ], + "schemaVersion": 39, + "tags": ["edp", "platform", "overview"], + "templating": { + "list": [ + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus"}, + "definition": "label_values(up, cluster_environment)", + "includeAll": true, + "multi": true, + "name": "cluster_environment", + "label": "Environment", + "query": "label_values(up, cluster_environment)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "title": "EDP Platform Overview", + "uid": "edp-platform-overview" + } diff --git a/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml b/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml index 4018fbd..c44c474 100644 --- a/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml +++ b/template/stacks/observability/grafana-operator/manifests/victoria-logs.yaml @@ -6,4 +6,7 @@ spec: instanceSelector: matchLabels: dashboards: "grafana" - url: "https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/refs/heads/master/dashboards/vm/victorialogs.json" + folder: "EDP / Operations" + grafanaCom: + id: 22698 + revision: 1 diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml index f884bd9..2cce6a3 100644 --- a/template/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml +++ b/template/stacks/observability/victoria-k8s-stack/manifests/alerts.yaml @@ -1,18 +1,119 @@ apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: - name: forgejo-alerts + name: edp-platform-alerts namespace: observability spec: groups: - - name: forgejo + - name: platform-health rules: - - alert: forgejo down - expr: sum by(cluster_environment) (up{pod=~"forgejo-server-.*"}) < 1 - for: 30s + - alert: ForgejoDown + expr: sum by(cluster_environment) (up{job="forgejo-server-http"}) < 1 + for: 1m + labels: + severity: critical + annotations: + summary: "Forgejo is down on {{ $labels.cluster_environment }}" + description: "Forgejo server has been unreachable for more than 1 minute in cluster {{ $labels.cluster_environment }}." + + - alert: IngressHighErrorRate + expr: | + sum by(cluster_environment) (rate(nginx_ingress_controller_requests{status=~"5.."}[5m])) + / sum by(cluster_environment) (rate(nginx_ingress_controller_requests[5m])) > 0.05 + for: 5m labels: severity: major - job: "{{ $labels.job }}" annotations: - value: "{{ $value }}" - description: 'forgejo is down in cluster environment {{ $labels.cluster_environment }}' + summary: "High ingress 5xx rate on {{ $labels.cluster_environment }}" + description: "More than 5% of ingress requests are returning 5xx errors for over 5 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeNotReady + expr: kube_node_status_condition{condition="Ready", status="true"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Node {{ $labels.node }} not ready on {{ $labels.cluster_environment }}" + description: "Node {{ $labels.node }} has been in NotReady state for more than 5 minutes." + + - alert: PodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3 + for: 5m + labels: + severity: major + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash-looping on {{ $labels.cluster_environment }}" + description: "Pod has restarted more than 3 times in the last 15 minutes." + + - name: storage + rules: + - alert: PVCUsageHigh + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: major + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >80%" + description: "PVC usage is at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}." + value: "{{ $value | humanizePercentage }}" + + - alert: PVCUsageCritical + expr: | + 1 - (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.90 + for: 5m + labels: + severity: critical + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage >90%" + description: "PVC is almost full at {{ $value | humanizePercentage }} on {{ $labels.cluster_environment }}. Immediate action required." + value: "{{ $value | humanizePercentage }}" + + - name: resources + rules: + - alert: NodeCPUHigh + expr: | + 1 - avg by(instance, cluster_environment) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.85 + for: 15m + labels: + severity: major + annotations: + summary: "Node {{ $labels.instance }} CPU >85% on {{ $labels.cluster_environment }}" + description: "Node CPU utilization has been above 85% for 15 minutes." + value: "{{ $value | humanizePercentage }}" + + - alert: NodeMemoryHigh + expr: | + 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 + for: 10m + labels: + severity: major + annotations: + summary: "Node memory >90% on {{ $labels.cluster_environment }}" + description: "Node memory utilization above 90% for 10 minutes." + value: "{{ $value | humanizePercentage }}" + + - name: cluster-health + rules: + - alert: ClusterMetricsSilent + expr: | + count(up{job="kubelet"}) by (cluster_environment) < 1 + or + absent(up{job="kubelet", cluster_environment="dev"}) + for: 10m + labels: + severity: critical + annotations: + summary: "Cluster {{ $labels.cluster_environment }} stopped sending metrics" + description: "No kubelet metrics received from cluster {{ $labels.cluster_environment }} for over 10 minutes. Either vmagent is dead or the cluster is unreachable." + + - alert: ClusterAPIServerDown + expr: | + up{job="apiserver", cluster_environment=~".+"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "API server down on {{ $labels.cluster_environment }}" + description: "Kubernetes API server scrape is failing on cluster {{ $labels.cluster_environment }}." diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml new file mode 100644 index 0000000..0517321 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/argocd-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: argocd +spec: + namespaceSelector: + matchNames: + - argocd + selector: + matchLabels: + app.kubernetes.io/part-of: argocd + endpoints: + - port: http-metrics diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml new file mode 100644 index 0000000..259a2bf --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/backup-alerts.yaml @@ -0,0 +1,78 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: backup-alerts + namespace: observability +spec: + groups: + - name: backup-schedule-staleness + rules: + - alert: BackupCronJobNotScheduled + expr: | + time() - kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + > 26 * 3600 + for: 5m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been + scheduled for over 26 hours in cluster {{ $labels.cluster_environment }}. + Last schedule was {{ $value | humanizeDuration }} ago. + summary: "Backup CronJob {{ $labels.cronjob }} is stale" + + - alert: BackupCronJobNeverScheduled + expr: | + kube_cronjob_status_last_schedule_time{cronjob=~"forgejo-s3-backup|secrets-backup", namespace="gitea"} + == 0 + for: 30m + labels: + severity: critical + cronjob: "{{ $labels.cronjob }}" + annotations: + description: >- + CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has never been + scheduled in cluster {{ $labels.cluster_environment }}. + summary: "Backup CronJob {{ $labels.cronjob }} never ran" + + - name: backup-job-failures + rules: + - alert: BackupJobFailed + expr: | + max by(cluster_environment, namespace, job_name) ( + kube_job_status_failed{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 0 + for: 30s + labels: + severity: critical + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has + {{ $value }} failed pod(s) in cluster {{ $labels.cluster_environment }}. + summary: "Backup job {{ $labels.job_name }} failed" + + - name: backup-job-duration + rules: + - alert: BackupJobTooSlow + expr: | + ( + time() - kube_job_status_start_time{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} + ) > 300 + and + kube_job_status_active{job_name=~"forgejo-s3-backup-.*|secrets-backup-.*", namespace="gitea"} > 0 + for: 1m + labels: + severity: major + job_name: "{{ $labels.job_name }}" + annotations: + value: "{{ $value | humanizeDuration }}" + description: >- + Backup job {{ $labels.namespace }}/{{ $labels.job_name }} has been + running for {{ $value | humanizeDuration }} (threshold: 5m) + in cluster {{ $labels.cluster_environment }}. This may indicate a + hung process or connectivity issue. + summary: "Backup job {{ $labels.job_name }} running too long" diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml new file mode 100644 index 0000000..2290b99 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/ci-sustainability-rules.yaml @@ -0,0 +1,61 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: ci-sustainability +spec: + groups: + - name: ci.sustainability.daily + interval: 5m + rules: + - record: ci:cpu_seconds:increase1d + expr: | + sum by(namespace, cluster) ( + increase(container_cpu_usage_seconds_total{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + }[1d]) + ) + - record: ci:memory_bytes_seconds:avg1d + expr: | + avg_over_time( + sum by(namespace, cluster) ( + container_memory_working_set_bytes{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*", + container!="" + } + )[1d:5m] + ) + - record: ci:pod_count:avg1d + expr: | + avg_over_time( + count by(namespace, cluster) ( + kube_pod_info{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + } + )[1d:5m] + ) + - record: ci:pod_creations:increase1d + expr: | + sum by(namespace, cluster) ( + changes(kube_pod_start_time{ + namespace=~"gitea|garm", + pod=~"forgejo-runner.*|garm-.*" + }[1d]) + ) + - name: ci.sustainability.cluster + interval: 5m + rules: + - record: cluster:cpu_seconds:rate5m + expr: | + sum by(cluster) ( + rate(node_cpu_seconds_total{mode!="idle"}[5m]) + ) + - record: cluster:memory_used_bytes:sum + expr: | + sum by(cluster) ( + node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes + ) + diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml new file mode 100644 index 0000000..77cef00 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/coredns-scrape.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: Service +metadata: + name: coredns-metrics + namespace: kube-system + labels: + k8s-app: coredns-metrics +spec: + clusterIP: None + selector: + k8s-app: coredns + ports: + - name: metrics + port: 9153 + targetPort: 9153 + protocol: TCP +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: coredns +spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: coredns-metrics + endpoints: + - port: metrics diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml new file mode 100644 index 0000000..f73afa8 --- /dev/null +++ b/template/stacks/observability/victoria-k8s-stack/manifests/garm-scrape.yaml @@ -0,0 +1,13 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: garm +spec: + namespaceSelector: + matchNames: + - garm + selector: + matchLabels: + app.kubernetes.io/name: garm + endpoints: + - port: http diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml index c74f8d5..2fec1ef 100644 --- a/template/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml +++ b/template/stacks/observability/victoria-k8s-stack/manifests/vlogs.yaml @@ -9,7 +9,7 @@ spec: storageMetadata: annotations: everest.io/crypt-key-id: {{{ .Env.PVC_KMS_KEY_ID }}} - everest.io/disk-volume-type: SATA + everest.io/disk-volume-type: GPSSD storage: storageClassName: csi-disk accessModes: diff --git a/template/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml b/template/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml index 2ea5d76..e1de2c6 100644 --- a/template/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml +++ b/template/stacks/observability/victoria-k8s-stack/manifests/vmauth.yaml @@ -5,11 +5,17 @@ metadata: namespace: observability spec: username: simple-user - password: simple-password + password: sx5gC7ooWaWOODwD targetRefs: - static: url: http://vmsingle-o12y:8429 paths: ["/api/v1/write"] + - static: + url: http://vmsingle-o12y:8429 + paths: ["/api/v1/.*"] - static: url: http://vlogs-victorialogs:9428 paths: ["/insert/elasticsearch/.*"] + - static: + url: http://vlogs-victorialogs:9428 + paths: ["/select/.*"] \ No newline at end of file diff --git a/template/stacks/observability/victoria-k8s-stack/values.yaml b/template/stacks/observability/victoria-k8s-stack/values.yaml index 78c705d..ba1fb74 100644 --- a/template/stacks/observability/victoria-k8s-stack/values.yaml +++ b/template/stacks/observability/victoria-k8s-stack/values.yaml @@ -1,6 +1,6 @@ global: # -- Cluster label to use for dashboards and rules - clusterLabel: cluster + clusterLabel: cluster_environment # -- Global license configuration license: key: "" @@ -201,13 +201,13 @@ defaultRules: create: true rules: {} kubernetesSystemControllerManager: - create: true + create: false rules: {} kubeScheduler: - create: true + create: false rules: {} kubernetesSystemScheduler: - create: true + create: false rules: {} kubeStateMetrics: create: true @@ -289,7 +289,7 @@ vmsingle: storageMetadata: annotations: everest.io/crypt-key-id: {{{ .Env.PVC_KMS_KEY_ID }}} - everest.io/disk-volume-type: SATA + everest.io/disk-volume-type: GPSSD storage: storageClassName: csi-disk accessModes: @@ -351,7 +351,7 @@ vmcluster: spec: resources: requests: - storage: 10Gi + storage: 10Gi resources: {} # limits: @@ -538,108 +538,30 @@ alertmanager: # If you're migrating existing config, please make sure that `.Values.alertmanager.config`: # - with `useManagedConfig: false` has structure described [here](https://prometheus.io/docs/alerting/latest/configuration/). # - with `useManagedConfig: true` has structure described [here](https://docs.victoriametrics.com/operator/api/#vmalertmanagerconfig). - useManagedConfig: false + useManagedConfig: true # -- (object) Alertmanager configuration config: route: receiver: "blackhole" - # group_by: ["alertgroup", "job"] - # group_wait: 30s - # group_interval: 5m - # repeat_interval: 12h - # routes: - # - # # Duplicate code_owner routes to teams - # # These will send alerts to team channels but continue - # # processing through the rest of the tree to handled by on-call - # - matchers: - # - code_owner_channel!="" - # - severity=~"info|warning|critical" - # group_by: ["code_owner_channel", "alertgroup", "job"] - # receiver: slack-code-owners - # - # # Standard on-call routes - # - matchers: - # - severity=~"info|warning|critical" - # receiver: slack-monitoring - # continue: true - # - # inhibit_rules: - # - target_matchers: - # - severity=~"warning|info" - # source_matchers: - # - severity=critical - # equal: - # - cluster - # - namespace - # - alertname - # - target_matchers: - # - severity=info - # source_matchers: - # - severity=warning - # equal: - # - cluster - # - namespace - # - alertname - # - target_matchers: - # - severity=info - # source_matchers: - # - alertname=InfoInhibitor - # equal: - # - cluster - # - namespace - + routes: + - matchers: + - severity=~"critical|major" + receiver: outlook receivers: - name: blackhole - # - name: "slack-monitoring" - # slack_configs: - # - channel: "#channel" - # send_resolved: true - # title: '{{ template "slack.monzo.title" . }}' - # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' - # color: '{{ template "slack.monzo.color" . }}' - # text: '{{ template "slack.monzo.text" . }}' - # actions: - # - type: button - # text: "Runbook :green_book:" - # url: "{{ (index .Alerts 0).Annotations.runbook_url }}" - # - type: button - # text: "Query :mag:" - # url: "{{ (index .Alerts 0).GeneratorURL }}" - # - type: button - # text: "Dashboard :grafana:" - # url: "{{ (index .Alerts 0).Annotations.dashboard }}" - # - type: button - # text: "Silence :no_bell:" - # url: '{{ template "__alert_silence_link" . }}' - # - type: button - # text: '{{ template "slack.monzo.link_button_text" . }}' - # url: "{{ .CommonAnnotations.link_url }}" - # - name: slack-code-owners - # slack_configs: - # - channel: "#{{ .CommonLabels.code_owner_channel }}" - # send_resolved: true - # title: '{{ template "slack.monzo.title" . }}' - # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' - # color: '{{ template "slack.monzo.color" . }}' - # text: '{{ template "slack.monzo.text" . }}' - # actions: - # - type: button - # text: "Runbook :green_book:" - # url: "{{ (index .Alerts 0).Annotations.runbook }}" - # - type: button - # text: "Query :mag:" - # url: "{{ (index .Alerts 0).GeneratorURL }}" - # - type: button - # text: "Dashboard :grafana:" - # url: "{{ (index .Alerts 0).Annotations.dashboard }}" - # - type: button - # text: "Silence :no_bell:" - # url: '{{ template "__alert_silence_link" . }}' - # - type: button - # text: '{{ template "slack.monzo.link_button_text" . }}' - # url: "{{ .CommonAnnotations.link_url }}" - # + - name: outlook + email_configs: + - smarthost: 'mail.mms-support.de:465' + auth_username: 'ipcei-cis-devfw@mms-support.de' + auth_password: + name: email-user-credentials + key: connection-string + from: '"IPCEI CIS DevFW" ' + to: 'f9f9953a.mg.telekom.de@de.teams.ms' + headers: + subject: 'Grafana Mail Alerts' + require_tls: false + # -- Better alert templates for [slack source](https://gist.github.com/milesbxf/e2744fc90e9c41b47aa47925f8ff6512) monzoTemplate: enabled: true @@ -870,7 +792,7 @@ defaultDatasources: implementation: prometheus # -- Configure additional grafana datasources (passed through tpl). # Check [here](http://docs.grafana.org/administration/provisioning/#datasources) for details - extra: + extra: - name: VictoriaLogs access: proxy type: victoriametrics-logs-datasource @@ -921,7 +843,7 @@ grafana: # Uncomment the block below, if you want to enable VictoriaMetrics Datasource in Grafana: # Note that Grafana will need internet access to install the datasource plugin. - + plugins: - victoriametrics-metrics-datasource - victoriametrics-logs-datasource @@ -1098,7 +1020,7 @@ kubeApiServer: # Component scraping the kube controller manager kubeControllerManager: # -- Enable kube controller manager metrics scraping - enabled: true + enabled: false # -- If your kube controller manager is not deployed as a pod, specify IPs it can be found on endpoints: [] @@ -1231,7 +1153,7 @@ kubeEtcd: # Component scraping kube scheduler kubeScheduler: # -- Enable KubeScheduler metrics scraping - enabled: true + enabled: false # -- If your kube scheduler is not deployed as a pod, specify IPs it can be found on endpoints: [] diff --git a/template/stacks/otc/ingress-nginx.yaml b/template/stacks/otc/ingress-nginx.yaml index d240304..cb58d5d 100644 --- a/template/stacks/otc/ingress-nginx.yaml +++ b/template/stacks/otc/ingress-nginx.yaml @@ -18,9 +18,9 @@ spec: name: in-cluster namespace: ingress-nginx sources: - - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/DevFW-CICD/ingress-nginx-helm.git + - repoURL: https://github.com/kubernetes/ingress-nginx.git path: charts/ingress-nginx - targetRevision: helm-chart-4.12.4-depends + targetRevision: helm-chart-4.12.1 helm: valueFiles: - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/otc/ingress-nginx/values.yaml diff --git a/template/stacks/otc/storageclass/storageclass.yaml b/template/stacks/otc/storageclass/storageclass.yaml index 038bf24..0f59c35 100644 --- a/template/stacks/otc/storageclass/storageclass.yaml +++ b/template/stacks/otc/storageclass/storageclass.yaml @@ -13,6 +13,6 @@ parameters: kubernetes.io/volumetype: SATA kubernetes.io/zone: eu-de-02 provisioner: flexvolume-huawei.com/fuxivol -reclaimPolicy: Delete +reclaimPolicy: {{{ getenv "STORAGE_RECLAIM_POLICY" "Retain" }}} volumeBindingMode: Immediate allowVolumeExpansion: true \ No newline at end of file diff --git a/template/stacks/terralist/terralist.yaml b/template/stacks/terralist/terralist.yaml new file mode 100644 index 0000000..77126f8 --- /dev/null +++ b/template/stacks/terralist/terralist.yaml @@ -0,0 +1,30 @@ +# helm upgrade --install --create-namespace --namespace terralist terralist oci://ghcr.io/terralist/helm-charts/terralist -f terralist-values.yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: terralist + namespace: argocd + labels: + env: dev +spec: + project: default + syncPolicy: + automated: + selfHeal: true + syncOptions: + - CreateNamespace=true + retry: + limit: -1 + destination: + name: in-cluster + namespace: terralist + sources: + - repoURL: https://github.com/terralist/helm-charts + path: charts/terralist + targetRevision: terralist-0.8.1 + helm: + valueFiles: + - $values/{{{ .Env.CLIENT_REPO_ID }}}/{{{ .Env.DOMAIN }}}/stacks/terralist/terralist/values.yaml + - repoURL: https://{{{ .Env.CLIENT_REPO_DOMAIN }}}/{{{ .Env.CLIENT_REPO_ORG_NAME }}} + targetRevision: HEAD + ref: values diff --git a/template/stacks/terralist/terralist/values.yaml b/template/stacks/terralist/terralist/values.yaml new file mode 100644 index 0000000..096db37 --- /dev/null +++ b/template/stacks/terralist/terralist/values.yaml @@ -0,0 +1,87 @@ +controllers: + main: + strategy: Recreate + containers: + app: + env: + - name: TERRALIST_OAUTH_PROVIDER + value: oidc + - name: TERRALIST_OI_CLIENT_ID + valueFrom: + secretKeyRef: + name: oidc-credentials + key: client-id + - name: TERRALIST_OI_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: oidc-credentials + key: client-secret + - name: TERRALIST_OI_AUTHORIZE_URL + valueFrom: + secretKeyRef: + name: oidc-credentials + key: authorize-url + - name: TERRALIST_OI_TOKEN_URL + valueFrom: + secretKeyRef: + name: oidc-credentials + key: token-url + - name: TERRALIST_OI_USERINFO_URL + valueFrom: + secretKeyRef: + name: oidc-credentials + key: userinfo-url + - name: TERRALIST_OI_SCOPE + valueFrom: + secretKeyRef: + name: oidc-credentials + key: scope + - name: TERRALIST_TOKEN_SIGNING_SECRET + valueFrom: + secretKeyRef: + name: terralist-secret + key: token-signing-secret + - name: TERRALIST_COOKIE_SECRET + valueFrom: + secretKeyRef: + name: terralist-secret + key: cookie-secret + - name: TERRALIST_URL + value: https://terralist.{{{ .Env.DOMAIN_GITEA }}} + - name: TERRALIST_SQLITE_PATH + value: /data/db.sqlite + - name: TERRALIST_LOCAL_STORE + value: /data/modules + - name: TERRALIST_PROVIDERS_ANONYMOUS_READ + value: "true" + +ingress: + main: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: main + hosts: + - host: terralist.{{{ .Env.DOMAIN_GITEA }}} + paths: + - path: / + pathType: Prefix + service: + identifier: main + port: http + tls: + - hosts: + - terralist.{{{ .Env.DOMAIN_GITEA }}} + secretName: terralist-tls-secret + +persistence: + data: + enabled: true + accessMode: ReadWriteOnce + size: 10Gi + retain: false + storageClass: "csi-disk" + annotations: + everest.io/disk-volume-type: GPSSD + globalMounts: + - path: /data