From 009e6ce927aa2d6d5629935e872e1d467a7293b3 Mon Sep 17 00:00:00 2001 From: juvdiaz Date: Mon, 25 May 2026 21:50:53 -0600 Subject: [PATCH] Add observability platform stack --- bootstrap/platform/main.tf | 372 ++++++++++++++++++++++++++++++++ bootstrap/platform/variables.tf | 75 ++++++- 2 files changed, 446 insertions(+), 1 deletion(-) diff --git a/bootstrap/platform/main.tf b/bootstrap/platform/main.tf index 7139c3a..81b9966 100644 --- a/bootstrap/platform/main.tf +++ b/bootstrap/platform/main.tf @@ -231,6 +231,14 @@ resource "kubernetes_storage_class_v1" "openebs_hostpath_retain" { allow_volume_expansion = true } +resource "kubernetes_namespace_v1" "monitoring" { + depends_on = [kubernetes_storage_class_v1.openebs_hostpath_retain] + + metadata { + name = var.observability.namespace + } +} + resource "helm_release" "argocd" { depends_on = [helm_release.openebs] name = "argocd" @@ -328,6 +336,370 @@ EOT } } +resource "helm_release" "loki" { + depends_on = [kubernetes_namespace_v1.monitoring] + name = "loki" + repository = var.observability.loki.repository + chart = var.observability.loki.chart + version = var.observability.loki.version + namespace = var.observability.namespace + create_namespace = false + timeout = 900 + wait = true + + values = [ + yamlencode({ + deploymentMode = "SingleBinary" + loki = { + auth_enabled = false + commonConfig = { + replication_factor = 1 + } + storage = { + type = "filesystem" + } + schemaConfig = { + configs = [ + { + from = "2024-04-01" + store = "tsdb" + object_store = "filesystem" + schema = "v13" + index = { + prefix = "loki_index_" + period = "24h" + } + } + ] + } + limits_config = { + retention_period = var.observability.loki.retention_period + } + compactor = { + retention_enabled = true + delete_request_store = "filesystem" + working_directory = "/var/loki/compactor" + } + } + singleBinary = { + replicas = 1 + affinity = {} + persistence = { + enabled = true + whenScaled = "Retain" + whenDeleted = "Retain" + enableStatefulSetAutoDeletePVC = false + storageClass = var.openebs.retain_storage_class + size = var.observability.loki.storage_size + } + resources = { + requests = { + cpu = "50m" + memory = "256Mi" + } + limits = { + memory = "768Mi" + } + } + } + read = { + replicas = 0 + } + write = { + replicas = 0 + } + backend = { + replicas = 0 + } + gateway = { + enabled = false + } + chunksCache = { + enabled = false + } + resultsCache = { + enabled = false + } + lokiCanary = { + enabled = false + } + test = { + enabled = false + } + }) + ] +} + +resource "helm_release" "mimir" { + depends_on = [kubernetes_namespace_v1.monitoring] + name = "mimir" + repository = var.observability.mimir.repository + chart = var.observability.mimir.chart + version = var.observability.mimir.version + namespace = var.observability.namespace + create_namespace = false + timeout = 1200 + wait = true + + values = [ + yamlencode({ + mimir = { + structuredConfig = { + multitenancy_enabled = false + ingester = { + ring = { + replication_factor = 1 + } + } + } + } + alertmanager = { + persistentVolume = { + storageClass = var.openebs.retain_storage_class + size = var.observability.mimir.alertmanager_storage_size + } + zoneAwareReplication = { + enabled = false + } + } + ingester = { + replicas = 1 + persistentVolume = { + storageClass = var.openebs.retain_storage_class + size = var.observability.mimir.ingester_storage_size + } + resources = { + requests = { + cpu = "100m" + memory = "512Mi" + } + limits = { + memory = "1Gi" + } + } + zoneAwareReplication = { + enabled = false + } + } + store_gateway = { + replicas = 1 + persistentVolume = { + storageClass = var.openebs.retain_storage_class + size = var.observability.mimir.store_gateway_storage_size + } + zoneAwareReplication = { + enabled = false + } + } + compactor = { + replicas = 1 + persistentVolume = { + storageClass = var.openebs.retain_storage_class + size = var.observability.mimir.compactor_storage_size + } + } + distributor = { + replicas = 1 + } + querier = { + replicas = 1 + } + query_frontend = { + replicas = 1 + } + query_scheduler = { + replicas = 1 + } + ruler = { + replicas = 1 + } + minio = { + persistence = { + storageClass = var.openebs.retain_storage_class + size = var.observability.mimir.minio_storage_size + } + resources = { + requests = { + cpu = "50m" + memory = "128Mi" + } + limits = { + memory = "512Mi" + } + } + } + nginx = { + replicas = 1 + } + gateway = { + enabled = false + } + rollout_operator = { + enabled = false + } + }) + ] +} + +resource "helm_release" "promtail" { + depends_on = [helm_release.loki] + name = "promtail" + repository = var.observability.promtail.repository + chart = var.observability.promtail.chart + version = var.observability.promtail.version + namespace = var.observability.namespace + create_namespace = false + timeout = 600 + wait = true + + values = [ + yamlencode({ + config = { + clients = [ + { + url = "http://loki.${var.observability.namespace}.svc:3100/loki/api/v1/push" + } + ] + } + resources = { + requests = { + cpu = "25m" + memory = "64Mi" + } + limits = { + memory = "128Mi" + } + } + }) + ] +} + +resource "helm_release" "prometheus_stack" { + depends_on = [helm_release.loki, helm_release.mimir] + name = "prometheus-stack" + repository = var.observability.prometheus.repository + chart = var.observability.prometheus.chart + version = var.observability.prometheus.version + namespace = var.observability.namespace + create_namespace = false + timeout = 1200 + wait = true + + values = [ + yamlencode({ + kubeControllerManager = { + enabled = false + } + kubeEtcd = { + enabled = false + } + kubeProxy = { + enabled = false + } + kubeScheduler = { + enabled = false + } + prometheusOperator = { + admissionWebhooks = { + enabled = false + } + resources = { + requests = { + cpu = "50m" + memory = "128Mi" + } + limits = { + memory = "384Mi" + } + } + } + alertmanager = { + alertmanagerSpec = { + storage = { + volumeClaimTemplate = { + spec = { + storageClassName = var.openebs.retain_storage_class + accessModes = ["ReadWriteOnce"] + resources = { + requests = { + storage = var.observability.prometheus.alertmanager_storage_size + } + } + } + } + } + } + } + prometheus = { + prometheusSpec = { + retention = var.observability.prometheus.retention + resources = { + requests = { + cpu = "100m" + memory = "512Mi" + } + limits = { + memory = "1Gi" + } + } + remoteWrite = var.observability.prometheus.remote_write_mimir_enabled ? [ + { + url = "http://mimir-nginx.${var.observability.namespace}.svc/api/v1/push" + } + ] : [] + storageSpec = { + volumeClaimTemplate = { + spec = { + storageClassName = var.openebs.retain_storage_class + accessModes = ["ReadWriteOnce"] + resources = { + requests = { + storage = var.observability.prometheus.storage_size + } + } + } + } + } + } + } + grafana = { + persistence = { + enabled = true + type = "sts" + storageClassName = var.openebs.retain_storage_class + accessModes = ["ReadWriteOnce"] + size = var.observability.prometheus.grafana_storage_size + } + additionalDataSources = [ + { + name = "Loki" + type = "loki" + access = "proxy" + url = "http://loki.${var.observability.namespace}.svc:3100" + isDefault = false + }, + { + name = "Mimir" + type = "prometheus" + access = "proxy" + url = "http://mimir-nginx.${var.observability.namespace}.svc/prometheus" + isDefault = false + } + ] + resources = { + requests = { + cpu = "50m" + memory = "128Mi" + } + limits = { + memory = "384Mi" + } + } + } + }) + ] +} + resource "helm_release" "extra_tools" { for_each = var.extra_helm_releases diff --git a/bootstrap/platform/variables.tf b/bootstrap/platform/variables.tf index 5d7a7bb..ef9fb1a 100644 --- a/bootstrap/platform/variables.tf +++ b/bootstrap/platform/variables.tf @@ -76,6 +76,80 @@ variable "argocd" { } } +variable "observability" { + type = object({ + namespace = string + prometheus = object({ + repository = string + chart = string + version = string + retention = string + storage_size = string + alertmanager_storage_size = string + grafana_storage_size = string + remote_write_mimir_enabled = bool + }) + loki = object({ + repository = string + chart = string + version = string + storage_size = string + retention_period = string + }) + promtail = object({ + repository = string + chart = string + version = string + }) + mimir = object({ + repository = string + chart = string + version = string + minio_storage_size = string + alertmanager_storage_size = string + ingester_storage_size = string + store_gateway_storage_size = string + compactor_storage_size = string + }) + }) + + default = { + namespace = "monitoring" + prometheus = { + repository = "https://prometheus-community.github.io/helm-charts" + chart = "kube-prometheus-stack" + version = "85.3.3" + retention = "7d" + storage_size = "15Gi" + alertmanager_storage_size = "1Gi" + grafana_storage_size = "2Gi" + remote_write_mimir_enabled = true + } + loki = { + repository = "https://grafana.github.io/helm-charts" + chart = "loki" + version = "7.0.0" + storage_size = "10Gi" + retention_period = "168h" + } + promtail = { + repository = "https://grafana.github.io/helm-charts" + chart = "promtail" + version = "6.17.1" + } + mimir = { + repository = "https://grafana.github.io/helm-charts" + chart = "mimir-distributed" + version = "5.8.0" + minio_storage_size = "10Gi" + alertmanager_storage_size = "1Gi" + ingester_storage_size = "4Gi" + store_gateway_storage_size = "4Gi" + compactor_storage_size = "4Gi" + } + } +} + variable "extra_helm_releases" { type = map(object({ repository = string @@ -90,4 +164,3 @@ variable "extra_helm_releases" { default = {} } -