diff --git a/README.md b/README.md index 6a34c53..2f6b7c4 100644 --- a/README.md +++ b/README.md @@ -225,6 +225,15 @@ OpenEBS hostpath PVs are node-local. Move workloads only after their storage and edge path are ready on the target node. Gitea is outside Kubernetes and is moved by changing the Raspberry Pi Docker install target instead. +The Prometheus stack control workloads are pinned to Pimox worker nodes by the +default `prometheus_stack_node_selector` (`homelab.dev/node-role=app` and +`homelab.dev/storage=nvme`). Because the Prometheus, Alertmanager, and Grafana +PVCs use retained local OpenEBS volumes, moving an existing install off the +Debian control plane requires discarding those PVCs. Run +`./lab.sh move-prometheus-stack-workers` from the Debian host to destroy only +the existing `prometheus-stack` Helm release, delete its retained PVC/PV objects, +and recreate the stack on the worker selector. + The website and demos NodePorts are reachable from the OCI jump box through the Raspberry Pi Tailscale interface. `bootstrap/cluster` installs a persistent `homelab-tailscale-nodeport.service` on the configured worker to restore the diff --git a/bootstrap/platform/main.tf b/bootstrap/platform/main.tf index 9cc4e24..d84795e 100644 --- a/bootstrap/platform/main.tf +++ b/bootstrap/platform/main.tf @@ -104,6 +104,8 @@ EOT local.metallb_ip_address_pool_manifest, var.metallb.l2_advertisement_enabled ? local.metallb_l2_advertisement_manifest : "", ])) + + prometheus_stack_node_selector = var.prometheus_stack_node_selector } resource "helm_release" "calico_crds" { @@ -1102,6 +1104,7 @@ resource "helm_release" "prometheus_stack" { enabled = false } prometheusOperator = { + nodeSelector = local.prometheus_stack_node_selector tls = { enabled = false } @@ -1120,6 +1123,7 @@ resource "helm_release" "prometheus_stack" { } alertmanager = { alertmanagerSpec = { + nodeSelector = local.prometheus_stack_node_selector storage = { volumeClaimTemplate = { spec = { @@ -1137,7 +1141,8 @@ resource "helm_release" "prometheus_stack" { } prometheus = { prometheusSpec = { - retention = var.observability.prometheus.retention + nodeSelector = local.prometheus_stack_node_selector + retention = var.observability.prometheus.retention resources = { requests = { cpu = "100m" @@ -1168,6 +1173,7 @@ resource "helm_release" "prometheus_stack" { } } grafana = { + nodeSelector = local.prometheus_stack_node_selector persistence = { enabled = true type = "sts" @@ -1205,6 +1211,9 @@ resource "helm_release" "prometheus_stack" { } } } + "kube-state-metrics" = { + nodeSelector = local.prometheus_stack_node_selector + } }) ] } diff --git a/bootstrap/platform/variables.tf b/bootstrap/platform/variables.tf index 430d5a1..e9cbefa 100644 --- a/bootstrap/platform/variables.tf +++ b/bootstrap/platform/variables.tf @@ -210,6 +210,15 @@ variable "observability" { } } +variable "prometheus_stack_node_selector" { + description = "Node selector applied to kube-prometheus-stack control workloads so they stay off the control plane." + type = map(string) + default = { + "homelab.dev/node-role" = "app" + "homelab.dev/storage" = "nvme" + } +} + variable "extra_helm_releases" { type = map(object({ repository = string diff --git a/lab.sh b/lab.sh index 7a01964..27698ef 100755 --- a/lab.sh +++ b/lab.sh @@ -174,6 +174,31 @@ adopt_apps_existing_resources() { "demos-static" } +delete_prometheus_stack_storage() { + local namespace="${1:-monitoring}" + local pattern='(^|-)prometheus-stack-(prometheus|alertmanager|grafana)(-|$)|^prometheus-prometheus-stack|^alertmanager-prometheus-stack|^storage-prometheus-stack-grafana' + local pvc_names + local pv_names + + pvc_names="$(kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" get pvc -o name 2>/dev/null | + awk -F/ -v pattern="${pattern}" '$2 ~ pattern {print $2}')" + pv_names="$(kubectl --kubeconfig "${KUBECONFIG_PATH}" get pv \ + -o jsonpath='{range .items[?(@.spec.claimRef.namespace=="'"${namespace}"'")]}{.metadata.name}{"\t"}{.spec.claimRef.name}{"\n"}{end}' 2>/dev/null | + awk -v pattern="${pattern}" '$2 ~ pattern {print $1}')" + + if [[ -n "${pvc_names}" ]]; then + echo "Deleting old prometheus-stack PVCs in ${namespace}; saved Prometheus, Alertmanager, and Grafana data will be discarded..." + printf '%s\n' "${pvc_names}" | + xargs -r kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" delete pvc --wait=true --timeout=180s + fi + + if [[ -n "${pv_names}" ]]; then + echo "Deleting old prometheus-stack retained PV objects..." + printf '%s\n' "${pv_names}" | + xargs -r kubectl --kubeconfig "${KUBECONFIG_PATH}" delete pv --wait=false + fi +} + run_tofu_stack() { local stack="$1" local -a apply_args=(-auto-approve) @@ -192,6 +217,24 @@ run_tofu_stack() { tofu -chdir="${REPO_ROOT}/${stack}" apply "${apply_args[@]}" } +move_prometheus_stack_workers() { + local stack="bootstrap/platform" + local namespace="${LAB_MONITORING_NAMESPACE:-monitoring}" + + require_debian_server "move-prometheus-stack-workers" + + export TF_VAR_kubeconfig_path="${TF_VAR_kubeconfig_path:-${KUBECONFIG_PATH}}" + export KUBECONFIG="${TF_VAR_kubeconfig_path}" + + echo "Moving prometheus-stack off the control plane. Existing prometheus-stack PVC data will be deleted." + tofu -chdir="${REPO_ROOT}/${stack}" init + adopt_platform_existing_resources + tofu -chdir="${REPO_ROOT}/${stack}" destroy -target=helm_release.prometheus_stack -auto-approve + delete_prometheus_stack_storage "${namespace}" + tofu -chdir="${REPO_ROOT}/${stack}" apply -auto-approve + kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" get pods -o wide +} + truthy() { case "${1,,}" in 1 | true | yes | on) @@ -2761,6 +2804,9 @@ case "${1:-}" in install-gitea-runner) install_gitea_runner "${2:-}" ;; + move-prometheus-stack-workers) + move_prometheus_stack_workers + ;; openwrt) openwrt ;; @@ -2768,7 +2814,7 @@ case "${1:-}" in nuke ;; *) - echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|openwrt|nuke}" + echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|move-prometheus-stack-workers|openwrt|nuke}" exit 1 ;; esac