Compare commits

..

2 Commits

Author SHA1 Message Date
juvdiaz 89fa756305 Label worker nodes before Prometheus migration
Homelab Main / deploy (push) Successful in 1m30s Details
2026-05-28 12:49:34 -06:00
juvdiaz b0a2c44bbb Move Prometheus stack to worker nodes 2026-05-28 12:46:54 -06:00
4 changed files with 123 additions and 4 deletions

View File

@ -210,6 +210,8 @@ duplicate those PV manifests when you want storage on another node.
`bootstrap/cluster` labels nodes with homelab placement metadata:
- `node-role.kubernetes.io/worker=worker` on every worker so `kubectl get nodes`
shows `worker` instead of `<none>` in the ROLES column
- `homelab.dev/node-role=control-plane` and `homelab.dev/storage=local` on the
Debian control plane
- `homelab.dev/node-role=edge-app` and `homelab.dev/storage=local` on the
@ -225,6 +227,15 @@ OpenEBS hostpath PVs are node-local. Move workloads only after their storage and
edge path are ready on the target node. Gitea is outside Kubernetes and is moved
by changing the Raspberry Pi Docker install target instead.
The Prometheus stack control workloads are pinned to Pimox worker nodes by the
default `prometheus_stack_node_selector` (`homelab.dev/node-role=app` and
`homelab.dev/storage=nvme`). Because the Prometheus, Alertmanager, and Grafana
PVCs use retained local OpenEBS volumes, moving an existing install off the
Debian control plane requires discarding those PVCs. Run
`./lab.sh move-prometheus-stack-workers` from the Debian host to label existing
worker nodes, destroy only the existing `prometheus-stack` Helm release, delete
its retained PVC/PV objects, and recreate the stack on the worker selector.
The website and demos NodePorts are reachable from the OCI jump box through the
Raspberry Pi Tailscale interface. `bootstrap/cluster` installs a persistent
`homelab-tailscale-nodeport.service` on the configured worker to restore the

View File

@ -104,6 +104,8 @@ EOT
local.metallb_ip_address_pool_manifest,
var.metallb.l2_advertisement_enabled ? local.metallb_l2_advertisement_manifest : "",
]))
prometheus_stack_node_selector = var.prometheus_stack_node_selector
}
resource "helm_release" "calico_crds" {
@ -1102,6 +1104,7 @@ resource "helm_release" "prometheus_stack" {
enabled = false
}
prometheusOperator = {
nodeSelector = local.prometheus_stack_node_selector
tls = {
enabled = false
}
@ -1120,6 +1123,7 @@ resource "helm_release" "prometheus_stack" {
}
alertmanager = {
alertmanagerSpec = {
nodeSelector = local.prometheus_stack_node_selector
storage = {
volumeClaimTemplate = {
spec = {
@ -1137,6 +1141,7 @@ resource "helm_release" "prometheus_stack" {
}
prometheus = {
prometheusSpec = {
nodeSelector = local.prometheus_stack_node_selector
retention = var.observability.prometheus.retention
resources = {
requests = {
@ -1168,6 +1173,7 @@ resource "helm_release" "prometheus_stack" {
}
}
grafana = {
nodeSelector = local.prometheus_stack_node_selector
persistence = {
enabled = true
type = "sts"
@ -1205,6 +1211,9 @@ resource "helm_release" "prometheus_stack" {
}
}
}
"kube-state-metrics" = {
nodeSelector = local.prometheus_stack_node_selector
}
})
]
}

View File

@ -210,6 +210,15 @@ variable "observability" {
}
}
variable "prometheus_stack_node_selector" {
description = "Node selector applied to kube-prometheus-stack control workloads so they stay off the control plane."
type = map(string)
default = {
"homelab.dev/node-role" = "app"
"homelab.dev/storage" = "nvme"
}
}
variable "extra_helm_releases" {
type = map(object({
repository = string

96
lab.sh
View File

@ -174,6 +174,74 @@ adopt_apps_existing_resources() {
"demos-static"
}
ensure_homelab_node_labels() {
local control_plane_node="${LAB_CONTROL_PLANE_NODE_NAME:-debian}"
local raspberry_node="${LAB_RASPBERRY_NODE_NAME:-raspberry}"
local prometheus_selector="homelab.dev/node-role=app,homelab.dev/storage=nvme"
local node
local target_nodes
echo "Applying homelab labels to existing Kubernetes nodes..."
while IFS= read -r node; do
[[ -n "${node}" ]] || continue
if [[ "${node}" == "${control_plane_node}" ]]; then
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
homelab.dev/node-role=control-plane \
homelab.dev/storage=local \
--overwrite
continue
fi
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
node-role.kubernetes.io/worker=worker \
--overwrite
if [[ "${node}" == pimox-worker-* ]]; then
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
homelab.dev/node-role=app \
homelab.dev/storage=nvme \
--overwrite
elif [[ "${node}" == "${raspberry_node}" ]]; then
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
homelab.dev/node-role=edge-app \
homelab.dev/storage=local \
--overwrite
fi
done < <(kubectl --kubeconfig "${KUBECONFIG_PATH}" get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
target_nodes="$(kubectl --kubeconfig "${KUBECONFIG_PATH}" get nodes -l "${prometheus_selector}" -o name)"
if [[ -z "${target_nodes}" ]]; then
echo "No nodes match ${prometheus_selector}; refusing to move prometheus-stack." >&2
exit 1
fi
}
delete_prometheus_stack_storage() {
local namespace="${1:-monitoring}"
local pattern='(^|-)prometheus-stack-(prometheus|alertmanager|grafana)(-|$)|^prometheus-prometheus-stack|^alertmanager-prometheus-stack|^storage-prometheus-stack-grafana'
local pvc_names
local pv_names
pvc_names="$(kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" get pvc -o name 2>/dev/null |
awk -F/ -v pattern="${pattern}" '$2 ~ pattern {print $2}')"
pv_names="$(kubectl --kubeconfig "${KUBECONFIG_PATH}" get pv \
-o jsonpath='{range .items[?(@.spec.claimRef.namespace=="'"${namespace}"'")]}{.metadata.name}{"\t"}{.spec.claimRef.name}{"\n"}{end}' 2>/dev/null |
awk -v pattern="${pattern}" '$2 ~ pattern {print $1}')"
if [[ -n "${pvc_names}" ]]; then
echo "Deleting old prometheus-stack PVCs in ${namespace}; saved Prometheus, Alertmanager, and Grafana data will be discarded..."
printf '%s\n' "${pvc_names}" |
xargs -r kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" delete pvc --wait=true --timeout=180s
fi
if [[ -n "${pv_names}" ]]; then
echo "Deleting old prometheus-stack retained PV objects..."
printf '%s\n' "${pv_names}" |
xargs -r kubectl --kubeconfig "${KUBECONFIG_PATH}" delete pv --wait=false
fi
}
run_tofu_stack() {
local stack="$1"
local -a apply_args=(-auto-approve)
@ -192,6 +260,25 @@ run_tofu_stack() {
tofu -chdir="${REPO_ROOT}/${stack}" apply "${apply_args[@]}"
}
move_prometheus_stack_workers() {
local stack="bootstrap/platform"
local namespace="${LAB_MONITORING_NAMESPACE:-monitoring}"
require_debian_server "move-prometheus-stack-workers"
export TF_VAR_kubeconfig_path="${TF_VAR_kubeconfig_path:-${KUBECONFIG_PATH}}"
export KUBECONFIG="${TF_VAR_kubeconfig_path}"
echo "Moving prometheus-stack off the control plane. Existing prometheus-stack PVC data will be deleted."
ensure_homelab_node_labels
tofu -chdir="${REPO_ROOT}/${stack}" init
adopt_platform_existing_resources
tofu -chdir="${REPO_ROOT}/${stack}" destroy -target=helm_release.prometheus_stack -auto-approve
delete_prometheus_stack_storage "${namespace}"
tofu -chdir="${REPO_ROOT}/${stack}" apply -auto-approve
kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" get pods -o wide
}
truthy() {
case "${1,,}" in
1 | true | yes | on)
@ -604,8 +691,8 @@ write_cluster_worker_var_file() {
LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \
LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \
LAB_RASPBERRY_SSH_KEY_PATH="${LAB_RASPBERRY_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}" \
LAB_RASPBERRY_NODE_LABELS_JSON="${LAB_RASPBERRY_NODE_LABELS_JSON:-{\"homelab.dev/node-role\":\"edge-app\",\"homelab.dev/storage\":\"local\"}}" \
LAB_PIMOX_WORKER_NODE_LABELS_JSON="${LAB_PIMOX_WORKER_NODE_LABELS_JSON:-{\"homelab.dev/node-role\":\"app\",\"homelab.dev/storage\":\"nvme\"}}" \
LAB_RASPBERRY_NODE_LABELS_JSON="${LAB_RASPBERRY_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"edge-app\",\"homelab.dev/storage\":\"local\"}}" \
LAB_PIMOX_WORKER_NODE_LABELS_JSON="${LAB_PIMOX_WORKER_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"app\",\"homelab.dev/storage\":\"nvme\"}}" \
python3 - "${spec_file}" "${var_file}" <<'PY'
import json
import os
@ -2761,6 +2848,9 @@ case "${1:-}" in
install-gitea-runner)
install_gitea_runner "${2:-}"
;;
move-prometheus-stack-workers)
move_prometheus_stack_workers
;;
openwrt)
openwrt
;;
@ -2768,7 +2858,7 @@ case "${1:-}" in
nuke
;;
*)
echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|openwrt|nuke}"
echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|move-prometheus-stack-workers|openwrt|nuke}"
exit 1
;;
esac