Add Pimox cluster rebuild path
Homelab Main / deploy (push) Failing after 2s Details

This commit is contained in:
juvdiaz 2026-05-27 14:31:06 -06:00
parent 041e607fed
commit f8d2286558
5 changed files with 134 additions and 39 deletions

View File

@ -164,16 +164,31 @@ hostname.
For Pimox on Orange Pi 5 Plus, `./lab.sh up` can create the Debian 13 arm64
template and worker VM clones automatically. Defaults are intentionally tied to
the observed host: Pimox SSH host `192.168.100.80`, bridge `vmbr0`, template VMID
`9000` on `local` storage, worker VMIDs starting at `9010`, and worker clone
storage `nvme_thin_pool`. Details and override variables are in
`9000` on `local` storage, two 4 GiB worker VMs starting at VMID `9010`, CPU
affinities `4-5` and `6-7`, and worker clone storage `nvme_thin_pool`. Details
and override variables are in
`bootstrap/provisioning/README.md`.
Worker indexes are stable. Index `1` maps to VMID `9010`, node name
`pimox-worker-01`, and worker key `pimox01`; index `2` maps to VMID `9011`, and
so on. `LAB_PIMOX_SKIP_WORKER_INDEXES=1` leaves the already-created first slot
unmanaged while allowing higher indexes to be automated.
so on. `LAB_PIMOX_SKIP_WORKER_INDEXES=1` leaves the first slot unmanaged while
allowing higher indexes to be automated.
Add entries to `bootstrap/cluster/variables.tf` or a `.tfvars` file:
Run a full cluster rebuild from the Debian server with:
```bash
./lab.sh rebuild-cluster
```
That path preserves external Raspberry Pi Gitea, rebuilds the Pimox template
with 2 cores, 4 GiB memory, and CPU affinity `4-5`, replaces two Pimox worker
VMs with 2 cores, 4 GiB memory, and affinities `4-5` and `6-7`, and joins those
workers to the Kubernetes cluster. The Raspberry Pi worker is excluded by
default while it hosts external Gitea.
To opt the Raspberry Pi back into the Kubernetes cluster, set
`LAB_INCLUDE_RASPBERRY_WORKER=true` or add entries to
`bootstrap/cluster/variables.tf` or a `.tfvars` file:
```hcl
worker_nodes = {
@ -459,11 +474,12 @@ for platform or provisioning updates.
## Destructive Rebuilds
`./lab.sh nuke` resets kubeadm, containerd runtime state, CNI files, Calico
links, iptables rules, local OpenTofu state, and configured worker nodes. It does
not delete retained data under `/var/openebs/local`.
links, iptables rules, and local OpenTofu state. It does not delete retained data
under `/var/openebs/local`.
For multi-node labs, set `WORKER_SSH_TARGETS` to a space-separated list of SSH
targets. For a single-node rebuild, set it to an empty string.
targets. It defaults to an empty string so the Raspberry Pi Gitea host is not
cleaned unless you explicitly include it.
## Website App

View File

@ -59,25 +59,13 @@ variable "worker_nodes" {
ssh_key_path = string
}))
default = {
raspberrypi = {
host = "192.168.100.89"
user = "jv"
node_name = "raspberry"
ssh_key_path = "/home/jv/.ssh/id_ed25519"
}
}
default = {}
}
variable "worker_node_labels" {
type = map(map(string))
default = {
raspberrypi = {
"homelab.dev/node-role" = "edge-app"
"homelab.dev/storage" = "local"
}
}
default = {}
}
variable "tailscale_nodeport_access" {

View File

@ -108,21 +108,26 @@ networking, virtio-scsi disk, `vmbr0`, `local` template storage, 1 socket with
`TF_VAR_pimox_template_cpu_affinity` if the Orange Pi template layout changes.
`./lab.sh up` also creates or reuses worker clones after the template exists. It
defaults to one worker, VMID `9010`, names like `pimox-worker-01`, deterministic
locally administered MAC addresses, 1 socket with 2 cores, 4 GiB RAM,
Orange Pi 5 high-speed CPU affinity pairs `4-5` and `6-7`,
`nvme_thin_pool` clone storage, and qemu-guest-agent IP discovery. New workers
are full clones created with
defaults to two workers, VMIDs `9010` and `9011`, names like
`pimox-worker-01`, deterministic locally administered MAC addresses, 1 socket
with 2 cores, 4 GiB RAM, Orange Pi 5 high-speed CPU affinity pairs `4-5` and
`6-7`, `nvme_thin_pool` clone storage, and qemu-guest-agent IP discovery. New
workers are full clones created with
`qm clone --storage`, so the template can remain on `local` while worker disks
land on the NVMe thin pool. The pipeline refuses `LAB_PIMOX_WORKER_STORAGE=local`
so only the template VM lives on local storage. Useful overrides:
land on the NVMe thin pool. Set `LAB_PIMOX_WORKER_REPLACE_EXISTING=true` to
destroy and recreate existing worker VMs from the current template. The pipeline
refuses `LAB_PIMOX_WORKER_STORAGE=local` so only the template VM lives on local
storage. Useful overrides:
```bash
./lab.sh rebuild-cluster
LAB_PIMOX_PIPELINE=false ./lab.sh up
LAB_PIMOX_TEMPLATE_REPLACE_EXISTING=true ./lab.sh up
LAB_PIMOX_WORKER_COUNT=0 ./lab.sh up
LAB_PIMOX_WORKER_COUNT=2 ./lab.sh up
LAB_PIMOX_WORKER_BASE_VMID=9020 ./lab.sh up
LAB_PIMOX_WORKER_STORAGE=nvme_thin_pool ./lab.sh up
LAB_PIMOX_WORKER_REPLACE_EXISTING=true ./lab.sh up
LAB_PIMOX_WORKER_CPU_AFFINITIES="4-5 6-7" ./lab.sh up
LAB_PIMOX_HOST=192.168.100.80 LAB_PIMOX_BRIDGE=vmbr0 ./lab.sh up
```

View File

@ -173,7 +173,7 @@ if ! ip link show "${self.triggers.bridge}" >/dev/null 2>&1; then
fi
if sudo "$qm_cmd" status "$vmid" >/dev/null 2>&1; then
if sudo "$qm_cmd" config "$vmid" | grep -q '^template: 1$'; then
if sudo "$qm_cmd" config "$vmid" | grep -q '^template: 1$' && [ "$replace_existing" != "true" ]; then
sudo "$qm_cmd" set "$vmid" --agent enabled=1
exit 0
fi

102
lab.sh
View File

@ -263,7 +263,8 @@ ensure_pimox_worker_node() {
local timeout_seconds="${17}"
local qm_bin="${18}"
local worker_storage="${19}"
local worker_cpu_affinity="${20}"
local worker_replace_existing="${20}"
local worker_cpu_affinity="${21}"
local padded
local vmid
local worker_key
@ -284,9 +285,26 @@ ensure_pimox_worker_node() {
echo "VM ${vmid} exists as a template; refusing to reuse it as worker ${worker_name}." >&2
exit 1
fi
if truthy "${worker_replace_existing}"; then
echo "Replacing existing Pimox worker VM ${vmid} (${worker_name}) before cloning from template ${template_vmid}..."
pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu
sudo '${qm_bin}' stop '${vmid}' >/dev/null 2>&1 || true
elapsed=0
while [ \"\$elapsed\" -lt 300 ]; do
if sudo '${qm_bin}' status '${vmid}' | grep -q 'status: stopped'; then
break
fi
sleep 5
elapsed=\$((elapsed + 5))
done
sudo '${qm_bin}' destroy '${vmid}' --purge 1 >/dev/null 2>&1 || sudo '${qm_bin}' destroy '${vmid}'"
else
pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' set '${vmid}' --agent enabled=1 --sockets 1 --cores '${worker_cores}' --memory '${worker_memory}' --affinity '${worker_cpu_affinity}'
if sudo '${qm_bin}' status '${vmid}' | grep -q 'status: stopped'; then sudo '${qm_bin}' start '${vmid}'; fi"
else
fi
fi
if ! pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' status '${vmid}' >/dev/null 2>&1"; then
pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu
if ! ip link show '${bridge}' >/dev/null 2>&1; then
echo 'Pimox bridge ${bridge} does not exist. Refusing to change Orange Pi networking.' >&2
@ -325,7 +343,7 @@ write_cluster_worker_var_file() {
local spec_file="$1"
local var_file="$2"
LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-true}" \
LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-false}" \
LAB_RASPBERRY_HOST="${LAB_RASPBERRY_HOST:-192.168.100.89}" \
LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \
LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \
@ -385,17 +403,21 @@ run_pimox_pipeline() {
local bridge="${LAB_PIMOX_BRIDGE:-${TF_VAR_pimox_template_bridge:-vmbr0}}"
local template_vmid="${LAB_PIMOX_TEMPLATE_VMID:-${TF_VAR_pimox_template_vmid:-9000}}"
local template_name="${LAB_PIMOX_TEMPLATE_NAME:-${TF_VAR_pimox_template_name:-debian13-arm64-k8s-template}}"
local template_cores="${LAB_PIMOX_TEMPLATE_CORES:-${TF_VAR_pimox_template_cores:-2}}"
local template_memory="${LAB_PIMOX_TEMPLATE_MEMORY:-${TF_VAR_pimox_template_memory:-4096}}"
local template_cpu_affinity="${LAB_PIMOX_TEMPLATE_CPU_AFFINITY:-${TF_VAR_pimox_template_cpu_affinity:-4-5}}"
local template_replace_existing="${LAB_PIMOX_TEMPLATE_REPLACE_EXISTING:-${TF_VAR_pimox_template_replace_existing:-false}}"
local provisioning_interface
local worker_count="${LAB_PIMOX_WORKER_COUNT:-1}"
local worker_count="${LAB_PIMOX_WORKER_COUNT:-2}"
local worker_base_vmid="${LAB_PIMOX_WORKER_BASE_VMID:-9010}"
local worker_name_prefix="${LAB_PIMOX_WORKER_NAME_PREFIX:-pimox-worker}"
local worker_node_prefix="${LAB_PIMOX_WORKER_NODE_PREFIX:-pimox-worker}"
local worker_key_prefix="${LAB_PIMOX_WORKER_KEY_PREFIX:-pimox}"
local worker_skip_indexes="${LAB_PIMOX_SKIP_WORKER_INDEXES:-1}"
local worker_skip_indexes="${LAB_PIMOX_SKIP_WORKER_INDEXES:-}"
local worker_cores="${LAB_PIMOX_WORKER_CORES:-2}"
local worker_memory="${LAB_PIMOX_WORKER_MEMORY:-4096}"
local worker_cpu_affinities="${LAB_PIMOX_WORKER_CPU_AFFINITIES:-4-5 6-7}"
local worker_replace_existing="${LAB_PIMOX_WORKER_REPLACE_EXISTING:-false}"
local worker_storage="${LAB_PIMOX_WORKER_STORAGE:-${TF_VAR_pimox_worker_storage:-nvme_thin_pool}}"
local worker_user="${LAB_PIMOX_WORKER_USER:-jv}"
local worker_key_path="${LAB_PIMOX_WORKER_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}"
@ -406,6 +428,7 @@ run_pimox_pipeline() {
local index
local readiness_output
local readiness_status
local template_cpu_count
local worker_cpu_affinity
if disabled_value "${mode}"; then
@ -420,6 +443,30 @@ run_pimox_pipeline() {
echo "LAB_PIMOX_WORKER_COUNT must be a non-negative integer." >&2
exit 1
fi
if ! [[ "${template_cores}" =~ ^[0-9]+$ && "${worker_cores}" =~ ^[0-9]+$ ]]; then
echo "LAB_PIMOX_TEMPLATE_CORES and LAB_PIMOX_WORKER_CORES must be positive integers." >&2
exit 1
fi
if ! [[ "${template_memory}" =~ ^[0-9]+$ && "${worker_memory}" =~ ^[0-9]+$ ]]; then
echo "LAB_PIMOX_TEMPLATE_MEMORY and LAB_PIMOX_WORKER_MEMORY must be positive integer MiB values." >&2
exit 1
fi
if ((template_cores == 0 || worker_cores == 0 || template_memory == 0 || worker_memory == 0)); then
echo "Pimox template and worker CPU and memory values must be greater than zero." >&2
exit 1
fi
if ! template_cpu_count="$(cpuset_cpu_count "${template_cpu_affinity}")"; then
echo "Invalid Pimox template CPU affinity '${template_cpu_affinity}'. Use CPU IDs or ranges, such as 4-5." >&2
exit 1
fi
if ((template_cpu_count != template_cores)); then
echo "Pimox template uses ${template_cores} cores but affinity '${template_cpu_affinity}' contains ${template_cpu_count} CPUs." >&2
exit 1
fi
if ! truthy "${worker_replace_existing}" && ! disabled_value "${worker_replace_existing}"; then
echo "LAB_PIMOX_WORKER_REPLACE_EXISTING must be true or false." >&2
exit 1
fi
if ! [[ "${worker_storage}" =~ ^[A-Za-z0-9_.:-]+$ ]]; then
echo "LAB_PIMOX_WORKER_STORAGE must be a valid Pimox storage identifier." >&2
exit 1
@ -469,6 +516,9 @@ fi" 2>&1)"
export TF_VAR_pimox_template_bridge="${bridge}"
export TF_VAR_pimox_template_vmid="${template_vmid}"
export TF_VAR_pimox_template_name="${template_name}"
export TF_VAR_pimox_template_cores="${template_cores}"
export TF_VAR_pimox_template_memory="${template_memory}"
export TF_VAR_pimox_template_cpu_affinity="${template_cpu_affinity}"
export TF_VAR_pimox_template_replace_existing="${template_replace_existing}"
export TF_VAR_pimox_template_builder_enabled="${TF_VAR_pimox_template_builder_enabled:-true}"
export TF_VAR_pimox_template_build_ssh_key_path="${TF_VAR_pimox_template_build_ssh_key_path:-${worker_key_path}}"
@ -519,6 +569,7 @@ fi" 2>&1)"
"${timeout_seconds}" \
"${qm_bin}" \
"${worker_storage}" \
"${worker_replace_existing}" \
"${worker_cpu_affinity}"
done
@ -2239,6 +2290,36 @@ up() {
echo "Deployment successfully completed."
}
rebuild_cluster() {
require_debian_server "rebuild-cluster"
export LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-false}"
export LAB_PIMOX_PIPELINE="${LAB_PIMOX_PIPELINE:-true}"
export LAB_PIMOX_TEMPLATE_REPLACE_EXISTING="${LAB_PIMOX_TEMPLATE_REPLACE_EXISTING:-true}"
export LAB_PIMOX_TEMPLATE_CORES="${LAB_PIMOX_TEMPLATE_CORES:-2}"
export LAB_PIMOX_TEMPLATE_MEMORY="${LAB_PIMOX_TEMPLATE_MEMORY:-4096}"
export LAB_PIMOX_TEMPLATE_CPU_AFFINITY="${LAB_PIMOX_TEMPLATE_CPU_AFFINITY:-4-5}"
export LAB_PIMOX_WORKER_COUNT="${LAB_PIMOX_WORKER_COUNT:-2}"
export LAB_PIMOX_SKIP_WORKER_INDEXES="${LAB_PIMOX_SKIP_WORKER_INDEXES:-}"
export LAB_PIMOX_WORKER_REPLACE_EXISTING="${LAB_PIMOX_WORKER_REPLACE_EXISTING:-true}"
export LAB_PIMOX_WORKER_CORES="${LAB_PIMOX_WORKER_CORES:-2}"
export LAB_PIMOX_WORKER_MEMORY="${LAB_PIMOX_WORKER_MEMORY:-4096}"
export LAB_PIMOX_WORKER_CPU_AFFINITIES="${LAB_PIMOX_WORKER_CPU_AFFINITIES:-4-5 6-7}"
export WORKER_SSH_TARGETS="${WORKER_SSH_TARGETS:-}"
echo "Rebuilding the Kubernetes cluster without touching external Raspberry Pi Gitea..."
nuke
run_pimox_pipeline
run_openwrt_pipeline
run_tofu_stack "bootstrap/cluster"
run_tofu_stack "bootstrap/platform"
apps
run_tofu_stack "bootstrap/edge"
echo "Cluster rebuild successfully completed."
}
nuke() {
local worker_ssh_targets
local worker_targets
@ -2247,7 +2328,7 @@ nuke() {
require_debian_server "nuke"
echo "Brutally nuking the homelab infrastructure..."
worker_ssh_targets="${WORKER_SSH_TARGETS-jv@192.168.100.89}"
worker_ssh_targets="${WORKER_SSH_TARGETS-}"
read -r -a worker_targets <<< "${worker_ssh_targets}"
echo "--> Terminating local OpenTofu tasks..."
@ -2364,6 +2445,9 @@ EOF
rm -f "${BUILDX_CONFIG}" || true
echo "--> Deleting OpenTofu tracking state files..."
rm -rf "${REPO_ROOT}"/bootstrap/provisioning/terraform.tfstate*
rm -f "${REPO_ROOT}"/bootstrap/provisioning/.terraform.tfstate.lock.info
rm -rf "${REPO_ROOT}"/bootstrap/provisioning/.terraform/
rm -rf "${REPO_ROOT}"/bootstrap/cluster/terraform.tfstate*
rm -f "${REPO_ROOT}"/bootstrap/cluster/.terraform.tfstate.lock.info
rm -rf "${REPO_ROOT}"/bootstrap/cluster/.terraform/
@ -2384,6 +2468,9 @@ case "${1:-}" in
up)
up
;;
rebuild-cluster)
rebuild_cluster
;;
apps)
apps
;;
@ -2406,8 +2493,7 @@ case "${1:-}" in
nuke
;;
*)
echo "Usage: $0 {up|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|nuke}"
echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|nuke}"
exit 1
;;
esac