diff --git a/README.md b/README.md index ec2130d..f063436 100644 --- a/README.md +++ b/README.md @@ -164,16 +164,31 @@ hostname. For Pimox on Orange Pi 5 Plus, `./lab.sh up` can create the Debian 13 arm64 template and worker VM clones automatically. Defaults are intentionally tied to the observed host: Pimox SSH host `192.168.100.80`, bridge `vmbr0`, template VMID -`9000` on `local` storage, worker VMIDs starting at `9010`, and worker clone -storage `nvme_thin_pool`. Details and override variables are in +`9000` on `local` storage, two 4 GiB worker VMs starting at VMID `9010`, CPU +affinities `4-5` and `6-7`, and worker clone storage `nvme_thin_pool`. Details +and override variables are in `bootstrap/provisioning/README.md`. Worker indexes are stable. Index `1` maps to VMID `9010`, node name `pimox-worker-01`, and worker key `pimox01`; index `2` maps to VMID `9011`, and -so on. `LAB_PIMOX_SKIP_WORKER_INDEXES=1` leaves the already-created first slot -unmanaged while allowing higher indexes to be automated. +so on. `LAB_PIMOX_SKIP_WORKER_INDEXES=1` leaves the first slot unmanaged while +allowing higher indexes to be automated. -Add entries to `bootstrap/cluster/variables.tf` or a `.tfvars` file: +Run a full cluster rebuild from the Debian server with: + +```bash +./lab.sh rebuild-cluster +``` + +That path preserves external Raspberry Pi Gitea, rebuilds the Pimox template +with 2 cores, 4 GiB memory, and CPU affinity `4-5`, replaces two Pimox worker +VMs with 2 cores, 4 GiB memory, and affinities `4-5` and `6-7`, and joins those +workers to the Kubernetes cluster. The Raspberry Pi worker is excluded by +default while it hosts external Gitea. + +To opt the Raspberry Pi back into the Kubernetes cluster, set +`LAB_INCLUDE_RASPBERRY_WORKER=true` or add entries to +`bootstrap/cluster/variables.tf` or a `.tfvars` file: ```hcl worker_nodes = { @@ -459,11 +474,12 @@ for platform or provisioning updates. ## Destructive Rebuilds `./lab.sh nuke` resets kubeadm, containerd runtime state, CNI files, Calico -links, iptables rules, local OpenTofu state, and configured worker nodes. It does -not delete retained data under `/var/openebs/local`. +links, iptables rules, and local OpenTofu state. It does not delete retained data +under `/var/openebs/local`. For multi-node labs, set `WORKER_SSH_TARGETS` to a space-separated list of SSH -targets. For a single-node rebuild, set it to an empty string. +targets. It defaults to an empty string so the Raspberry Pi Gitea host is not +cleaned unless you explicitly include it. ## Website App diff --git a/bootstrap/cluster/variables.tf b/bootstrap/cluster/variables.tf index 6a5eccb..c4bb0bf 100644 --- a/bootstrap/cluster/variables.tf +++ b/bootstrap/cluster/variables.tf @@ -59,25 +59,13 @@ variable "worker_nodes" { ssh_key_path = string })) - default = { - raspberrypi = { - host = "192.168.100.89" - user = "jv" - node_name = "raspberry" - ssh_key_path = "/home/jv/.ssh/id_ed25519" - } - } + default = {} } variable "worker_node_labels" { type = map(map(string)) - default = { - raspberrypi = { - "homelab.dev/node-role" = "edge-app" - "homelab.dev/storage" = "local" - } - } + default = {} } variable "tailscale_nodeport_access" { diff --git a/bootstrap/provisioning/README.md b/bootstrap/provisioning/README.md index c6d552c..358a4c2 100644 --- a/bootstrap/provisioning/README.md +++ b/bootstrap/provisioning/README.md @@ -108,21 +108,26 @@ networking, virtio-scsi disk, `vmbr0`, `local` template storage, 1 socket with `TF_VAR_pimox_template_cpu_affinity` if the Orange Pi template layout changes. `./lab.sh up` also creates or reuses worker clones after the template exists. It -defaults to one worker, VMID `9010`, names like `pimox-worker-01`, deterministic -locally administered MAC addresses, 1 socket with 2 cores, 4 GiB RAM, -Orange Pi 5 high-speed CPU affinity pairs `4-5` and `6-7`, -`nvme_thin_pool` clone storage, and qemu-guest-agent IP discovery. New workers -are full clones created with +defaults to two workers, VMIDs `9010` and `9011`, names like +`pimox-worker-01`, deterministic locally administered MAC addresses, 1 socket +with 2 cores, 4 GiB RAM, Orange Pi 5 high-speed CPU affinity pairs `4-5` and +`6-7`, `nvme_thin_pool` clone storage, and qemu-guest-agent IP discovery. New +workers are full clones created with `qm clone --storage`, so the template can remain on `local` while worker disks -land on the NVMe thin pool. The pipeline refuses `LAB_PIMOX_WORKER_STORAGE=local` -so only the template VM lives on local storage. Useful overrides: +land on the NVMe thin pool. Set `LAB_PIMOX_WORKER_REPLACE_EXISTING=true` to +destroy and recreate existing worker VMs from the current template. The pipeline +refuses `LAB_PIMOX_WORKER_STORAGE=local` so only the template VM lives on local +storage. Useful overrides: ```bash +./lab.sh rebuild-cluster LAB_PIMOX_PIPELINE=false ./lab.sh up +LAB_PIMOX_TEMPLATE_REPLACE_EXISTING=true ./lab.sh up LAB_PIMOX_WORKER_COUNT=0 ./lab.sh up LAB_PIMOX_WORKER_COUNT=2 ./lab.sh up LAB_PIMOX_WORKER_BASE_VMID=9020 ./lab.sh up LAB_PIMOX_WORKER_STORAGE=nvme_thin_pool ./lab.sh up +LAB_PIMOX_WORKER_REPLACE_EXISTING=true ./lab.sh up LAB_PIMOX_WORKER_CPU_AFFINITIES="4-5 6-7" ./lab.sh up LAB_PIMOX_HOST=192.168.100.80 LAB_PIMOX_BRIDGE=vmbr0 ./lab.sh up ``` diff --git a/bootstrap/provisioning/main.tf b/bootstrap/provisioning/main.tf index 97a2229..afb58e5 100644 --- a/bootstrap/provisioning/main.tf +++ b/bootstrap/provisioning/main.tf @@ -173,7 +173,7 @@ if ! ip link show "${self.triggers.bridge}" >/dev/null 2>&1; then fi if sudo "$qm_cmd" status "$vmid" >/dev/null 2>&1; then - if sudo "$qm_cmd" config "$vmid" | grep -q '^template: 1$'; then + if sudo "$qm_cmd" config "$vmid" | grep -q '^template: 1$' && [ "$replace_existing" != "true" ]; then sudo "$qm_cmd" set "$vmid" --agent enabled=1 exit 0 fi diff --git a/lab.sh b/lab.sh index 4a6bedd..1e1e6e4 100755 --- a/lab.sh +++ b/lab.sh @@ -263,7 +263,8 @@ ensure_pimox_worker_node() { local timeout_seconds="${17}" local qm_bin="${18}" local worker_storage="${19}" - local worker_cpu_affinity="${20}" + local worker_replace_existing="${20}" + local worker_cpu_affinity="${21}" local padded local vmid local worker_key @@ -284,9 +285,26 @@ ensure_pimox_worker_node() { echo "VM ${vmid} exists as a template; refusing to reuse it as worker ${worker_name}." >&2 exit 1 fi - pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' set '${vmid}' --agent enabled=1 --sockets 1 --cores '${worker_cores}' --memory '${worker_memory}' --affinity '${worker_cpu_affinity}' + if truthy "${worker_replace_existing}"; then + echo "Replacing existing Pimox worker VM ${vmid} (${worker_name}) before cloning from template ${template_vmid}..." + pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu +sudo '${qm_bin}' stop '${vmid}' >/dev/null 2>&1 || true +elapsed=0 +while [ \"\$elapsed\" -lt 300 ]; do + if sudo '${qm_bin}' status '${vmid}' | grep -q 'status: stopped'; then + break + fi + sleep 5 + elapsed=\$((elapsed + 5)) +done +sudo '${qm_bin}' destroy '${vmid}' --purge 1 >/dev/null 2>&1 || sudo '${qm_bin}' destroy '${vmid}'" + else + pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' set '${vmid}' --agent enabled=1 --sockets 1 --cores '${worker_cores}' --memory '${worker_memory}' --affinity '${worker_cpu_affinity}' if sudo '${qm_bin}' status '${vmid}' | grep -q 'status: stopped'; then sudo '${qm_bin}' start '${vmid}'; fi" - else + fi + fi + + if ! pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' status '${vmid}' >/dev/null 2>&1"; then pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu if ! ip link show '${bridge}' >/dev/null 2>&1; then echo 'Pimox bridge ${bridge} does not exist. Refusing to change Orange Pi networking.' >&2 @@ -325,7 +343,7 @@ write_cluster_worker_var_file() { local spec_file="$1" local var_file="$2" - LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-true}" \ + LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-false}" \ LAB_RASPBERRY_HOST="${LAB_RASPBERRY_HOST:-192.168.100.89}" \ LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \ LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \ @@ -385,17 +403,21 @@ run_pimox_pipeline() { local bridge="${LAB_PIMOX_BRIDGE:-${TF_VAR_pimox_template_bridge:-vmbr0}}" local template_vmid="${LAB_PIMOX_TEMPLATE_VMID:-${TF_VAR_pimox_template_vmid:-9000}}" local template_name="${LAB_PIMOX_TEMPLATE_NAME:-${TF_VAR_pimox_template_name:-debian13-arm64-k8s-template}}" + local template_cores="${LAB_PIMOX_TEMPLATE_CORES:-${TF_VAR_pimox_template_cores:-2}}" + local template_memory="${LAB_PIMOX_TEMPLATE_MEMORY:-${TF_VAR_pimox_template_memory:-4096}}" + local template_cpu_affinity="${LAB_PIMOX_TEMPLATE_CPU_AFFINITY:-${TF_VAR_pimox_template_cpu_affinity:-4-5}}" local template_replace_existing="${LAB_PIMOX_TEMPLATE_REPLACE_EXISTING:-${TF_VAR_pimox_template_replace_existing:-false}}" local provisioning_interface - local worker_count="${LAB_PIMOX_WORKER_COUNT:-1}" + local worker_count="${LAB_PIMOX_WORKER_COUNT:-2}" local worker_base_vmid="${LAB_PIMOX_WORKER_BASE_VMID:-9010}" local worker_name_prefix="${LAB_PIMOX_WORKER_NAME_PREFIX:-pimox-worker}" local worker_node_prefix="${LAB_PIMOX_WORKER_NODE_PREFIX:-pimox-worker}" local worker_key_prefix="${LAB_PIMOX_WORKER_KEY_PREFIX:-pimox}" - local worker_skip_indexes="${LAB_PIMOX_SKIP_WORKER_INDEXES:-1}" + local worker_skip_indexes="${LAB_PIMOX_SKIP_WORKER_INDEXES:-}" local worker_cores="${LAB_PIMOX_WORKER_CORES:-2}" local worker_memory="${LAB_PIMOX_WORKER_MEMORY:-4096}" local worker_cpu_affinities="${LAB_PIMOX_WORKER_CPU_AFFINITIES:-4-5 6-7}" + local worker_replace_existing="${LAB_PIMOX_WORKER_REPLACE_EXISTING:-false}" local worker_storage="${LAB_PIMOX_WORKER_STORAGE:-${TF_VAR_pimox_worker_storage:-nvme_thin_pool}}" local worker_user="${LAB_PIMOX_WORKER_USER:-jv}" local worker_key_path="${LAB_PIMOX_WORKER_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}" @@ -406,6 +428,7 @@ run_pimox_pipeline() { local index local readiness_output local readiness_status + local template_cpu_count local worker_cpu_affinity if disabled_value "${mode}"; then @@ -420,6 +443,30 @@ run_pimox_pipeline() { echo "LAB_PIMOX_WORKER_COUNT must be a non-negative integer." >&2 exit 1 fi + if ! [[ "${template_cores}" =~ ^[0-9]+$ && "${worker_cores}" =~ ^[0-9]+$ ]]; then + echo "LAB_PIMOX_TEMPLATE_CORES and LAB_PIMOX_WORKER_CORES must be positive integers." >&2 + exit 1 + fi + if ! [[ "${template_memory}" =~ ^[0-9]+$ && "${worker_memory}" =~ ^[0-9]+$ ]]; then + echo "LAB_PIMOX_TEMPLATE_MEMORY and LAB_PIMOX_WORKER_MEMORY must be positive integer MiB values." >&2 + exit 1 + fi + if ((template_cores == 0 || worker_cores == 0 || template_memory == 0 || worker_memory == 0)); then + echo "Pimox template and worker CPU and memory values must be greater than zero." >&2 + exit 1 + fi + if ! template_cpu_count="$(cpuset_cpu_count "${template_cpu_affinity}")"; then + echo "Invalid Pimox template CPU affinity '${template_cpu_affinity}'. Use CPU IDs or ranges, such as 4-5." >&2 + exit 1 + fi + if ((template_cpu_count != template_cores)); then + echo "Pimox template uses ${template_cores} cores but affinity '${template_cpu_affinity}' contains ${template_cpu_count} CPUs." >&2 + exit 1 + fi + if ! truthy "${worker_replace_existing}" && ! disabled_value "${worker_replace_existing}"; then + echo "LAB_PIMOX_WORKER_REPLACE_EXISTING must be true or false." >&2 + exit 1 + fi if ! [[ "${worker_storage}" =~ ^[A-Za-z0-9_.:-]+$ ]]; then echo "LAB_PIMOX_WORKER_STORAGE must be a valid Pimox storage identifier." >&2 exit 1 @@ -469,6 +516,9 @@ fi" 2>&1)" export TF_VAR_pimox_template_bridge="${bridge}" export TF_VAR_pimox_template_vmid="${template_vmid}" export TF_VAR_pimox_template_name="${template_name}" + export TF_VAR_pimox_template_cores="${template_cores}" + export TF_VAR_pimox_template_memory="${template_memory}" + export TF_VAR_pimox_template_cpu_affinity="${template_cpu_affinity}" export TF_VAR_pimox_template_replace_existing="${template_replace_existing}" export TF_VAR_pimox_template_builder_enabled="${TF_VAR_pimox_template_builder_enabled:-true}" export TF_VAR_pimox_template_build_ssh_key_path="${TF_VAR_pimox_template_build_ssh_key_path:-${worker_key_path}}" @@ -519,6 +569,7 @@ fi" 2>&1)" "${timeout_seconds}" \ "${qm_bin}" \ "${worker_storage}" \ + "${worker_replace_existing}" \ "${worker_cpu_affinity}" done @@ -2239,6 +2290,36 @@ up() { echo "Deployment successfully completed." } +rebuild_cluster() { + require_debian_server "rebuild-cluster" + + export LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-false}" + export LAB_PIMOX_PIPELINE="${LAB_PIMOX_PIPELINE:-true}" + export LAB_PIMOX_TEMPLATE_REPLACE_EXISTING="${LAB_PIMOX_TEMPLATE_REPLACE_EXISTING:-true}" + export LAB_PIMOX_TEMPLATE_CORES="${LAB_PIMOX_TEMPLATE_CORES:-2}" + export LAB_PIMOX_TEMPLATE_MEMORY="${LAB_PIMOX_TEMPLATE_MEMORY:-4096}" + export LAB_PIMOX_TEMPLATE_CPU_AFFINITY="${LAB_PIMOX_TEMPLATE_CPU_AFFINITY:-4-5}" + export LAB_PIMOX_WORKER_COUNT="${LAB_PIMOX_WORKER_COUNT:-2}" + export LAB_PIMOX_SKIP_WORKER_INDEXES="${LAB_PIMOX_SKIP_WORKER_INDEXES:-}" + export LAB_PIMOX_WORKER_REPLACE_EXISTING="${LAB_PIMOX_WORKER_REPLACE_EXISTING:-true}" + export LAB_PIMOX_WORKER_CORES="${LAB_PIMOX_WORKER_CORES:-2}" + export LAB_PIMOX_WORKER_MEMORY="${LAB_PIMOX_WORKER_MEMORY:-4096}" + export LAB_PIMOX_WORKER_CPU_AFFINITIES="${LAB_PIMOX_WORKER_CPU_AFFINITIES:-4-5 6-7}" + export WORKER_SSH_TARGETS="${WORKER_SSH_TARGETS:-}" + + echo "Rebuilding the Kubernetes cluster without touching external Raspberry Pi Gitea..." + + nuke + run_pimox_pipeline + run_openwrt_pipeline + run_tofu_stack "bootstrap/cluster" + run_tofu_stack "bootstrap/platform" + apps + run_tofu_stack "bootstrap/edge" + + echo "Cluster rebuild successfully completed." +} + nuke() { local worker_ssh_targets local worker_targets @@ -2247,7 +2328,7 @@ nuke() { require_debian_server "nuke" echo "Brutally nuking the homelab infrastructure..." - worker_ssh_targets="${WORKER_SSH_TARGETS-jv@192.168.100.89}" + worker_ssh_targets="${WORKER_SSH_TARGETS-}" read -r -a worker_targets <<< "${worker_ssh_targets}" echo "--> Terminating local OpenTofu tasks..." @@ -2364,6 +2445,9 @@ EOF rm -f "${BUILDX_CONFIG}" || true echo "--> Deleting OpenTofu tracking state files..." + rm -rf "${REPO_ROOT}"/bootstrap/provisioning/terraform.tfstate* + rm -f "${REPO_ROOT}"/bootstrap/provisioning/.terraform.tfstate.lock.info + rm -rf "${REPO_ROOT}"/bootstrap/provisioning/.terraform/ rm -rf "${REPO_ROOT}"/bootstrap/cluster/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/cluster/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/cluster/.terraform/ @@ -2384,6 +2468,9 @@ case "${1:-}" in up) up ;; + rebuild-cluster) + rebuild_cluster + ;; apps) apps ;; @@ -2406,8 +2493,7 @@ case "${1:-}" in nuke ;; *) - echo "Usage: $0 {up|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|nuke}" + echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|nuke}" exit 1 ;; esac -