Add Pimox cluster rebuild path
Homelab Main / deploy (push) Failing after 2s
Details
Homelab Main / deploy (push) Failing after 2s
Details
This commit is contained in:
parent
041e607fed
commit
f8d2286558
32
README.md
32
README.md
|
|
@ -164,16 +164,31 @@ hostname.
|
|||
For Pimox on Orange Pi 5 Plus, `./lab.sh up` can create the Debian 13 arm64
|
||||
template and worker VM clones automatically. Defaults are intentionally tied to
|
||||
the observed host: Pimox SSH host `192.168.100.80`, bridge `vmbr0`, template VMID
|
||||
`9000` on `local` storage, worker VMIDs starting at `9010`, and worker clone
|
||||
storage `nvme_thin_pool`. Details and override variables are in
|
||||
`9000` on `local` storage, two 4 GiB worker VMs starting at VMID `9010`, CPU
|
||||
affinities `4-5` and `6-7`, and worker clone storage `nvme_thin_pool`. Details
|
||||
and override variables are in
|
||||
`bootstrap/provisioning/README.md`.
|
||||
|
||||
Worker indexes are stable. Index `1` maps to VMID `9010`, node name
|
||||
`pimox-worker-01`, and worker key `pimox01`; index `2` maps to VMID `9011`, and
|
||||
so on. `LAB_PIMOX_SKIP_WORKER_INDEXES=1` leaves the already-created first slot
|
||||
unmanaged while allowing higher indexes to be automated.
|
||||
so on. `LAB_PIMOX_SKIP_WORKER_INDEXES=1` leaves the first slot unmanaged while
|
||||
allowing higher indexes to be automated.
|
||||
|
||||
Add entries to `bootstrap/cluster/variables.tf` or a `.tfvars` file:
|
||||
Run a full cluster rebuild from the Debian server with:
|
||||
|
||||
```bash
|
||||
./lab.sh rebuild-cluster
|
||||
```
|
||||
|
||||
That path preserves external Raspberry Pi Gitea, rebuilds the Pimox template
|
||||
with 2 cores, 4 GiB memory, and CPU affinity `4-5`, replaces two Pimox worker
|
||||
VMs with 2 cores, 4 GiB memory, and affinities `4-5` and `6-7`, and joins those
|
||||
workers to the Kubernetes cluster. The Raspberry Pi worker is excluded by
|
||||
default while it hosts external Gitea.
|
||||
|
||||
To opt the Raspberry Pi back into the Kubernetes cluster, set
|
||||
`LAB_INCLUDE_RASPBERRY_WORKER=true` or add entries to
|
||||
`bootstrap/cluster/variables.tf` or a `.tfvars` file:
|
||||
|
||||
```hcl
|
||||
worker_nodes = {
|
||||
|
|
@ -459,11 +474,12 @@ for platform or provisioning updates.
|
|||
## Destructive Rebuilds
|
||||
|
||||
`./lab.sh nuke` resets kubeadm, containerd runtime state, CNI files, Calico
|
||||
links, iptables rules, local OpenTofu state, and configured worker nodes. It does
|
||||
not delete retained data under `/var/openebs/local`.
|
||||
links, iptables rules, and local OpenTofu state. It does not delete retained data
|
||||
under `/var/openebs/local`.
|
||||
|
||||
For multi-node labs, set `WORKER_SSH_TARGETS` to a space-separated list of SSH
|
||||
targets. For a single-node rebuild, set it to an empty string.
|
||||
targets. It defaults to an empty string so the Raspberry Pi Gitea host is not
|
||||
cleaned unless you explicitly include it.
|
||||
|
||||
## Website App
|
||||
|
||||
|
|
|
|||
|
|
@ -59,25 +59,13 @@ variable "worker_nodes" {
|
|||
ssh_key_path = string
|
||||
}))
|
||||
|
||||
default = {
|
||||
raspberrypi = {
|
||||
host = "192.168.100.89"
|
||||
user = "jv"
|
||||
node_name = "raspberry"
|
||||
ssh_key_path = "/home/jv/.ssh/id_ed25519"
|
||||
}
|
||||
}
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "worker_node_labels" {
|
||||
type = map(map(string))
|
||||
|
||||
default = {
|
||||
raspberrypi = {
|
||||
"homelab.dev/node-role" = "edge-app"
|
||||
"homelab.dev/storage" = "local"
|
||||
}
|
||||
}
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "tailscale_nodeport_access" {
|
||||
|
|
|
|||
|
|
@ -108,21 +108,26 @@ networking, virtio-scsi disk, `vmbr0`, `local` template storage, 1 socket with
|
|||
`TF_VAR_pimox_template_cpu_affinity` if the Orange Pi template layout changes.
|
||||
|
||||
`./lab.sh up` also creates or reuses worker clones after the template exists. It
|
||||
defaults to one worker, VMID `9010`, names like `pimox-worker-01`, deterministic
|
||||
locally administered MAC addresses, 1 socket with 2 cores, 4 GiB RAM,
|
||||
Orange Pi 5 high-speed CPU affinity pairs `4-5` and `6-7`,
|
||||
`nvme_thin_pool` clone storage, and qemu-guest-agent IP discovery. New workers
|
||||
are full clones created with
|
||||
defaults to two workers, VMIDs `9010` and `9011`, names like
|
||||
`pimox-worker-01`, deterministic locally administered MAC addresses, 1 socket
|
||||
with 2 cores, 4 GiB RAM, Orange Pi 5 high-speed CPU affinity pairs `4-5` and
|
||||
`6-7`, `nvme_thin_pool` clone storage, and qemu-guest-agent IP discovery. New
|
||||
workers are full clones created with
|
||||
`qm clone --storage`, so the template can remain on `local` while worker disks
|
||||
land on the NVMe thin pool. The pipeline refuses `LAB_PIMOX_WORKER_STORAGE=local`
|
||||
so only the template VM lives on local storage. Useful overrides:
|
||||
land on the NVMe thin pool. Set `LAB_PIMOX_WORKER_REPLACE_EXISTING=true` to
|
||||
destroy and recreate existing worker VMs from the current template. The pipeline
|
||||
refuses `LAB_PIMOX_WORKER_STORAGE=local` so only the template VM lives on local
|
||||
storage. Useful overrides:
|
||||
|
||||
```bash
|
||||
./lab.sh rebuild-cluster
|
||||
LAB_PIMOX_PIPELINE=false ./lab.sh up
|
||||
LAB_PIMOX_TEMPLATE_REPLACE_EXISTING=true ./lab.sh up
|
||||
LAB_PIMOX_WORKER_COUNT=0 ./lab.sh up
|
||||
LAB_PIMOX_WORKER_COUNT=2 ./lab.sh up
|
||||
LAB_PIMOX_WORKER_BASE_VMID=9020 ./lab.sh up
|
||||
LAB_PIMOX_WORKER_STORAGE=nvme_thin_pool ./lab.sh up
|
||||
LAB_PIMOX_WORKER_REPLACE_EXISTING=true ./lab.sh up
|
||||
LAB_PIMOX_WORKER_CPU_AFFINITIES="4-5 6-7" ./lab.sh up
|
||||
LAB_PIMOX_HOST=192.168.100.80 LAB_PIMOX_BRIDGE=vmbr0 ./lab.sh up
|
||||
```
|
||||
|
|
|
|||
|
|
@ -173,7 +173,7 @@ if ! ip link show "${self.triggers.bridge}" >/dev/null 2>&1; then
|
|||
fi
|
||||
|
||||
if sudo "$qm_cmd" status "$vmid" >/dev/null 2>&1; then
|
||||
if sudo "$qm_cmd" config "$vmid" | grep -q '^template: 1$'; then
|
||||
if sudo "$qm_cmd" config "$vmid" | grep -q '^template: 1$' && [ "$replace_existing" != "true" ]; then
|
||||
sudo "$qm_cmd" set "$vmid" --agent enabled=1
|
||||
exit 0
|
||||
fi
|
||||
|
|
|
|||
104
lab.sh
104
lab.sh
|
|
@ -263,7 +263,8 @@ ensure_pimox_worker_node() {
|
|||
local timeout_seconds="${17}"
|
||||
local qm_bin="${18}"
|
||||
local worker_storage="${19}"
|
||||
local worker_cpu_affinity="${20}"
|
||||
local worker_replace_existing="${20}"
|
||||
local worker_cpu_affinity="${21}"
|
||||
local padded
|
||||
local vmid
|
||||
local worker_key
|
||||
|
|
@ -284,9 +285,26 @@ ensure_pimox_worker_node() {
|
|||
echo "VM ${vmid} exists as a template; refusing to reuse it as worker ${worker_name}." >&2
|
||||
exit 1
|
||||
fi
|
||||
pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' set '${vmid}' --agent enabled=1 --sockets 1 --cores '${worker_cores}' --memory '${worker_memory}' --affinity '${worker_cpu_affinity}'
|
||||
if truthy "${worker_replace_existing}"; then
|
||||
echo "Replacing existing Pimox worker VM ${vmid} (${worker_name}) before cloning from template ${template_vmid}..."
|
||||
pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu
|
||||
sudo '${qm_bin}' stop '${vmid}' >/dev/null 2>&1 || true
|
||||
elapsed=0
|
||||
while [ \"\$elapsed\" -lt 300 ]; do
|
||||
if sudo '${qm_bin}' status '${vmid}' | grep -q 'status: stopped'; then
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
elapsed=\$((elapsed + 5))
|
||||
done
|
||||
sudo '${qm_bin}' destroy '${vmid}' --purge 1 >/dev/null 2>&1 || sudo '${qm_bin}' destroy '${vmid}'"
|
||||
else
|
||||
pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' set '${vmid}' --agent enabled=1 --sockets 1 --cores '${worker_cores}' --memory '${worker_memory}' --affinity '${worker_cpu_affinity}'
|
||||
if sudo '${qm_bin}' status '${vmid}' | grep -q 'status: stopped'; then sudo '${qm_bin}' start '${vmid}'; fi"
|
||||
else
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' status '${vmid}' >/dev/null 2>&1"; then
|
||||
pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu
|
||||
if ! ip link show '${bridge}' >/dev/null 2>&1; then
|
||||
echo 'Pimox bridge ${bridge} does not exist. Refusing to change Orange Pi networking.' >&2
|
||||
|
|
@ -325,7 +343,7 @@ write_cluster_worker_var_file() {
|
|||
local spec_file="$1"
|
||||
local var_file="$2"
|
||||
|
||||
LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-true}" \
|
||||
LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-false}" \
|
||||
LAB_RASPBERRY_HOST="${LAB_RASPBERRY_HOST:-192.168.100.89}" \
|
||||
LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \
|
||||
LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \
|
||||
|
|
@ -385,17 +403,21 @@ run_pimox_pipeline() {
|
|||
local bridge="${LAB_PIMOX_BRIDGE:-${TF_VAR_pimox_template_bridge:-vmbr0}}"
|
||||
local template_vmid="${LAB_PIMOX_TEMPLATE_VMID:-${TF_VAR_pimox_template_vmid:-9000}}"
|
||||
local template_name="${LAB_PIMOX_TEMPLATE_NAME:-${TF_VAR_pimox_template_name:-debian13-arm64-k8s-template}}"
|
||||
local template_cores="${LAB_PIMOX_TEMPLATE_CORES:-${TF_VAR_pimox_template_cores:-2}}"
|
||||
local template_memory="${LAB_PIMOX_TEMPLATE_MEMORY:-${TF_VAR_pimox_template_memory:-4096}}"
|
||||
local template_cpu_affinity="${LAB_PIMOX_TEMPLATE_CPU_AFFINITY:-${TF_VAR_pimox_template_cpu_affinity:-4-5}}"
|
||||
local template_replace_existing="${LAB_PIMOX_TEMPLATE_REPLACE_EXISTING:-${TF_VAR_pimox_template_replace_existing:-false}}"
|
||||
local provisioning_interface
|
||||
local worker_count="${LAB_PIMOX_WORKER_COUNT:-1}"
|
||||
local worker_count="${LAB_PIMOX_WORKER_COUNT:-2}"
|
||||
local worker_base_vmid="${LAB_PIMOX_WORKER_BASE_VMID:-9010}"
|
||||
local worker_name_prefix="${LAB_PIMOX_WORKER_NAME_PREFIX:-pimox-worker}"
|
||||
local worker_node_prefix="${LAB_PIMOX_WORKER_NODE_PREFIX:-pimox-worker}"
|
||||
local worker_key_prefix="${LAB_PIMOX_WORKER_KEY_PREFIX:-pimox}"
|
||||
local worker_skip_indexes="${LAB_PIMOX_SKIP_WORKER_INDEXES:-1}"
|
||||
local worker_skip_indexes="${LAB_PIMOX_SKIP_WORKER_INDEXES:-}"
|
||||
local worker_cores="${LAB_PIMOX_WORKER_CORES:-2}"
|
||||
local worker_memory="${LAB_PIMOX_WORKER_MEMORY:-4096}"
|
||||
local worker_cpu_affinities="${LAB_PIMOX_WORKER_CPU_AFFINITIES:-4-5 6-7}"
|
||||
local worker_replace_existing="${LAB_PIMOX_WORKER_REPLACE_EXISTING:-false}"
|
||||
local worker_storage="${LAB_PIMOX_WORKER_STORAGE:-${TF_VAR_pimox_worker_storage:-nvme_thin_pool}}"
|
||||
local worker_user="${LAB_PIMOX_WORKER_USER:-jv}"
|
||||
local worker_key_path="${LAB_PIMOX_WORKER_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}"
|
||||
|
|
@ -406,6 +428,7 @@ run_pimox_pipeline() {
|
|||
local index
|
||||
local readiness_output
|
||||
local readiness_status
|
||||
local template_cpu_count
|
||||
local worker_cpu_affinity
|
||||
|
||||
if disabled_value "${mode}"; then
|
||||
|
|
@ -420,6 +443,30 @@ run_pimox_pipeline() {
|
|||
echo "LAB_PIMOX_WORKER_COUNT must be a non-negative integer." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! [[ "${template_cores}" =~ ^[0-9]+$ && "${worker_cores}" =~ ^[0-9]+$ ]]; then
|
||||
echo "LAB_PIMOX_TEMPLATE_CORES and LAB_PIMOX_WORKER_CORES must be positive integers." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! [[ "${template_memory}" =~ ^[0-9]+$ && "${worker_memory}" =~ ^[0-9]+$ ]]; then
|
||||
echo "LAB_PIMOX_TEMPLATE_MEMORY and LAB_PIMOX_WORKER_MEMORY must be positive integer MiB values." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ((template_cores == 0 || worker_cores == 0 || template_memory == 0 || worker_memory == 0)); then
|
||||
echo "Pimox template and worker CPU and memory values must be greater than zero." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! template_cpu_count="$(cpuset_cpu_count "${template_cpu_affinity}")"; then
|
||||
echo "Invalid Pimox template CPU affinity '${template_cpu_affinity}'. Use CPU IDs or ranges, such as 4-5." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ((template_cpu_count != template_cores)); then
|
||||
echo "Pimox template uses ${template_cores} cores but affinity '${template_cpu_affinity}' contains ${template_cpu_count} CPUs." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! truthy "${worker_replace_existing}" && ! disabled_value "${worker_replace_existing}"; then
|
||||
echo "LAB_PIMOX_WORKER_REPLACE_EXISTING must be true or false." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! [[ "${worker_storage}" =~ ^[A-Za-z0-9_.:-]+$ ]]; then
|
||||
echo "LAB_PIMOX_WORKER_STORAGE must be a valid Pimox storage identifier." >&2
|
||||
exit 1
|
||||
|
|
@ -469,6 +516,9 @@ fi" 2>&1)"
|
|||
export TF_VAR_pimox_template_bridge="${bridge}"
|
||||
export TF_VAR_pimox_template_vmid="${template_vmid}"
|
||||
export TF_VAR_pimox_template_name="${template_name}"
|
||||
export TF_VAR_pimox_template_cores="${template_cores}"
|
||||
export TF_VAR_pimox_template_memory="${template_memory}"
|
||||
export TF_VAR_pimox_template_cpu_affinity="${template_cpu_affinity}"
|
||||
export TF_VAR_pimox_template_replace_existing="${template_replace_existing}"
|
||||
export TF_VAR_pimox_template_builder_enabled="${TF_VAR_pimox_template_builder_enabled:-true}"
|
||||
export TF_VAR_pimox_template_build_ssh_key_path="${TF_VAR_pimox_template_build_ssh_key_path:-${worker_key_path}}"
|
||||
|
|
@ -519,6 +569,7 @@ fi" 2>&1)"
|
|||
"${timeout_seconds}" \
|
||||
"${qm_bin}" \
|
||||
"${worker_storage}" \
|
||||
"${worker_replace_existing}" \
|
||||
"${worker_cpu_affinity}"
|
||||
done
|
||||
|
||||
|
|
@ -2239,6 +2290,36 @@ up() {
|
|||
echo "Deployment successfully completed."
|
||||
}
|
||||
|
||||
rebuild_cluster() {
|
||||
require_debian_server "rebuild-cluster"
|
||||
|
||||
export LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-false}"
|
||||
export LAB_PIMOX_PIPELINE="${LAB_PIMOX_PIPELINE:-true}"
|
||||
export LAB_PIMOX_TEMPLATE_REPLACE_EXISTING="${LAB_PIMOX_TEMPLATE_REPLACE_EXISTING:-true}"
|
||||
export LAB_PIMOX_TEMPLATE_CORES="${LAB_PIMOX_TEMPLATE_CORES:-2}"
|
||||
export LAB_PIMOX_TEMPLATE_MEMORY="${LAB_PIMOX_TEMPLATE_MEMORY:-4096}"
|
||||
export LAB_PIMOX_TEMPLATE_CPU_AFFINITY="${LAB_PIMOX_TEMPLATE_CPU_AFFINITY:-4-5}"
|
||||
export LAB_PIMOX_WORKER_COUNT="${LAB_PIMOX_WORKER_COUNT:-2}"
|
||||
export LAB_PIMOX_SKIP_WORKER_INDEXES="${LAB_PIMOX_SKIP_WORKER_INDEXES:-}"
|
||||
export LAB_PIMOX_WORKER_REPLACE_EXISTING="${LAB_PIMOX_WORKER_REPLACE_EXISTING:-true}"
|
||||
export LAB_PIMOX_WORKER_CORES="${LAB_PIMOX_WORKER_CORES:-2}"
|
||||
export LAB_PIMOX_WORKER_MEMORY="${LAB_PIMOX_WORKER_MEMORY:-4096}"
|
||||
export LAB_PIMOX_WORKER_CPU_AFFINITIES="${LAB_PIMOX_WORKER_CPU_AFFINITIES:-4-5 6-7}"
|
||||
export WORKER_SSH_TARGETS="${WORKER_SSH_TARGETS:-}"
|
||||
|
||||
echo "Rebuilding the Kubernetes cluster without touching external Raspberry Pi Gitea..."
|
||||
|
||||
nuke
|
||||
run_pimox_pipeline
|
||||
run_openwrt_pipeline
|
||||
run_tofu_stack "bootstrap/cluster"
|
||||
run_tofu_stack "bootstrap/platform"
|
||||
apps
|
||||
run_tofu_stack "bootstrap/edge"
|
||||
|
||||
echo "Cluster rebuild successfully completed."
|
||||
}
|
||||
|
||||
nuke() {
|
||||
local worker_ssh_targets
|
||||
local worker_targets
|
||||
|
|
@ -2247,7 +2328,7 @@ nuke() {
|
|||
require_debian_server "nuke"
|
||||
|
||||
echo "Brutally nuking the homelab infrastructure..."
|
||||
worker_ssh_targets="${WORKER_SSH_TARGETS-jv@192.168.100.89}"
|
||||
worker_ssh_targets="${WORKER_SSH_TARGETS-}"
|
||||
read -r -a worker_targets <<< "${worker_ssh_targets}"
|
||||
|
||||
echo "--> Terminating local OpenTofu tasks..."
|
||||
|
|
@ -2364,6 +2445,9 @@ EOF
|
|||
rm -f "${BUILDX_CONFIG}" || true
|
||||
|
||||
echo "--> Deleting OpenTofu tracking state files..."
|
||||
rm -rf "${REPO_ROOT}"/bootstrap/provisioning/terraform.tfstate*
|
||||
rm -f "${REPO_ROOT}"/bootstrap/provisioning/.terraform.tfstate.lock.info
|
||||
rm -rf "${REPO_ROOT}"/bootstrap/provisioning/.terraform/
|
||||
rm -rf "${REPO_ROOT}"/bootstrap/cluster/terraform.tfstate*
|
||||
rm -f "${REPO_ROOT}"/bootstrap/cluster/.terraform.tfstate.lock.info
|
||||
rm -rf "${REPO_ROOT}"/bootstrap/cluster/.terraform/
|
||||
|
|
@ -2384,6 +2468,9 @@ case "${1:-}" in
|
|||
up)
|
||||
up
|
||||
;;
|
||||
rebuild-cluster)
|
||||
rebuild_cluster
|
||||
;;
|
||||
apps)
|
||||
apps
|
||||
;;
|
||||
|
|
@ -2406,8 +2493,7 @@ case "${1:-}" in
|
|||
nuke
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {up|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|nuke}"
|
||||
echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|nuke}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue