Improve Pimox worker boot diagnostics
Homelab Main / deploy (push) Has been cancelled Details

This commit is contained in:
juvdiaz 2026-05-27 16:35:46 -06:00
parent 14b784ef9a
commit 8a55d14542
1 changed files with 59 additions and 7 deletions

62
lab.sh
View File

@ -120,7 +120,7 @@ pimox_guest_ipv4() {
local key_path="$3" local key_path="$3"
local vmid="$4" local vmid="$4"
local ip_prefix="$5" local ip_prefix="$5"
local qm_bin="${LAB_PIMOX_QM_BIN:-/usr/sbin/qm}" local qm_bin="${6:-${LAB_PIMOX_QM_BIN:-/usr/sbin/qm}}"
guest_json="$(pimox_ssh "${host}" "${user}" "${key_path}" "sudo '${qm_bin}' guest cmd '${vmid}' network-get-interfaces" 2>/dev/null || true)" guest_json="$(pimox_ssh "${host}" "${user}" "${key_path}" "sudo '${qm_bin}' guest cmd '${vmid}' network-get-interfaces" 2>/dev/null || true)"
if [[ -z "${guest_json}" ]]; then if [[ -z "${guest_json}" ]]; then
@ -153,6 +153,22 @@ sys.exit(1)
PY PY
} }
pimox_worker_vm_debug() {
local host="$1"
local user="$2"
local key_path="$3"
local vmid="$4"
local qm_bin="$5"
pimox_ssh "${host}" "${user}" "${key_path}" "set +e
echo 'Pimox VM ${vmid} status:'
sudo '${qm_bin}' status '${vmid}'
echo 'Pimox VM ${vmid} config summary:'
sudo '${qm_bin}' config '${vmid}' | grep -E '^(agent|boot|net0|scsi0|virtio0|sata0|ide0|ide2|efidisk0):' || true
echo 'Pimox VM ${vmid} guest-agent network-get-interfaces:'
sudo '${qm_bin}' guest cmd '${vmid}' network-get-interfaces" >&2 || true
}
wait_for_pimox_guest_ssh() { wait_for_pimox_guest_ssh() {
local host="$1" local host="$1"
local user="$2" local user="$2"
@ -162,20 +178,56 @@ wait_for_pimox_guest_ssh() {
local guest_key_path="$6" local guest_key_path="$6"
local ip_prefix="$7" local ip_prefix="$7"
local timeout_seconds="$8" local timeout_seconds="$8"
local qm_bin="${9:-${LAB_PIMOX_QM_BIN:-/usr/sbin/qm}}"
local deadline local deadline
local elapsed
local guest_ip local guest_ip
local ip_filter_description
local last_guest_ip=""
local last_ssh_output=""
local next_log
local ssh_output
ip_filter_description="matching prefix ${ip_prefix}"
if [[ -z "${ip_prefix}" ]]; then
ip_filter_description="that is not loopback or link-local"
fi
deadline=$((SECONDS + timeout_seconds)) deadline=$((SECONDS + timeout_seconds))
next_log="${SECONDS}"
while ((SECONDS < deadline)); do while ((SECONDS < deadline)); do
guest_ip="$(pimox_guest_ipv4 "${host}" "${user}" "${key_path}" "${vmid}" "${ip_prefix}" || true)" guest_ip="$(pimox_guest_ipv4 "${host}" "${user}" "${key_path}" "${vmid}" "${ip_prefix}" "${qm_bin}" || true)"
if [[ -n "${guest_ip}" ]] && if [[ -n "${guest_ip}" ]]; then
ssh -i "${guest_key_path}" -o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new "${guest_user}@${guest_ip}" true >/dev/null 2>&1; then last_guest_ip="${guest_ip}"
if ssh_output="$(ssh -i "${guest_key_path}" -o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new "${guest_user}@${guest_ip}" true 2>&1)"; then
printf '%s\n' "${guest_ip}" printf '%s\n' "${guest_ip}"
return 0 return 0
fi fi
last_ssh_output="${ssh_output}"
fi
if ((SECONDS >= next_log)); then
elapsed=$((timeout_seconds - (deadline - SECONDS)))
if [[ -n "${last_guest_ip}" ]]; then
echo "Waiting for SSH to worker VM ${vmid} at ${last_guest_ip} as ${guest_user} (${elapsed}s elapsed)..." >&2
else
echo "Waiting for worker VM ${vmid} to report an IPv4 address ${ip_filter_description} through qemu-guest-agent (${elapsed}s elapsed)..." >&2
fi
next_log=$((SECONDS + 60))
fi
sleep 10 sleep 10
done done
if [[ -n "${last_guest_ip}" ]]; then
echo "Worker VM ${vmid} reported guest IP ${last_guest_ip}, but SSH as ${guest_user} never became reachable." >&2
if [[ -n "${last_ssh_output}" ]]; then
echo "Last SSH failure: ${last_ssh_output}" >&2
fi
else
echo "Worker VM ${vmid} did not report an IPv4 address ${ip_filter_description} through qemu-guest-agent." >&2
fi
pimox_worker_vm_debug "${host}" "${user}" "${key_path}" "${vmid}" "${qm_bin}"
return 1 return 1
} }
@ -358,7 +410,7 @@ sudo '${qm_bin}' set '${vmid}' --onboot 1
sudo '${qm_bin}' start '${vmid}'" sudo '${qm_bin}' start '${vmid}'"
fi fi
if ! guest_ip="$(wait_for_pimox_guest_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "${vmid}" "${worker_user}" "${worker_key_path}" "${ip_prefix}" "${timeout_seconds}")"; then if ! guest_ip="$(wait_for_pimox_guest_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "${vmid}" "${worker_user}" "${worker_key_path}" "${ip_prefix}" "${timeout_seconds}" "${qm_bin}")"; then
echo "Timed out waiting for worker VM ${vmid} (${worker_name}) to report a reachable guest IP." >&2 echo "Timed out waiting for worker VM ${vmid} (${worker_name}) to report a reachable guest IP." >&2
exit 1 exit 1
fi fi