From 8a55d14542eb2253285d999da352dec92968b595 Mon Sep 17 00:00:00 2001 From: juvdiaz Date: Wed, 27 May 2026 16:35:46 -0600 Subject: [PATCH] Improve Pimox worker boot diagnostics --- lab.sh | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/lab.sh b/lab.sh index afbe15e..b8bd730 100755 --- a/lab.sh +++ b/lab.sh @@ -120,7 +120,7 @@ pimox_guest_ipv4() { local key_path="$3" local vmid="$4" local ip_prefix="$5" - local qm_bin="${LAB_PIMOX_QM_BIN:-/usr/sbin/qm}" + local qm_bin="${6:-${LAB_PIMOX_QM_BIN:-/usr/sbin/qm}}" guest_json="$(pimox_ssh "${host}" "${user}" "${key_path}" "sudo '${qm_bin}' guest cmd '${vmid}' network-get-interfaces" 2>/dev/null || true)" if [[ -z "${guest_json}" ]]; then @@ -153,6 +153,22 @@ sys.exit(1) PY } +pimox_worker_vm_debug() { + local host="$1" + local user="$2" + local key_path="$3" + local vmid="$4" + local qm_bin="$5" + + pimox_ssh "${host}" "${user}" "${key_path}" "set +e +echo 'Pimox VM ${vmid} status:' +sudo '${qm_bin}' status '${vmid}' +echo 'Pimox VM ${vmid} config summary:' +sudo '${qm_bin}' config '${vmid}' | grep -E '^(agent|boot|net0|scsi0|virtio0|sata0|ide0|ide2|efidisk0):' || true +echo 'Pimox VM ${vmid} guest-agent network-get-interfaces:' +sudo '${qm_bin}' guest cmd '${vmid}' network-get-interfaces" >&2 || true +} + wait_for_pimox_guest_ssh() { local host="$1" local user="$2" @@ -162,20 +178,56 @@ wait_for_pimox_guest_ssh() { local guest_key_path="$6" local ip_prefix="$7" local timeout_seconds="$8" + local qm_bin="${9:-${LAB_PIMOX_QM_BIN:-/usr/sbin/qm}}" local deadline + local elapsed local guest_ip + local ip_filter_description + local last_guest_ip="" + local last_ssh_output="" + local next_log + local ssh_output + + ip_filter_description="matching prefix ${ip_prefix}" + if [[ -z "${ip_prefix}" ]]; then + ip_filter_description="that is not loopback or link-local" + fi deadline=$((SECONDS + timeout_seconds)) + next_log="${SECONDS}" while ((SECONDS < deadline)); do - guest_ip="$(pimox_guest_ipv4 "${host}" "${user}" "${key_path}" "${vmid}" "${ip_prefix}" || true)" - if [[ -n "${guest_ip}" ]] && - ssh -i "${guest_key_path}" -o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new "${guest_user}@${guest_ip}" true >/dev/null 2>&1; then - printf '%s\n' "${guest_ip}" - return 0 + guest_ip="$(pimox_guest_ipv4 "${host}" "${user}" "${key_path}" "${vmid}" "${ip_prefix}" "${qm_bin}" || true)" + if [[ -n "${guest_ip}" ]]; then + last_guest_ip="${guest_ip}" + if ssh_output="$(ssh -i "${guest_key_path}" -o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new "${guest_user}@${guest_ip}" true 2>&1)"; then + printf '%s\n' "${guest_ip}" + return 0 + fi + last_ssh_output="${ssh_output}" + fi + + if ((SECONDS >= next_log)); then + elapsed=$((timeout_seconds - (deadline - SECONDS))) + if [[ -n "${last_guest_ip}" ]]; then + echo "Waiting for SSH to worker VM ${vmid} at ${last_guest_ip} as ${guest_user} (${elapsed}s elapsed)..." >&2 + else + echo "Waiting for worker VM ${vmid} to report an IPv4 address ${ip_filter_description} through qemu-guest-agent (${elapsed}s elapsed)..." >&2 + fi + next_log=$((SECONDS + 60)) fi sleep 10 done + if [[ -n "${last_guest_ip}" ]]; then + echo "Worker VM ${vmid} reported guest IP ${last_guest_ip}, but SSH as ${guest_user} never became reachable." >&2 + if [[ -n "${last_ssh_output}" ]]; then + echo "Last SSH failure: ${last_ssh_output}" >&2 + fi + else + echo "Worker VM ${vmid} did not report an IPv4 address ${ip_filter_description} through qemu-guest-agent." >&2 + fi + pimox_worker_vm_debug "${host}" "${user}" "${key_path}" "${vmid}" "${qm_bin}" + return 1 } @@ -358,7 +410,7 @@ sudo '${qm_bin}' set '${vmid}' --onboot 1 sudo '${qm_bin}' start '${vmid}'" fi - if ! guest_ip="$(wait_for_pimox_guest_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "${vmid}" "${worker_user}" "${worker_key_path}" "${ip_prefix}" "${timeout_seconds}")"; then + if ! guest_ip="$(wait_for_pimox_guest_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "${vmid}" "${worker_user}" "${worker_key_path}" "${ip_prefix}" "${timeout_seconds}" "${qm_bin}")"; then echo "Timed out waiting for worker VM ${vmid} (${worker_name}) to report a reachable guest IP." >&2 exit 1 fi