#!/usr/bin/env bash set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BUILDX_CONFIG="/tmp/buildx-config.toml" KUBECONFIG_PATH="${KUBECONFIG_PATH:-${TF_VAR_kubeconfig_path:-/home/jv/.kube/config}}" trap 'rm -f "${BUILDX_CONFIG}"' EXIT require_debian_server() { local command_name="$1" local os_id="" if [[ "$(uname -s)" != "Linux" ]]; then echo "Refusing to run '${command_name}' from this machine. Run it on the Debian homelab server." >&2 exit 1 fi if [[ -r /etc/os-release ]]; then os_id="$(awk -F= '$1 == "ID" {gsub(/"/, "", $2); print $2; exit}' /etc/os-release)" fi if [[ "${os_id}" != "debian" ]]; then echo "Refusing to run '${command_name}' on ${os_id:-unknown OS}. Run it on the Debian homelab server." >&2 exit 1 fi } run_tofu_stack() { local stack="$1" local -a apply_args=(-auto-approve) if [[ "${stack}" == "bootstrap/cluster" && -n "${LAB_CLUSTER_VAR_FILE:-}" ]]; then apply_args+=("-var-file=${LAB_CLUSTER_VAR_FILE}") fi tofu -chdir="${REPO_ROOT}/${stack}" init tofu -chdir="${REPO_ROOT}/${stack}" apply "${apply_args[@]}" } truthy() { case "${1,,}" in 1 | true | yes | on) return 0 ;; *) return 1 ;; esac } disabled_value() { case "${1,,}" in 0 | false | no | off | disabled) return 0 ;; *) return 1 ;; esac } ensure_python3() { if command -v python3 >/dev/null 2>&1; then return 0 fi sudo apt-get update sudo apt-get install -y --no-install-recommends python3 } detect_route_interface() { local target="$1" ip route get "${target}" 2>/dev/null | awk ' { for (i = 1; i <= NF; i++) { if ($i == "dev") { print $(i + 1) exit } } } ' } pimox_ssh() { local host="$1" local user="$2" local key_path="$3" shift 3 ssh -i "${key_path}" -o BatchMode=yes -o ConnectTimeout=10 -o StrictHostKeyChecking=accept-new "${user}@${host}" "$@" } pimox_guest_ipv4() { local guest_json local host="$1" local user="$2" local key_path="$3" local vmid="$4" local ip_prefix="$5" local qm_bin="${LAB_PIMOX_QM_BIN:-/usr/sbin/qm}" guest_json="$(pimox_ssh "${host}" "${user}" "${key_path}" "sudo '${qm_bin}' guest cmd '${vmid}' network-get-interfaces" 2>/dev/null || true)" if [[ -z "${guest_json}" ]]; then return 1 fi GUEST_JSON="${guest_json}" python3 - "${ip_prefix}" <<'PY' import json import os import sys prefix = sys.argv[1] try: interfaces = json.loads(os.environ.get("GUEST_JSON", "")) except Exception: sys.exit(1) for iface in interfaces or []: for address in iface.get("ip-addresses") or []: if address.get("ip-address-type") != "ipv4": continue ip = address.get("ip-address", "") if not ip or ip.startswith(("127.", "169.254.")): continue if prefix and not ip.startswith(prefix): continue print(ip) sys.exit(0) sys.exit(1) PY } wait_for_pimox_guest_ssh() { local host="$1" local user="$2" local key_path="$3" local vmid="$4" local guest_user="$5" local guest_key_path="$6" local ip_prefix="$7" local timeout_seconds="$8" local deadline local guest_ip deadline=$((SECONDS + timeout_seconds)) while ((SECONDS < deadline)); do guest_ip="$(pimox_guest_ipv4 "${host}" "${user}" "${key_path}" "${vmid}" "${ip_prefix}" || true)" if [[ -n "${guest_ip}" ]] && ssh -i "${guest_key_path}" -o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new "${guest_user}@${guest_ip}" true >/dev/null 2>&1; then printf '%s\n' "${guest_ip}" return 0 fi sleep 10 done return 1 } pimox_generated_mac() { local vmid="$1" printf '02:68:10:%02x:%02x:%02x\n' \ $(((vmid >> 16) & 255)) \ $(((vmid >> 8) & 255)) \ $((vmid & 255)) } ensure_pimox_worker_node() { local index="$1" local spec_file="$2" local pimox_host="$3" local pimox_user="$4" local pimox_key="$5" local template_vmid="$6" local bridge="$7" local worker_base_vmid="$8" local worker_name_prefix="$9" local worker_node_prefix="${10}" local worker_key_prefix="${11}" local worker_cores="${12}" local worker_memory="${13}" local worker_user="${14}" local worker_key_path="${15}" local ip_prefix="${16}" local timeout_seconds="${17}" local qm_bin="${18}" local padded local vmid local worker_key local worker_name local node_name local mac local guest_ip printf -v padded '%02d' "${index}" vmid=$((worker_base_vmid + index - 1)) worker_key="${worker_key_prefix}${padded}" worker_name="${worker_name_prefix}-${padded}" node_name="${worker_node_prefix}-${padded}" mac="$(pimox_generated_mac "${vmid}")" if pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' status '${vmid}' >/dev/null 2>&1"; then if pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' config '${vmid}' | grep -q '^template: 1$'"; then echo "VM ${vmid} exists as a template; refusing to reuse it as worker ${worker_name}." >&2 exit 1 fi pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' set '${vmid}' --agent enabled=1 if sudo '${qm_bin}' status '${vmid}' | grep -q 'status: stopped'; then sudo '${qm_bin}' start '${vmid}'; fi" else pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu if ! ip link show '${bridge}' >/dev/null 2>&1; then echo 'Pimox bridge ${bridge} does not exist. Refusing to change Orange Pi networking.' >&2 exit 1 fi sudo '${qm_bin}' clone '${template_vmid}' '${vmid}' --name '${worker_name}' --full 1 sudo '${qm_bin}' set '${vmid}' --agent enabled=1 sudo '${qm_bin}' set '${vmid}' --cores '${worker_cores}' --memory '${worker_memory}' sudo '${qm_bin}' set '${vmid}' --net0 'virtio=${mac},bridge=${bridge}' sudo '${qm_bin}' set '${vmid}' --boot 'order=scsi0;net0' sudo '${qm_bin}' set '${vmid}' --onboot 1 sudo '${qm_bin}' start '${vmid}'" fi if ! guest_ip="$(wait_for_pimox_guest_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "${vmid}" "${worker_user}" "${worker_key_path}" "${ip_prefix}" "${timeout_seconds}")"; then echo "Timed out waiting for worker VM ${vmid} (${worker_name}) to report a reachable guest IP." >&2 exit 1 fi printf '%s\t%s\t%s\t%s\t%s\n' "${worker_key}" "${guest_ip}" "${worker_user}" "${node_name}" "${worker_key_path}" >>"${spec_file}" } write_cluster_worker_var_file() { local spec_file="$1" local var_file="$2" LAB_INCLUDE_RASPBERRY_WORKER="${LAB_INCLUDE_RASPBERRY_WORKER:-true}" \ LAB_RASPBERRY_HOST="${LAB_RASPBERRY_HOST:-192.168.100.89}" \ LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \ LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \ LAB_RASPBERRY_SSH_KEY_PATH="${LAB_RASPBERRY_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}" \ python3 - "${spec_file}" "${var_file}" <<'PY' import json import os import sys spec_file, var_file = sys.argv[1:3] nodes = {} if os.environ["LAB_INCLUDE_RASPBERRY_WORKER"].lower() not in {"0", "false", "no", "off", "disabled"}: nodes["raspberrypi"] = { "host": os.environ["LAB_RASPBERRY_HOST"], "user": os.environ["LAB_RASPBERRY_USER"], "node_name": os.environ["LAB_RASPBERRY_NODE_NAME"], "ssh_key_path": os.environ["LAB_RASPBERRY_SSH_KEY_PATH"], } with open(spec_file, encoding="utf-8") as handle: for line in handle: line = line.rstrip("\n") if not line: continue key, host, user, node_name, ssh_key_path = line.split("\t") nodes[key] = { "host": host, "user": user, "node_name": node_name, "ssh_key_path": ssh_key_path, } with open(var_file, "w", encoding="utf-8") as handle: json.dump({"worker_nodes": nodes}, handle, indent=2) handle.write("\n") PY } run_pimox_pipeline() { local mode="${LAB_PIMOX_PIPELINE:-auto}" local pimox_host="${LAB_PIMOX_HOST:-${TF_VAR_pimox_host:-192.168.100.80}}" local pimox_user="${LAB_PIMOX_USER:-${TF_VAR_pimox_user:-jv}}" local pimox_key="${LAB_PIMOX_SSH_KEY_PATH:-${TF_VAR_pimox_ssh_key_path:-/home/jv/.ssh/id_ed25519}}" local qm_bin="${LAB_PIMOX_QM_BIN:-${TF_VAR_pimox_qm_bin:-/usr/sbin/qm}}" local bridge="${LAB_PIMOX_BRIDGE:-${TF_VAR_pimox_template_bridge:-vmbr0}}" local template_vmid="${LAB_PIMOX_TEMPLATE_VMID:-${TF_VAR_pimox_template_vmid:-9000}}" local template_name="${LAB_PIMOX_TEMPLATE_NAME:-${TF_VAR_pimox_template_name:-debian13-arm64-k8s-template}}" local template_replace_existing="${LAB_PIMOX_TEMPLATE_REPLACE_EXISTING:-${TF_VAR_pimox_template_replace_existing:-false}}" local provisioning_interface local worker_count="${LAB_PIMOX_WORKER_COUNT:-1}" local worker_base_vmid="${LAB_PIMOX_WORKER_BASE_VMID:-9010}" local worker_name_prefix="${LAB_PIMOX_WORKER_NAME_PREFIX:-pimox-worker}" local worker_node_prefix="${LAB_PIMOX_WORKER_NODE_PREFIX:-pimox-worker}" local worker_key_prefix="${LAB_PIMOX_WORKER_KEY_PREFIX:-pimox}" local worker_cores="${LAB_PIMOX_WORKER_CORES:-2}" local worker_memory="${LAB_PIMOX_WORKER_MEMORY:-2048}" local worker_user="${LAB_PIMOX_WORKER_USER:-jv}" local worker_key_path="${LAB_PIMOX_WORKER_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}" local ip_prefix="${LAB_PIMOX_GUEST_IP_PREFIX:-192.168.100.}" local timeout_seconds="${LAB_PIMOX_GUEST_TIMEOUT_SECONDS:-3600}" local spec_file="${REPO_ROOT}/.lab/pimox-workers.tsv" local var_file="${REPO_ROOT}/.lab/cluster-workers.auto.tfvars.json" local index local readiness_output local readiness_status if disabled_value "${mode}"; then return 0 fi if [[ "${mode}" == "auto" && -n "${LAB_PIMOX_WORKER_COUNT+x}" ]]; then mode="true" fi if ! [[ "${worker_count}" =~ ^[0-9]+$ ]]; then echo "LAB_PIMOX_WORKER_COUNT must be a non-negative integer." >&2 exit 1 fi set +e readiness_output="$(pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "set -eu if ! { command -v qm >/dev/null 2>&1 || [ -x '${qm_bin}' ]; }; then echo 'qm was not found in PATH and ${qm_bin} is not executable' exit 1 fi if ! ip link show '${bridge}' >/dev/null 2>&1; then echo 'bridge ${bridge} was not found' exit 1 fi if ! sudo -n true >/dev/null 2>&1; then echo 'passwordless sudo is not available for ${pimox_user}' exit 1 fi" 2>&1)" readiness_status=$? set -e if ((readiness_status != 0)); then if [[ "${mode}" == "auto" ]]; then echo "Skipping Pimox automation because ${pimox_user}@${pimox_host} with bridge ${bridge} is not ready." return 0 fi echo "Pimox automation requested, but ${pimox_user}@${pimox_host} is not ready: ${readiness_output}" >&2 exit 1 fi ensure_python3 provisioning_interface="${TF_VAR_provisioning_interface:-${LAB_PROVISIONING_INTERFACE:-$(detect_route_interface "${pimox_host}")}}" if [[ -z "${provisioning_interface}" ]]; then echo "Could not detect the Debian interface used to reach ${pimox_host}; set LAB_PROVISIONING_INTERFACE." >&2 exit 1 fi export TF_VAR_provisioning_interface="${provisioning_interface}" export TF_VAR_pimox_host="${pimox_host}" export TF_VAR_pimox_user="${pimox_user}" export TF_VAR_pimox_ssh_key_path="${pimox_key}" export TF_VAR_pimox_qm_bin="${qm_bin}" export TF_VAR_pimox_template_bridge="${bridge}" export TF_VAR_pimox_template_vmid="${template_vmid}" export TF_VAR_pimox_template_name="${template_name}" export TF_VAR_pimox_template_replace_existing="${template_replace_existing}" export TF_VAR_pimox_template_builder_enabled="${TF_VAR_pimox_template_builder_enabled:-true}" export TF_VAR_pimox_template_build_ssh_key_path="${TF_VAR_pimox_template_build_ssh_key_path:-${worker_key_path}}" export TF_VAR_pimox_template_build_user="${TF_VAR_pimox_template_build_user:-${worker_user}}" export TF_VAR_pimox_template_guest_ip_prefix="${TF_VAR_pimox_template_guest_ip_prefix:-${ip_prefix}}" export TF_VAR_pimox_template_build_timeout_seconds="${TF_VAR_pimox_template_build_timeout_seconds:-${timeout_seconds}}" echo "Preparing Pimox provisioning and Debian worker template on ${pimox_host} without changing Orange Pi host networking..." run_tofu_stack "bootstrap/provisioning" if ((worker_count == 0)); then return 0 fi if ! pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' config '${template_vmid}' | grep -q '^template: 1$'"; then echo "Template VM ${template_vmid} is not available as a Pimox template after provisioning." >&2 exit 1 fi pimox_ssh "${pimox_host}" "${pimox_user}" "${pimox_key}" "sudo '${qm_bin}' set '${template_vmid}' --agent enabled=1" mkdir -p "${REPO_ROOT}/.lab" : >"${spec_file}" for ((index = 1; index <= worker_count; index++)); do ensure_pimox_worker_node \ "${index}" \ "${spec_file}" \ "${pimox_host}" \ "${pimox_user}" \ "${pimox_key}" \ "${template_vmid}" \ "${bridge}" \ "${worker_base_vmid}" \ "${worker_name_prefix}" \ "${worker_node_prefix}" \ "${worker_key_prefix}" \ "${worker_cores}" \ "${worker_memory}" \ "${worker_user}" \ "${worker_key_path}" \ "${ip_prefix}" \ "${timeout_seconds}" \ "${qm_bin}" done write_cluster_worker_var_file "${spec_file}" "${var_file}" export LAB_CLUSTER_VAR_FILE="${var_file}" } cleanup_calico_links() { ip link show | awk -F: '/^[0-9]+: cali/ {print $2}' | cut -d@ -f1 | xargs -r -n1 sudo ip link delete 2>/dev/null || true sudo ip link delete vxlan.calico 2>/dev/null || true sudo ip link delete tunl0 2>/dev/null || true sudo ip link delete cni0 2>/dev/null || true sudo ip link delete kube-ipvs0 2>/dev/null || true ip netns list | awk '/^(cni-|calico)/ {print $1}' | xargs -r -n1 sudo ip netns delete 2>/dev/null || true } cleanup_iptables() { sudo iptables -F || true sudo iptables -X || true sudo iptables -t nat -F || true sudo iptables -t nat -X || true sudo iptables -t mangle -F || true sudo iptables -t mangle -X || true sudo iptables -t raw -F || true sudo iptables -t raw -X || true if command -v ipvsadm >/dev/null 2>&1; then sudo ipvsadm --clear || true fi } cleanup_calico_runtime_files() { local path for path in /run/calico /var/run/calico; do if sudo test -e "${path}"; then sudo find "${path}" -path '*/cgroup*' -prune -o -mindepth 1 -exec rm -rf -- {} + 2>/dev/null || true sudo rmdir "${path}" 2>/dev/null || true fi done } restore_node_dns() { sudo rm -f /etc/systemd/resolved.conf.d/homelab-k8s.conf if sudo test -e /etc/resolv.conf.homelab-k8s-backup; then sudo rm -f /etc/resolv.conf sudo mv /etc/resolv.conf.homelab-k8s-backup /etc/resolv.conf fi sudo systemctl restart systemd-resolved 2>/dev/null || true } cleanup_mounts() { if command -v findmnt >/dev/null 2>&1; then local mount_root while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <( for mount_root in /var/lib/kubelet /var/lib/containerd /run/calico /run/calico/cgroup /var/run/calico /var/run/calico/cgroup; do findmnt -Rno TARGET "${mount_root}" 2>/dev/null || true done | sort -ru ) fi while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <(find /var/lib/kubelet/pods -mindepth 2 -maxdepth 5 -type d 2>/dev/null || true) sudo umount -f /var/lib/containerd/srun/* 2>/dev/null || sudo umount -l /var/lib/containerd/srun/* 2>/dev/null || true } cleanup_node() { sudo kubeadm reset --force || true sudo systemctl stop kubelet 2>/dev/null || true sudo systemctl stop containerd 2>/dev/null || true sudo killall containerd-shim-runc-v2 2>/dev/null || true cleanup_mounts sudo rm -rf \ /etc/kubernetes/ \ /var/lib/etcd/ \ /var/lib/kubelet/ \ /var/lib/cni/ \ /etc/cni/net.d \ /run/flannel \ /var/lib/calico \ /var/log/calico \ /var/lib/containerd/* \ /run/containerd/* \ /etc/containerd/certs.d \ /etc/containerd/config.toml cleanup_calico_runtime_files sudo rm -f /opt/cni/bin/calico /opt/cni/bin/calico-ipam cleanup_iptables cleanup_calico_links restore_node_dns sudo mkdir -p /etc/containerd/certs.d sudo systemctl reset-failed kubelet containerd 2>/dev/null || true sudo systemctl start containerd 2>/dev/null || true } website_registry_endpoint() { local image image="$(awk '$1 == "image:" && $2 ~ /php-website/ {print $2; exit}' "${REPO_ROOT}/apps/website/web-app.yaml")" if [[ -z "${image}" || "${image}" != */* ]]; then echo "Could not determine website registry endpoint from apps/website/web-app.yaml" >&2 exit 1 fi printf '%s\n' "${image%%/*}" } demos_registry_endpoint() { local image image="$(awk '$1 == "image:" && $2 ~ /demos-static/ {print $2; exit}' "${REPO_ROOT}/apps/demos-static/web-app.yaml")" if [[ -z "${image}" || "${image}" != */* ]]; then echo "Could not determine demos registry endpoint from apps/demos-static/web-app.yaml" >&2 exit 1 fi printf '%s\n' "${image%%/*}" } website_source_hash() { ( cd "${REPO_ROOT}" find apps/website -type f -print0 | sort -z | xargs -0 sha256sum | sha256sum | awk '{print $1}' ) } demos_source_hash() { ( cd "${REPO_ROOT}" find apps/demos-static -type f -print0 | sort -z | xargs -0 sha256sum | sha256sum | awk '{print $1}' ) } registry_image_exists() { local registry_endpoint="$1" local repository="$2" local tag="$3" local accept_header if ! command -v curl >/dev/null 2>&1; then return 1 fi accept_header="application/vnd.oci.image.index.v1+json, application/vnd.oci.image.manifest.v1+json, application/vnd.docker.distribution.manifest.list.v2+json, application/vnd.docker.distribution.manifest.v2+json" curl -fsS \ -H "Accept: ${accept_header}" \ "http://${registry_endpoint}/v2/${repository}/manifests/${tag}" >/dev/null } image_state_value() { local state_file="$1" local key="$2" awk -F= -v key="${key}" '$1 == key {print substr($0, index($0, "=") + 1); exit}' "${state_file}" 2>/dev/null || true } website_image_is_current() { local state_file="$1" local source_hash="$2" local platforms="$3" local image_ref="$4" local registry_endpoint="$5" local saved_hash local saved_platforms local saved_image [[ -f "${state_file}" ]] || return 1 saved_hash="$(image_state_value "${state_file}" source_hash)" saved_platforms="$(image_state_value "${state_file}" platforms)" saved_image="$(image_state_value "${state_file}" image)" [[ "${saved_hash}" == "${source_hash}" ]] || return 1 [[ "${saved_platforms}" == "${platforms}" ]] || return 1 [[ "${saved_image}" == "${image_ref}" ]] || return 1 registry_image_exists "${registry_endpoint}" php-website latest } demos_image_is_current() { local state_file="$1" local source_hash="$2" local platforms="$3" local image_ref="$4" local registry_endpoint="$5" local saved_hash local saved_platforms local saved_image [[ -f "${state_file}" ]] || return 1 saved_hash="$(image_state_value "${state_file}" source_hash)" saved_platforms="$(image_state_value "${state_file}" platforms)" saved_image="$(image_state_value "${state_file}" image)" [[ "${saved_hash}" == "${source_hash}" ]] || return 1 [[ "${saved_platforms}" == "${platforms}" ]] || return 1 [[ "${saved_image}" == "${image_ref}" ]] || return 1 registry_image_exists "${registry_endpoint}" demos-static latest } write_website_image_state() { local state_file="$1" local source_hash="$2" local platforms="$3" local image_ref="$4" mkdir -p "$(dirname "${state_file}")" { printf 'source_hash=%s\n' "${source_hash}" printf 'platforms=%s\n' "${platforms}" printf 'image=%s\n' "${image_ref}" } > "${state_file}" } write_demos_image_state() { local state_file="$1" local source_hash="$2" local platforms="$3" local image_ref="$4" mkdir -p "$(dirname "${state_file}")" { printf 'source_hash=%s\n' "${source_hash}" printf 'platforms=%s\n' "${platforms}" printf 'image=%s\n' "${image_ref}" } > "${state_file}" } path_available_mb() { local path="$1" while [[ ! -e "${path}" && "${path}" != "/" ]]; do path="$(dirname "${path}")" done df -Pm "${path}" | awk 'NR == 2 {print $4}' } docker_root_dir() { docker info --format '{{.DockerRootDir}}' 2>/dev/null || printf '/var/lib/docker\n' } prune_unused_docker_build_data() { docker buildx rm lab-builder 2>/dev/null || true docker rm -f buildx_buildkit_lab-builder0 2>/dev/null || true docker builder prune -af 2>/dev/null || true docker system prune -af 2>/dev/null || true } ensure_docker_build_space() { local docker_root local free_mb local min_free_mb min_free_mb="${DOCKER_BUILD_MIN_FREE_MB:-4096}" docker_root="$(docker_root_dir)" free_mb="$(path_available_mb "${docker_root}")" if (( free_mb >= min_free_mb )); then return 0 fi echo "Docker data root ${docker_root} has ${free_mb}MiB free; pruning unused Docker build data..." prune_unused_docker_build_data free_mb="$(path_available_mb "${docker_root}")" if (( free_mb < min_free_mb )); then echo "Docker data root ${docker_root} still has only ${free_mb}MiB free after cleanup." >&2 echo "Free space there or move Docker's data-root to a larger filesystem such as /home before building." >&2 echo "Override the threshold with DOCKER_BUILD_MIN_FREE_MB if this host can build with less space." >&2 exit 1 fi } prepare_buildx_builder() { local registry_endpoint="$1" docker run --rm --privileged multiarch/qemu-user-static --reset -p yes cat < "${BUILDX_CONFIG}" [registry."${registry_endpoint}"] http = true insecure = true [registry."127.0.0.1:30500"] http = true insecure = true [registry."localhost:30500"] http = true insecure = true EOF docker buildx rm lab-builder 2>/dev/null || true docker buildx create --name lab-builder --driver docker-container --driver-opt network=host --config "${BUILDX_CONFIG}" --use docker buildx inspect --bootstrap } dump_argocd_debug() { local app="$1" kubectl --kubeconfig "${KUBECONFIG}" -n argocd get application "${app}" -o yaml || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd describe application "${app}" || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd get pods -o wide || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd logs deployment/argocd-repo-server --tail=120 || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd logs statefulset/argocd-application-controller --tail=120 || true } dump_namespace_debug() { local namespace="$1" kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get all -o wide || true kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get pvc -o wide || true kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" describe pods || true kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get events --sort-by=.lastTimestamp 2>/dev/null | tail -80 || true } wait_for_namespace() { local namespace="$1" local app="$2" local timeout_seconds="$3" local elapsed=0 until kubectl --kubeconfig "${KUBECONFIG}" get namespace "${namespace}" >/dev/null 2>&1; do if ((elapsed >= timeout_seconds)); then echo "Timed out waiting for namespace ${namespace} from Argo CD app ${app}" >&2 dump_argocd_debug "${app}" exit 1 fi sleep 5 elapsed=$((elapsed + 5)) done } wait_for_namespaced_resource() { local namespace="$1" local kind="$2" local name="$3" local app="$4" local timeout_seconds="$5" local elapsed=0 until kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get "${kind}/${name}" >/dev/null 2>&1; do if ((elapsed >= timeout_seconds)); then echo "Timed out waiting for ${kind}/${name} in namespace ${namespace} from Argo CD app ${app}" >&2 dump_argocd_debug "${app}" kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get events --sort-by=.lastTimestamp 2>/dev/null | tail -80 || true exit 1 fi sleep 5 elapsed=$((elapsed + 5)) done } wait_for_deployment_ready() { local namespace="$1" local deployment="$2" local app="$3" local timeout_seconds="$4" local desired_replicas local ready_replicas local elapsed=0 desired_replicas="$(kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get deployment "${deployment}" -o jsonpath='{.spec.replicas}' 2>/dev/null || true)" desired_replicas="${desired_replicas:-1}" until ready_replicas="$(kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get deployment "${deployment}" -o jsonpath='{.status.readyReplicas}' 2>/dev/null)"; \ (( ${ready_replicas:-0} >= desired_replicas )); do if ((elapsed >= timeout_seconds)); then echo "Timed out waiting for deployment/${deployment} in namespace ${namespace} to have ${desired_replicas} ready replicas" >&2 dump_argocd_debug "${app}" dump_namespace_debug "${namespace}" exit 1 fi sleep 5 elapsed=$((elapsed + 5)) done } apply_gitea_bootstrap_manifests() { kubectl --kubeconfig "${KUBECONFIG}" apply -f "${REPO_ROOT}/apps/gitea/namespace.yaml" kubectl --kubeconfig "${KUBECONFIG}" apply -f "${REPO_ROOT}/apps/gitea/storage.yaml" kubectl --kubeconfig "${KUBECONFIG}" apply -f "${REPO_ROOT}/apps/gitea/service.yaml" kubectl --kubeconfig "${KUBECONFIG}" apply -f "${REPO_ROOT}/apps/gitea/deployment.yaml" wait_for_namespace gitea-system gitea 300 wait_for_namespaced_resource gitea-system deployment gitea gitea 300 wait_for_deployment_ready gitea-system gitea gitea 300 } install_gitea_backup_timer() { local backup_script="/usr/local/sbin/homelab-gitea-backup.sh" sudo tee "${backup_script}" >/dev/null </dev/null 2>&1; then echo "kubectl is required for Gitea backups." >&2 exit 1 fi pod="\$(kubectl --kubeconfig "\${KUBECONFIG_PATH}" -n "\${GITEA_NAMESPACE}" get pods \ -l "\${GITEA_SELECTOR}" \ --field-selector=status.phase=Running \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)" if [[ -z "\${pod}" ]]; then echo "Skipping Gitea backup: no running Gitea pod found." exit 0 fi timestamp="\$(date -u +%Y%m%dT%H%M%SZ)" tmp_archive="\$(mktemp "/tmp/gitea-\${timestamp}.XXXXXX.zip")" backup_archive="\${GITEA_BACKUP_DIR}/gitea-\${timestamp}.zip" cleanup() { rm -f "\${tmp_archive}" kubectl --kubeconfig "\${KUBECONFIG_PATH}" -n "\${GITEA_NAMESPACE}" exec "\${pod}" -c "\${GITEA_CONTAINER}" -- rm -f "\${REMOTE_ARCHIVE}" >/dev/null 2>&1 || true } trap cleanup EXIT kubectl --kubeconfig "\${KUBECONFIG_PATH}" -n "\${GITEA_NAMESPACE}" exec "\${pod}" -c "\${GITEA_CONTAINER}" -- rm -f "\${REMOTE_ARCHIVE}" >/dev/null 2>&1 || true kubectl --kubeconfig "\${KUBECONFIG_PATH}" -n "\${GITEA_NAMESPACE}" exec "\${pod}" -c "\${GITEA_CONTAINER}" -- \ sh -c 'mkdir -p /data/git/repositories && chown git:git /data/git /data/git/repositories' kubectl --kubeconfig "\${KUBECONFIG_PATH}" -n "\${GITEA_NAMESPACE}" exec "\${pod}" -c "\${GITEA_CONTAINER}" -- \ su-exec git gitea dump -c /data/gitea/conf/app.ini --file "\${REMOTE_ARCHIVE}" kubectl --kubeconfig "\${KUBECONFIG_PATH}" -n "\${GITEA_NAMESPACE}" cp -c "\${GITEA_CONTAINER}" \ "\${GITEA_NAMESPACE}/\${pod}:\${REMOTE_ARCHIVE}" "\${tmp_archive}" sudo mkdir -p "\${GITEA_BACKUP_DIR}" sudo install -m 0640 -o root -g root "\${tmp_archive}" "\${backup_archive}" sudo find "\${GITEA_BACKUP_DIR}" -type f -name 'gitea-*.zip' -mtime +"\${GITEA_BACKUP_RETENTION_DAYS}" -delete echo "Created \${backup_archive}" BACKUP_SCRIPT_EOT sudo chmod 0755 "${backup_script}" sudo tee /etc/systemd/system/homelab-gitea-backup.service >/dev/null <<'SERVICE_EOT' [Unit] Description=Back up in-cluster Gitea to Debian host storage After=network-online.target Wants=network-online.target [Service] Type=oneshot ExecStart=/usr/local/sbin/homelab-gitea-backup.sh SERVICE_EOT sudo tee /etc/systemd/system/homelab-gitea-backup.timer >/dev/null <<'TIMER_EOT' [Unit] Description=Run daily Homelab Gitea backups [Timer] OnCalendar=*-*-* 02:35:00 RandomizedDelaySec=20m Persistent=true [Install] WantedBy=timers.target TIMER_EOT sudo systemctl daemon-reload sudo systemctl enable --now homelab-gitea-backup.timer >/dev/null } backup_gitea() { require_debian_server "backup-gitea" export KUBECONFIG="${KUBECONFIG_PATH}" install_gitea_backup_timer sudo /usr/local/sbin/homelab-gitea-backup.sh } install_gitea_runner() { local runner_arch local runner_home="${GITEA_RUNNER_HOME:-/home/jv/.local/share/gitea-runner/my-homelab-configs}" local runner_instance="${GITEA_RUNNER_INSTANCE_URL:-https://lab2025.duckdns.org/git/}" local runner_labels="${GITEA_RUNNER_LABELS:-homelab-debian:host}" local runner_name="${GITEA_RUNNER_NAME:-homelab-debian-my-homelab-configs}" local runner_token="${GITEA_RUNNER_REGISTRATION_TOKEN:-${1:-}}" local runner_user="${GITEA_RUNNER_USER:-jv}" local runner_version="${GITEA_ACT_RUNNER_VERSION:-0.2.11}" local missing_packages=() require_debian_server "install-gitea-runner" case "$(dpkg --print-architecture)" in amd64) runner_arch="linux-amd64" ;; arm64) runner_arch="linux-arm64" ;; *) echo "Unsupported Debian architecture: $(dpkg --print-architecture)" >&2 exit 1 ;; esac for package in ca-certificates curl git nodejs python3; do if ! dpkg-query -W -f='${Status}' "$package" 2>/dev/null | grep -q "install ok installed"; then missing_packages+=("$package") fi done if [[ ${#missing_packages[@]} -gt 0 ]]; then sudo apt-get update sudo apt-get install -y --no-install-recommends "${missing_packages[@]}" fi sudo curl -fsSL \ -o /usr/local/bin/act_runner \ "https://gitea.com/gitea/act_runner/releases/download/v${runner_version}/act_runner-${runner_version}-${runner_arch}" sudo chmod 0755 /usr/local/bin/act_runner sudo chown root:root /usr/local/bin/act_runner sudo -u "${runner_user}" mkdir -p "${runner_home}" if [[ ! -f "${runner_home}/.runner" ]]; then if [[ -z "${runner_token}" ]]; then echo "Set GITEA_RUNNER_REGISTRATION_TOKEN to the repository-level runner token from Gitea." >&2 exit 1 fi sudo -u "${runner_user}" env \ HOME="/home/${runner_user}" \ GITEA_RUNNER_HOME="${runner_home}" \ GITEA_RUNNER_INSTANCE_URL="${runner_instance}" \ GITEA_RUNNER_REGISTRATION_TOKEN="${runner_token}" \ GITEA_RUNNER_NAME="${runner_name}" \ GITEA_RUNNER_LABELS="${runner_labels}" \ bash -lc 'cd "${GITEA_RUNNER_HOME}" && /usr/local/bin/act_runner register --no-interactive --instance "${GITEA_RUNNER_INSTANCE_URL}" --token "${GITEA_RUNNER_REGISTRATION_TOKEN}" --name "${GITEA_RUNNER_NAME}" --labels "${GITEA_RUNNER_LABELS}"' else echo "Existing runner registration found at ${runner_home}/.runner; keeping it." fi sudo tee /etc/systemd/system/homelab-gitea-runner.service >/dev/null </dev/null sudo systemctl status homelab-gitea-runner.service --no-pager -l } recreate_pods_for_selector() { local namespace="$1" local selector="$2" local app="$3" if ! kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" delete pod -l "${selector}" --ignore-not-found --wait=true --timeout=120s; then echo "Failed to recreate pods matching ${selector} in namespace ${namespace}" >&2 dump_argocd_debug "${app}" dump_namespace_debug "${namespace}" exit 1 fi } refresh_argocd_application() { local app="$1" kubectl --kubeconfig "${KUBECONFIG}" patch application "${app}" -n argocd --type merge -p '{"metadata":{"annotations":{"argocd.argoproj.io/refresh":"hard"}}}' >/dev/null } apps() { local buildx_builder_ready=false local demos_image_built=false local demos_image_ref local demos_image_state_file local demos_platforms local demos_registry_endpoint local demos_source_hash local registry_endpoint local website_image_built=false local website_image_ref local website_image_state_file local website_platforms local website_source_hash require_debian_server "apps" registry_endpoint="$(website_registry_endpoint)" demos_registry_endpoint="$(demos_registry_endpoint)" demos_image_ref="${registry_endpoint}/demos-static:latest" demos_image_state_file="${REPO_ROOT}/.lab/demos-static-image.state" demos_platforms="${DEMOS_IMAGE_PLATFORMS:-linux/arm64}" demos_source_hash="$(demos_source_hash)" website_image_ref="${registry_endpoint}/php-website:latest" website_image_state_file="${REPO_ROOT}/.lab/php-website-image.state" website_platforms="${WEBSITE_IMAGE_PLATFORMS:-linux/arm64}" website_source_hash="$(website_source_hash)" export TF_VAR_registry_endpoint="${TF_VAR_registry_endpoint:-${registry_endpoint}}" export TF_VAR_kubeconfig_path="${TF_VAR_kubeconfig_path:-${KUBECONFIG_PATH}}" export KUBECONFIG="${TF_VAR_kubeconfig_path}" if [[ "${TF_VAR_registry_endpoint}" != "${registry_endpoint}" ]]; then echo "TF_VAR_registry_endpoint must match apps/website/web-app.yaml (${registry_endpoint})" >&2 exit 1 fi if [[ "${demos_registry_endpoint}" != "${registry_endpoint}" ]]; then echo "apps/demos-static/web-app.yaml registry endpoint (${demos_registry_endpoint}) must match apps/website/web-app.yaml (${registry_endpoint})" >&2 exit 1 fi echo "Deploying homelab applications..." apply_gitea_bootstrap_manifests run_tofu_stack "bootstrap/apps" refresh_argocd_application container-registry refresh_argocd_application demos-static refresh_argocd_application gitea refresh_argocd_application website-production wait_for_namespace container-registry container-registry 300 wait_for_namespaced_resource container-registry deployment local-registry container-registry 300 wait_for_deployment_ready container-registry local-registry container-registry 300 if website_image_is_current "${website_image_state_file}" "${website_source_hash}" "${website_platforms}" "${website_image_ref}" "${registry_endpoint}"; then echo "Website image ${website_image_ref} is already current (${website_source_hash}); skipping build." else echo "Building website image ${website_image_ref} for ${website_platforms} (${website_source_hash})..." ensure_docker_build_space if [[ "${buildx_builder_ready}" != "true" ]]; then prepare_buildx_builder "${registry_endpoint}" buildx_builder_ready=true fi docker buildx build \ --network host \ --platform "${website_platforms}" \ --provenance=false \ --sbom=false \ --label "dev.homelab.website.source-hash=${website_source_hash}" \ -t "${website_image_ref}" \ -f "${REPO_ROOT}/apps/website/Dockerfile" \ "${REPO_ROOT}/apps/website/" \ --push website_image_built=true fi if demos_image_is_current "${demos_image_state_file}" "${demos_source_hash}" "${demos_platforms}" "${demos_image_ref}" "${registry_endpoint}"; then echo "Demos image ${demos_image_ref} is already current (${demos_source_hash}); skipping build." else echo "Building demos image ${demos_image_ref} for ${demos_platforms} (${demos_source_hash})..." ensure_docker_build_space if [[ "${buildx_builder_ready}" != "true" ]]; then prepare_buildx_builder "${registry_endpoint}" buildx_builder_ready=true fi docker buildx build \ --network host \ --platform "${demos_platforms}" \ --provenance=false \ --sbom=false \ --label "dev.homelab.demos.source-hash=${demos_source_hash}" \ -t "${demos_image_ref}" \ -f "${REPO_ROOT}/apps/demos-static/Dockerfile" \ "${REPO_ROOT}/apps/demos-static/" \ --push demos_image_built=true fi refresh_argocd_application website-production wait_for_namespace website-production website-production 300 wait_for_namespaced_resource website-production deployment php-website-deployment website-production 300 if [[ "${website_image_built}" == "true" ]]; then recreate_pods_for_selector website-production app=php-website website-production else echo "Skipping website pod restart because the image did not change." fi wait_for_deployment_ready website-production php-website-deployment website-production 300 if [[ "${website_image_built}" == "true" ]]; then write_website_image_state "${website_image_state_file}" "${website_source_hash}" "${website_platforms}" "${website_image_ref}" fi refresh_argocd_application demos-static wait_for_namespace demos-static demos-static 300 wait_for_namespaced_resource demos-static deployment demos-static demos-static 300 if [[ "${demos_image_built}" == "true" ]]; then recreate_pods_for_selector demos-static app=demos-static demos-static else echo "Skipping demos pod restart because the image did not change." fi wait_for_deployment_ready demos-static demos-static demos-static 300 if [[ "${demos_image_built}" == "true" ]]; then write_demos_image_state "${demos_image_state_file}" "${demos_source_hash}" "${demos_platforms}" "${demos_image_ref}" fi echo "Application deployment successfully completed." } up() { require_debian_server "up" echo "Deploying the homelab infrastructure..." run_pimox_pipeline run_tofu_stack "bootstrap/cluster" run_tofu_stack "bootstrap/platform" install_gitea_backup_timer apps run_tofu_stack "bootstrap/edge" echo "Deployment successfully completed." } nuke() { local worker_ssh_targets local worker_targets local target require_debian_server "nuke" echo "Brutally nuking the homelab infrastructure..." worker_ssh_targets="${WORKER_SSH_TARGETS-jv@192.168.100.89}" read -r -a worker_targets <<< "${worker_ssh_targets}" echo "--> Terminating local OpenTofu tasks..." killall tofu terraform 2>/dev/null || true echo "--> Eviscerating local Kubernetes components..." cleanup_node sudo rm -f "${KUBECONFIG_PATH}" for target in "${worker_targets[@]}"; do echo "--> Eviscerating remote Kubernetes components (${target})..." if ! ssh -o ConnectTimeout=5 "${target}" "bash -s" <<'EOF' set -euo pipefail cleanup_calico_links() { ip link show | awk -F: '/^[0-9]+: cali/ {print $2}' | cut -d@ -f1 | xargs -r -n1 sudo ip link delete 2>/dev/null || true sudo ip link delete vxlan.calico 2>/dev/null || true sudo ip link delete tunl0 2>/dev/null || true sudo ip link delete cni0 2>/dev/null || true sudo ip link delete kube-ipvs0 2>/dev/null || true ip netns list | awk '/^(cni-|calico)/ {print $1}' | xargs -r -n1 sudo ip netns delete 2>/dev/null || true } cleanup_iptables() { sudo iptables -F || true sudo iptables -X || true sudo iptables -t nat -F || true sudo iptables -t nat -X || true sudo iptables -t mangle -F || true sudo iptables -t mangle -X || true sudo iptables -t raw -F || true sudo iptables -t raw -X || true if command -v ipvsadm >/dev/null 2>&1; then sudo ipvsadm --clear || true fi } cleanup_calico_runtime_files() { local path for path in /run/calico /var/run/calico; do if sudo test -e "${path}"; then sudo find "${path}" -path '*/cgroup*' -prune -o -mindepth 1 -exec rm -rf -- {} + 2>/dev/null || true sudo rmdir "${path}" 2>/dev/null || true fi done } restore_node_dns() { sudo rm -f /etc/systemd/resolved.conf.d/homelab-k8s.conf if sudo test -e /etc/resolv.conf.homelab-k8s-backup; then sudo rm -f /etc/resolv.conf sudo mv /etc/resolv.conf.homelab-k8s-backup /etc/resolv.conf fi sudo systemctl restart systemd-resolved 2>/dev/null || true } cleanup_mounts() { if command -v findmnt >/dev/null 2>&1; then local mount_root while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <( for mount_root in /var/lib/kubelet /var/lib/containerd /run/calico /run/calico/cgroup /var/run/calico /var/run/calico/cgroup; do findmnt -Rno TARGET "${mount_root}" 2>/dev/null || true done | sort -ru ) fi while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <(find /var/lib/kubelet/pods -mindepth 2 -maxdepth 5 -type d 2>/dev/null || true) sudo umount -f /var/lib/containerd/srun/* 2>/dev/null || sudo umount -l /var/lib/containerd/srun/* 2>/dev/null || true } sudo kubeadm reset --force || true sudo systemctl stop kubelet 2>/dev/null || true sudo systemctl stop containerd 2>/dev/null || true sudo killall containerd-shim-runc-v2 2>/dev/null || true cleanup_mounts sudo rm -rf \ /etc/kubernetes/ \ /var/lib/etcd/ \ /var/lib/kubelet/ \ /var/lib/cni/ \ /etc/cni/net.d \ /run/flannel \ /var/lib/calico \ /var/log/calico \ /var/lib/containerd/* \ /run/containerd/* \ /etc/containerd/certs.d \ /etc/containerd/config.toml cleanup_calico_runtime_files sudo rm -f /opt/cni/bin/calico /opt/cni/bin/calico-ipam cleanup_iptables cleanup_calico_links restore_node_dns sudo mkdir -p /etc/containerd/certs.d sudo systemctl reset-failed kubelet containerd 2>/dev/null || true sudo systemctl start containerd 2>/dev/null || true EOF then echo "Remote cleanup failed for ${target}; not deleting OpenTofu state." >&2 exit 1 fi done docker buildx rm lab-builder 2>/dev/null || true docker rm -f buildx_buildkit_lab-builder0 2>/dev/null || true rm -f "${BUILDX_CONFIG}" || true echo "--> Deleting OpenTofu tracking state files..." rm -rf "${REPO_ROOT}"/bootstrap/cluster/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/cluster/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/cluster/.terraform/ rm -rf "${REPO_ROOT}"/bootstrap/platform/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/platform/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/platform/.terraform/ rm -rf "${REPO_ROOT}"/bootstrap/apps/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/apps/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/apps/.terraform/ rm -rf "${REPO_ROOT}"/bootstrap/edge/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/edge/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/edge/.terraform/ echo "Destruction complete. Retained data under /var/openebs/local was left intact." } case "${1:-}" in up) up ;; apps) apps ;; backup-gitea) backup_gitea ;; install-gitea-runner) install_gitea_runner "${2:-}" ;; nuke) nuke ;; *) echo "Usage: $0 {up|apps|backup-gitea|install-gitea-runner|nuke}" exit 1 ;; esac