#!/usr/bin/env bash set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BUILDX_CONFIG="/tmp/buildx-config.toml" KUBECONFIG_PATH="${KUBECONFIG_PATH:-${TF_VAR_kubeconfig_path:-/home/jv/.kube/config}}" trap 'rm -f "${BUILDX_CONFIG}"' EXIT run_tofu_stack() { local stack="$1" tofu -chdir="${REPO_ROOT}/${stack}" init tofu -chdir="${REPO_ROOT}/${stack}" apply -auto-approve } cleanup_calico_links() { ip link show | awk -F: '/^[0-9]+: cali/ {print $2}' | cut -d@ -f1 | xargs -r -n1 sudo ip link delete 2>/dev/null || true sudo ip link delete vxlan.calico 2>/dev/null || true sudo ip link delete tunl0 2>/dev/null || true sudo ip link delete cni0 2>/dev/null || true sudo ip link delete kube-ipvs0 2>/dev/null || true ip netns list | awk '/^(cni-|calico)/ {print $1}' | xargs -r -n1 sudo ip netns delete 2>/dev/null || true } cleanup_iptables() { sudo iptables -F || true sudo iptables -X || true sudo iptables -t nat -F || true sudo iptables -t nat -X || true sudo iptables -t mangle -F || true sudo iptables -t mangle -X || true sudo iptables -t raw -F || true sudo iptables -t raw -X || true if command -v ipvsadm >/dev/null 2>&1; then sudo ipvsadm --clear || true fi } cleanup_calico_runtime_files() { local path for path in /run/calico /var/run/calico; do if sudo test -e "${path}"; then sudo find "${path}" -path '*/cgroup*' -prune -o -mindepth 1 -exec rm -rf -- {} + 2>/dev/null || true sudo rmdir "${path}" 2>/dev/null || true fi done } restore_node_dns() { sudo rm -f /etc/systemd/resolved.conf.d/homelab-k8s.conf if sudo test -e /etc/resolv.conf.homelab-k8s-backup; then sudo rm -f /etc/resolv.conf sudo mv /etc/resolv.conf.homelab-k8s-backup /etc/resolv.conf fi sudo systemctl restart systemd-resolved 2>/dev/null || true } cleanup_mounts() { if command -v findmnt >/dev/null 2>&1; then local mount_root while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <( for mount_root in /var/lib/kubelet /var/lib/containerd /run/calico /run/calico/cgroup /var/run/calico /var/run/calico/cgroup; do findmnt -Rno TARGET "${mount_root}" 2>/dev/null || true done | sort -ru ) fi while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <(find /var/lib/kubelet/pods -mindepth 2 -maxdepth 5 -type d 2>/dev/null || true) sudo umount -f /var/lib/containerd/srun/* 2>/dev/null || sudo umount -l /var/lib/containerd/srun/* 2>/dev/null || true } cleanup_node() { sudo kubeadm reset --force || true sudo systemctl stop kubelet 2>/dev/null || true sudo systemctl stop containerd 2>/dev/null || true sudo killall containerd-shim-runc-v2 2>/dev/null || true cleanup_mounts sudo rm -rf \ /etc/kubernetes/ \ /var/lib/etcd/ \ /var/lib/kubelet/ \ /var/lib/cni/ \ /etc/cni/net.d \ /run/flannel \ /var/lib/calico \ /var/log/calico \ /var/lib/containerd/* \ /run/containerd/* \ /etc/containerd/certs.d \ /etc/containerd/config.toml cleanup_calico_runtime_files sudo rm -f /opt/cni/bin/calico /opt/cni/bin/calico-ipam cleanup_iptables cleanup_calico_links restore_node_dns sudo mkdir -p /etc/containerd/certs.d sudo systemctl reset-failed kubelet containerd 2>/dev/null || true sudo systemctl start containerd 2>/dev/null || true } website_registry_endpoint() { local image image="$(awk '$1 == "image:" && $2 ~ /php-website/ {print $2; exit}' "${REPO_ROOT}/apps/website/web-app.yaml")" if [[ -z "${image}" || "${image}" != */* ]]; then echo "Could not determine website registry endpoint from apps/website/web-app.yaml" >&2 exit 1 fi printf '%s\n' "${image%%/*}" } dump_argocd_debug() { local app="$1" kubectl --kubeconfig "${KUBECONFIG}" -n argocd get application "${app}" -o yaml || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd describe application "${app}" || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd get pods -o wide || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd logs deployment/argocd-repo-server --tail=120 || true kubectl --kubeconfig "${KUBECONFIG}" -n argocd logs statefulset/argocd-application-controller --tail=120 || true } dump_namespace_debug() { local namespace="$1" kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get all -o wide || true kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get pvc -o wide || true kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" describe pods || true kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get events --sort-by=.lastTimestamp 2>/dev/null | tail -80 || true } wait_for_namespace() { local namespace="$1" local app="$2" local timeout_seconds="$3" local elapsed=0 until kubectl --kubeconfig "${KUBECONFIG}" get namespace "${namespace}" >/dev/null 2>&1; do if ((elapsed >= timeout_seconds)); then echo "Timed out waiting for namespace ${namespace} from Argo CD app ${app}" >&2 dump_argocd_debug "${app}" exit 1 fi sleep 5 elapsed=$((elapsed + 5)) done } wait_for_namespaced_resource() { local namespace="$1" local kind="$2" local name="$3" local app="$4" local timeout_seconds="$5" local elapsed=0 until kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get "${kind}/${name}" >/dev/null 2>&1; do if ((elapsed >= timeout_seconds)); then echo "Timed out waiting for ${kind}/${name} in namespace ${namespace} from Argo CD app ${app}" >&2 dump_argocd_debug "${app}" kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get events --sort-by=.lastTimestamp 2>/dev/null | tail -80 || true exit 1 fi sleep 5 elapsed=$((elapsed + 5)) done } wait_for_deployment_ready() { local namespace="$1" local deployment="$2" local app="$3" local timeout_seconds="$4" local desired_replicas local ready_replicas local elapsed=0 desired_replicas="$(kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get deployment "${deployment}" -o jsonpath='{.spec.replicas}' 2>/dev/null || true)" desired_replicas="${desired_replicas:-1}" until ready_replicas="$(kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" get deployment "${deployment}" -o jsonpath='{.status.readyReplicas}' 2>/dev/null)"; \ (( ${ready_replicas:-0} >= desired_replicas )); do if ((elapsed >= timeout_seconds)); then echo "Timed out waiting for deployment/${deployment} in namespace ${namespace} to have ${desired_replicas} ready replicas" >&2 dump_argocd_debug "${app}" dump_namespace_debug "${namespace}" exit 1 fi sleep 5 elapsed=$((elapsed + 5)) done } recreate_pods_for_selector() { local namespace="$1" local selector="$2" local app="$3" if ! kubectl --kubeconfig "${KUBECONFIG}" -n "${namespace}" delete pod -l "${selector}" --ignore-not-found --wait=true --timeout=120s; then echo "Failed to recreate pods matching ${selector} in namespace ${namespace}" >&2 dump_argocd_debug "${app}" dump_namespace_debug "${namespace}" exit 1 fi } refresh_argocd_application() { local app="$1" kubectl --kubeconfig "${KUBECONFIG}" patch application "${app}" -n argocd --type merge -p '{"metadata":{"annotations":{"argocd.argoproj.io/refresh":"hard"}}}' >/dev/null } up() { local registry_endpoint registry_endpoint="$(website_registry_endpoint)" export TF_VAR_registry_endpoint="${TF_VAR_registry_endpoint:-${registry_endpoint}}" export TF_VAR_kubeconfig_path="${TF_VAR_kubeconfig_path:-${KUBECONFIG_PATH}}" export KUBECONFIG="${TF_VAR_kubeconfig_path}" if [[ "${TF_VAR_registry_endpoint}" != "${registry_endpoint}" ]]; then echo "TF_VAR_registry_endpoint must match apps/website/web-app.yaml (${registry_endpoint})" >&2 exit 1 fi echo "Deploying the homelab infrastructure..." docker run --rm --privileged multiarch/qemu-user-static --reset -p yes cat < "${BUILDX_CONFIG}" [registry."${registry_endpoint}"] http = true insecure = true [registry."127.0.0.1:30500"] http = true insecure = true [registry."localhost:30500"] http = true insecure = true EOF docker buildx rm lab-builder 2>/dev/null || true docker buildx create --name lab-builder --driver docker-container --driver-opt network=host --config "${BUILDX_CONFIG}" --use docker buildx inspect --bootstrap run_tofu_stack "bootstrap/cluster" run_tofu_stack "bootstrap/platform" run_tofu_stack "bootstrap/apps" refresh_argocd_application container-registry refresh_argocd_application gitea refresh_argocd_application website-production wait_for_namespace container-registry container-registry 300 wait_for_namespaced_resource container-registry deployment local-registry container-registry 300 wait_for_deployment_ready container-registry local-registry container-registry 300 docker buildx build \ --network host \ --platform linux/amd64,linux/arm64 \ -t "${registry_endpoint}/php-website:latest" \ -f "${REPO_ROOT}/apps/website/Dockerfile" \ "${REPO_ROOT}/apps/website/" \ --push refresh_argocd_application website-production wait_for_namespace website-production website-production 300 wait_for_namespaced_resource website-production deployment php-website-deployment website-production 300 recreate_pods_for_selector website-production app=php-website website-production wait_for_deployment_ready website-production php-website-deployment website-production 300 echo "Deployment successfully completed." } nuke() { local worker_ssh_targets local worker_targets local target echo "Brutally nuking the homelab infrastructure..." worker_ssh_targets="${WORKER_SSH_TARGETS-jv@192.168.100.89}" read -r -a worker_targets <<< "${worker_ssh_targets}" echo "--> Terminating local OpenTofu tasks..." killall tofu terraform 2>/dev/null || true echo "--> Eviscerating local Kubernetes components..." cleanup_node sudo rm -f "${KUBECONFIG_PATH}" for target in "${worker_targets[@]}"; do echo "--> Eviscerating remote Kubernetes components (${target})..." if ! ssh -o ConnectTimeout=5 "${target}" "bash -s" <<'EOF' set -euo pipefail cleanup_calico_links() { ip link show | awk -F: '/^[0-9]+: cali/ {print $2}' | cut -d@ -f1 | xargs -r -n1 sudo ip link delete 2>/dev/null || true sudo ip link delete vxlan.calico 2>/dev/null || true sudo ip link delete tunl0 2>/dev/null || true sudo ip link delete cni0 2>/dev/null || true sudo ip link delete kube-ipvs0 2>/dev/null || true ip netns list | awk '/^(cni-|calico)/ {print $1}' | xargs -r -n1 sudo ip netns delete 2>/dev/null || true } cleanup_iptables() { sudo iptables -F || true sudo iptables -X || true sudo iptables -t nat -F || true sudo iptables -t nat -X || true sudo iptables -t mangle -F || true sudo iptables -t mangle -X || true sudo iptables -t raw -F || true sudo iptables -t raw -X || true if command -v ipvsadm >/dev/null 2>&1; then sudo ipvsadm --clear || true fi } cleanup_calico_runtime_files() { local path for path in /run/calico /var/run/calico; do if sudo test -e "${path}"; then sudo find "${path}" -path '*/cgroup*' -prune -o -mindepth 1 -exec rm -rf -- {} + 2>/dev/null || true sudo rmdir "${path}" 2>/dev/null || true fi done } restore_node_dns() { sudo rm -f /etc/systemd/resolved.conf.d/homelab-k8s.conf if sudo test -e /etc/resolv.conf.homelab-k8s-backup; then sudo rm -f /etc/resolv.conf sudo mv /etc/resolv.conf.homelab-k8s-backup /etc/resolv.conf fi sudo systemctl restart systemd-resolved 2>/dev/null || true } cleanup_mounts() { if command -v findmnt >/dev/null 2>&1; then local mount_root while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <( for mount_root in /var/lib/kubelet /var/lib/containerd /run/calico /run/calico/cgroup /var/run/calico /var/run/calico/cgroup; do findmnt -Rno TARGET "${mount_root}" 2>/dev/null || true done | sort -ru ) fi while IFS= read -r mountpoint; do sudo umount -f "${mountpoint}" 2>/dev/null || sudo umount -l "${mountpoint}" 2>/dev/null || true done < <(find /var/lib/kubelet/pods -mindepth 2 -maxdepth 5 -type d 2>/dev/null || true) sudo umount -f /var/lib/containerd/srun/* 2>/dev/null || sudo umount -l /var/lib/containerd/srun/* 2>/dev/null || true } sudo kubeadm reset --force || true sudo systemctl stop kubelet 2>/dev/null || true sudo systemctl stop containerd 2>/dev/null || true sudo killall containerd-shim-runc-v2 2>/dev/null || true cleanup_mounts sudo rm -rf \ /etc/kubernetes/ \ /var/lib/etcd/ \ /var/lib/kubelet/ \ /var/lib/cni/ \ /etc/cni/net.d \ /run/flannel \ /var/lib/calico \ /var/log/calico \ /var/lib/containerd/* \ /run/containerd/* \ /etc/containerd/certs.d \ /etc/containerd/config.toml cleanup_calico_runtime_files sudo rm -f /opt/cni/bin/calico /opt/cni/bin/calico-ipam cleanup_iptables cleanup_calico_links restore_node_dns sudo mkdir -p /etc/containerd/certs.d sudo systemctl reset-failed kubelet containerd 2>/dev/null || true sudo systemctl start containerd 2>/dev/null || true EOF then echo "Remote cleanup failed for ${target}; not deleting OpenTofu state." >&2 exit 1 fi done docker buildx rm lab-builder 2>/dev/null || true docker rm -f buildx_buildkit_lab-builder0 2>/dev/null || true rm -f "${BUILDX_CONFIG}" || true echo "--> Deleting OpenTofu tracking state files..." rm -rf "${REPO_ROOT}"/bootstrap/cluster/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/cluster/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/cluster/.terraform/ rm -rf "${REPO_ROOT}"/bootstrap/platform/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/platform/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/platform/.terraform/ rm -rf "${REPO_ROOT}"/bootstrap/apps/terraform.tfstate* rm -f "${REPO_ROOT}"/bootstrap/apps/.terraform.tfstate.lock.info rm -rf "${REPO_ROOT}"/bootstrap/apps/.terraform/ echo "Destruction complete. Retained data under /var/openebs/local was left intact." } case "${1:-}" in up) up ;; nuke) nuke ;; *) echo "Usage: $0 {up|nuke}" exit 1 ;; esac