Add durable platform workload scheduling
Homelab Main / deploy (push) Successful in 1m14s Details

This commit is contained in:
juvdiaz 2026-05-29 09:44:43 -06:00
parent 240a55e826
commit 47c018b6dc
5 changed files with 241 additions and 32 deletions

View File

@ -212,12 +212,12 @@ duplicate those PV manifests when you want storage on another node.
- `node-role.kubernetes.io/worker=worker` on every worker so `kubectl get nodes` - `node-role.kubernetes.io/worker=worker` on every worker so `kubectl get nodes`
shows `worker` instead of `<none>` in the ROLES column shows `worker` instead of `<none>` in the ROLES column
- `homelab.dev/node-role=control-plane` and `homelab.dev/storage=local` on the - `homelab.dev/node-role=control-plane`, `homelab.dev/storage=local`, and
Debian control plane `homelab.dev/workload-class=control-plane` on the Debian control plane
- `homelab.dev/node-role=edge-app` and `homelab.dev/storage=local` on the - `homelab.dev/node-role=edge-app`, `homelab.dev/storage=local`, and
Raspberry Pi worker `homelab.dev/workload-class=edge` on the Raspberry Pi worker
- `homelab.dev/node-role=app` and `homelab.dev/storage=nvme` on automated Pimox - `homelab.dev/node-role=app`, `homelab.dev/storage=nvme`, and
worker clones `homelab.dev/workload-class=platform` on automated Pimox worker clones
Override `control_plane_node_labels`, `worker_node_labels`, Override `control_plane_node_labels`, `worker_node_labels`,
`LAB_RASPBERRY_NODE_LABELS_JSON`, or `LAB_PIMOX_WORKER_NODE_LABELS_JSON` when `LAB_RASPBERRY_NODE_LABELS_JSON`, or `LAB_PIMOX_WORKER_NODE_LABELS_JSON` when
@ -227,14 +227,18 @@ OpenEBS hostpath PVs are node-local. Move workloads only after their storage and
edge path are ready on the target node. Gitea is outside Kubernetes and is moved edge path are ready on the target node. Gitea is outside Kubernetes and is moved
by changing the Raspberry Pi Docker install target instead. by changing the Raspberry Pi Docker install target instead.
The Prometheus stack control workloads are pinned to Pimox worker nodes by the The stateless platform controllers are pinned to Pimox worker nodes through
default `prometheus_stack_node_selector` (`homelab.dev/node-role=app` and `homelab.dev/workload-class=platform` and include hostname topology spread plus
`homelab.dev/storage=nvme`). Because the Prometheus, Alertmanager, and Grafana preferred pod anti-affinity so future Argo CD, Kyverno, Prometheus operator, and
PVCs use retained local OpenEBS volumes, moving an existing install off the kube-state-metrics scheduling does not collapse onto the first worker that joins.
Debian control plane requires discarding those PVCs. Run PVC-backed monitoring StatefulSets are intentionally treated separately because
their retained OpenEBS hostpath volumes are node-local. Run
`./lab.sh move-prometheus-stack-workers` from the Debian host to label existing `./lab.sh move-prometheus-stack-workers` from the Debian host to label existing
worker nodes, destroy only the existing `prometheus-stack` Helm release, delete worker nodes, destroy only the existing `prometheus-stack` Helm release, delete
its retained PVC/PV objects, and recreate the stack on the worker selector. its retained PVC/PV objects, and recreate the stack on the worker selector when
you intentionally accept losing that monitoring data. A planned monitoring data
migration should be handled as a separate maintenance task with backup,
delete/recreate or storage migration steps, and post-restore checks.
The website and demos NodePorts are reachable from the OCI jump box through the The website and demos NodePorts are reachable from the OCI jump box through the
Raspberry Pi Tailscale interface. `bootstrap/cluster` installs a persistent Raspberry Pi Tailscale interface. `bootstrap/cluster` installs a persistent

View File

@ -8,6 +8,7 @@ variable "control_plane_node_labels" {
default = { default = {
"homelab.dev/node-role" = "control-plane" "homelab.dev/node-role" = "control-plane"
"homelab.dev/storage" = "local" "homelab.dev/storage" = "local"
"homelab.dev/workload-class" = "control-plane"
} }
} }

View File

@ -105,15 +105,169 @@ EOT
var.metallb.l2_advertisement_enabled ? local.metallb_l2_advertisement_manifest : "", var.metallb.l2_advertisement_enabled ? local.metallb_l2_advertisement_manifest : "",
])) ]))
platform_topology_key = "kubernetes.io/hostname"
prometheus_stack_node_selector = var.prometheus_stack_node_selector prometheus_stack_node_selector = var.prometheus_stack_node_selector
argocd_node_selector = { argocd_node_selector = {
"kubernetes.io/os" = "linux" "kubernetes.io/os" = "linux"
"homelab.dev/node-role" = "app" "homelab.dev/workload-class" = "platform"
} }
kyverno_node_selector = { kyverno_node_selector = {
"kubernetes.io/os" = "linux" "kubernetes.io/os" = "linux"
"homelab.dev/node-role" = "app" "homelab.dev/workload-class" = "platform"
} }
argocd_component_label_values = {
application_set = "argocd-applicationset-controller"
controller = "argocd-application-controller"
dex = "argocd-dex-server"
notifications = "argocd-notifications-controller"
redis = "argocd-redis"
repo_server = "argocd-repo-server"
server = "argocd-server"
}
argocd_component_match_labels = {
for component, name in local.argocd_component_label_values : component => {
"app.kubernetes.io/name" = name
}
}
argocd_component_affinity = {
for component, labels in local.argocd_component_match_labels : component => {
podAntiAffinity = {
preferredDuringSchedulingIgnoredDuringExecution = [
{
weight = 100
podAffinityTerm = {
labelSelector = {
matchLabels = labels
}
topologyKey = local.platform_topology_key
}
},
]
}
}
}
argocd_component_topology_spread_constraints = {
for component, labels in local.argocd_component_match_labels : component => [
{
maxSkew = 1
topologyKey = local.platform_topology_key
whenUnsatisfiable = "ScheduleAnyway"
labelSelector = {
matchLabels = labels
}
},
]
}
kyverno_component_label_values = {
admissionController = "admission-controller"
backgroundController = "background-controller"
cleanupController = "cleanup-controller"
reportsController = "reports-controller"
}
kyverno_component_match_labels = {
for component, name in local.kyverno_component_label_values : component => {
"app.kubernetes.io/component" = name
}
}
kyverno_component_pod_anti_affinity = {
for component, labels in local.kyverno_component_match_labels : component => {
preferredDuringSchedulingIgnoredDuringExecution = [
{
weight = 100
podAffinityTerm = {
labelSelector = {
matchLabels = labels
}
topologyKey = local.platform_topology_key
}
},
]
}
}
kyverno_component_topology_spread_constraints = {
for component, labels in local.kyverno_component_match_labels : component => [
{
maxSkew = 1
topologyKey = local.platform_topology_key
whenUnsatisfiable = "ScheduleAnyway"
labelSelector = {
matchLabels = labels
}
},
]
}
prometheus_operator_match_labels = {
app = "kube-prometheus-stack-operator"
release = "prometheus-stack"
}
kube_state_metrics_match_labels = {
"app.kubernetes.io/instance" = "prometheus-stack"
"app.kubernetes.io/name" = "kube-state-metrics"
}
prometheus_operator_affinity = {
podAntiAffinity = {
preferredDuringSchedulingIgnoredDuringExecution = [
{
weight = 100
podAffinityTerm = {
labelSelector = {
matchLabels = local.prometheus_operator_match_labels
}
topologyKey = local.platform_topology_key
}
},
]
}
}
kube_state_metrics_affinity = {
podAntiAffinity = {
preferredDuringSchedulingIgnoredDuringExecution = [
{
weight = 100
podAffinityTerm = {
labelSelector = {
matchLabels = local.kube_state_metrics_match_labels
}
topologyKey = local.platform_topology_key
}
},
]
}
}
prometheus_operator_topology_spread_constraints = [
{
maxSkew = 1
topologyKey = local.platform_topology_key
whenUnsatisfiable = "ScheduleAnyway"
labelSelector = {
matchLabels = local.prometheus_operator_match_labels
}
},
]
kube_state_metrics_topology_spread_constraints = [
{
maxSkew = 1
topologyKey = local.platform_topology_key
whenUnsatisfiable = "ScheduleAnyway"
labelSelector = {
matchLabels = local.kube_state_metrics_match_labels
}
},
]
} }
resource "helm_release" "calico_crds" { resource "helm_release" "calico_crds" {
@ -678,6 +832,41 @@ resource "helm_release" "argocd" {
global = { global = {
nodeSelector = local.argocd_node_selector nodeSelector = local.argocd_node_selector
} }
applicationSet = {
nodeSelector = local.argocd_node_selector
affinity = local.argocd_component_affinity.application_set
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.application_set
}
controller = {
nodeSelector = local.argocd_node_selector
affinity = local.argocd_component_affinity.controller
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.controller
}
dex = {
nodeSelector = local.argocd_node_selector
affinity = local.argocd_component_affinity.dex
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.dex
}
notifications = {
nodeSelector = local.argocd_node_selector
affinity = local.argocd_component_affinity.notifications
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.notifications
}
redis = {
nodeSelector = local.argocd_node_selector
affinity = local.argocd_component_affinity.redis
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.redis
}
repoServer = {
nodeSelector = local.argocd_node_selector
affinity = local.argocd_component_affinity.repo_server
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.repo_server
}
server = {
nodeSelector = local.argocd_node_selector
affinity = local.argocd_component_affinity.server
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.server
}
}) })
] ]
} }
@ -789,7 +978,9 @@ resource "helm_release" "kyverno" {
} }
admissionController = { admissionController = {
nodeSelector = local.kyverno_node_selector nodeSelector = local.kyverno_node_selector
podAntiAffinity = local.kyverno_component_pod_anti_affinity.admissionController
replicas = 1 replicas = 1
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.admissionController
resources = { resources = {
requests = { requests = {
cpu = "50m" cpu = "50m"
@ -802,7 +993,9 @@ resource "helm_release" "kyverno" {
} }
backgroundController = { backgroundController = {
nodeSelector = local.kyverno_node_selector nodeSelector = local.kyverno_node_selector
podAntiAffinity = local.kyverno_component_pod_anti_affinity.backgroundController
replicas = 1 replicas = 1
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.backgroundController
resources = { resources = {
requests = { requests = {
cpu = "25m" cpu = "25m"
@ -815,7 +1008,9 @@ resource "helm_release" "kyverno" {
} }
cleanupController = { cleanupController = {
nodeSelector = local.kyverno_node_selector nodeSelector = local.kyverno_node_selector
podAntiAffinity = local.kyverno_component_pod_anti_affinity.cleanupController
replicas = 1 replicas = 1
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.cleanupController
resources = { resources = {
requests = { requests = {
cpu = "10m" cpu = "10m"
@ -828,7 +1023,9 @@ resource "helm_release" "kyverno" {
} }
reportsController = { reportsController = {
nodeSelector = local.kyverno_node_selector nodeSelector = local.kyverno_node_selector
podAntiAffinity = local.kyverno_component_pod_anti_affinity.reportsController
replicas = 1 replicas = 1
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.reportsController
resources = { resources = {
requests = { requests = {
cpu = "25m" cpu = "25m"
@ -1134,7 +1331,9 @@ resource "helm_release" "prometheus_stack" {
enabled = false enabled = false
} }
prometheusOperator = { prometheusOperator = {
affinity = local.prometheus_operator_affinity
nodeSelector = local.prometheus_stack_node_selector nodeSelector = local.prometheus_stack_node_selector
topologySpreadConstraints = local.prometheus_operator_topology_spread_constraints
tls = { tls = {
enabled = false enabled = false
} }
@ -1242,7 +1441,9 @@ resource "helm_release" "prometheus_stack" {
} }
} }
"kube-state-metrics" = { "kube-state-metrics" = {
affinity = local.kube_state_metrics_affinity
nodeSelector = local.prometheus_stack_node_selector nodeSelector = local.prometheus_stack_node_selector
topologySpreadConstraints = local.kube_state_metrics_topology_spread_constraints
} }
}) })
] ]

View File

@ -214,8 +214,8 @@ variable "prometheus_stack_node_selector" {
description = "Node selector applied to kube-prometheus-stack control workloads so they stay off the control plane." description = "Node selector applied to kube-prometheus-stack control workloads so they stay off the control plane."
type = map(string) type = map(string)
default = { default = {
"homelab.dev/node-role" = "app" "kubernetes.io/os" = "linux"
"homelab.dev/storage" = "nvme" "homelab.dev/workload-class" = "platform"
} }
} }

7
lab.sh
View File

@ -189,6 +189,7 @@ ensure_homelab_node_labels() {
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \ kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
homelab.dev/node-role=control-plane \ homelab.dev/node-role=control-plane \
homelab.dev/storage=local \ homelab.dev/storage=local \
homelab.dev/workload-class=control-plane \
--overwrite --overwrite
continue continue
fi fi
@ -201,11 +202,13 @@ ensure_homelab_node_labels() {
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \ kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
homelab.dev/node-role=app \ homelab.dev/node-role=app \
homelab.dev/storage=nvme \ homelab.dev/storage=nvme \
homelab.dev/workload-class=platform \
--overwrite --overwrite
elif [[ "${node}" == "${raspberry_node}" ]]; then elif [[ "${node}" == "${raspberry_node}" ]]; then
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \ kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
homelab.dev/node-role=edge-app \ homelab.dev/node-role=edge-app \
homelab.dev/storage=local \ homelab.dev/storage=local \
homelab.dev/workload-class=edge \
--overwrite --overwrite
fi fi
done < <(kubectl --kubeconfig "${KUBECONFIG_PATH}" get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') done < <(kubectl --kubeconfig "${KUBECONFIG_PATH}" get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
@ -691,8 +694,8 @@ write_cluster_worker_var_file() {
LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \ LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \
LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \ LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \
LAB_RASPBERRY_SSH_KEY_PATH="${LAB_RASPBERRY_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}" \ LAB_RASPBERRY_SSH_KEY_PATH="${LAB_RASPBERRY_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}" \
LAB_RASPBERRY_NODE_LABELS_JSON="${LAB_RASPBERRY_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"edge-app\",\"homelab.dev/storage\":\"local\"}}" \ LAB_RASPBERRY_NODE_LABELS_JSON="${LAB_RASPBERRY_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"edge-app\",\"homelab.dev/storage\":\"local\",\"homelab.dev/workload-class\":\"edge\"}}" \
LAB_PIMOX_WORKER_NODE_LABELS_JSON="${LAB_PIMOX_WORKER_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"app\",\"homelab.dev/storage\":\"nvme\"}}" \ LAB_PIMOX_WORKER_NODE_LABELS_JSON="${LAB_PIMOX_WORKER_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"app\",\"homelab.dev/storage\":\"nvme\",\"homelab.dev/workload-class\":\"platform\"}}" \
python3 - "${spec_file}" "${var_file}" <<'PY' python3 - "${spec_file}" "${var_file}" <<'PY'
import json import json
import os import os