Add durable platform workload scheduling
Homelab Main / deploy (push) Successful in 1m14s
Details
Homelab Main / deploy (push) Successful in 1m14s
Details
This commit is contained in:
parent
240a55e826
commit
47c018b6dc
28
README.md
28
README.md
|
|
@ -212,12 +212,12 @@ duplicate those PV manifests when you want storage on another node.
|
|||
|
||||
- `node-role.kubernetes.io/worker=worker` on every worker so `kubectl get nodes`
|
||||
shows `worker` instead of `<none>` in the ROLES column
|
||||
- `homelab.dev/node-role=control-plane` and `homelab.dev/storage=local` on the
|
||||
Debian control plane
|
||||
- `homelab.dev/node-role=edge-app` and `homelab.dev/storage=local` on the
|
||||
Raspberry Pi worker
|
||||
- `homelab.dev/node-role=app` and `homelab.dev/storage=nvme` on automated Pimox
|
||||
worker clones
|
||||
- `homelab.dev/node-role=control-plane`, `homelab.dev/storage=local`, and
|
||||
`homelab.dev/workload-class=control-plane` on the Debian control plane
|
||||
- `homelab.dev/node-role=edge-app`, `homelab.dev/storage=local`, and
|
||||
`homelab.dev/workload-class=edge` on the Raspberry Pi worker
|
||||
- `homelab.dev/node-role=app`, `homelab.dev/storage=nvme`, and
|
||||
`homelab.dev/workload-class=platform` on automated Pimox worker clones
|
||||
|
||||
Override `control_plane_node_labels`, `worker_node_labels`,
|
||||
`LAB_RASPBERRY_NODE_LABELS_JSON`, or `LAB_PIMOX_WORKER_NODE_LABELS_JSON` when
|
||||
|
|
@ -227,14 +227,18 @@ OpenEBS hostpath PVs are node-local. Move workloads only after their storage and
|
|||
edge path are ready on the target node. Gitea is outside Kubernetes and is moved
|
||||
by changing the Raspberry Pi Docker install target instead.
|
||||
|
||||
The Prometheus stack control workloads are pinned to Pimox worker nodes by the
|
||||
default `prometheus_stack_node_selector` (`homelab.dev/node-role=app` and
|
||||
`homelab.dev/storage=nvme`). Because the Prometheus, Alertmanager, and Grafana
|
||||
PVCs use retained local OpenEBS volumes, moving an existing install off the
|
||||
Debian control plane requires discarding those PVCs. Run
|
||||
The stateless platform controllers are pinned to Pimox worker nodes through
|
||||
`homelab.dev/workload-class=platform` and include hostname topology spread plus
|
||||
preferred pod anti-affinity so future Argo CD, Kyverno, Prometheus operator, and
|
||||
kube-state-metrics scheduling does not collapse onto the first worker that joins.
|
||||
PVC-backed monitoring StatefulSets are intentionally treated separately because
|
||||
their retained OpenEBS hostpath volumes are node-local. Run
|
||||
`./lab.sh move-prometheus-stack-workers` from the Debian host to label existing
|
||||
worker nodes, destroy only the existing `prometheus-stack` Helm release, delete
|
||||
its retained PVC/PV objects, and recreate the stack on the worker selector.
|
||||
its retained PVC/PV objects, and recreate the stack on the worker selector when
|
||||
you intentionally accept losing that monitoring data. A planned monitoring data
|
||||
migration should be handled as a separate maintenance task with backup,
|
||||
delete/recreate or storage migration steps, and post-restore checks.
|
||||
|
||||
The website and demos NodePorts are reachable from the OCI jump box through the
|
||||
Raspberry Pi Tailscale interface. `bootstrap/cluster` installs a persistent
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ variable "control_plane_node_labels" {
|
|||
default = {
|
||||
"homelab.dev/node-role" = "control-plane"
|
||||
"homelab.dev/storage" = "local"
|
||||
"homelab.dev/workload-class" = "control-plane"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -105,15 +105,169 @@ EOT
|
|||
var.metallb.l2_advertisement_enabled ? local.metallb_l2_advertisement_manifest : "",
|
||||
]))
|
||||
|
||||
platform_topology_key = "kubernetes.io/hostname"
|
||||
prometheus_stack_node_selector = var.prometheus_stack_node_selector
|
||||
argocd_node_selector = {
|
||||
"kubernetes.io/os" = "linux"
|
||||
"homelab.dev/node-role" = "app"
|
||||
"homelab.dev/workload-class" = "platform"
|
||||
}
|
||||
kyverno_node_selector = {
|
||||
"kubernetes.io/os" = "linux"
|
||||
"homelab.dev/node-role" = "app"
|
||||
"homelab.dev/workload-class" = "platform"
|
||||
}
|
||||
|
||||
argocd_component_label_values = {
|
||||
application_set = "argocd-applicationset-controller"
|
||||
controller = "argocd-application-controller"
|
||||
dex = "argocd-dex-server"
|
||||
notifications = "argocd-notifications-controller"
|
||||
redis = "argocd-redis"
|
||||
repo_server = "argocd-repo-server"
|
||||
server = "argocd-server"
|
||||
}
|
||||
|
||||
argocd_component_match_labels = {
|
||||
for component, name in local.argocd_component_label_values : component => {
|
||||
"app.kubernetes.io/name" = name
|
||||
}
|
||||
}
|
||||
|
||||
argocd_component_affinity = {
|
||||
for component, labels in local.argocd_component_match_labels : component => {
|
||||
podAntiAffinity = {
|
||||
preferredDuringSchedulingIgnoredDuringExecution = [
|
||||
{
|
||||
weight = 100
|
||||
podAffinityTerm = {
|
||||
labelSelector = {
|
||||
matchLabels = labels
|
||||
}
|
||||
topologyKey = local.platform_topology_key
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
argocd_component_topology_spread_constraints = {
|
||||
for component, labels in local.argocd_component_match_labels : component => [
|
||||
{
|
||||
maxSkew = 1
|
||||
topologyKey = local.platform_topology_key
|
||||
whenUnsatisfiable = "ScheduleAnyway"
|
||||
labelSelector = {
|
||||
matchLabels = labels
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
kyverno_component_label_values = {
|
||||
admissionController = "admission-controller"
|
||||
backgroundController = "background-controller"
|
||||
cleanupController = "cleanup-controller"
|
||||
reportsController = "reports-controller"
|
||||
}
|
||||
|
||||
kyverno_component_match_labels = {
|
||||
for component, name in local.kyverno_component_label_values : component => {
|
||||
"app.kubernetes.io/component" = name
|
||||
}
|
||||
}
|
||||
|
||||
kyverno_component_pod_anti_affinity = {
|
||||
for component, labels in local.kyverno_component_match_labels : component => {
|
||||
preferredDuringSchedulingIgnoredDuringExecution = [
|
||||
{
|
||||
weight = 100
|
||||
podAffinityTerm = {
|
||||
labelSelector = {
|
||||
matchLabels = labels
|
||||
}
|
||||
topologyKey = local.platform_topology_key
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
kyverno_component_topology_spread_constraints = {
|
||||
for component, labels in local.kyverno_component_match_labels : component => [
|
||||
{
|
||||
maxSkew = 1
|
||||
topologyKey = local.platform_topology_key
|
||||
whenUnsatisfiable = "ScheduleAnyway"
|
||||
labelSelector = {
|
||||
matchLabels = labels
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
prometheus_operator_match_labels = {
|
||||
app = "kube-prometheus-stack-operator"
|
||||
release = "prometheus-stack"
|
||||
}
|
||||
|
||||
kube_state_metrics_match_labels = {
|
||||
"app.kubernetes.io/instance" = "prometheus-stack"
|
||||
"app.kubernetes.io/name" = "kube-state-metrics"
|
||||
}
|
||||
|
||||
prometheus_operator_affinity = {
|
||||
podAntiAffinity = {
|
||||
preferredDuringSchedulingIgnoredDuringExecution = [
|
||||
{
|
||||
weight = 100
|
||||
podAffinityTerm = {
|
||||
labelSelector = {
|
||||
matchLabels = local.prometheus_operator_match_labels
|
||||
}
|
||||
topologyKey = local.platform_topology_key
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
kube_state_metrics_affinity = {
|
||||
podAntiAffinity = {
|
||||
preferredDuringSchedulingIgnoredDuringExecution = [
|
||||
{
|
||||
weight = 100
|
||||
podAffinityTerm = {
|
||||
labelSelector = {
|
||||
matchLabels = local.kube_state_metrics_match_labels
|
||||
}
|
||||
topologyKey = local.platform_topology_key
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
prometheus_operator_topology_spread_constraints = [
|
||||
{
|
||||
maxSkew = 1
|
||||
topologyKey = local.platform_topology_key
|
||||
whenUnsatisfiable = "ScheduleAnyway"
|
||||
labelSelector = {
|
||||
matchLabels = local.prometheus_operator_match_labels
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
kube_state_metrics_topology_spread_constraints = [
|
||||
{
|
||||
maxSkew = 1
|
||||
topologyKey = local.platform_topology_key
|
||||
whenUnsatisfiable = "ScheduleAnyway"
|
||||
labelSelector = {
|
||||
matchLabels = local.kube_state_metrics_match_labels
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
resource "helm_release" "calico_crds" {
|
||||
|
|
@ -678,6 +832,41 @@ resource "helm_release" "argocd" {
|
|||
global = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
}
|
||||
applicationSet = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
affinity = local.argocd_component_affinity.application_set
|
||||
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.application_set
|
||||
}
|
||||
controller = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
affinity = local.argocd_component_affinity.controller
|
||||
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.controller
|
||||
}
|
||||
dex = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
affinity = local.argocd_component_affinity.dex
|
||||
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.dex
|
||||
}
|
||||
notifications = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
affinity = local.argocd_component_affinity.notifications
|
||||
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.notifications
|
||||
}
|
||||
redis = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
affinity = local.argocd_component_affinity.redis
|
||||
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.redis
|
||||
}
|
||||
repoServer = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
affinity = local.argocd_component_affinity.repo_server
|
||||
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.repo_server
|
||||
}
|
||||
server = {
|
||||
nodeSelector = local.argocd_node_selector
|
||||
affinity = local.argocd_component_affinity.server
|
||||
topologySpreadConstraints = local.argocd_component_topology_spread_constraints.server
|
||||
}
|
||||
})
|
||||
]
|
||||
}
|
||||
|
|
@ -789,7 +978,9 @@ resource "helm_release" "kyverno" {
|
|||
}
|
||||
admissionController = {
|
||||
nodeSelector = local.kyverno_node_selector
|
||||
podAntiAffinity = local.kyverno_component_pod_anti_affinity.admissionController
|
||||
replicas = 1
|
||||
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.admissionController
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
|
|
@ -802,7 +993,9 @@ resource "helm_release" "kyverno" {
|
|||
}
|
||||
backgroundController = {
|
||||
nodeSelector = local.kyverno_node_selector
|
||||
podAntiAffinity = local.kyverno_component_pod_anti_affinity.backgroundController
|
||||
replicas = 1
|
||||
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.backgroundController
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "25m"
|
||||
|
|
@ -815,7 +1008,9 @@ resource "helm_release" "kyverno" {
|
|||
}
|
||||
cleanupController = {
|
||||
nodeSelector = local.kyverno_node_selector
|
||||
podAntiAffinity = local.kyverno_component_pod_anti_affinity.cleanupController
|
||||
replicas = 1
|
||||
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.cleanupController
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
|
|
@ -828,7 +1023,9 @@ resource "helm_release" "kyverno" {
|
|||
}
|
||||
reportsController = {
|
||||
nodeSelector = local.kyverno_node_selector
|
||||
podAntiAffinity = local.kyverno_component_pod_anti_affinity.reportsController
|
||||
replicas = 1
|
||||
topologySpreadConstraints = local.kyverno_component_topology_spread_constraints.reportsController
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "25m"
|
||||
|
|
@ -1134,7 +1331,9 @@ resource "helm_release" "prometheus_stack" {
|
|||
enabled = false
|
||||
}
|
||||
prometheusOperator = {
|
||||
affinity = local.prometheus_operator_affinity
|
||||
nodeSelector = local.prometheus_stack_node_selector
|
||||
topologySpreadConstraints = local.prometheus_operator_topology_spread_constraints
|
||||
tls = {
|
||||
enabled = false
|
||||
}
|
||||
|
|
@ -1242,7 +1441,9 @@ resource "helm_release" "prometheus_stack" {
|
|||
}
|
||||
}
|
||||
"kube-state-metrics" = {
|
||||
affinity = local.kube_state_metrics_affinity
|
||||
nodeSelector = local.prometheus_stack_node_selector
|
||||
topologySpreadConstraints = local.kube_state_metrics_topology_spread_constraints
|
||||
}
|
||||
})
|
||||
]
|
||||
|
|
|
|||
|
|
@ -214,8 +214,8 @@ variable "prometheus_stack_node_selector" {
|
|||
description = "Node selector applied to kube-prometheus-stack control workloads so they stay off the control plane."
|
||||
type = map(string)
|
||||
default = {
|
||||
"homelab.dev/node-role" = "app"
|
||||
"homelab.dev/storage" = "nvme"
|
||||
"kubernetes.io/os" = "linux"
|
||||
"homelab.dev/workload-class" = "platform"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
7
lab.sh
7
lab.sh
|
|
@ -189,6 +189,7 @@ ensure_homelab_node_labels() {
|
|||
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
|
||||
homelab.dev/node-role=control-plane \
|
||||
homelab.dev/storage=local \
|
||||
homelab.dev/workload-class=control-plane \
|
||||
--overwrite
|
||||
continue
|
||||
fi
|
||||
|
|
@ -201,11 +202,13 @@ ensure_homelab_node_labels() {
|
|||
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
|
||||
homelab.dev/node-role=app \
|
||||
homelab.dev/storage=nvme \
|
||||
homelab.dev/workload-class=platform \
|
||||
--overwrite
|
||||
elif [[ "${node}" == "${raspberry_node}" ]]; then
|
||||
kubectl --kubeconfig "${KUBECONFIG_PATH}" label node "${node}" \
|
||||
homelab.dev/node-role=edge-app \
|
||||
homelab.dev/storage=local \
|
||||
homelab.dev/workload-class=edge \
|
||||
--overwrite
|
||||
fi
|
||||
done < <(kubectl --kubeconfig "${KUBECONFIG_PATH}" get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
|
||||
|
|
@ -691,8 +694,8 @@ write_cluster_worker_var_file() {
|
|||
LAB_RASPBERRY_USER="${LAB_RASPBERRY_USER:-jv}" \
|
||||
LAB_RASPBERRY_NODE_NAME="${LAB_RASPBERRY_NODE_NAME:-raspberry}" \
|
||||
LAB_RASPBERRY_SSH_KEY_PATH="${LAB_RASPBERRY_SSH_KEY_PATH:-/home/jv/.ssh/id_ed25519}" \
|
||||
LAB_RASPBERRY_NODE_LABELS_JSON="${LAB_RASPBERRY_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"edge-app\",\"homelab.dev/storage\":\"local\"}}" \
|
||||
LAB_PIMOX_WORKER_NODE_LABELS_JSON="${LAB_PIMOX_WORKER_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"app\",\"homelab.dev/storage\":\"nvme\"}}" \
|
||||
LAB_RASPBERRY_NODE_LABELS_JSON="${LAB_RASPBERRY_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"edge-app\",\"homelab.dev/storage\":\"local\",\"homelab.dev/workload-class\":\"edge\"}}" \
|
||||
LAB_PIMOX_WORKER_NODE_LABELS_JSON="${LAB_PIMOX_WORKER_NODE_LABELS_JSON:-{\"node-role.kubernetes.io/worker\":\"worker\",\"homelab.dev/node-role\":\"app\",\"homelab.dev/storage\":\"nvme\",\"homelab.dev/workload-class\":\"platform\"}}" \
|
||||
python3 - "${spec_file}" "${var_file}" <<'PY'
|
||||
import json
|
||||
import os
|
||||
|
|
|
|||
Loading…
Reference in New Issue