Move Prometheus stack to worker nodes
This commit is contained in:
parent
072bcfdbc8
commit
b0a2c44bbb
|
|
@ -225,6 +225,15 @@ OpenEBS hostpath PVs are node-local. Move workloads only after their storage and
|
|||
edge path are ready on the target node. Gitea is outside Kubernetes and is moved
|
||||
by changing the Raspberry Pi Docker install target instead.
|
||||
|
||||
The Prometheus stack control workloads are pinned to Pimox worker nodes by the
|
||||
default `prometheus_stack_node_selector` (`homelab.dev/node-role=app` and
|
||||
`homelab.dev/storage=nvme`). Because the Prometheus, Alertmanager, and Grafana
|
||||
PVCs use retained local OpenEBS volumes, moving an existing install off the
|
||||
Debian control plane requires discarding those PVCs. Run
|
||||
`./lab.sh move-prometheus-stack-workers` from the Debian host to destroy only
|
||||
the existing `prometheus-stack` Helm release, delete its retained PVC/PV objects,
|
||||
and recreate the stack on the worker selector.
|
||||
|
||||
The website and demos NodePorts are reachable from the OCI jump box through the
|
||||
Raspberry Pi Tailscale interface. `bootstrap/cluster` installs a persistent
|
||||
`homelab-tailscale-nodeport.service` on the configured worker to restore the
|
||||
|
|
|
|||
|
|
@ -104,6 +104,8 @@ EOT
|
|||
local.metallb_ip_address_pool_manifest,
|
||||
var.metallb.l2_advertisement_enabled ? local.metallb_l2_advertisement_manifest : "",
|
||||
]))
|
||||
|
||||
prometheus_stack_node_selector = var.prometheus_stack_node_selector
|
||||
}
|
||||
|
||||
resource "helm_release" "calico_crds" {
|
||||
|
|
@ -1102,6 +1104,7 @@ resource "helm_release" "prometheus_stack" {
|
|||
enabled = false
|
||||
}
|
||||
prometheusOperator = {
|
||||
nodeSelector = local.prometheus_stack_node_selector
|
||||
tls = {
|
||||
enabled = false
|
||||
}
|
||||
|
|
@ -1120,6 +1123,7 @@ resource "helm_release" "prometheus_stack" {
|
|||
}
|
||||
alertmanager = {
|
||||
alertmanagerSpec = {
|
||||
nodeSelector = local.prometheus_stack_node_selector
|
||||
storage = {
|
||||
volumeClaimTemplate = {
|
||||
spec = {
|
||||
|
|
@ -1137,6 +1141,7 @@ resource "helm_release" "prometheus_stack" {
|
|||
}
|
||||
prometheus = {
|
||||
prometheusSpec = {
|
||||
nodeSelector = local.prometheus_stack_node_selector
|
||||
retention = var.observability.prometheus.retention
|
||||
resources = {
|
||||
requests = {
|
||||
|
|
@ -1168,6 +1173,7 @@ resource "helm_release" "prometheus_stack" {
|
|||
}
|
||||
}
|
||||
grafana = {
|
||||
nodeSelector = local.prometheus_stack_node_selector
|
||||
persistence = {
|
||||
enabled = true
|
||||
type = "sts"
|
||||
|
|
@ -1205,6 +1211,9 @@ resource "helm_release" "prometheus_stack" {
|
|||
}
|
||||
}
|
||||
}
|
||||
"kube-state-metrics" = {
|
||||
nodeSelector = local.prometheus_stack_node_selector
|
||||
}
|
||||
})
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -210,6 +210,15 @@ variable "observability" {
|
|||
}
|
||||
}
|
||||
|
||||
variable "prometheus_stack_node_selector" {
|
||||
description = "Node selector applied to kube-prometheus-stack control workloads so they stay off the control plane."
|
||||
type = map(string)
|
||||
default = {
|
||||
"homelab.dev/node-role" = "app"
|
||||
"homelab.dev/storage" = "nvme"
|
||||
}
|
||||
}
|
||||
|
||||
variable "extra_helm_releases" {
|
||||
type = map(object({
|
||||
repository = string
|
||||
|
|
|
|||
48
lab.sh
48
lab.sh
|
|
@ -174,6 +174,31 @@ adopt_apps_existing_resources() {
|
|||
"demos-static"
|
||||
}
|
||||
|
||||
delete_prometheus_stack_storage() {
|
||||
local namespace="${1:-monitoring}"
|
||||
local pattern='(^|-)prometheus-stack-(prometheus|alertmanager|grafana)(-|$)|^prometheus-prometheus-stack|^alertmanager-prometheus-stack|^storage-prometheus-stack-grafana'
|
||||
local pvc_names
|
||||
local pv_names
|
||||
|
||||
pvc_names="$(kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" get pvc -o name 2>/dev/null |
|
||||
awk -F/ -v pattern="${pattern}" '$2 ~ pattern {print $2}')"
|
||||
pv_names="$(kubectl --kubeconfig "${KUBECONFIG_PATH}" get pv \
|
||||
-o jsonpath='{range .items[?(@.spec.claimRef.namespace=="'"${namespace}"'")]}{.metadata.name}{"\t"}{.spec.claimRef.name}{"\n"}{end}' 2>/dev/null |
|
||||
awk -v pattern="${pattern}" '$2 ~ pattern {print $1}')"
|
||||
|
||||
if [[ -n "${pvc_names}" ]]; then
|
||||
echo "Deleting old prometheus-stack PVCs in ${namespace}; saved Prometheus, Alertmanager, and Grafana data will be discarded..."
|
||||
printf '%s\n' "${pvc_names}" |
|
||||
xargs -r kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" delete pvc --wait=true --timeout=180s
|
||||
fi
|
||||
|
||||
if [[ -n "${pv_names}" ]]; then
|
||||
echo "Deleting old prometheus-stack retained PV objects..."
|
||||
printf '%s\n' "${pv_names}" |
|
||||
xargs -r kubectl --kubeconfig "${KUBECONFIG_PATH}" delete pv --wait=false
|
||||
fi
|
||||
}
|
||||
|
||||
run_tofu_stack() {
|
||||
local stack="$1"
|
||||
local -a apply_args=(-auto-approve)
|
||||
|
|
@ -192,6 +217,24 @@ run_tofu_stack() {
|
|||
tofu -chdir="${REPO_ROOT}/${stack}" apply "${apply_args[@]}"
|
||||
}
|
||||
|
||||
move_prometheus_stack_workers() {
|
||||
local stack="bootstrap/platform"
|
||||
local namespace="${LAB_MONITORING_NAMESPACE:-monitoring}"
|
||||
|
||||
require_debian_server "move-prometheus-stack-workers"
|
||||
|
||||
export TF_VAR_kubeconfig_path="${TF_VAR_kubeconfig_path:-${KUBECONFIG_PATH}}"
|
||||
export KUBECONFIG="${TF_VAR_kubeconfig_path}"
|
||||
|
||||
echo "Moving prometheus-stack off the control plane. Existing prometheus-stack PVC data will be deleted."
|
||||
tofu -chdir="${REPO_ROOT}/${stack}" init
|
||||
adopt_platform_existing_resources
|
||||
tofu -chdir="${REPO_ROOT}/${stack}" destroy -target=helm_release.prometheus_stack -auto-approve
|
||||
delete_prometheus_stack_storage "${namespace}"
|
||||
tofu -chdir="${REPO_ROOT}/${stack}" apply -auto-approve
|
||||
kubectl --kubeconfig "${KUBECONFIG_PATH}" -n "${namespace}" get pods -o wide
|
||||
}
|
||||
|
||||
truthy() {
|
||||
case "${1,,}" in
|
||||
1 | true | yes | on)
|
||||
|
|
@ -2761,6 +2804,9 @@ case "${1:-}" in
|
|||
install-gitea-runner)
|
||||
install_gitea_runner "${2:-}"
|
||||
;;
|
||||
move-prometheus-stack-workers)
|
||||
move_prometheus_stack_workers
|
||||
;;
|
||||
openwrt)
|
||||
openwrt
|
||||
;;
|
||||
|
|
@ -2768,7 +2814,7 @@ case "${1:-}" in
|
|||
nuke
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|openwrt|nuke}"
|
||||
echo "Usage: $0 {up|rebuild-cluster|apps|deploy-gitea|bootstrap-gitea-repo|backup-gitea|drill-gitea-restore|install-gitea-runner|move-prometheus-stack-workers|openwrt|nuke}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
Loading…
Reference in New Issue