my-homelab-configs/bootstrap/platform/main.tf

727 lines
20 KiB
HCL

terraform {
required_version = ">= 1.0"
required_providers {
helm = {
source = "hashicorp/helm"
version = "~> 2.12"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.26"
}
null = {
source = "hashicorp/null"
version = "~> 3.2"
}
}
}
provider "kubernetes" {
config_path = var.kubeconfig_path
}
provider "helm" {
kubernetes {
config_path = var.kubeconfig_path
}
}
resource "helm_release" "calico_crds" {
name = "calico-crds"
repository = var.calico.repository
chart = "crd.projectcalico.org.v1"
version = var.calico.version
namespace = var.calico.namespace
create_namespace = true
}
resource "null_resource" "calico_helm_recovery" {
depends_on = [helm_release.calico_crds]
triggers = {
always = timestamp()
kubeconfig_path = var.kubeconfig_path
namespace = var.calico.namespace
release_name = "calico"
}
provisioner "local-exec" {
interpreter = ["/bin/bash", "-lc"]
command = <<EOT
set -euo pipefail
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" get secrets \
-l "owner=helm,name=${self.triggers.release_name}" \
-o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.status}{"\n"}{end}' 2>/dev/null |
while IFS=$'\t' read -r secret status; do
case "$status" in
pending-install|pending-upgrade|pending-rollback|failed|uninstalling)
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" delete secret "$secret"
;;
esac
done
EOT
}
}
resource "helm_release" "calico" {
depends_on = [null_resource.calico_helm_recovery]
name = "calico"
repository = var.calico.repository
chart = "tigera-operator"
version = var.calico.version
namespace = var.calico.namespace
create_namespace = true
timeout = 900
wait = false
cleanup_on_fail = true
values = [
yamlencode({
manageCRDs = false
nodeSelector = {
"kubernetes.io/os" = "linux"
"kubernetes.io/hostname" = var.calico_operator_node_name
}
apiServer = {
enabled = false
}
goldmane = {
enabled = false
}
whisker = {
enabled = false
}
installation = {
controlPlaneReplicas = 1
cni = {
type = "Calico"
}
calicoNetwork = {
bgp = "Disabled"
nodeAddressAutodetectionV4 = {
cidrs = var.calico_node_address_autodetection_cidrs
firstFound = false
}
ipPools = [
{
cidr = var.pod_network_cidr
encapsulation = "VXLAN"
}
]
}
}
})
]
}
resource "null_resource" "calico_ready" {
depends_on = [helm_release.calico]
triggers = {
kubeconfig_path = var.kubeconfig_path
calico_version = var.calico.version
pod_network_cidr = var.pod_network_cidr
calico_node_address_autodetection_cidrs = join(",", var.calico_node_address_autodetection_cidrs)
}
provisioner "local-exec" {
interpreter = ["/bin/bash", "-lc"]
command = <<EOT
set -euo pipefail
dump_calico_debug() {
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" get nodes -o wide || true
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" get pods -A -o wide || true
for ns in tigera-operator calico-system kube-system; do
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "$ns" get events --sort-by=.lastTimestamp 2>/dev/null | tail -80 || true
done
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n tigera-operator describe deployment tigera-operator || true
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n tigera-operator logs deployment/tigera-operator --tail=160 || true
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n calico-system describe daemonset calico-node || true
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n calico-system describe deployment calico-kube-controllers || true
}
wait_for_resource() {
kind="$1"
namespace="$2"
name="$3"
timeout_seconds="$4"
elapsed=0
until kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "$namespace" get "$kind/$name" >/dev/null 2>&1; do
if [ "$elapsed" -ge "$timeout_seconds" ]; then
echo "Timed out waiting for $kind/$name in namespace $namespace" >&2
dump_calico_debug
exit 1
fi
sleep 5
elapsed=$((elapsed + 5))
done
}
trap dump_calico_debug ERR
wait_for_resource deployment tigera-operator tigera-operator 300
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n tigera-operator rollout status deployment/tigera-operator --timeout=300s
wait_for_resource daemonset calico-system calico-node 600
wait_for_resource deployment calico-system calico-kube-controllers 600
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n calico-system rollout status daemonset/calico-node --timeout=600s
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n calico-system rollout status deployment/calico-kube-controllers --timeout=600s
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" wait --for=condition=Ready nodes --all --timeout=600s
EOT
}
}
resource "helm_release" "openebs" {
depends_on = [null_resource.calico_ready]
name = "openebs"
repository = var.openebs.repository
chart = "openebs"
version = var.openebs.version
namespace = var.openebs.namespace
create_namespace = true
timeout = 600
values = [
yamlencode({
engines = {
local = {
lvm = {
enabled = false
}
zfs = {
enabled = false
}
}
replicated = {
mayastor = {
enabled = false
}
}
}
loki = {
enabled = false
}
alloy = {
enabled = false
}
})
]
}
resource "kubernetes_storage_class_v1" "openebs_hostpath_retain" {
depends_on = [helm_release.openebs]
metadata {
name = var.openebs.retain_storage_class
annotations = {
"openebs.io/cas-type" = "local"
"cas.openebs.io/config" = yamlencode([{ name = "StorageType", value = "hostpath" }, { name = "BasePath", value = var.openebs.base_path }])
"storageclass.kubernetes.io/is-default-class" = "false"
}
}
storage_provisioner = "openebs.io/local"
reclaim_policy = "Retain"
volume_binding_mode = "WaitForFirstConsumer"
allow_volume_expansion = true
}
resource "kubernetes_namespace_v1" "monitoring" {
depends_on = [kubernetes_storage_class_v1.openebs_hostpath_retain]
metadata {
name = var.observability.namespace
}
}
resource "helm_release" "argocd" {
depends_on = [helm_release.openebs]
name = "argocd"
repository = var.argocd.repository
chart = "argo-cd"
version = var.argocd.version
namespace = var.argocd.namespace
create_namespace = true
timeout = 600
}
resource "null_resource" "argocd_ready" {
depends_on = [helm_release.argocd]
triggers = {
kubeconfig_path = var.kubeconfig_path
namespace = var.argocd.namespace
version = var.argocd.version
}
provisioner "local-exec" {
interpreter = ["/bin/bash", "-lc"]
command = <<EOT
set -euo pipefail
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" wait --for=condition=Established --timeout=180s crd/applications.argoproj.io
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" rollout status deployment/argocd-repo-server --timeout=300s
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" rollout status deployment/argocd-server --timeout=300s
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" rollout status statefulset/argocd-application-controller --timeout=300s
EOT
}
}
resource "null_resource" "argocd_private_repo" {
depends_on = [null_resource.argocd_ready]
triggers = {
kubeconfig_path = var.kubeconfig_path
namespace = var.argocd.namespace
secret_name = var.argocd.repo_secret_name
repo_url = var.gitops_repo_url
ssh_key_path = var.gitops_ssh_key_path
}
provisioner "local-exec" {
interpreter = ["/bin/bash", "-lc"]
command = <<EOT
set -euo pipefail
repo_url="${self.triggers.repo_url}"
case "$${repo_url}" in
http://*|https://*)
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" create secret generic "${self.triggers.secret_name}" \
--from-literal=type=git \
--from-literal=url="${self.triggers.repo_url}" \
--dry-run=client -o yaml | kubectl --kubeconfig "${self.triggers.kubeconfig_path}" apply -f -
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" label secret "${self.triggers.secret_name}" \
argocd.argoproj.io/secret-type=repository --overwrite
exit 0
;;
esac
repo_target="$${repo_url#ssh://}"
repo_target="$${repo_target#*@}"
repo_target="$${repo_target%%/*}"
repo_host="$${repo_target%%:*}"
if [ -z "$${repo_host}" ]; then
echo "Could not determine GitOps SSH host from $${repo_url}" >&2
exit 1
fi
known_hosts_file="$(mktemp)"
known_hosts_sorted="$(mktemp)"
trap 'rm -f "$${known_hosts_file}" "$${known_hosts_sorted}"' EXIT
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" get configmap argocd-ssh-known-hosts-cm \
-o jsonpath='{.data.ssh_known_hosts}' > "$${known_hosts_file}" 2>/dev/null || true
ssh-keyscan -H "$${repo_host}" >> "$${known_hosts_file}" 2>/dev/null
sort -u "$${known_hosts_file}" > "$${known_hosts_sorted}"
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" create configmap argocd-ssh-known-hosts-cm \
--from-file=ssh_known_hosts="$${known_hosts_sorted}" \
--dry-run=client -o yaml | kubectl --kubeconfig "${self.triggers.kubeconfig_path}" apply -f -
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" create secret generic "${self.triggers.secret_name}" \
--from-literal=type=git \
--from-literal=url="${self.triggers.repo_url}" \
--from-file=sshPrivateKey="${self.triggers.ssh_key_path}" \
--dry-run=client -o yaml | kubectl --kubeconfig "${self.triggers.kubeconfig_path}" apply -f -
kubectl --kubeconfig "${self.triggers.kubeconfig_path}" -n "${self.triggers.namespace}" label secret "${self.triggers.secret_name}" \
argocd.argoproj.io/secret-type=repository --overwrite
EOT
}
}
resource "helm_release" "loki" {
depends_on = [kubernetes_namespace_v1.monitoring]
name = "loki"
repository = var.observability.loki.repository
chart = var.observability.loki.chart
version = var.observability.loki.version
namespace = var.observability.namespace
create_namespace = false
timeout = 900
wait = true
values = [
yamlencode({
deploymentMode = "SingleBinary"
loki = {
auth_enabled = false
commonConfig = {
replication_factor = 1
}
storage = {
type = "filesystem"
}
schemaConfig = {
configs = [
{
from = "2024-04-01"
store = "tsdb"
object_store = "filesystem"
schema = "v13"
index = {
prefix = "loki_index_"
period = "24h"
}
}
]
}
limits_config = {
retention_period = var.observability.loki.retention_period
}
compactor = {
retention_enabled = true
delete_request_store = "filesystem"
working_directory = "/var/loki/compactor"
}
}
singleBinary = {
replicas = 1
affinity = {}
persistence = {
enabled = true
whenScaled = "Retain"
whenDeleted = "Retain"
enableStatefulSetAutoDeletePVC = false
storageClass = var.openebs.retain_storage_class
size = var.observability.loki.storage_size
}
resources = {
requests = {
cpu = "50m"
memory = "256Mi"
}
limits = {
memory = "768Mi"
}
}
}
read = {
replicas = 0
}
write = {
replicas = 0
}
backend = {
replicas = 0
}
gateway = {
enabled = false
}
chunksCache = {
enabled = false
}
resultsCache = {
enabled = false
}
lokiCanary = {
enabled = false
}
test = {
enabled = false
}
})
]
}
resource "helm_release" "mimir" {
depends_on = [kubernetes_namespace_v1.monitoring]
name = "mimir"
repository = var.observability.mimir.repository
chart = var.observability.mimir.chart
version = var.observability.mimir.version
namespace = var.observability.namespace
create_namespace = false
timeout = 1200
wait = true
values = [
yamlencode({
mimir = {
structuredConfig = {
multitenancy_enabled = false
ingester = {
ring = {
replication_factor = 1
}
}
}
}
alertmanager = {
persistentVolume = {
storageClass = var.openebs.retain_storage_class
size = var.observability.mimir.alertmanager_storage_size
}
zoneAwareReplication = {
enabled = false
}
}
ingester = {
replicas = 1
persistentVolume = {
storageClass = var.openebs.retain_storage_class
size = var.observability.mimir.ingester_storage_size
}
resources = {
requests = {
cpu = "100m"
memory = "512Mi"
}
limits = {
memory = "1Gi"
}
}
zoneAwareReplication = {
enabled = false
}
}
store_gateway = {
replicas = 1
persistentVolume = {
storageClass = var.openebs.retain_storage_class
size = var.observability.mimir.store_gateway_storage_size
}
zoneAwareReplication = {
enabled = false
}
}
compactor = {
replicas = 1
persistentVolume = {
storageClass = var.openebs.retain_storage_class
size = var.observability.mimir.compactor_storage_size
}
}
distributor = {
replicas = 1
}
querier = {
replicas = 1
}
query_frontend = {
replicas = 1
}
query_scheduler = {
replicas = 1
}
ruler = {
replicas = 1
}
minio = {
persistence = {
storageClass = var.openebs.retain_storage_class
size = var.observability.mimir.minio_storage_size
}
resources = {
requests = {
cpu = "50m"
memory = "128Mi"
}
limits = {
memory = "512Mi"
}
}
}
nginx = {
replicas = 1
}
gateway = {
enabled = false
}
rollout_operator = {
enabled = false
}
})
]
}
resource "helm_release" "promtail" {
depends_on = [helm_release.loki]
name = "promtail"
repository = var.observability.promtail.repository
chart = var.observability.promtail.chart
version = var.observability.promtail.version
namespace = var.observability.namespace
create_namespace = false
timeout = 600
wait = true
values = [
yamlencode({
config = {
clients = [
{
url = "http://loki.${var.observability.namespace}.svc:3100/loki/api/v1/push"
}
]
}
resources = {
requests = {
cpu = "25m"
memory = "64Mi"
}
limits = {
memory = "128Mi"
}
}
})
]
}
resource "helm_release" "prometheus_stack" {
depends_on = [helm_release.loki, helm_release.mimir]
name = "prometheus-stack"
repository = var.observability.prometheus.repository
chart = var.observability.prometheus.chart
version = var.observability.prometheus.version
namespace = var.observability.namespace
create_namespace = false
timeout = 1200
wait = true
values = [
yamlencode({
kubeControllerManager = {
enabled = false
}
kubeEtcd = {
enabled = false
}
kubeProxy = {
enabled = false
}
kubeScheduler = {
enabled = false
}
prometheusOperator = {
tls = {
enabled = false
}
admissionWebhooks = {
enabled = false
}
resources = {
requests = {
cpu = "50m"
memory = "128Mi"
}
limits = {
memory = "384Mi"
}
}
}
alertmanager = {
alertmanagerSpec = {
storage = {
volumeClaimTemplate = {
spec = {
storageClassName = var.openebs.retain_storage_class
accessModes = ["ReadWriteOnce"]
resources = {
requests = {
storage = var.observability.prometheus.alertmanager_storage_size
}
}
}
}
}
}
}
prometheus = {
prometheusSpec = {
retention = var.observability.prometheus.retention
resources = {
requests = {
cpu = "100m"
memory = "512Mi"
}
limits = {
memory = "1Gi"
}
}
remoteWrite = var.observability.prometheus.remote_write_mimir_enabled ? [
{
url = "http://mimir-nginx.${var.observability.namespace}.svc/api/v1/push"
}
] : []
storageSpec = {
volumeClaimTemplate = {
spec = {
storageClassName = var.openebs.retain_storage_class
accessModes = ["ReadWriteOnce"]
resources = {
requests = {
storage = var.observability.prometheus.storage_size
}
}
}
}
}
}
}
grafana = {
persistence = {
enabled = true
type = "sts"
storageClassName = var.openebs.retain_storage_class
accessModes = ["ReadWriteOnce"]
size = var.observability.prometheus.grafana_storage_size
}
additionalDataSources = [
{
name = "Loki"
type = "loki"
access = "proxy"
url = "http://loki.${var.observability.namespace}.svc:3100"
isDefault = false
},
{
name = "Mimir"
type = "prometheus"
access = "proxy"
url = "http://mimir-nginx.${var.observability.namespace}.svc/prometheus"
isDefault = false
}
]
resources = {
requests = {
cpu = "50m"
memory = "128Mi"
}
limits = {
memory = "384Mi"
}
}
}
})
]
}
resource "helm_release" "extra_tools" {
for_each = var.extra_helm_releases
depends_on = [null_resource.calico_ready]
name = each.key
repository = each.value.repository
chart = each.value.chart
version = each.value.version != "" ? each.value.version : null
namespace = each.value.namespace
create_namespace = each.value.create_namespace
timeout = each.value.timeout
values = each.value.values_yaml != "" ? [each.value.values_yaml] : []
dynamic "set" {
for_each = each.value.set_values
content {
name = set.key
value = set.value
}
}
}