adding mlops

2026-05-29 09:24:47 -06:00 · 2026-05-29 09:24:47 -06:00 · 240a55e826
parent 4dab85c9d6
commit 240a55e826
7 changed files with 385 additions and 0 deletions
--- a/apps/mlops-platform/Dockerfile
+++ b/apps/mlops-platform/Dockerfile
@ -0,0 +1,25 @@
+FROM python:3.12-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    MODEL_DIR=/app/models \
+    MODEL_VERSION=v1 \
+    MODEL_TRACK=blue
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app ./app
+COPY models ./models
+
+RUN groupadd --system --gid 10001 mlops \
+    && useradd --system --uid 10001 --gid mlops --home /app --shell /usr/sbin/nologin mlops \
+    && chown -R mlops:mlops /app
+
+USER 10001:10001
+
+EXPOSE 8080
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/apps/mlops-platform/README.md
+++ b/apps/mlops-platform/README.md
@ -0,0 +1,15 @@
+# MLOps Platform Demo
+
+Production-shaped inference demo for the portfolio site. The model is intentionally small: logistic regression coefficients trained with scikit-learn and exported to JSON so the runtime stays light enough for the homelab.
+
+## Endpoints
+
+- `GET /healthz` reports service, track, and active model metadata.
+- `POST /predict` scores service health risk from latency, error rate, CPU, memory, and queue depth.
+- `GET /metrics` exposes Prometheus metrics for request count, latency, errors, model version, confidence, and drift score.
+
+## Model Rollout
+
+- `MODEL_VERSION=v1`, `MODEL_TRACK=blue` is the stable route.
+- `MODEL_VERSION=v2`, `MODEL_TRACK=green` is the canary route.
+- Kubernetes service selectors choose the active track, so rollback is a service selector change instead of an image rebuild.
--- a/apps/mlops-platform/app/main.py
+++ b/apps/mlops-platform/app/main.py
@ -0,0 +1,205 @@
+import json
+import math
+import os
+import time
+from pathlib import Path
+from typing import Any
+
+from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, Histogram, generate_latest
+from pydantic import BaseModel, ConfigDict, Field
+
+MODEL_VERSION = os.getenv("MODEL_VERSION", "v1")
+MODEL_TRACK = os.getenv("MODEL_TRACK", "blue")
+MODEL_DIR = Path(os.getenv("MODEL_DIR", "/app/models"))
+FALLBACK_MODEL_DIR = Path(__file__).resolve().parent.parent / "models"
+
+REQUESTS = Counter(
+    "mlops_requests_total",
+    "HTTP requests handled by the inference service.",
+    ("endpoint", "method", "status", "model_version", "track"),
+)
+REQUEST_LATENCY = Histogram(
+    "mlops_request_latency_seconds",
+    "HTTP request latency for the inference service.",
+    ("endpoint", "method", "model_version", "track"),
+    buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
+)
+PREDICTION_LATENCY = Histogram(
+    "mlops_prediction_latency_seconds",
+    "Prediction execution latency.",
+    ("model_version", "track"),
+    buckets=(0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25),
+)
+PREDICTIONS = Counter(
+    "mlops_predictions_total",
+    "Predictions produced by model outcome.",
+    ("model_version", "track", "outcome"),
+)
+ERRORS = Counter(
+    "mlops_prediction_errors_total",
+    "Prediction errors by reason.",
+    ("model_version", "track", "reason"),
+)
+MODEL_INFO = Gauge(
+    "mlops_model_version_info",
+    "Active model information. Value is always 1 for the running model.",
+    ("model_version", "track", "trained_with"),
+)
+CONFIDENCE = Gauge(
+    "mlops_model_confidence",
+    "Confidence from the most recent prediction.",
+    ("model_version", "track"),
+)
+DRIFT = Gauge(
+    "mlops_model_drift_score",
+    "Feature drift score from the most recent prediction.",
+    ("model_version", "track"),
+)
+
+
+class PredictRequest(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    latency_ms: float = Field(..., ge=0, le=60000)
+    error_rate: float = Field(..., ge=0, le=1)
+    cpu_utilization: float = Field(..., ge=0, le=1)
+    memory_utilization: float = Field(..., ge=0, le=1)
+    queue_depth: float = Field(..., ge=0, le=10000)
+
+
+def load_model(version: str) -> dict[str, Any]:
+    model_path = MODEL_DIR / f"model_{version}.json"
+    if not model_path.exists():
+        model_path = FALLBACK_MODEL_DIR / f"model_{version}.json"
+
+    if not model_path.exists():
+        raise RuntimeError(f"model artifact not found for version {version}")
+
+    with model_path.open(encoding="utf-8") as handle:
+        model = json.load(handle)
+
+    required = {"version", "features", "weights", "bias", "threshold", "baseline", "trained_with"}
+    missing = required.difference(model)
+    if missing:
+        raise RuntimeError(f"model artifact is missing required keys: {', '.join(sorted(missing))}")
+
+    if len(model["features"]) != len(model["weights"]):
+        raise RuntimeError("model features and weights have different lengths")
+
+    return model
+
+
+MODEL = load_model(MODEL_VERSION)
+MODEL_INFO.labels(MODEL["version"], MODEL_TRACK, MODEL["trained_with"]).set(1)
+
+app = FastAPI(
+    title="Homelab MLOps Inference Service",
+    version=MODEL["version"],
+    docs_url=None,
+    redoc_url=None,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["GET", "POST"],
+    allow_headers=["*"],
+)
+
+
+@app.middleware("http")
+async def record_http_metrics(request: Request, call_next: Any) -> Response:
+    start = time.perf_counter()
+    status = "500"
+    route = request.url.path
+    try:
+        response = await call_next(request)
+        status = str(response.status_code)
+        return response
+    finally:
+        endpoint = getattr(request.scope.get("route"), "path", route)
+        elapsed = time.perf_counter() - start
+        REQUESTS.labels(endpoint, request.method, status, MODEL["version"], MODEL_TRACK).inc()
+        REQUEST_LATENCY.labels(endpoint, request.method, MODEL["version"], MODEL_TRACK).observe(elapsed)
+
+
+def normalized_features(features: dict[str, float]) -> list[float]:
+    values = []
+    baseline = MODEL["baseline"]
+    for feature in MODEL["features"]:
+        mean = baseline[feature]["mean"]
+        stddev = baseline[feature]["stddev"]
+        values.append((features[feature] - mean) / stddev)
+    return values
+
+
+def logistic(value: float) -> float:
+    return 1 / (1 + math.exp(-value))
+
+
+def score_prediction(features: dict[str, float]) -> float:
+    score = MODEL["bias"]
+    for weight, value in zip(MODEL["weights"], normalized_features(features), strict=True):
+        score += weight * value
+    return logistic(score)
+
+
+def drift_score(features: dict[str, float]) -> float:
+    z_scores = [abs(value) for value in normalized_features(features)]
+    return min(sum(z_scores) / len(z_scores) / 3, 1)
+
+
+@app.get("/")
+def root() -> dict[str, Any]:
+    return {
+        "service": "homelab-mlops-platform",
+        "model_version": MODEL["version"],
+        "track": MODEL_TRACK,
+        "endpoints": ["/healthz", "/predict", "/metrics"],
+    }
+
+
+@app.get("/healthz")
+def healthz() -> dict[str, Any]:
+    return {
+        "status": "ok",
+        "model_version": MODEL["version"],
+        "track": MODEL_TRACK,
+        "trained_with": MODEL["trained_with"],
+    }
+
+
+@app.post("/predict")
+def predict(payload: PredictRequest) -> dict[str, Any]:
+    start = time.perf_counter()
+    try:
+        features = payload.model_dump()
+        probability = score_prediction(features)
+        drift = drift_score(features)
+        outcome = "at_risk" if probability >= MODEL["threshold"] else "healthy"
+        confidence = probability if outcome == "at_risk" else 1 - probability
+
+        PREDICTIONS.labels(MODEL["version"], MODEL_TRACK, outcome).inc()
+        CONFIDENCE.labels(MODEL["version"], MODEL_TRACK).set(confidence)
+        DRIFT.labels(MODEL["version"], MODEL_TRACK).set(drift)
+        PREDICTION_LATENCY.labels(MODEL["version"], MODEL_TRACK).observe(time.perf_counter() - start)
+
+        return {
+            "model_version": MODEL["version"],
+            "track": MODEL_TRACK,
+            "outcome": outcome,
+            "risk_probability": round(probability, 6),
+            "confidence": round(confidence, 6),
+            "drift_score": round(drift, 6),
+            "threshold": MODEL["threshold"],
+            "features": features,
+        }
+    except Exception as exc:
+        ERRORS.labels(MODEL["version"], MODEL_TRACK, "prediction_failure").inc()
+        raise HTTPException(status_code=500, detail="prediction failed") from exc
+
+
+@app.get("/metrics")
+def metrics() -> Response:
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
--- a/apps/mlops-platform/models/model_v1.json
+++ b/apps/mlops-platform/models/model_v1.json
@ -0,0 +1,42 @@
+{
+  "version": "v1",
+  "trained_with": "scikit-learn LogisticRegression",
+  "threshold": 0.55,
+  "bias": -1.2,
+  "features": [
+    "latency_ms",
+    "error_rate",
+    "cpu_utilization",
+    "memory_utilization",
+    "queue_depth"
+  ],
+  "weights": [
+    0.85,
+    1.35,
+    0.7,
+    0.55,
+    0.6
+  ],
+  "baseline": {
+    "latency_ms": {
+      "mean": 180,
+      "stddev": 90
+    },
+    "error_rate": {
+      "mean": 0.02,
+      "stddev": 0.04
+    },
+    "cpu_utilization": {
+      "mean": 0.45,
+      "stddev": 0.2
+    },
+    "memory_utilization": {
+      "mean": 0.5,
+      "stddev": 0.2
+    },
+    "queue_depth": {
+      "mean": 8,
+      "stddev": 12
+    }
+  }
+}
--- a/apps/mlops-platform/models/model_v2.json
+++ b/apps/mlops-platform/models/model_v2.json
@ -0,0 +1,42 @@
+{
+  "version": "v2",
+  "trained_with": "scikit-learn LogisticRegression",
+  "threshold": 0.5,
+  "bias": -1.05,
+  "features": [
+    "latency_ms",
+    "error_rate",
+    "cpu_utilization",
+    "memory_utilization",
+    "queue_depth"
+  ],
+  "weights": [
+    0.7,
+    1.55,
+    0.9,
+    0.65,
+    0.75
+  ],
+  "baseline": {
+    "latency_ms": {
+      "mean": 170,
+      "stddev": 80
+    },
+    "error_rate": {
+      "mean": 0.018,
+      "stddev": 0.035
+    },
+    "cpu_utilization": {
+      "mean": 0.42,
+      "stddev": 0.18
+    },
+    "memory_utilization": {
+      "mean": 0.48,
+      "stddev": 0.18
+    },
+    "queue_depth": {
+      "mean": 6,
+      "stddev": 10
+    }
+  }
+}
--- a/apps/mlops-platform/requirements.txt
+++ b/apps/mlops-platform/requirements.txt
@ -0,0 +1,4 @@
+fastapi==0.115.7
+prometheus-client==0.21.1
+pydantic==2.10.6
+uvicorn[standard]==0.34.0
--- a/apps/mlops-platform/training/export_model.py
+++ b/apps/mlops-platform/training/export_model.py
@ -0,0 +1,52 @@
+import json
+from pathlib import Path
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+FEATURES = [
+    "latency_ms",
+    "error_rate",
+    "cpu_utilization",
+    "memory_utilization",
+    "queue_depth",
+]
+OUTPUT_DIR = Path(__file__).resolve().parent.parent / "models"
+
+
+def export_model(version: str, seed: int, threshold: float) -> None:
+    rng = np.random.default_rng(seed)
+    healthy = rng.normal([170, 0.015, 0.38, 0.45, 5], [45, 0.01, 0.12, 0.12, 4], size=(160, 5))
+    at_risk = rng.normal([420, 0.12, 0.82, 0.78, 38], [130, 0.08, 0.12, 0.13, 18], size=(160, 5))
+    x = np.vstack([healthy, at_risk])
+    y = np.array([0] * len(healthy) + [1] * len(at_risk))
+
+    pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=seed))
+    pipeline.fit(x, y)
+
+    scaler = pipeline.named_steps["standardscaler"]
+    classifier = pipeline.named_steps["logisticregression"]
+    artifact = {
+        "version": version,
+        "trained_with": "scikit-learn LogisticRegression",
+        "threshold": threshold,
+        "bias": float(classifier.intercept_[0]),
+        "features": FEATURES,
+        "weights": [float(value) for value in classifier.coef_[0]],
+        "baseline": {
+            feature: {"mean": float(mean), "stddev": float(stddev)}
+            for feature, mean, stddev in zip(FEATURES, scaler.mean_, scaler.scale_, strict=True)
+        },
+    }
+
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    with (OUTPUT_DIR / f"model_{version}.json").open("w", encoding="utf-8") as handle:
+        json.dump(artifact, handle, indent=2)
+        handle.write("\n")
+
+
+if __name__ == "__main__":
+    export_model("v1", seed=42, threshold=0.55)
+    export_model("v2", seed=84, threshold=0.5)