From 51145454c767390aafb0d6d801093bdbcd6d81a0 Mon Sep 17 00:00:00 2001
From: Sawyer <sawyer@teamculture.ai>
Date: Mon, 4 Nov 2024 13:31:02 -0500
Subject: [PATCH] Add Kubernetes autoscaler with configuration and state
 management

---
 k8s-autoscaler/k8s_autoscaler/api/__init__.py |   0
 k8s-autoscaler/k8s_autoscaler/api/routes.py   | 135 ++++++++++
 k8s-autoscaler/k8s_autoscaler/config.py       |  47 ++++
 k8s-autoscaler/k8s_autoscaler/dependencies.py |  26 ++
 k8s-autoscaler/k8s_autoscaler/kubernetes.py   |  71 +++++
 k8s-autoscaler/k8s_autoscaler/main.py         | 254 +++---------------
 k8s-autoscaler/k8s_autoscaler/types.py        |  24 ++
 k8s-autoscaler/k8s_autoscaler/vllm.py         |  73 +++++
 k8s-autoscaler/poetry.lock                    |  36 ++-
 k8s-autoscaler/pyproject.toml                 |   1 +
 10 files changed, 443 insertions(+), 224 deletions(-)
 create mode 100644 k8s-autoscaler/k8s_autoscaler/api/__init__.py
 create mode 100644 k8s-autoscaler/k8s_autoscaler/api/routes.py
 create mode 100644 k8s-autoscaler/k8s_autoscaler/config.py
 create mode 100644 k8s-autoscaler/k8s_autoscaler/dependencies.py
 create mode 100644 k8s-autoscaler/k8s_autoscaler/kubernetes.py
 create mode 100644 k8s-autoscaler/k8s_autoscaler/types.py
 create mode 100644 k8s-autoscaler/k8s_autoscaler/vllm.py

diff --git a/k8s-autoscaler/k8s_autoscaler/api/__init__.py b/k8s-autoscaler/k8s_autoscaler/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/k8s-autoscaler/k8s_autoscaler/api/routes.py b/k8s-autoscaler/k8s_autoscaler/api/routes.py
new file mode 100644
index 0000000..4f7437a
--- /dev/null
+++ b/k8s-autoscaler/k8s_autoscaler/api/routes.py
@@ -0,0 +1,135 @@
+from fastapi import (
+    APIRouter,
+    Request,
+    Response,
+    BackgroundTasks,
+    HTTPException,
+    status,
+    Depends,
+)
+from fastapi.responses import StreamingResponse, JSONResponse
+import httpx
+import logging
+import time
+from typing import AsyncGenerator
+from ..types import AutoscalerState, PodPhase
+from ..config import Settings
+from ..kubernetes import KubeCommand
+from ..vllm import VLLMManager
+from ..dependencies import get_settings, get_state, get_kube, get_vllm_manager
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+async def stream_response(response: httpx.Response) -> AsyncGenerator[bytes, None]:
+    """Stream response content."""
+    try:
+        async for chunk in response.aiter_bytes():
+            yield chunk
+    except httpx.HTTPError as e:
+        logger.error(f"Error streaming response: {e}")
+        raise HTTPException(status_code=502, detail="Error streaming from vLLM service")
+
+
+@router.get("/health")
+async def health_check(
+    kube: KubeCommand = Depends(get_kube), state: AutoscalerState = Depends(get_state)
+):
+    """Health check endpoint."""
+    phase = await kube.get_pod_phase()
+    current_replicas, desired_replicas = await kube.get_replicas()
+    return {
+        "status": "healthy",
+        "vllm_status": phase,
+        "vllm_running": phase == PodPhase.RUNNING,
+        "current_replicas": current_replicas,
+        "desired_replicas": desired_replicas,
+        "last_activity": time.strftime(
+            "%Y-%m-%d %H:%M:%S", time.localtime(state.last_activity)
+        ),
+    }
+
+
+@router.post("/scale/{replicas}")
+async def scale(
+    replicas: int,
+    background_tasks: BackgroundTasks,
+    kube: KubeCommand = Depends(get_kube),
+    vllm_manager: VLLMManager = Depends(get_vllm_manager),
+) -> JSONResponse:
+    """Manually scale the vLLM deployment."""
+    if replicas < 0:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Replica count must be non-negative",
+        )
+
+    if not await kube.scale_deployment(replicas):
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to scale deployment",
+        )
+
+    if replicas > 0:
+        vllm_manager.reset_inactivity_timer(background_tasks)
+
+    return JSONResponse(
+        status_code=status.HTTP_200_OK,
+        content={"message": f"Scaling deployment to {replicas} replicas"},
+    )
+
+
+@router.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
+async def proxy_request(
+    request: Request,
+    path: str,
+    background_tasks: BackgroundTasks,
+    settings: Settings = Depends(get_settings),
+    state: AutoscalerState = Depends(get_state),
+    vllm_manager: VLLMManager = Depends(get_vllm_manager),
+) -> StreamingResponse:
+    """Proxy requests to vLLM service, handling activation as needed."""
+    try:
+        if not await vllm_manager.ensure_running():
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail=f"vLLM service activation failed after {settings.activation_timeout}s",
+            )
+
+        vllm_manager.reset_inactivity_timer(background_tasks)
+
+        if not state.http_client:
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail="HTTP client not initialized",
+            )
+
+        # Forward the request to vLLM
+        url = f"{settings.vllm_url_base}/{path}"
+        headers = dict(request.headers)
+        headers.pop("host", None)  # Remove host header to avoid conflicts
+
+        vllm_response = await state.http_client.request(
+            method=request.method,
+            url=url,
+            headers=headers,
+            content=await request.body(),
+            params=request.query_params,
+        )
+
+        return StreamingResponse(
+            stream_response(vllm_response),
+            status_code=vllm_response.status_code,
+            headers=dict(vllm_response.headers),
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error processing request: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Internal server error",
+        )
diff --git a/k8s-autoscaler/k8s_autoscaler/config.py b/k8s-autoscaler/k8s_autoscaler/config.py
new file mode 100644
index 0000000..4097765
--- /dev/null
+++ b/k8s-autoscaler/k8s_autoscaler/config.py
@@ -0,0 +1,47 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field
+from functools import cached_property
+
+
+class Settings(BaseSettings):
+    """Application settings with validation and documentation."""
+
+    model_config = SettingsConfigDict(
+        env_file=".env", env_file_encoding="utf-8", extra="ignore"
+    )
+
+    vllm_service_host: str = Field(
+        default="vllm-svc", description="Hostname of the vLLM service"
+    )
+    vllm_service_port: str = Field(
+        default="8000", description="Port of the vLLM service"
+    )
+    vllm_deployment: str = Field(
+        default="vllm", description="Name of the vLLM deployment"
+    )
+    kubernetes_namespace: str = Field(
+        default="default", description="Kubernetes namespace for the vLLM deployment"
+    )
+    inactivity_timeout: int = Field(
+        default=900,
+        description="Timeout in seconds before scaling down due to inactivity",
+        gt=0,
+    )
+    activation_timeout: int = Field(
+        default=120,
+        description="Timeout in seconds while waiting for vLLM to become ready",
+        gt=0,
+    )
+    proxy_timeout: float = Field(
+        default=30.0, description="Timeout in seconds for proxy requests", gt=0
+    )
+
+    @cached_property
+    def vllm_url_base(self) -> str:
+        """Base URL for the vLLM service."""
+        return f"http://{self.vllm_service_host}:{self.vllm_service_port}"
+
+    @cached_property
+    def kubectl_base_cmd(self) -> str:
+        """Base kubectl command with namespace."""
+        return f"kubectl -n {self.kubernetes_namespace}"
diff --git a/k8s-autoscaler/k8s_autoscaler/dependencies.py b/k8s-autoscaler/k8s_autoscaler/dependencies.py
new file mode 100644
index 0000000..360e09c
--- /dev/null
+++ b/k8s-autoscaler/k8s_autoscaler/dependencies.py
@@ -0,0 +1,26 @@
+# app/dependencies.py
+from fastapi import Depends
+from .config import Settings
+from .types import AutoscalerState
+from .kubernetes import KubeCommand
+from .vllm import VLLMManager
+
+
+def get_settings() -> Settings:
+    return Settings()
+
+
+def get_state() -> AutoscalerState:
+    return AutoscalerState()
+
+
+def get_kube(settings: Settings = Depends(get_settings)) -> KubeCommand:
+    return KubeCommand(settings)
+
+
+def get_vllm_manager(
+    settings: Settings = Depends(get_settings),
+    state: AutoscalerState = Depends(get_state),
+    kube: KubeCommand = Depends(get_kube),
+) -> VLLMManager:
+    return VLLMManager(settings, state, kube)
diff --git a/k8s-autoscaler/k8s_autoscaler/kubernetes.py b/k8s-autoscaler/k8s_autoscaler/kubernetes.py
new file mode 100644
index 0000000..07cdec3
--- /dev/null
+++ b/k8s-autoscaler/k8s_autoscaler/kubernetes.py
@@ -0,0 +1,71 @@
+import asyncio
+import subprocess
+import logging
+from .types import PodPhase
+from .config import Settings
+
+logger = logging.getLogger(__name__)
+
+
+class KubeCommand:
+    """Kubectl command builder and executor."""
+
+    def __init__(self, settings: Settings):
+        self.settings = settings
+
+    async def execute(self, cmd: str) -> tuple[bool, str]:
+        """Execute a kubectl command and return success status and output."""
+        full_cmd = f"{self.settings.kubectl_base_cmd} {cmd}"
+        try:
+            process = await asyncio.create_subprocess_shell(
+                full_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            stdout, stderr = await process.communicate()
+
+            success = process.returncode == 0
+            output = stdout.decode().strip() if success else stderr.decode().strip()
+            if not success:
+                logger.error(f"kubectl command failed: {output}")
+            return success, output
+        except Exception as e:
+            logger.error(f"kubectl command failed: {e}")
+            return False, str(e)
+
+    async def get_pod_phase(self) -> PodPhase:
+        """Get the phase of the vLLM pod."""
+        success, output = await self.execute(
+            "get pods -l app=vllm -o jsonpath='{.items[0].status.phase}'"
+        )
+        try:
+            return PodPhase(output) if success and output else PodPhase.UNKNOWN
+        except ValueError:
+            logger.warning(f"Unknown pod phase: {output}")
+            return PodPhase.UNKNOWN
+
+    async def scale_deployment(self, replicas: int) -> bool:
+        """Scale vLLM deployment to specified replicas."""
+        if replicas < 0:
+            logger.error(f"Invalid replica count: {replicas}")
+            return False
+
+        success, output = await self.execute(
+            f"scale deployment {self.settings.vllm_deployment} --replicas={replicas}"
+        )
+        if success:
+            logger.info(f"Successfully scaled deployment to {replicas} replicas")
+        return success
+
+    async def get_replicas(self) -> tuple[int, int]:
+        """Get current and desired replica counts."""
+        cmd = (
+            f"get deployment {self.settings.vllm_deployment} "
+            "-o jsonpath='{.status.replicas} {.spec.replicas}'"
+        )
+        success, output = await self.execute(cmd)
+        if success and output:
+            try:
+                current, desired = map(int, output.split())
+                return current, desired
+            except ValueError:
+                logger.error(f"Failed to parse replica counts: {output}")
+        return -1, -1
diff --git a/k8s-autoscaler/k8s_autoscaler/main.py b/k8s-autoscaler/k8s_autoscaler/main.py
index 34d9f3a..6ae12af 100644
--- a/k8s-autoscaler/k8s_autoscaler/main.py
+++ b/k8s-autoscaler/k8s_autoscaler/main.py
@@ -1,15 +1,11 @@
-from fastapi import FastAPI, Response, BackgroundTasks, HTTPException
-from fastapi.responses import StreamingResponse
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
 import httpx
-import asyncio
-import subprocess
-from asyncio.subprocess import Process
 import logging
-import time
-import os
-from typing import Optional, TypedDict, Literal, AsyncGenerator, cast
-from dataclasses import dataclass
-from contextlib import asynccontextmanager
+from .config import Settings
+from .types import AutoscalerState
+from .api import routes
+import asyncio
 
 # Configure logging
 logging.basicConfig(
@@ -17,227 +13,39 @@
 )
 logger = logging.getLogger(__name__)
 
-# Type definitions
-PodPhase = Literal["Pending", "Running", "Succeeded", "Failed", "Unknown"]
-
-
-class CommandResult(TypedDict):
-    success: bool
-    output: str
-    error: str
-
-
-@dataclass
-class Config:
-    vllm_service_host: str
-    vllm_service_port: str
-    vllm_deployment: str
-    kubernetes_namespace: str
-    inactivity_timeout: int
-    activation_timeout: int
-
-
-# Load configuration from environment
-config = Config(
-    vllm_service_host=os.getenv("VLLM_SERVICE_HOST", "vllm-svc"),
-    vllm_service_port=os.getenv("VLLM_SERVICE_PORT", "8000"),
-    vllm_deployment=os.getenv("VLLM_DEPLOYMENT_NAME", "vllm"),
-    kubernetes_namespace=os.getenv("KUBERNETES_NAMESPACE", "default"),
-    inactivity_timeout=int(os.getenv("INACTIVITY_TIMEOUT", "900")),
-    activation_timeout=int(os.getenv("ACTIVATION_TIMEOUT", "120")),
-)
-
-
-# Global state
-class AutoscalerState:
-    def __init__(self):
-        self.last_activity: float = time.time()
-        self.shutdown_task: Optional[asyncio.Task] = None
-        self.http_client: Optional[httpx.AsyncClient] = None
-
-
-state = AutoscalerState()
-
-
-async def execute_command(cmd: str) -> CommandResult:
-    """Execute a shell command and return structured result."""
-    try:
-        process: Process = await asyncio.create_subprocess_shell(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        stdout, stderr = await process.communicate()
-
-        return CommandResult(
-            success=process.returncode == 0,
-            output=stdout.decode().strip(),
-            error=stderr.decode().strip(),
-        )
-    except Exception as e:
-        logger.error(f"Failed to execute command: {str(e)}")
-        return CommandResult(success=False, output="", error=str(e))
-
-
-async def check_vllm_status() -> tuple[bool, PodPhase]:
-    """Check if VLLM deployment is running and ready."""
-    cmd = f"kubectl get pods -n {config.kubernetes_namespace} -l app=vllm -o jsonpath='{{.items[0].status.phase}}'"
-    result = await execute_command(cmd)
-
-    if not result["success"]:
-        return False, "Unknown"
-
-    phase = result["output"]
-    if not phase:
-        return False, "Unknown"
-
-    return phase == "Running", cast(PodPhase, phase)
-
-
-async def scale_vllm(replicas: int) -> bool:
-    """Scale VLLM deployment to specified number of replicas."""
-    cmd = f"kubectl scale deployment -n {config.kubernetes_namespace} {config.vllm_deployment} --replicas={replicas}"
-    result = await execute_command(cmd)
-
-    if not result["success"]:
-        logger.error(f"Failed to scale VLLM: {result['error']}")
-
-    return result["success"]
-
-
-async def wait_for_vllm_ready() -> bool:
-    """Wait for VLLM to become ready within timeout period."""
-    start_time = time.time()
-    while time.time() - start_time < config.activation_timeout:
-        is_running, phase = await check_vllm_status()
-        if is_running:
-            return True
-        logger.info(f"Waiting for VLLM to be ready. Current phase: {phase}")
-        await asyncio.sleep(2)
-    return False
-
-
-async def monitor_inactivity():
-    """Monitor for inactivity and scale down when timeout is reached."""
-    try:
-        while True:
-            await asyncio.sleep(60)  # Check every minute
-            if time.time() - state.last_activity > config.inactivity_timeout:
-                logger.info(
-                    f"Inactivity timeout of {config.inactivity_timeout}s reached, scaling down VLLM"
-                )
-                if await scale_vllm(0):
-                    logger.info("VLLM scaled down successfully")
-                else:
-                    logger.error("Failed to scale down VLLM")
-                break
-    except Exception as e:
-        logger.error(f"Error in inactivity monitor: {str(e)}")
-    finally:
-        state.shutdown_task = None
-
-
-def reset_inactivity_timer(background_tasks: BackgroundTasks):
-    """Reset the inactivity timer and start monitoring if needed."""
-    state.last_activity = time.time()
-
-    if state.shutdown_task is None:
-        state.shutdown_task = asyncio.create_task(monitor_inactivity())
-        background_tasks.add_task(lambda: state.shutdown_task)
-
 
 @asynccontextmanager
-async def get_http_client():
-    """Get or create HTTP client."""
-    if state.http_client is None:
-        state.http_client = httpx.AsyncClient(timeout=30.0)
-    try:
-        yield state.http_client
-    finally:
-        pass  # Keep client alive for reuse
-
-
-async def stream_response(response: httpx.Response) -> AsyncGenerator[bytes, None]:
-    """Stream response content."""
-    async for chunk in response.aiter_bytes():
-        yield chunk
-
-
-app = FastAPI(title="VLLM Autoscaler")
-
+async def lifespan(app: FastAPI):
+    # Startup
+    settings = Settings()
+    state = AutoscalerState()
+    state.http_client = httpx.AsyncClient(timeout=settings.proxy_timeout)
 
-@app.on_event("startup")
-async def startup_event():
-    """Initialize HTTP client on startup."""
-    state.http_client = httpx.AsyncClient(timeout=30.0)
+    # Store in app state for access in dependencies
+    app.state.settings = settings
+    app.state.state = state
 
+    yield
 
-@app.on_event("shutdown")
-async def shutdown_event():
-    """Clean up resources on shutdown."""
+    # Shutdown
     if state.http_client:
         await state.http_client.aclose()
+    if state.shutdown_task:
+        state.shutdown_task.cancel()
+        try:
+            await state.shutdown_task
+        except asyncio.CancelledError:
+            pass
+
+
+app = FastAPI(
+    title="vLLM Autoscaler",
+    description="Autoscaler and proxy for vLLM deployments in Kubernetes",
+    version="1.0.0",
+    lifespan=lifespan,
+)
 
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    is_running, phase = await check_vllm_status()
-    return {"status": "healthy", "vllm_status": phase, "vllm_running": is_running}
-
-
-@app.get("/{path:path}")
-async def proxy_request(
-    path: str,
-    response: Response,
-    background_tasks: BackgroundTasks,
-    raw_query_string: str = "",
-) -> StreamingResponse:
-    """Proxy requests to VLLM service, handling activation as needed."""
-    try:
-        # Check if VLLM is running
-        is_running, phase = await check_vllm_status()
-        if not is_running:
-            logger.info(
-                f"VLLM not running (phase: {phase}), starting activation sequence"
-            )
-
-            # Scale up VLLM
-            if not await scale_vllm(1):
-                raise HTTPException(
-                    status_code=503, detail="Failed to activate VLLM service"
-                )
-
-            # Wait for VLLM to become ready
-            if not await wait_for_vllm_ready():
-                raise HTTPException(
-                    status_code=503,
-                    detail=f"VLLM service activation timeout after {config.activation_timeout}s",
-                )
-
-            logger.info("VLLM activation completed successfully")
-
-        # Reset inactivity timer
-        reset_inactivity_timer(background_tasks)
-
-        # Proxy the request to VLLM
-        query = f"?{raw_query_string}" if raw_query_string else ""
-        vllm_url = f"http://{config.vllm_service_host}:{config.vllm_service_port}/{path}{query}"
-
-        async with get_http_client() as client:
-            vllm_response = await client.get(vllm_url)
-
-            # Create streaming response
-            return StreamingResponse(
-                stream_response(vllm_response),
-                status_code=vllm_response.status_code,
-                headers=dict(vllm_response.headers),
-            )
-
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Error processing request: {str(e)}", exc_info=True)
-        raise HTTPException(status_code=500, detail="Internal server error")
-
+app.include_router(routes.router)
 
 if __name__ == "__main__":
     import uvicorn
diff --git a/k8s-autoscaler/k8s_autoscaler/types.py b/k8s-autoscaler/k8s_autoscaler/types.py
new file mode 100644
index 0000000..49e001b
--- /dev/null
+++ b/k8s-autoscaler/k8s_autoscaler/types.py
@@ -0,0 +1,24 @@
+from enum import Enum
+from dataclasses import dataclass, field
+import asyncio
+from typing import Optional, ClassVar
+import httpx
+import time
+
+
+class PodPhase(str, Enum):
+    PENDING = "Pending"
+    RUNNING = "Running"
+    SUCCEEDED = "Succeeded"
+    FAILED = "Failed"
+    UNKNOWN = "Unknown"
+
+
+@dataclass
+class AutoscalerState:
+    """Global state management for the autoscaler."""
+
+    last_activity: float = field(default_factory=time.time)
+    shutdown_task: Optional[asyncio.Task] = None
+    http_client: Optional[httpx.AsyncClient] = None
+    scaling_lock: ClassVar[asyncio.Lock] = asyncio.Lock()
diff --git a/k8s-autoscaler/k8s_autoscaler/vllm.py b/k8s-autoscaler/k8s_autoscaler/vllm.py
new file mode 100644
index 0000000..79b867f
--- /dev/null
+++ b/k8s-autoscaler/k8s_autoscaler/vllm.py
@@ -0,0 +1,73 @@
+import asyncio
+import logging
+import time
+from fastapi import BackgroundTasks
+from .types import PodPhase, AutoscalerState
+from .config import Settings
+from .kubernetes import KubeCommand
+
+logger = logging.getLogger(__name__)
+
+
+class VLLMManager:
+    """Manager for vLLM deployment operations."""
+
+    def __init__(self, settings: Settings, state: AutoscalerState, kube: KubeCommand):
+        self.settings = settings
+        self.state = state
+        self.kube = kube
+
+    async def ensure_running(self) -> bool:
+        """Ensure vLLM is running, scaling up if necessary."""
+        async with self.state.scaling_lock:
+            phase = await self.kube.get_pod_phase()
+            if phase == PodPhase.RUNNING:
+                return True
+
+            logger.info(f"vLLM not running (phase: {phase}), scaling up")
+            if not await self.kube.scale_deployment(1):
+                return False
+
+            return await self._wait_until_ready()
+
+    async def _wait_until_ready(self) -> bool:
+        """Wait for vLLM to become ready."""
+        start_time = time.time()
+        while time.time() - start_time < self.settings.activation_timeout:
+            phase = await self.kube.get_pod_phase()
+            if phase == PodPhase.RUNNING:
+                return True
+            if phase == PodPhase.FAILED:
+                logger.error("Pod failed to start")
+                return False
+            await asyncio.sleep(2)
+        logger.error("Timeout waiting for pod to become ready")
+        return False
+
+    async def monitor_inactivity(self):
+        """Monitor for inactivity and scale down when timeout is reached."""
+        try:
+            while True:
+                await asyncio.sleep(60)
+                current_replicas, _ = await self.kube.get_replicas()
+                if (
+                    time.time() - self.state.last_activity
+                    > self.settings.inactivity_timeout
+                    and current_replicas > 0
+                ):
+                    logger.info(
+                        f"Scaling down vLLM after {self.settings.inactivity_timeout}s inactivity"
+                    )
+                    await self.kube.scale_deployment(0)
+                    break
+        except Exception as e:
+            logger.error(f"Error in inactivity monitor: {e}")
+        finally:
+            self.state.shutdown_task = None
+
+    def reset_inactivity_timer(self, background_tasks: BackgroundTasks):
+        """Reset inactivity timer and ensure monitoring task is running."""
+        self.state.last_activity = time.time()
+        if self.state.shutdown_task is None:
+            self.state.shutdown_task = asyncio.create_task(self.monitor_inactivity())
+            background_tasks.add_task(lambda: self.state.shutdown_task)
diff --git a/k8s-autoscaler/poetry.lock b/k8s-autoscaler/poetry.lock
index b59ecf4..39e55c4 100644
--- a/k8s-autoscaler/poetry.lock
+++ b/k8s-autoscaler/poetry.lock
@@ -298,6 +298,40 @@ files = [
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
+[[package]]
+name = "pydantic-settings"
+version = "2.6.0"
+description = "Settings management using Pydantic"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_settings-2.6.0-py3-none-any.whl", hash = "sha256:4a819166f119b74d7f8c765196b165f95cc7487ce58ea27dec8a5a26be0970e0"},
+    {file = "pydantic_settings-2.6.0.tar.gz", hash = "sha256:44a1804abffac9e6a30372bb45f6cafab945ef5af25e66b1c634c01dd39e0188"},
+]
+
+[package.dependencies]
+pydantic = ">=2.7.0"
+python-dotenv = ">=0.21.0"
+
+[package.extras]
+azure-key-vault = ["azure-identity (>=1.16.0)", "azure-keyvault-secrets (>=4.8.0)"]
+toml = ["tomli (>=2.0.1)"]
+yaml = ["pyyaml (>=6.0.1)"]
+
+[[package]]
+name = "python-dotenv"
+version = "1.0.1"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
+    {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -359,4 +393,4 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)",
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "42286b323abdc5a2444aa3078fb3f01192955ab30de19fadbb8ad3f28f70a7b2"
+content-hash = "01aa266e919a16630b7e43ec5df02dde3d50d1cae3b7c66bc6f26f56cbedf26b"
diff --git a/k8s-autoscaler/pyproject.toml b/k8s-autoscaler/pyproject.toml
index 021bfcf..6f4b149 100644
--- a/k8s-autoscaler/pyproject.toml
+++ b/k8s-autoscaler/pyproject.toml
@@ -10,6 +10,7 @@ python = "^3.10"
 fastapi = "^0.115.3"
 httpx = "^0.27.2"
 uvicorn = "^0.32.0"
+pydantic-settings = "^2.6.0"
 
 
 [build-system]