diff --git a/README.md b/README.md index 479320b..edb3021 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,8 @@ Below is an example config.json: "ProxyTargetPort": "18081", "Command": "/opt/llama.cpp/llama-server", "Args": "-m /opt/Gemma-27B-v1_Q4km.gguf -c 8192 -ngl 100 -t 4 --port 18081", + "HealthcheckCommand": "curl --fail http://localhost:8081/", + "HealthcheckIntervalMilliseconds": 200, "RestartOnConnectionFailure": false, "ResourceRequirements": { "VRAM-GPU-1": 20000, diff --git a/main.go b/main.go index 6645714..924249c 100644 --- a/main.go +++ b/main.go @@ -26,17 +26,19 @@ type Config struct { } type ServiceConfig struct { - Name string - ListenPort string - ProxyTargetHost string - ProxyTargetPort string - Command string - Args string - LogFilePath string - Workdir string - ShutDownAfterInactivitySeconds time.Duration - RestartOnConnectionFailure bool - ResourceRequirements map[string]int `json:"ResourceRequirements"` + Name string + ListenPort string + ProxyTargetHost string + ProxyTargetPort string + Command string + Args string + LogFilePath string + Workdir string + HealthcheckCommand string + HealthcheckIntervalMilliseconds time.Duration + ShutDownAfterInactivitySeconds time.Duration + RestartOnConnectionFailure bool + ResourceRequirements map[string]int `json:"ResourceRequirements"` } type RunningService struct { manageMutex *sync.Mutex @@ -268,8 +270,8 @@ func startService(serviceConfig ServiceConfig) (net.Conn, error) { delete(resourceManager.runningServices, serviceConfig.Name) return nil, fmt.Errorf("failed to run command \"%s %s\"", serviceConfig.Command, serviceConfig.Args) } + performHealthCheck(serviceConfig) var serviceConnection = connectWithWaiting(serviceConfig.ProxyTargetHost, serviceConfig.ProxyTargetPort, serviceConfig.Name, 120*time.Second) - time.Sleep(2 * time.Second) //TODO: replace with a custom callback runningService.cmd = cmd @@ -291,6 +293,32 @@ func startService(serviceConfig ServiceConfig) (net.Conn, error) { return serviceConnection, nil } +func performHealthCheck(serviceConfig ServiceConfig) { + if serviceConfig.HealthcheckCommand == "" { + return + } + + log.Printf("[%s] Running healthcheck command \"%s\"", serviceConfig.Name, serviceConfig.HealthcheckCommand) + for { + cmd := exec.Command("sh", "-c", serviceConfig.HealthcheckCommand) + err := cmd.Run() + + if err == nil { + log.Printf("[%s] Healthceck \"%s\" returned exit code 0, healthcheck completed", serviceConfig.Name, serviceConfig.HealthcheckCommand) + break + } else { + log.Printf( + "[%s] Healtcheck \"%s\" returned exit code %d, trying again in %dms", + serviceConfig.Name, + serviceConfig.HealthcheckCommand, + cmd.ProcessState.ExitCode(), + serviceConfig.HealthcheckIntervalMilliseconds, + ) + time.Sleep(serviceConfig.HealthcheckIntervalMilliseconds * time.Millisecond) + } + } +} + func connectToService(serviceConfig ServiceConfig) net.Conn { log.Printf("[%s] Opening new service connection to %s:%s", serviceConfig.Name, serviceConfig.ProxyTargetHost, serviceConfig.ProxyTargetPort) serviceConn, err := net.Dial("tcp", net.JoinHostPort(serviceConfig.ProxyTargetHost, serviceConfig.ProxyTargetPort)) diff --git a/test-server/healthcheck-immediate-listen-start.json b/test-server/healthcheck-immediate-listen-start.json index 8085181..3e7a685 100644 --- a/test-server/healthcheck-immediate-listen-start.json +++ b/test-server/healthcheck-immediate-listen-start.json @@ -6,7 +6,9 @@ "ProxyTargetHost": "localhost", "ProxyTargetPort": "12002", "Command": "test-server/test-server", - "Args": "-p 12002 -healthcheck-port 2012 -sleep-before-listening-for-healthcheck 3s -startup-duration 5s" + "Args": "-p 12002 -healthcheck-port 2012 -sleep-before-listening-for-healthcheck 3s -startup-duration 5s", + "HealthcheckCommand": "curl --fail http://localhost:2012", + "HealthcheckIntervalMilliseconds": 200 } ] } \ No newline at end of file diff --git a/test-server/healthcheck-immediate-startup-delayed-healthcheck.json b/test-server/healthcheck-immediate-startup-delayed-healthcheck.json index 6e2e304..d0799d8 100644 --- a/test-server/healthcheck-immediate-startup-delayed-healthcheck.json +++ b/test-server/healthcheck-immediate-startup-delayed-healthcheck.json @@ -6,7 +6,9 @@ "ProxyTargetHost": "localhost", "ProxyTargetPort": "12003", "Command": "test-server/test-server", - "Args": "-p 12003 -healthcheck-port 2013 -sleep-before-listening-for-healthcheck 3s -startup-duration 5s" + "Args": "-p 12003 -healthcheck-port 2013 -sleep-before-listening-for-healthcheck 3s -startup-duration 5s", + "HealthcheckCommand": "curl --fail http://localhost:2013", + "HealthcheckIntervalMilliseconds": 200 } ] } \ No newline at end of file diff --git a/test-server/healthcheck-immediate-startup.json b/test-server/healthcheck-immediate-startup.json index eb90c65..48e105c 100644 --- a/test-server/healthcheck-immediate-startup.json +++ b/test-server/healthcheck-immediate-startup.json @@ -6,7 +6,9 @@ "ProxyTargetHost": "localhost", "ProxyTargetPort": "12004", "Command": "test-server/test-server", - "Args": "-p 12004 -healthcheck-port 2014" + "Args": "-p 12004 -healthcheck-port 2014", + "HealthcheckCommand": "curl --fail http://localhost:2014", + "HealthcheckIntervalMilliseconds": 200 } ] } \ No newline at end of file diff --git a/test-server/healthcheck.json b/test-server/healthcheck.json index 3a3d5ce..5450952 100644 --- a/test-server/healthcheck.json +++ b/test-server/healthcheck.json @@ -6,7 +6,9 @@ "ProxyTargetHost": "localhost", "ProxyTargetPort": "12001", "Command": "test-server/test-server", - "Args": "-p 12001 -healthcheck-port 2011 -sleep-before-listening 10s -sleep-before-listening-for-healthcheck 3s -startup-duration 5s" + "Args": "-p 12001 -healthcheck-port 2011 -sleep-before-listening 10s -sleep-before-listening-for-healthcheck 3s -startup-duration 5s", + "HealthcheckCommand": "curl --fail http://localhost:2011", + "HealthcheckIntervalMilliseconds": 200 } ] } \ No newline at end of file