From 0e1742b9ab0536852afb31af5b3a781e3cb88471 Mon Sep 17 00:00:00 2001
From: Martin Treml <martin.treml@symflower.com>
Date: Mon, 24 Jun 2024 11:40:59 +0200
Subject: [PATCH 1/3] Introduce "runtime" parameter with "local" als default to
 run evaluations on the executing machine directly

Part of #198
---
 cmd/eval-dev-quality/cmd/evaluate.go | 77 +++++++++++++++++-----------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
index bafd9577..a5e339ad 100644
--- a/cmd/eval-dev-quality/cmd/evaluate.go
+++ b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -68,6 +68,9 @@ type Evaluate struct {
 	// NoDisqualification indicates that models are not to be disqualified if they fail to solve basic language tasks.
 	NoDisqualification bool `long:"no-disqualification" description:"By default, models that cannot solve basic language tasks are disqualified for more complex tasks. Overwriting this behavior runs all tasks regardless."`
 
+	// Runtime indicates if the evaluation is run locally or inside a container.
+	Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local"`
+
 	// logger holds the logger of the command.
 	logger *log.Logger
 	// timestamp holds the timestamp of the command execution.
@@ -356,37 +359,11 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		command.logger.Panic("ERROR: empty evaluation context")
 	}
 
-	// Install required tools for the basic evaluation.
-	if err := tools.InstallEvaluation(command.logger, command.InstallToolsPath); err != nil {
-		command.logger.Panicf("ERROR: %s", err)
-	}
-
-	assessments, totalScore := evaluate.Evaluate(evaluationContext)
-
-	assessmentsPerModel := assessments.CollapseByModel()
-	if err := (report.Markdown{
-		DateTime: command.timestamp,
-		Version:  evaluate.Version,
-
-		CSVPath:       "./evaluation.csv",
-		LogPath:       "./evaluation.log",
-		ModelLogsPath: ".",
-		SVGPath:       "./categories.svg",
-
-		AssessmentPerModel: assessmentsPerModel,
-		TotalScore:         totalScore,
-	}).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil {
-		command.logger.Panicf("ERROR: %s", err)
-	}
-
-	_ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) {
-		command.logger.Printf("Evaluation score for %q (%q): cost=%.2f, %s", model.ID(), assessment.Category(totalScore).ID, model.Cost(), assessment)
-
-		return nil
-	})
-
-	if err := writeCSVs(command.ResultPath, assessments); err != nil {
-		command.logger.Panicf("ERROR: %s", err)
+	switch command.Runtime {
+	case "local":
+		return command.evaluateLocal(evaluationContext)
+	default:
+		command.logger.Panicf("ERROR: unknown runtime")
 	}
 
 	return nil
@@ -427,3 +404,41 @@ func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err erro
 
 	return nil
 }
+
+// evaluateLocal executes the evaluation on the current system.
+func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err error) {
+	// Install required tools for the basic evaluation.
+	if err := tools.InstallEvaluation(command.logger, command.InstallToolsPath); err != nil {
+		command.logger.Panicf("ERROR: %s", err)
+	}
+
+	assessments, totalScore := evaluate.Evaluate(evaluationContext)
+
+	assessmentsPerModel := assessments.CollapseByModel()
+	if err := (report.Markdown{
+		DateTime: command.timestamp,
+		Version:  evaluate.Version,
+
+		CSVPath:       "./evaluation.csv",
+		LogPath:       "./evaluation.log",
+		ModelLogsPath: ".",
+		SVGPath:       "./categories.svg",
+
+		AssessmentPerModel: assessmentsPerModel,
+		TotalScore:         totalScore,
+	}).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil {
+		command.logger.Panicf("ERROR: %s", err)
+	}
+
+	_ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) {
+		command.logger.Printf("Evaluation score for %q (%q): cost=%.2f, %s", model.ID(), assessment.Category(totalScore).ID, model.Cost(), assessment)
+
+		return nil
+	})
+
+	if err := writeCSVs(command.ResultPath, assessments); err != nil {
+		command.logger.Panicf("ERROR: %s", err)
+	}
+
+	return nil
+}

From 91d5e0933268d09b84255520aa2b62a9778d70f2 Mon Sep 17 00:00:00 2001
From: Martin Treml <martin.treml@symflower.com>
Date: Mon, 24 Jun 2024 13:20:03 +0200
Subject: [PATCH 2/3] Introduce the "docker" runtime to run an evaluation
 within a docker-container

Part of #198
---
 cmd/eval-dev-quality/cmd/evaluate.go | 121 +++++++++++++++++++++------
 util/exec.go                         |  36 ++++++++
 util/exec_test.go                    |  59 +++++++++++++
 3 files changed, 192 insertions(+), 24 deletions(-)

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
index a5e339ad..d811393d 100644
--- a/cmd/eval-dev-quality/cmd/evaluate.go
+++ b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -1,7 +1,9 @@
 package cmd
 
 import (
+	"context"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"slices"
 	"sort"
@@ -69,7 +71,7 @@ type Evaluate struct {
 	NoDisqualification bool `long:"no-disqualification" description:"By default, models that cannot solve basic language tasks are disqualified for more complex tasks. Overwriting this behavior runs all tasks regardless."`
 
 	// Runtime indicates if the evaluation is run locally or inside a container.
-	Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local"`
+	Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local" choice:"docker"`
 
 	// logger holds the logger of the command.
 	logger *log.Logger
@@ -97,29 +99,6 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 	}()
 	evaluationContext = &evaluate.Context{}
 
-	// Setup evaluation result directory.
-	{
-		command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
-		uniqueResultPath, err := util.UniqueDirectory(command.ResultPath)
-		if err != nil {
-			command.logger.Panicf("ERROR: %s", err)
-		}
-		command.ResultPath = uniqueResultPath
-		evaluationContext.ResultPath = uniqueResultPath
-		command.logger.Printf("Writing results to %s", command.ResultPath)
-	}
-
-	// Initialize logging within result directory.
-	{
-		log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log"))
-		if err != nil {
-			command.logger.Panicf("ERROR: %s", err)
-		}
-		cleanup = logClose
-		command.logger = log
-		evaluationContext.Log = log
-	}
-
 	// Check and validate common options.
 	{
 		if command.InstallToolsPath == "" {
@@ -157,6 +136,12 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 		}
 		evaluationContext.Runs = command.Runs
 
+		if command.Runtime == "docker" {
+			if _, err := exec.LookPath("docker"); err != nil {
+				command.logger.Panic("docker runtime could not be found")
+			}
+		}
+
 		evaluationContext.NoDisqualification = command.NoDisqualification
 	}
 
@@ -173,6 +158,29 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 		evaluationContext.TestdataPath = testdataPath
 	}
 
+	// Setup evaluation result directory.
+	{
+		command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
+		uniqueResultPath, err := util.UniqueDirectory(command.ResultPath)
+		if err != nil {
+			command.logger.Panicf("ERROR: %s", err)
+		}
+		command.ResultPath = uniqueResultPath
+		evaluationContext.ResultPath = uniqueResultPath
+		command.logger.Printf("Writing results to %s", command.ResultPath)
+	}
+
+	// Initialize logging within result directory.
+	{
+		log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log"))
+		if err != nil {
+			command.logger.Panicf("ERROR: %s", err)
+		}
+		cleanup = logClose
+		command.logger = log
+		evaluationContext.Log = log
+	}
+
 	// Register custom OpenAI API providers and models.
 	{
 		customProviders := map[string]*openaiapi.Provider{}
@@ -362,6 +370,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
 	switch command.Runtime {
 	case "local":
 		return command.evaluateLocal(evaluationContext)
+	case "docker":
+		return command.evaluateDocker(evaluationContext)
 	default:
 		command.logger.Panicf("ERROR: unknown runtime")
 	}
@@ -442,3 +452,66 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err
 
 	return nil
 }
+
+// evaluateDocker executes the evaluation for each model inside a docker container.
+func (command *Evaluate) evaluateDocker(ctx *evaluate.Context) (err error) {
+	// Filter all the args to pass them onto the container.
+	args := util.FilterArgs(os.Args[2:], []string{
+		"--runtime",
+		"--model",
+		"--result-path",
+	})
+
+	// Iterate over each model and start the container.
+	for _, model := range ctx.Models {
+		// We are skipping ollama models until we fully support pulling. https://github.com/symflower/eval-dev-quality/issues/100.
+		if ctx.ProviderForModel[model].ID() == "ollama" {
+			command.logger.Print("Skipping unsupported ollama model with docker runtime")
+
+			continue
+		}
+
+		// Create for each model a dedicated subfolder inside the results path.
+		resultPath, err := filepath.Abs(command.ResultPath)
+		if err != nil {
+			return err
+		}
+		// Set permission 777 so the non-root docker image is able to store its results inside the result path.
+		if err := os.Chmod(resultPath, 0777); err != nil {
+			return err
+		}
+
+		// Commands regarding the docker runtime.
+		dockerCommand := []string{
+			"docker",
+			"run",
+			"-v", // bind volume
+			resultPath + ":/home/ubuntu/evaluation",
+			"--rm", // automatically remove container after it finished
+			"ghcr.io/symflower/eval-dev-quality:v0.5.0",
+		}
+
+		// Commands for the evaluation to run inside the container.
+		evaluationCommand := []string{
+			"eval-dev-quality",
+			"evaluate",
+			"--model",
+			model.ID(),
+			"--result-path",
+			"/home/ubuntu/evaluation/" + model.ID(),
+		}
+
+		cmd := append(dockerCommand, evaluationCommand...)
+		cmd = append(cmd, args...)
+
+		commandOutput, err := util.CommandWithResult(context.Background(), command.logger, &util.Command{
+			Command: cmd,
+		})
+		if err != nil {
+			return pkgerrors.WithMessage(pkgerrors.WithStack(err), commandOutput)
+		}
+
+	}
+
+	return nil
+}
diff --git a/util/exec.go b/util/exec.go
index 9803c2ea..779900e4 100644
--- a/util/exec.go
+++ b/util/exec.go
@@ -54,3 +54,39 @@ func CommandWithResult(ctx context.Context, logger *log.Logger, command *Command
 
 	return writer.String(), nil
 }
+
+// FilterArgs parses args and removes the ignored ones.
+func FilterArgs(args []string, ignored []string) (filtered []string) {
+	filterMap := map[string]bool{}
+	for _, v := range ignored {
+		filterMap[v] = true
+	}
+
+	// Resolve args with equals sign.
+	var resolvedArgs []string
+	for _, v := range args {
+		if strings.HasPrefix(v, "--") && strings.Contains(v, "=") {
+			resolvedArgs = append(resolvedArgs, strings.SplitN(v, "=", 2)...)
+		} else {
+			resolvedArgs = append(resolvedArgs, v)
+		}
+	}
+
+	skip := false
+	for _, v := range resolvedArgs {
+		if skip && strings.HasPrefix(v, "--") {
+			skip = false
+		}
+		if filterMap[v] {
+			skip = true
+		}
+
+		if skip {
+			continue
+		}
+
+		filtered = append(filtered, v)
+	}
+
+	return filtered
+}
diff --git a/util/exec_test.go b/util/exec_test.go
index 489fa7cb..ec633bc4 100644
--- a/util/exec_test.go
+++ b/util/exec_test.go
@@ -32,3 +32,62 @@ func TestCommandWithResultTimeout(t *testing.T) {
 	assert.Error(t, err)
 	assert.Less(t, duration.Seconds(), 5.0)
 }
+
+func TestFilterArgs(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Args    []string
+		Ignored []string
+
+		ExpectedFiltered []string
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			actualFiltered := FilterArgs(tc.Args, tc.Ignored)
+
+			assert.Equal(t, tc.ExpectedFiltered, actualFiltered)
+		})
+	}
+
+	validate(t, &testCase{
+		Name: "Filter arguments",
+
+		Args: []string{
+			"--runtime",
+			"abc",
+			"--runs",
+			"5",
+		},
+		Ignored: []string{
+			"--runtime",
+		},
+
+		ExpectedFiltered: []string{
+			"--runs",
+			"5",
+		},
+	})
+
+	validate(t, &testCase{
+		Name: "Filter arguments with equals sign",
+
+		Args: []string{
+			"--runtime=abc",
+			"--runs=5",
+			"--foo",
+			"bar",
+		},
+		Ignored: []string{
+			"--runtime",
+		},
+
+		ExpectedFiltered: []string{
+			"--runs",
+			"5",
+			"--foo",
+			"bar",
+		},
+	})
+}

From c49a891e9994b5c0f655d91e3cd03e8b4fab6682 Mon Sep 17 00:00:00 2001
From: Martin Treml <martin.treml@symflower.com>
Date: Wed, 26 Jun 2024 11:51:07 +0200
Subject: [PATCH 3/3] Introduce "runtime-image" parameter to specify the
 container image to use for a different runtime

If none is specified we will always use the taged version defined within eval-dev-quality

Part of #198
---
 cmd/eval-dev-quality/cmd/evaluate.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
index d811393d..73db67dd 100644
--- a/cmd/eval-dev-quality/cmd/evaluate.go
+++ b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -72,6 +72,8 @@ type Evaluate struct {
 
 	// Runtime indicates if the evaluation is run locally or inside a container.
 	Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local" choice:"docker"`
+	// RuntimeImage determines the container image used for any container runtime.
+	RuntimeImage string `long:"runtime-image" description:"The container image to use for the evaluation." default:""`
 
 	// logger holds the logger of the command.
 	logger *log.Logger
@@ -142,6 +144,10 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 			}
 		}
 
+		if command.RuntimeImage == "" {
+			command.RuntimeImage = "ghcr.io/symflower/eval-dev-quality:v" + evaluate.Version
+		}
+
 		evaluationContext.NoDisqualification = command.NoDisqualification
 	}
 
@@ -488,7 +494,7 @@ func (command *Evaluate) evaluateDocker(ctx *evaluate.Context) (err error) {
 			"-v", // bind volume
 			resultPath + ":/home/ubuntu/evaluation",
 			"--rm", // automatically remove container after it finished
-			"ghcr.io/symflower/eval-dev-quality:v0.5.0",
+			command.RuntimeImage,
 		}
 
 		// Commands for the evaluation to run inside the container.