From 0e1742b9ab0536852afb31af5b3a781e3cb88471 Mon Sep 17 00:00:00 2001 From: Martin Treml Date: Mon, 24 Jun 2024 11:40:59 +0200 Subject: [PATCH 1/3] Introduce "runtime" parameter with "local" als default to run evaluations on the executing machine directly Part of #198 --- cmd/eval-dev-quality/cmd/evaluate.go | 77 +++++++++++++++++----------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index bafd9577..a5e339ad 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -68,6 +68,9 @@ type Evaluate struct { // NoDisqualification indicates that models are not to be disqualified if they fail to solve basic language tasks. NoDisqualification bool `long:"no-disqualification" description:"By default, models that cannot solve basic language tasks are disqualified for more complex tasks. Overwriting this behavior runs all tasks regardless."` + // Runtime indicates if the evaluation is run locally or inside a container. + Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local"` + // logger holds the logger of the command. logger *log.Logger // timestamp holds the timestamp of the command execution. @@ -356,37 +359,11 @@ func (command *Evaluate) Execute(args []string) (err error) { command.logger.Panic("ERROR: empty evaluation context") } - // Install required tools for the basic evaluation. - if err := tools.InstallEvaluation(command.logger, command.InstallToolsPath); err != nil { - command.logger.Panicf("ERROR: %s", err) - } - - assessments, totalScore := evaluate.Evaluate(evaluationContext) - - assessmentsPerModel := assessments.CollapseByModel() - if err := (report.Markdown{ - DateTime: command.timestamp, - Version: evaluate.Version, - - CSVPath: "./evaluation.csv", - LogPath: "./evaluation.log", - ModelLogsPath: ".", - SVGPath: "./categories.svg", - - AssessmentPerModel: assessmentsPerModel, - TotalScore: totalScore, - }).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil { - command.logger.Panicf("ERROR: %s", err) - } - - _ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) { - command.logger.Printf("Evaluation score for %q (%q): cost=%.2f, %s", model.ID(), assessment.Category(totalScore).ID, model.Cost(), assessment) - - return nil - }) - - if err := writeCSVs(command.ResultPath, assessments); err != nil { - command.logger.Panicf("ERROR: %s", err) + switch command.Runtime { + case "local": + return command.evaluateLocal(evaluationContext) + default: + command.logger.Panicf("ERROR: unknown runtime") } return nil @@ -427,3 +404,41 @@ func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err erro return nil } + +// evaluateLocal executes the evaluation on the current system. +func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err error) { + // Install required tools for the basic evaluation. + if err := tools.InstallEvaluation(command.logger, command.InstallToolsPath); err != nil { + command.logger.Panicf("ERROR: %s", err) + } + + assessments, totalScore := evaluate.Evaluate(evaluationContext) + + assessmentsPerModel := assessments.CollapseByModel() + if err := (report.Markdown{ + DateTime: command.timestamp, + Version: evaluate.Version, + + CSVPath: "./evaluation.csv", + LogPath: "./evaluation.log", + ModelLogsPath: ".", + SVGPath: "./categories.svg", + + AssessmentPerModel: assessmentsPerModel, + TotalScore: totalScore, + }).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil { + command.logger.Panicf("ERROR: %s", err) + } + + _ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) { + command.logger.Printf("Evaluation score for %q (%q): cost=%.2f, %s", model.ID(), assessment.Category(totalScore).ID, model.Cost(), assessment) + + return nil + }) + + if err := writeCSVs(command.ResultPath, assessments); err != nil { + command.logger.Panicf("ERROR: %s", err) + } + + return nil +} From 91d5e0933268d09b84255520aa2b62a9778d70f2 Mon Sep 17 00:00:00 2001 From: Martin Treml Date: Mon, 24 Jun 2024 13:20:03 +0200 Subject: [PATCH 2/3] Introduce the "docker" runtime to run an evaluation within a docker-container Part of #198 --- cmd/eval-dev-quality/cmd/evaluate.go | 121 +++++++++++++++++++++------ util/exec.go | 36 ++++++++ util/exec_test.go | 59 +++++++++++++ 3 files changed, 192 insertions(+), 24 deletions(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index a5e339ad..d811393d 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -1,7 +1,9 @@ package cmd import ( + "context" "os" + "os/exec" "path/filepath" "slices" "sort" @@ -69,7 +71,7 @@ type Evaluate struct { NoDisqualification bool `long:"no-disqualification" description:"By default, models that cannot solve basic language tasks are disqualified for more complex tasks. Overwriting this behavior runs all tasks regardless."` // Runtime indicates if the evaluation is run locally or inside a container. - Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local"` + Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local" choice:"docker"` // logger holds the logger of the command. logger *log.Logger @@ -97,29 +99,6 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate. }() evaluationContext = &evaluate.Context{} - // Setup evaluation result directory. - { - command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group. - uniqueResultPath, err := util.UniqueDirectory(command.ResultPath) - if err != nil { - command.logger.Panicf("ERROR: %s", err) - } - command.ResultPath = uniqueResultPath - evaluationContext.ResultPath = uniqueResultPath - command.logger.Printf("Writing results to %s", command.ResultPath) - } - - // Initialize logging within result directory. - { - log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log")) - if err != nil { - command.logger.Panicf("ERROR: %s", err) - } - cleanup = logClose - command.logger = log - evaluationContext.Log = log - } - // Check and validate common options. { if command.InstallToolsPath == "" { @@ -157,6 +136,12 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate. } evaluationContext.Runs = command.Runs + if command.Runtime == "docker" { + if _, err := exec.LookPath("docker"); err != nil { + command.logger.Panic("docker runtime could not be found") + } + } + evaluationContext.NoDisqualification = command.NoDisqualification } @@ -173,6 +158,29 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate. evaluationContext.TestdataPath = testdataPath } + // Setup evaluation result directory. + { + command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group. + uniqueResultPath, err := util.UniqueDirectory(command.ResultPath) + if err != nil { + command.logger.Panicf("ERROR: %s", err) + } + command.ResultPath = uniqueResultPath + evaluationContext.ResultPath = uniqueResultPath + command.logger.Printf("Writing results to %s", command.ResultPath) + } + + // Initialize logging within result directory. + { + log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log")) + if err != nil { + command.logger.Panicf("ERROR: %s", err) + } + cleanup = logClose + command.logger = log + evaluationContext.Log = log + } + // Register custom OpenAI API providers and models. { customProviders := map[string]*openaiapi.Provider{} @@ -362,6 +370,8 @@ func (command *Evaluate) Execute(args []string) (err error) { switch command.Runtime { case "local": return command.evaluateLocal(evaluationContext) + case "docker": + return command.evaluateDocker(evaluationContext) default: command.logger.Panicf("ERROR: unknown runtime") } @@ -442,3 +452,66 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err return nil } + +// evaluateDocker executes the evaluation for each model inside a docker container. +func (command *Evaluate) evaluateDocker(ctx *evaluate.Context) (err error) { + // Filter all the args to pass them onto the container. + args := util.FilterArgs(os.Args[2:], []string{ + "--runtime", + "--model", + "--result-path", + }) + + // Iterate over each model and start the container. + for _, model := range ctx.Models { + // We are skipping ollama models until we fully support pulling. https://github.com/symflower/eval-dev-quality/issues/100. + if ctx.ProviderForModel[model].ID() == "ollama" { + command.logger.Print("Skipping unsupported ollama model with docker runtime") + + continue + } + + // Create for each model a dedicated subfolder inside the results path. + resultPath, err := filepath.Abs(command.ResultPath) + if err != nil { + return err + } + // Set permission 777 so the non-root docker image is able to store its results inside the result path. + if err := os.Chmod(resultPath, 0777); err != nil { + return err + } + + // Commands regarding the docker runtime. + dockerCommand := []string{ + "docker", + "run", + "-v", // bind volume + resultPath + ":/home/ubuntu/evaluation", + "--rm", // automatically remove container after it finished + "ghcr.io/symflower/eval-dev-quality:v0.5.0", + } + + // Commands for the evaluation to run inside the container. + evaluationCommand := []string{ + "eval-dev-quality", + "evaluate", + "--model", + model.ID(), + "--result-path", + "/home/ubuntu/evaluation/" + model.ID(), + } + + cmd := append(dockerCommand, evaluationCommand...) + cmd = append(cmd, args...) + + commandOutput, err := util.CommandWithResult(context.Background(), command.logger, &util.Command{ + Command: cmd, + }) + if err != nil { + return pkgerrors.WithMessage(pkgerrors.WithStack(err), commandOutput) + } + + } + + return nil +} diff --git a/util/exec.go b/util/exec.go index 9803c2ea..779900e4 100644 --- a/util/exec.go +++ b/util/exec.go @@ -54,3 +54,39 @@ func CommandWithResult(ctx context.Context, logger *log.Logger, command *Command return writer.String(), nil } + +// FilterArgs parses args and removes the ignored ones. +func FilterArgs(args []string, ignored []string) (filtered []string) { + filterMap := map[string]bool{} + for _, v := range ignored { + filterMap[v] = true + } + + // Resolve args with equals sign. + var resolvedArgs []string + for _, v := range args { + if strings.HasPrefix(v, "--") && strings.Contains(v, "=") { + resolvedArgs = append(resolvedArgs, strings.SplitN(v, "=", 2)...) + } else { + resolvedArgs = append(resolvedArgs, v) + } + } + + skip := false + for _, v := range resolvedArgs { + if skip && strings.HasPrefix(v, "--") { + skip = false + } + if filterMap[v] { + skip = true + } + + if skip { + continue + } + + filtered = append(filtered, v) + } + + return filtered +} diff --git a/util/exec_test.go b/util/exec_test.go index 489fa7cb..ec633bc4 100644 --- a/util/exec_test.go +++ b/util/exec_test.go @@ -32,3 +32,62 @@ func TestCommandWithResultTimeout(t *testing.T) { assert.Error(t, err) assert.Less(t, duration.Seconds(), 5.0) } + +func TestFilterArgs(t *testing.T) { + type testCase struct { + Name string + + Args []string + Ignored []string + + ExpectedFiltered []string + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualFiltered := FilterArgs(tc.Args, tc.Ignored) + + assert.Equal(t, tc.ExpectedFiltered, actualFiltered) + }) + } + + validate(t, &testCase{ + Name: "Filter arguments", + + Args: []string{ + "--runtime", + "abc", + "--runs", + "5", + }, + Ignored: []string{ + "--runtime", + }, + + ExpectedFiltered: []string{ + "--runs", + "5", + }, + }) + + validate(t, &testCase{ + Name: "Filter arguments with equals sign", + + Args: []string{ + "--runtime=abc", + "--runs=5", + "--foo", + "bar", + }, + Ignored: []string{ + "--runtime", + }, + + ExpectedFiltered: []string{ + "--runs", + "5", + "--foo", + "bar", + }, + }) +} From c49a891e9994b5c0f655d91e3cd03e8b4fab6682 Mon Sep 17 00:00:00 2001 From: Martin Treml Date: Wed, 26 Jun 2024 11:51:07 +0200 Subject: [PATCH 3/3] Introduce "runtime-image" parameter to specify the container image to use for a different runtime If none is specified we will always use the taged version defined within eval-dev-quality Part of #198 --- cmd/eval-dev-quality/cmd/evaluate.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index d811393d..73db67dd 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -72,6 +72,8 @@ type Evaluate struct { // Runtime indicates if the evaluation is run locally or inside a container. Runtime string `long:"runtime" description:"The runtime which will be used for the evaluation." default:"local" choice:"local" choice:"docker"` + // RuntimeImage determines the container image used for any container runtime. + RuntimeImage string `long:"runtime-image" description:"The container image to use for the evaluation." default:""` // logger holds the logger of the command. logger *log.Logger @@ -142,6 +144,10 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate. } } + if command.RuntimeImage == "" { + command.RuntimeImage = "ghcr.io/symflower/eval-dev-quality:v" + evaluate.Version + } + evaluationContext.NoDisqualification = command.NoDisqualification } @@ -488,7 +494,7 @@ func (command *Evaluate) evaluateDocker(ctx *evaluate.Context) (err error) { "-v", // bind volume resultPath + ":/home/ubuntu/evaluation", "--rm", // automatically remove container after it finished - "ghcr.io/symflower/eval-dev-quality:v0.5.0", + command.RuntimeImage, } // Commands for the evaluation to run inside the container.