From 04d5cf220bd5946a34cff38d7e9c2433a0509096 Mon Sep 17 00:00:00 2001 From: Rui Azevedo Date: Wed, 26 Jun 2024 18:11:53 +0100 Subject: [PATCH] Log LLM queries and responses directly to files, to debug the evaluation logic Part of #204 --- cmd/eval-dev-quality/cmd/evaluate_test.go | 27 ++++++++ evaluate/evaluate.go | 1 + evaluate/task/task-code-repair.go | 18 ++++- evaluate/task/task-write-test.go | 20 +++++- model/llm/llm.go | 22 +++++- model/llm/llm_test.go | 81 +++++++++++++++++++++++ task/task.go | 4 ++ 7 files changed, 164 insertions(+), 9 deletions(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go index ad488f3d..55d2f5de 100644 --- a/cmd/eval-dev-quality/cmd/evaluate_test.go +++ b/cmd/eval-dev-quality/cmd/evaluate_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/zimmski/osutil" + "github.com/zimmski/osutil/bytesutil" "github.com/symflower/eval-dev-quality/evaluate" "github.com/symflower/eval-dev-quality/evaluate/metrics" @@ -592,6 +593,19 @@ func TestEvaluateExecute(t *testing.T) { filepath.Join("result-directory", "models-summed.csv"): nil, filepath.Join("result-directory", "README.md"): nil, filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil, + filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain-plain-1.md"): func(t *testing.T, filePath, data string) { + assert.Contains(t, data, "# Query") + assert.Contains(t, data, bytesutil.StringTrimIndentations(` + `+"```"+`golang + package plain + + func plain() { + return // This does not do anything but it gives us a line to cover. + } + `+"```"+` + `)) + assert.Contains(t, data, "# Response") + }, }, }) } @@ -640,6 +654,19 @@ func TestEvaluateExecute(t *testing.T) { filepath.Join("result-directory", "models-summed.csv"): nil, filepath.Join("result-directory", "README.md"): nil, filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "custom-ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil, + filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "custom-ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain-plain-1.md"): func(t *testing.T, filePath, data string) { + assert.Contains(t, data, "# Query") + assert.Contains(t, data, bytesutil.StringTrimIndentations(` + `+"```"+`golang + package plain + + func plain() { + return // This does not do anything but it gives us a line to cover. + } + `+"```"+` + `)) + assert.Contains(t, data, "# Response") + }, }, }) } diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go index 2c708980..c2ad370d 100644 --- a/evaluate/evaluate.go +++ b/evaluate/evaluate.go @@ -125,6 +125,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err) } + task.SetCurrentRun(rm + 1) assessment, ps, err := task.Run(temporaryRepository) assessments.Add(model, language, repositoryPath, taskIdentifier, assessment) if err != nil { diff --git a/evaluate/task/task-code-repair.go b/evaluate/task/task-code-repair.go index 76e1ccfc..64ba24b0 100644 --- a/evaluate/task/task-code-repair.go +++ b/evaluate/task/task-code-repair.go @@ -3,6 +3,7 @@ package task import ( "os" "path/filepath" + "strconv" "strings" pkgerrors "github.com/pkg/errors" @@ -23,6 +24,9 @@ type TaskCodeRepair struct { // Model holds the model which the task should be evaluated. Model model.Model + // CurrentRun holds the current run being performed. + CurrentRun uint + // Logger holds the logger for this tasks. Logger *log.Logger } @@ -53,7 +57,8 @@ func (t *TaskCodeRepair) Identifier() evaltask.Identifier { // Run performs source code repairing in a repository with compilation errors. // This task requires the repository to consist of multiple packages, with each containing one faulty implementation file and a corresponding test file. func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) { - log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log")) + logFilePath := filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID()) + log, logClose, err := log.WithFile(t.Logger, filepath.Join(logFilePath, repository.Name()+".log")) if err != nil { return nil, nil, err } @@ -86,11 +91,13 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme return nil, nil, err } + fileName := strings.TrimSuffix(sourceFile, filepath.Ext(sourceFile)) ctx := evaltask.Context{ Language: t.Language, - RepositoryPath: packagePath, - FilePath: sourceFile, + RepositoryPath: packagePath, + FilePath: sourceFile, + QueryResponseFilePath: filepath.Join(logFilePath, repository.Name()+"-"+fileName+"-"+strconv.Itoa(int(t.CurrentRun))+".md"), Arguments: &TaskArgumentsCodeRepair{ Mistakes: mistakes, @@ -156,3 +163,8 @@ func (t *TaskCodeRepair) unpackCodeRepairPackage(fileLogger *log.Logger, package return sourceFilePath, mistakes, nil } + +// SetRun sets the current run being performed. +func (t *TaskCodeRepair) SetCurrentRun(run uint) { + t.CurrentRun = run +} diff --git a/evaluate/task/task-write-test.go b/evaluate/task/task-write-test.go index f7fb1f55..56dc1b35 100644 --- a/evaluate/task/task-write-test.go +++ b/evaluate/task/task-write-test.go @@ -2,6 +2,8 @@ package task import ( "path/filepath" + "strconv" + "strings" pkgerrors "github.com/pkg/errors" "github.com/symflower/eval-dev-quality/evaluate/metrics" @@ -21,6 +23,9 @@ type TaskWriteTests struct { // Model holds the model which the task should be evaluated. Model model.Model + // CurrentRun holds the current run being performed. + CurrentRun uint + // Logger holds the logger for this tasks. Logger *log.Logger } @@ -46,7 +51,8 @@ func (t *TaskWriteTests) Identifier() evaltask.Identifier { func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) { dataPath := repository.DataPath() - log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log")) + logFilePath := filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID()) + log, logClose, err := log.WithFile(t.Logger, filepath.Join(logFilePath, repository.Name()+".log")) if err != nil { return nil, nil, err } @@ -68,11 +74,14 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) } + fileNameWithExtension := filepath.Base(filePath) + fileName := strings.TrimSuffix(fileNameWithExtension, filepath.Ext(fileNameWithExtension)) ctx := evaltask.Context{ Language: t.Language, - RepositoryPath: dataPath, - FilePath: filePath, + RepositoryPath: dataPath, + FilePath: filePath, + QueryResponseFilePath: filepath.Join(logFilePath, repository.Name()+"-"+fileName+"-"+strconv.Itoa(int(t.CurrentRun))+".md"), Logger: log, } @@ -102,3 +111,8 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme return repositoryAssessment, problems, nil } + +// SetRun sets the current run being performed. +func (t *TaskWriteTests) SetCurrentRun(run uint) { + t.CurrentRun = run +} diff --git a/model/llm/llm.go b/model/llm/llm.go index c69a412b..43e24ce0 100644 --- a/model/llm/llm.go +++ b/model/llm/llm.go @@ -193,7 +193,7 @@ func (m *Model) generateTestsForFile(ctx task.Context) (assessment metrics.Asses return nil, err } - response, duration, err := m.query(ctx.Logger, request) + response, duration, err := m.query(ctx, ctx.Logger, request) if err != nil { return nil, pkgerrors.WithStack(err) } @@ -221,7 +221,7 @@ func (m *Model) generateTestsForFile(ctx task.Context) (assessment metrics.Asses return assessment, nil } -func (m *Model) query(log *log.Logger, request string) (response string, duration time.Duration, err error) { +func (m *Model) query(ctx task.Context, log *log.Logger, request string) (response string, duration time.Duration, err error) { if err := retry.Do( func() error { log.Printf("Querying model %q with:\n%s", m.ID(), string(bytesutil.PrefixLines([]byte(request), []byte("\t")))) @@ -246,9 +246,25 @@ func (m *Model) query(log *log.Logger, request string) (response string, duratio return "", 0, err } + // Store the response in a file. + if err = writeQueryAndResponseToFile(ctx.QueryResponseFilePath, request, response); err != nil { + return "", 0, err + } + return response, duration, nil } +// writeQueryAndResponseToFile writes the query and model response in a file. +func writeQueryAndResponseToFile(filePath string, query string, response string) (err error) { + content := "# Query\n" + query + "\n# Response\n" + response + + if err = os.WriteFile(filePath, []byte(content), 0644); err != nil { + return pkgerrors.WithStack(err) + } + + return nil +} + // repairSourceCodeFile queries the model to repair a source code with compilation error. func (m *Model) repairSourceCodeFile(ctx task.Context, codeRepairArguments *evaluatetask.TaskArgumentsCodeRepair) (assessment metrics.Assessments, err error) { assessment = map[metrics.AssessmentKey]uint64{} @@ -276,7 +292,7 @@ func (m *Model) repairSourceCodeFile(ctx task.Context, codeRepairArguments *eval return nil, err } - response, duration, err := m.query(ctx.Logger, request) + response, duration, err := m.query(ctx, ctx.Logger, request) if err != nil { return nil, pkgerrors.WithStack(err) } diff --git a/model/llm/llm_test.go b/model/llm/llm_test.go index c09942fb..88c7ae1c 100644 --- a/model/llm/llm_test.go +++ b/model/llm/llm_test.go @@ -341,3 +341,84 @@ func TestLLMCodeRepairSourceFilePrompt(t *testing.T) { `), }) } + +func TestWriteQueryAndResponseToFile(t *testing.T) { + temporaryPath := t.TempDir() + filepath := filepath.Join(temporaryPath, "mistakes-importMissing-1.md") + + query := bytesutil.StringTrimIndentations(` + Given the following Go code file "/path/to/foobar.go" with package "foobar" and a list of compilation errors, modify the code such that the errors are resolved. + The response must contain only the source code and nothing else. + + ` + "```" + `golang + package foobar + func foobar(i int) int + return i + 1 + } + ` + "```" + ` + + The list of compilation errors is the following: + - /path/to/foobar.go:3:1: expected 'IDENT', found 'func' + - /path/to/foobar.go: syntax error: non-declaration statement outside function body + - /path/to/foobar.go: missing return + `) + response := bytesutil.StringTrimIndentations(` + ` + "```" + ` + package com.eval; + public class OpeningBracketMissing { + public static int openingBracketMissing(int x) { + if (x > 0) { + return 1; + } + if (x < 0) { + return -1; + } + return 0; + } + } + ` + "```" + ` + `) + + err := writeQueryAndResponseToFile(filepath, query, response) + require.NoError(t, err) + + expectedFileContent := bytesutil.StringTrimIndentations(` + # Query + Given the following Go code file "/path/to/foobar.go" with package "foobar" and a list of compilation errors, modify the code such that the errors are resolved. + The response must contain only the source code and nothing else. + + ` + "```" + `golang + package foobar + func foobar(i int) int + return i + 1 + } + ` + "```" + ` + + The list of compilation errors is the following: + - /path/to/foobar.go:3:1: expected 'IDENT', found 'func' + - /path/to/foobar.go: syntax error: non-declaration statement outside function body + - /path/to/foobar.go: missing return + + # Response + ` + "```" + ` + package com.eval; + public class OpeningBracketMissing { + public static int openingBracketMissing(int x) { + if (x > 0) { + return 1; + } + if (x < 0) { + return -1; + } + return 0; + } + } + ` + "```" + ` + `) + + data, err := os.ReadFile(filepath) + require.NoError(t, err) + actualFileContent := string(data) + + assert.Equal(t, expectedFileContent, actualFileContent) +} diff --git a/task/task.go b/task/task.go index 8df261ae..8c34a455 100644 --- a/task/task.go +++ b/task/task.go @@ -25,6 +25,8 @@ type Context struct { RepositoryPath string // FilePath holds the path the file under test relative to the repository path. FilePath string + // QueryResponseFilePath holds the file path were query responses are written to. + QueryResponseFilePath string // Arguments holds extra data that can be used in a query prompt. Arguments any @@ -40,6 +42,8 @@ type Task interface { // Run runs a task in a given repository. Run(repository Repository) (assessments metrics.Assessments, problems []error, err error) + // SetCurrentRun sets the current run being performed. + SetCurrentRun(run uint) } // Repository defines a repository to be evaluated.