From 04d5cf220bd5946a34cff38d7e9c2433a0509096 Mon Sep 17 00:00:00 2001
From: Rui Azevedo <rui.azevedo@symflower.com>
Date: Wed, 26 Jun 2024 18:11:53 +0100
Subject: [PATCH] Log LLM queries and responses directly to files, to debug the
 evaluation logic

Part of #204
---
 cmd/eval-dev-quality/cmd/evaluate_test.go | 27 ++++++++
 evaluate/evaluate.go                      |  1 +
 evaluate/task/task-code-repair.go         | 18 ++++-
 evaluate/task/task-write-test.go          | 20 +++++-
 model/llm/llm.go                          | 22 +++++-
 model/llm/llm_test.go                     | 81 +++++++++++++++++++++++
 task/task.go                              |  4 ++
 7 files changed, 164 insertions(+), 9 deletions(-)

diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
index ad488f3d..55d2f5de 100644
--- a/cmd/eval-dev-quality/cmd/evaluate_test.go
+++ b/cmd/eval-dev-quality/cmd/evaluate_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"github.com/zimmski/osutil"
+	"github.com/zimmski/osutil/bytesutil"
 
 	"github.com/symflower/eval-dev-quality/evaluate"
 	"github.com/symflower/eval-dev-quality/evaluate/metrics"
@@ -592,6 +593,19 @@ func TestEvaluateExecute(t *testing.T) {
 						filepath.Join("result-directory", "models-summed.csv"): nil,
 						filepath.Join("result-directory", "README.md"):         nil,
 						filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil,
+						filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain-plain-1.md"): func(t *testing.T, filePath, data string) {
+							assert.Contains(t, data, "# Query")
+							assert.Contains(t, data, bytesutil.StringTrimIndentations(`
+								`+"```"+`golang
+								package plain
+
+								func plain() {
+									return // This does not do anything but it gives us a line to cover.
+								}
+								`+"```"+`
+							`))
+							assert.Contains(t, data, "# Response")
+						},
 					},
 				})
 			}
@@ -640,6 +654,19 @@ func TestEvaluateExecute(t *testing.T) {
 						filepath.Join("result-directory", "models-summed.csv"): nil,
 						filepath.Join("result-directory", "README.md"):         nil,
 						filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "custom-ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil,
+						filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "custom-ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain-plain-1.md"): func(t *testing.T, filePath, data string) {
+							assert.Contains(t, data, "# Query")
+							assert.Contains(t, data, bytesutil.StringTrimIndentations(`
+								`+"```"+`golang
+								package plain
+
+								func plain() {
+									return // This does not do anything but it gives us a line to cover.
+								}
+								`+"```"+`
+							`))
+							assert.Contains(t, data, "# Response")
+						},
 					},
 				})
 			}
diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go
index 2c708980..c2ad370d 100644
--- a/evaluate/evaluate.go
+++ b/evaluate/evaluate.go
@@ -125,6 +125,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
 									ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
 								}
 
+								task.SetCurrentRun(rm + 1)
 								assessment, ps, err := task.Run(temporaryRepository)
 								assessments.Add(model, language, repositoryPath, taskIdentifier, assessment)
 								if err != nil {
diff --git a/evaluate/task/task-code-repair.go b/evaluate/task/task-code-repair.go
index 76e1ccfc..64ba24b0 100644
--- a/evaluate/task/task-code-repair.go
+++ b/evaluate/task/task-code-repair.go
@@ -3,6 +3,7 @@ package task
 import (
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 
 	pkgerrors "github.com/pkg/errors"
@@ -23,6 +24,9 @@ type TaskCodeRepair struct {
 	// Model holds the model which the task should be evaluated.
 	Model model.Model
 
+	// CurrentRun holds the current run being performed.
+	CurrentRun uint
+
 	// Logger holds the logger for this tasks.
 	Logger *log.Logger
 }
@@ -53,7 +57,8 @@ func (t *TaskCodeRepair) Identifier() evaltask.Identifier {
 // Run performs source code repairing in a repository with compilation errors.
 // This task requires the repository to consist of multiple packages, with each containing one faulty implementation file and a corresponding test file.
 func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) {
-	log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log"))
+	logFilePath := filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID())
+	log, logClose, err := log.WithFile(t.Logger, filepath.Join(logFilePath, repository.Name()+".log"))
 	if err != nil {
 		return nil, nil, err
 	}
@@ -86,11 +91,13 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme
 			return nil, nil, err
 		}
 
+		fileName := strings.TrimSuffix(sourceFile, filepath.Ext(sourceFile))
 		ctx := evaltask.Context{
 			Language: t.Language,
 
-			RepositoryPath: packagePath,
-			FilePath:       sourceFile,
+			RepositoryPath:        packagePath,
+			FilePath:              sourceFile,
+			QueryResponseFilePath: filepath.Join(logFilePath, repository.Name()+"-"+fileName+"-"+strconv.Itoa(int(t.CurrentRun))+".md"),
 
 			Arguments: &TaskArgumentsCodeRepair{
 				Mistakes: mistakes,
@@ -156,3 +163,8 @@ func (t *TaskCodeRepair) unpackCodeRepairPackage(fileLogger *log.Logger, package
 
 	return sourceFilePath, mistakes, nil
 }
+
+// SetRun sets the current run being performed.
+func (t *TaskCodeRepair) SetCurrentRun(run uint) {
+	t.CurrentRun = run
+}
diff --git a/evaluate/task/task-write-test.go b/evaluate/task/task-write-test.go
index f7fb1f55..56dc1b35 100644
--- a/evaluate/task/task-write-test.go
+++ b/evaluate/task/task-write-test.go
@@ -2,6 +2,8 @@ package task
 
 import (
 	"path/filepath"
+	"strconv"
+	"strings"
 
 	pkgerrors "github.com/pkg/errors"
 	"github.com/symflower/eval-dev-quality/evaluate/metrics"
@@ -21,6 +23,9 @@ type TaskWriteTests struct {
 	// Model holds the model which the task should be evaluated.
 	Model model.Model
 
+	// CurrentRun holds the current run being performed.
+	CurrentRun uint
+
 	// Logger holds the logger for this tasks.
 	Logger *log.Logger
 }
@@ -46,7 +51,8 @@ func (t *TaskWriteTests) Identifier() evaltask.Identifier {
 func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) {
 	dataPath := repository.DataPath()
 
-	log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log"))
+	logFilePath := filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID())
+	log, logClose, err := log.WithFile(t.Logger, filepath.Join(logFilePath, repository.Name()+".log"))
 	if err != nil {
 		return nil, nil, err
 	}
@@ -68,11 +74,14 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme
 			t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
 		}
 
+		fileNameWithExtension := filepath.Base(filePath)
+		fileName := strings.TrimSuffix(fileNameWithExtension, filepath.Ext(fileNameWithExtension))
 		ctx := evaltask.Context{
 			Language: t.Language,
 
-			RepositoryPath: dataPath,
-			FilePath:       filePath,
+			RepositoryPath:        dataPath,
+			FilePath:              filePath,
+			QueryResponseFilePath: filepath.Join(logFilePath, repository.Name()+"-"+fileName+"-"+strconv.Itoa(int(t.CurrentRun))+".md"),
 
 			Logger: log,
 		}
@@ -102,3 +111,8 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme
 
 	return repositoryAssessment, problems, nil
 }
+
+// SetRun sets the current run being performed.
+func (t *TaskWriteTests) SetCurrentRun(run uint) {
+	t.CurrentRun = run
+}
diff --git a/model/llm/llm.go b/model/llm/llm.go
index c69a412b..43e24ce0 100644
--- a/model/llm/llm.go
+++ b/model/llm/llm.go
@@ -193,7 +193,7 @@ func (m *Model) generateTestsForFile(ctx task.Context) (assessment metrics.Asses
 		return nil, err
 	}
 
-	response, duration, err := m.query(ctx.Logger, request)
+	response, duration, err := m.query(ctx, ctx.Logger, request)
 	if err != nil {
 		return nil, pkgerrors.WithStack(err)
 	}
@@ -221,7 +221,7 @@ func (m *Model) generateTestsForFile(ctx task.Context) (assessment metrics.Asses
 	return assessment, nil
 }
 
-func (m *Model) query(log *log.Logger, request string) (response string, duration time.Duration, err error) {
+func (m *Model) query(ctx task.Context, log *log.Logger, request string) (response string, duration time.Duration, err error) {
 	if err := retry.Do(
 		func() error {
 			log.Printf("Querying model %q with:\n%s", m.ID(), string(bytesutil.PrefixLines([]byte(request), []byte("\t"))))
@@ -246,9 +246,25 @@ func (m *Model) query(log *log.Logger, request string) (response string, duratio
 		return "", 0, err
 	}
 
+	// Store the response in a file.
+	if err = writeQueryAndResponseToFile(ctx.QueryResponseFilePath, request, response); err != nil {
+		return "", 0, err
+	}
+
 	return response, duration, nil
 }
 
+// writeQueryAndResponseToFile writes the query and model response in a file.
+func writeQueryAndResponseToFile(filePath string, query string, response string) (err error) {
+	content := "# Query\n" + query + "\n# Response\n" + response
+
+	if err = os.WriteFile(filePath, []byte(content), 0644); err != nil {
+		return pkgerrors.WithStack(err)
+	}
+
+	return nil
+}
+
 // repairSourceCodeFile queries the model to repair a source code with compilation error.
 func (m *Model) repairSourceCodeFile(ctx task.Context, codeRepairArguments *evaluatetask.TaskArgumentsCodeRepair) (assessment metrics.Assessments, err error) {
 	assessment = map[metrics.AssessmentKey]uint64{}
@@ -276,7 +292,7 @@ func (m *Model) repairSourceCodeFile(ctx task.Context, codeRepairArguments *eval
 		return nil, err
 	}
 
-	response, duration, err := m.query(ctx.Logger, request)
+	response, duration, err := m.query(ctx, ctx.Logger, request)
 	if err != nil {
 		return nil, pkgerrors.WithStack(err)
 	}
diff --git a/model/llm/llm_test.go b/model/llm/llm_test.go
index c09942fb..88c7ae1c 100644
--- a/model/llm/llm_test.go
+++ b/model/llm/llm_test.go
@@ -341,3 +341,84 @@ func TestLLMCodeRepairSourceFilePrompt(t *testing.T) {
 		`),
 	})
 }
+
+func TestWriteQueryAndResponseToFile(t *testing.T) {
+	temporaryPath := t.TempDir()
+	filepath := filepath.Join(temporaryPath, "mistakes-importMissing-1.md")
+
+	query := bytesutil.StringTrimIndentations(`
+		Given the following Go code file "/path/to/foobar.go" with package "foobar" and a list of compilation errors, modify the code such that the errors are resolved.
+		The response must contain only the source code and nothing else.
+
+		` + "```" + `golang
+		package foobar
+		func foobar(i int) int
+			return i + 1
+		}
+		` + "```" + `
+
+		The list of compilation errors is the following:
+		- /path/to/foobar.go:3:1: expected 'IDENT', found 'func'
+		- /path/to/foobar.go: syntax error: non-declaration statement outside function body
+		- /path/to/foobar.go: missing return
+	`)
+	response := bytesutil.StringTrimIndentations(`
+		` + "```" + `
+		package com.eval;
+		public class OpeningBracketMissing {
+			public static int openingBracketMissing(int x) {
+				if (x > 0) {
+					return 1;
+				}
+				if (x < 0) {
+					return -1;
+				}
+				return 0;
+			}
+		}
+		` + "```" + `
+	`)
+
+	err := writeQueryAndResponseToFile(filepath, query, response)
+	require.NoError(t, err)
+
+	expectedFileContent := bytesutil.StringTrimIndentations(`
+		# Query
+		Given the following Go code file "/path/to/foobar.go" with package "foobar" and a list of compilation errors, modify the code such that the errors are resolved.
+		The response must contain only the source code and nothing else.
+
+		` + "```" + `golang
+		package foobar
+		func foobar(i int) int
+			return i + 1
+		}
+		` + "```" + `
+
+		The list of compilation errors is the following:
+		- /path/to/foobar.go:3:1: expected 'IDENT', found 'func'
+		- /path/to/foobar.go: syntax error: non-declaration statement outside function body
+		- /path/to/foobar.go: missing return
+
+		# Response
+		` + "```" + `
+		package com.eval;
+		public class OpeningBracketMissing {
+			public static int openingBracketMissing(int x) {
+				if (x > 0) {
+					return 1;
+				}
+				if (x < 0) {
+					return -1;
+				}
+				return 0;
+			}
+		}
+		` + "```" + `
+	`)
+
+	data, err := os.ReadFile(filepath)
+	require.NoError(t, err)
+	actualFileContent := string(data)
+
+	assert.Equal(t, expectedFileContent, actualFileContent)
+}
diff --git a/task/task.go b/task/task.go
index 8df261ae..8c34a455 100644
--- a/task/task.go
+++ b/task/task.go
@@ -25,6 +25,8 @@ type Context struct {
 	RepositoryPath string
 	// FilePath holds the path the file under test relative to the repository path.
 	FilePath string
+	// QueryResponseFilePath holds the file path were query responses are written to.
+	QueryResponseFilePath string
 
 	// Arguments holds extra data that can be used in a query prompt.
 	Arguments any
@@ -40,6 +42,8 @@ type Task interface {
 
 	// Run runs a task in a given repository.
 	Run(repository Repository) (assessments metrics.Assessments, problems []error, err error)
+	// SetCurrentRun sets the current run being performed.
+	SetCurrentRun(run uint)
 }
 
 // Repository defines a repository to be evaluated.