Skip to content

Commit

Permalink
Log LLM queries and responses directly to files, to debug the evaluat…
Browse files Browse the repository at this point in the history
…ion logic

Part of #204
  • Loading branch information
ruiAzevedo19 committed Jun 26, 2024
1 parent 10fc840 commit 04d5cf2
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 9 deletions.
27 changes: 27 additions & 0 deletions cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/zimmski/osutil"
"github.com/zimmski/osutil/bytesutil"

"github.com/symflower/eval-dev-quality/evaluate"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
Expand Down Expand Up @@ -592,6 +593,19 @@ func TestEvaluateExecute(t *testing.T) {
filepath.Join("result-directory", "models-summed.csv"): nil,
filepath.Join("result-directory", "README.md"): nil,
filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil,
filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain-plain-1.md"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "# Query")
assert.Contains(t, data, bytesutil.StringTrimIndentations(`
`+"```"+`golang
package plain
func plain() {
return // This does not do anything but it gives us a line to cover.
}
`+"```"+`
`))
assert.Contains(t, data, "# Response")
},
},
})
}
Expand Down Expand Up @@ -640,6 +654,19 @@ func TestEvaluateExecute(t *testing.T) {
filepath.Join("result-directory", "models-summed.csv"): nil,
filepath.Join("result-directory", "README.md"): nil,
filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "custom-ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil,
filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "custom-ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain-plain-1.md"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "# Query")
assert.Contains(t, data, bytesutil.StringTrimIndentations(`
`+"```"+`golang
package plain
func plain() {
return // This does not do anything but it gives us a line to cover.
}
`+"```"+`
`))
assert.Contains(t, data, "# Response")
},
},
})
}
Expand Down
1 change: 1 addition & 0 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
ctx.Log.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

task.SetCurrentRun(rm + 1)
assessment, ps, err := task.Run(temporaryRepository)
assessments.Add(model, language, repositoryPath, taskIdentifier, assessment)
if err != nil {
Expand Down
18 changes: 15 additions & 3 deletions evaluate/task/task-code-repair.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package task
import (
"os"
"path/filepath"
"strconv"
"strings"

pkgerrors "github.com/pkg/errors"
Expand All @@ -23,6 +24,9 @@ type TaskCodeRepair struct {
// Model holds the model which the task should be evaluated.
Model model.Model

// CurrentRun holds the current run being performed.
CurrentRun uint

// Logger holds the logger for this tasks.
Logger *log.Logger
}
Expand Down Expand Up @@ -53,7 +57,8 @@ func (t *TaskCodeRepair) Identifier() evaltask.Identifier {
// Run performs source code repairing in a repository with compilation errors.
// This task requires the repository to consist of multiple packages, with each containing one faulty implementation file and a corresponding test file.
func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) {
log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log"))
logFilePath := filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID())
log, logClose, err := log.WithFile(t.Logger, filepath.Join(logFilePath, repository.Name()+".log"))
if err != nil {
return nil, nil, err
}
Expand Down Expand Up @@ -86,11 +91,13 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme
return nil, nil, err
}

fileName := strings.TrimSuffix(sourceFile, filepath.Ext(sourceFile))
ctx := evaltask.Context{
Language: t.Language,

RepositoryPath: packagePath,
FilePath: sourceFile,
RepositoryPath: packagePath,
FilePath: sourceFile,
QueryResponseFilePath: filepath.Join(logFilePath, repository.Name()+"-"+fileName+"-"+strconv.Itoa(int(t.CurrentRun))+".md"),

Arguments: &TaskArgumentsCodeRepair{
Mistakes: mistakes,
Expand Down Expand Up @@ -156,3 +163,8 @@ func (t *TaskCodeRepair) unpackCodeRepairPackage(fileLogger *log.Logger, package

return sourceFilePath, mistakes, nil
}

// SetRun sets the current run being performed.
func (t *TaskCodeRepair) SetCurrentRun(run uint) {
t.CurrentRun = run
}
20 changes: 17 additions & 3 deletions evaluate/task/task-write-test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package task

import (
"path/filepath"
"strconv"
"strings"

pkgerrors "github.com/pkg/errors"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
Expand All @@ -21,6 +23,9 @@ type TaskWriteTests struct {
// Model holds the model which the task should be evaluated.
Model model.Model

// CurrentRun holds the current run being performed.
CurrentRun uint

// Logger holds the logger for this tasks.
Logger *log.Logger
}
Expand All @@ -46,7 +51,8 @@ func (t *TaskWriteTests) Identifier() evaltask.Identifier {
func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) {
dataPath := repository.DataPath()

log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log"))
logFilePath := filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID())
log, logClose, err := log.WithFile(t.Logger, filepath.Join(logFilePath, repository.Name()+".log"))
if err != nil {
return nil, nil, err
}
Expand All @@ -68,11 +74,14 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme
t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

fileNameWithExtension := filepath.Base(filePath)
fileName := strings.TrimSuffix(fileNameWithExtension, filepath.Ext(fileNameWithExtension))
ctx := evaltask.Context{
Language: t.Language,

RepositoryPath: dataPath,
FilePath: filePath,
RepositoryPath: dataPath,
FilePath: filePath,
QueryResponseFilePath: filepath.Join(logFilePath, repository.Name()+"-"+fileName+"-"+strconv.Itoa(int(t.CurrentRun))+".md"),

Logger: log,
}
Expand Down Expand Up @@ -102,3 +111,8 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme

return repositoryAssessment, problems, nil
}

// SetRun sets the current run being performed.
func (t *TaskWriteTests) SetCurrentRun(run uint) {
t.CurrentRun = run
}
22 changes: 19 additions & 3 deletions model/llm/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func (m *Model) generateTestsForFile(ctx task.Context) (assessment metrics.Asses
return nil, err
}

response, duration, err := m.query(ctx.Logger, request)
response, duration, err := m.query(ctx, ctx.Logger, request)
if err != nil {
return nil, pkgerrors.WithStack(err)
}
Expand Down Expand Up @@ -221,7 +221,7 @@ func (m *Model) generateTestsForFile(ctx task.Context) (assessment metrics.Asses
return assessment, nil
}

func (m *Model) query(log *log.Logger, request string) (response string, duration time.Duration, err error) {
func (m *Model) query(ctx task.Context, log *log.Logger, request string) (response string, duration time.Duration, err error) {
if err := retry.Do(
func() error {
log.Printf("Querying model %q with:\n%s", m.ID(), string(bytesutil.PrefixLines([]byte(request), []byte("\t"))))
Expand All @@ -246,9 +246,25 @@ func (m *Model) query(log *log.Logger, request string) (response string, duratio
return "", 0, err
}

// Store the response in a file.
if err = writeQueryAndResponseToFile(ctx.QueryResponseFilePath, request, response); err != nil {
return "", 0, err
}

return response, duration, nil
}

// writeQueryAndResponseToFile writes the query and model response in a file.
func writeQueryAndResponseToFile(filePath string, query string, response string) (err error) {
content := "# Query\n" + query + "\n# Response\n" + response

if err = os.WriteFile(filePath, []byte(content), 0644); err != nil {
return pkgerrors.WithStack(err)
}

return nil
}

// repairSourceCodeFile queries the model to repair a source code with compilation error.
func (m *Model) repairSourceCodeFile(ctx task.Context, codeRepairArguments *evaluatetask.TaskArgumentsCodeRepair) (assessment metrics.Assessments, err error) {
assessment = map[metrics.AssessmentKey]uint64{}
Expand Down Expand Up @@ -276,7 +292,7 @@ func (m *Model) repairSourceCodeFile(ctx task.Context, codeRepairArguments *eval
return nil, err
}

response, duration, err := m.query(ctx.Logger, request)
response, duration, err := m.query(ctx, ctx.Logger, request)
if err != nil {
return nil, pkgerrors.WithStack(err)
}
Expand Down
81 changes: 81 additions & 0 deletions model/llm/llm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,3 +341,84 @@ func TestLLMCodeRepairSourceFilePrompt(t *testing.T) {
`),
})
}

func TestWriteQueryAndResponseToFile(t *testing.T) {
temporaryPath := t.TempDir()
filepath := filepath.Join(temporaryPath, "mistakes-importMissing-1.md")

query := bytesutil.StringTrimIndentations(`
Given the following Go code file "/path/to/foobar.go" with package "foobar" and a list of compilation errors, modify the code such that the errors are resolved.
The response must contain only the source code and nothing else.
` + "```" + `golang
package foobar
func foobar(i int) int
return i + 1
}
` + "```" + `
The list of compilation errors is the following:
- /path/to/foobar.go:3:1: expected 'IDENT', found 'func'
- /path/to/foobar.go: syntax error: non-declaration statement outside function body
- /path/to/foobar.go: missing return
`)
response := bytesutil.StringTrimIndentations(`
` + "```" + `
package com.eval;
public class OpeningBracketMissing {
public static int openingBracketMissing(int x) {
if (x > 0) {
return 1;
}
if (x < 0) {
return -1;
}
return 0;
}
}
` + "```" + `
`)

err := writeQueryAndResponseToFile(filepath, query, response)
require.NoError(t, err)

expectedFileContent := bytesutil.StringTrimIndentations(`
# Query
Given the following Go code file "/path/to/foobar.go" with package "foobar" and a list of compilation errors, modify the code such that the errors are resolved.
The response must contain only the source code and nothing else.
` + "```" + `golang
package foobar
func foobar(i int) int
return i + 1
}
` + "```" + `
The list of compilation errors is the following:
- /path/to/foobar.go:3:1: expected 'IDENT', found 'func'
- /path/to/foobar.go: syntax error: non-declaration statement outside function body
- /path/to/foobar.go: missing return
# Response
` + "```" + `
package com.eval;
public class OpeningBracketMissing {
public static int openingBracketMissing(int x) {
if (x > 0) {
return 1;
}
if (x < 0) {
return -1;
}
return 0;
}
}
` + "```" + `
`)

data, err := os.ReadFile(filepath)
require.NoError(t, err)
actualFileContent := string(data)

assert.Equal(t, expectedFileContent, actualFileContent)
}
4 changes: 4 additions & 0 deletions task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ type Context struct {
RepositoryPath string
// FilePath holds the path the file under test relative to the repository path.
FilePath string
// QueryResponseFilePath holds the file path were query responses are written to.
QueryResponseFilePath string

// Arguments holds extra data that can be used in a query prompt.
Arguments any
Expand All @@ -40,6 +42,8 @@ type Task interface {

// Run runs a task in a given repository.
Run(repository Repository) (assessments metrics.Assessments, problems []error, err error)
// SetCurrentRun sets the current run being performed.
SetCurrentRun(run uint)
}

// Repository defines a repository to be evaluated.
Expand Down

0 comments on commit 04d5cf2

Please sign in to comment.