Skip to content

Commit

Permalink
Dump the assessments to the evaluation CSV right after running a task…
Browse files Browse the repository at this point in the history
…, to avoid losing information if the evaluation errors

Part of #237
  • Loading branch information
ruiAzevedo19 committed Jul 4, 2024
1 parent 156e23c commit a9b9c14
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 26 deletions.
9 changes: 0 additions & 9 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,15 +629,6 @@ func (command *Evaluate) evaluateKubernetes(ctx *evaluate.Context) (err error) {

// WriteCSVs writes the various CSV reports to disk.
func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err error) {
// Write the "evaluation.csv" containing all data.
csv, err := report.GenerateCSV(assessments)
if err != nil {
return pkgerrors.Wrap(err, "could not create evaluation.csv summary")
}
if err := os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(csv), 0644); err != nil {
return pkgerrors.Wrap(err, "could not write evaluation.csv summary")
}

// Write the "models-summed.csv" containing the summary per model.
byModel := assessments.CollapseByModel()
csvByModel, err := report.GenerateCSV(byModel)
Expand Down
61 changes: 44 additions & 17 deletions cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -731,27 +731,54 @@ func TestEvaluateExecute(t *testing.T) {
filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
metrics.Assessments{
metrics.AssessmentKeyCoverage: 30,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
metrics.AssessmentKeyResponseNoExcess: 3,
metrics.AssessmentKeyResponseWithCode: 3,
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 30,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
metrics.AssessmentKeyResponseNoExcess: 3,
metrics.AssessmentKeyResponseWithCode: 3,
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{42, 42})
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14, 14, 14, 14, 14, 14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762))
assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762))
assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(762))
for _, assessment := range actualAssessments {
assert.Greater(t, assessment[metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, assessment[metrics.AssessmentKeyResponseCharacterCount], uint64(254))
}
},
filepath.Join("result-directory", "evaluation.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "Run 1/3")
Expand Down
6 changes: 6 additions & 0 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
// Ensure we report metrics for every model even if they are excluded.
assessments = report.NewAssessmentStore()
problemsPerModel := map[string][]error{}
// Write the evaluation CSV header so it's only written once.
report.WriteEvaluationHeader(ctx.ResultPath)

{
// Create temporary repositories for each language so the repository is copied only once per language.
Expand Down Expand Up @@ -145,6 +147,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
modelSucceededBasicChecksOfLanguage[model][language] = true
}
assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment)
// Write the task assessment to the evaluation CSV file.
report.WriteEvaluationRecord(ctx.ResultPath, model, language, temporaryRepository.Name(), assessment)
}
})
}
Expand Down Expand Up @@ -249,6 +253,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
}
assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment)
// Write the task assessment to the evaluation CSV file.
report.WriteEvaluationRecord(ctx.ResultPath, model, language, temporaryRepository.Name(), assessment)
}
})
}
Expand Down
33 changes: 33 additions & 0 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ func TestEvaluate(t *testing.T) {
Before: func(t *testing.T, logger *log.Logger, resultPath string) {
// Set up mocks, when test is running.
mockedModel.MockCapabilityWriteTests.On("WriteTests", mock.Anything).Return(nil, ErrEmptyResponseFromModel)
mockedModel.MockModel.On("Name").Return("Empty Response Model")
mockedModel.MockModel.On("Cost").Return(0.0001)
},

Context: &Context{
Expand Down Expand Up @@ -211,6 +213,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -266,6 +269,7 @@ func TestEvaluate(t *testing.T) {
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, ErrEmptyResponseFromModel.Error())
},
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -332,6 +336,7 @@ func TestEvaluate(t *testing.T) {
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "Attempt 1/3: "+ErrEmptyResponseFromModel.Error())
},
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -397,6 +402,7 @@ func TestEvaluate(t *testing.T) {
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "DONE 0 tests, 1 error")
},
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -428,6 +434,8 @@ func TestEvaluate(t *testing.T) {
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Mocked Generation Model")
mockedModel.MockModel.On("Cost").Return(0.0001)

validate(t, &testCase{
Name: "Problems of previous runs shouldn't cancel successive runs",
Expand Down Expand Up @@ -517,13 +525,16 @@ func TestEvaluate(t *testing.T) {
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Mocked Generation Model")
mockedModel.MockModel.On("Cost").Return(0.0001)

validate(t, &testCase{
Name: "Solving basic checks once is enough",
Expand Down Expand Up @@ -612,13 +623,16 @@ func TestEvaluate(t *testing.T) {
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Mocked Generation Model")
mockedModel.MockModel.On("Cost").Return(0.0001)

validate(t, &testCase{
Name: "Never solving basic checks leads to exclusion",
Expand Down Expand Up @@ -672,6 +686,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand All @@ -684,6 +699,9 @@ func TestEvaluate(t *testing.T) {
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Mocked Generation Model")
mockedModel.MockModel.On("Cost").Return(0.0001)

repositoryPath := filepath.Join("golang", "plain")
validate(t, &testCase{
Name: "Interleaved",
Expand Down Expand Up @@ -736,6 +754,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Contains(t, output, "Run 1/3")
Expand All @@ -751,6 +770,9 @@ func TestEvaluate(t *testing.T) {
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Mocked Generation Model")
mockedModel.MockModel.On("Cost").Return(0.0001)

repositoryPath := filepath.Join("golang", "plain")
validate(t, &testCase{
Name: "Sequential",
Expand Down Expand Up @@ -803,6 +825,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Contains(t, output, "Run 1/3 for model")
Expand All @@ -826,6 +849,8 @@ func TestEvaluate(t *testing.T) {
languageGolang := &golang.Language{}
mockedModelID := "testing-provider/testing-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Testing Model")
mockedModel.MockModel.On("Cost").Return(0.0001)
mockedProviderID := "testing-provider"
mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel})
mockedLoader := providertesting.NewMockLoader(t)
Expand Down Expand Up @@ -900,6 +925,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand All @@ -908,6 +934,8 @@ func TestEvaluate(t *testing.T) {
languageGolang := &golang.Language{}
mockedModelID := "testing-provider/testing-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Testing Model")
mockedModel.MockModel.On("Cost").Return(0.0001)
mockedProviderID := "testing-provider"
mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel})
mockedLoader := providertesting.NewMockLoader(t)
Expand Down Expand Up @@ -980,6 +1008,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand All @@ -989,6 +1018,9 @@ func TestEvaluate(t *testing.T) {
languageGolang := &golang.Language{}
mockedModelID := "testing-provider/testing-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel.MockModel.On("Name").Return("Testing Model")
mockedModel.MockModel.On("Cost").Return(0.0001)

repositoryPath := filepath.Join("golang", "plain")

validate(t, &testCase{
Expand Down Expand Up @@ -1041,6 +1073,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down
55 changes: 55 additions & 0 deletions evaluate/report/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package report
import (
"cmp"
"encoding/csv"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
Expand Down Expand Up @@ -88,3 +90,56 @@ func (a AssessmentPerModel) Rows() (rows [][]string) {

return rows
}

// Evaluation header returns the CSV header for the evaluation CSV.
func EvaluationHeader() (header []string) {
return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...)
}

// WriteHeader writes only the assessment store header to the evaluation CSV file.
func WriteEvaluationHeader(resultPath string) (err error) {
var out strings.Builder
csv := csv.NewWriter(&out)

if err := csv.Write(EvaluationHeader()); err != nil {
return pkgerrors.WithStack(err)
}
csv.Flush()

if err = os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(out.String()), 0644); err != nil {
return pkgerrors.WithStack(err)
}

return nil
}

// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV.
func WriteEvaluationRecord(resultPath string, model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) {
var out strings.Builder
csv := csv.NewWriter(&out)

tasks := maps.Keys(assessmentsPerTask)
slices.SortStableFunc(tasks, func(a, b task.Identifier) int {
return cmp.Compare(a, b)
})

for _, task := range tasks {
assessment := assessmentsPerTask[task]
row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...)
csv.Write(row)
}
csv.Flush()

evaluationFile, err := os.OpenFile(filepath.Join(resultPath, "evaluation.csv"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return pkgerrors.WithStack(err)
}
defer evaluationFile.Close()

if _, err := evaluationFile.WriteString(out.String()); err != nil {
return pkgerrors.WithStack(err)
}

return nil
}

Loading

0 comments on commit a9b9c14

Please sign in to comment.