Skip to content

Commit

Permalink
Dump the assessments to the evaluation CSV right after running a task…
Browse files Browse the repository at this point in the history
…, to avoid losing information if the evaluation errors

Part of #237
  • Loading branch information
ruiAzevedo19 committed Jul 5, 2024
1 parent 62f9feb commit e09fa1c
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 141 deletions.
9 changes: 0 additions & 9 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -639,15 +639,6 @@ func (command *Evaluate) evaluateKubernetes(ctx *evaluate.Context) (err error) {

// WriteCSVs writes the various CSV reports to disk.
func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err error) {
// Write the "evaluation.csv" containing all data.
csv, err := report.GenerateCSV(assessments)
if err != nil {
return pkgerrors.Wrap(err, "could not create evaluation.csv summary")
}
if err := os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(csv), 0644); err != nil {
return pkgerrors.Wrap(err, "could not write evaluation.csv summary")
}

// Write the "models-summed.csv" containing the summary per model.
byModel := assessments.CollapseByModel()
csvByModel, err := report.GenerateCSV(byModel)
Expand Down
61 changes: 44 additions & 17 deletions cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -731,27 +731,54 @@ func TestEvaluateExecute(t *testing.T) {
filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
metrics.Assessments{
metrics.AssessmentKeyCoverage: 30,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
metrics.AssessmentKeyResponseNoExcess: 3,
metrics.AssessmentKeyResponseWithCode: 3,
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 30,
metrics.AssessmentKeyFilesExecuted: 3,
metrics.AssessmentKeyResponseNoError: 3,
metrics.AssessmentKeyResponseNoExcess: 3,
metrics.AssessmentKeyResponseWithCode: 3,
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{42, 42})
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
metrics.AssessmentKeyFilesExecuted: 1,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNoExcess: 1,
metrics.AssessmentKeyResponseWithCode: 1,
},
}, []uint64{14, 14, 14, 14, 14, 14})
// Assert non-deterministic behavior.
assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762))
assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762))
assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762))
assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(762))
for _, assessment := range actualAssessments {
assert.Greater(t, assessment[metrics.AssessmentKeyProcessingTime], uint64(0))
assert.Equal(t, assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254))
assert.Equal(t, assessment[metrics.AssessmentKeyResponseCharacterCount], uint64(254))
}
},
filepath.Join("result-directory", "evaluation.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "Run 1/3")
Expand Down
13 changes: 13 additions & 0 deletions evaluate/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
// Ensure we report metrics for every model even if they are excluded.
assessments = report.NewAssessmentStore()
problemsPerModel := map[string][]error{}
// Write the evaluation CSV header so it's only written once.
file, err := os.OpenFile(filepath.Join(ctx.ResultPath, "evaluation.csv"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
ctx.Log.Panicf("ERROR: unable to create evaluation CSV file: %+v", err)
}
evaluationFile, err := report.NewEvaluationFile(file)
if err != nil {
ctx.Log.Panicf("ERROR: %+v", err)
}

{
// Create temporary repositories for each language so the repository is copied only once per language.
Expand Down Expand Up @@ -145,6 +154,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
modelSucceededBasicChecksOfLanguage[model][language] = true
}
assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment)
// Write the task assessment to the evaluation CSV file.
evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), assessment)
}
})
}
Expand Down Expand Up @@ -249,6 +260,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin
ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err)
}
assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment)
// Write the task assessment to the evaluation CSV file.
evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), assessment)
}
})
}
Expand Down
33 changes: 24 additions & 9 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ func TestEvaluate(t *testing.T) {

{
languageGolang := &golang.Language{}
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, "empty-response-model")
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, "empty-response-model", "Empty Response Model", 0.0001)
repositoryPath := filepath.Join("golang", "plain")

validate(t, &testCase{
Expand Down Expand Up @@ -211,6 +211,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -266,6 +267,7 @@ func TestEvaluate(t *testing.T) {
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, ErrEmptyResponseFromModel.Error())
},
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -332,6 +334,7 @@ func TestEvaluate(t *testing.T) {
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "Attempt 1/3: "+ErrEmptyResponseFromModel.Error())
},
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -397,6 +400,7 @@ func TestEvaluate(t *testing.T) {
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) {
assert.Contains(t, data, "DONE 0 tests, 1 error")
},
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down Expand Up @@ -427,7 +431,7 @@ func TestEvaluate(t *testing.T) {
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001)

validate(t, &testCase{
Name: "Problems of previous runs shouldn't cancel successive runs",
Expand Down Expand Up @@ -517,13 +521,14 @@ func TestEvaluate(t *testing.T) {
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001)

validate(t, &testCase{
Name: "Solving basic checks once is enough",
Expand Down Expand Up @@ -612,13 +617,14 @@ func TestEvaluate(t *testing.T) {
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001)

validate(t, &testCase{
Name: "Never solving basic checks leads to exclusion",
Expand Down Expand Up @@ -672,6 +678,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 0,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand All @@ -683,7 +690,8 @@ func TestEvaluate(t *testing.T) {
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001)

repositoryPath := filepath.Join("golang", "plain")
validate(t, &testCase{
Name: "Interleaved",
Expand Down Expand Up @@ -736,6 +744,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Contains(t, output, "Run 1/3")
Expand All @@ -750,7 +759,8 @@ func TestEvaluate(t *testing.T) {
{
languageGolang := &golang.Language{}
mockedModelID := "mocked-generation-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001)

repositoryPath := filepath.Join("golang", "plain")
validate(t, &testCase{
Name: "Sequential",
Expand Down Expand Up @@ -803,6 +813,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Contains(t, output, "Run 1/3 for model")
Expand All @@ -825,7 +836,7 @@ func TestEvaluate(t *testing.T) {
// Setup provider and model mocking.
languageGolang := &golang.Language{}
mockedModelID := "testing-provider/testing-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001)
mockedProviderID := "testing-provider"
mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel})
mockedLoader := providertesting.NewMockLoader(t)
Expand Down Expand Up @@ -900,14 +911,15 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
{
// Setup provider and model mocking.
languageGolang := &golang.Language{}
mockedModelID := "testing-provider/testing-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001)
mockedProviderID := "testing-provider"
mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel})
mockedLoader := providertesting.NewMockLoader(t)
Expand Down Expand Up @@ -980,6 +992,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 6,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand All @@ -988,7 +1001,8 @@ func TestEvaluate(t *testing.T) {
// Setup provider and model mocking.
languageGolang := &golang.Language{}
mockedModelID := "testing-provider/testing-model"
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID)
mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001)

repositoryPath := filepath.Join("golang", "plain")

validate(t, &testCase{
Expand Down Expand Up @@ -1041,6 +1055,7 @@ func TestEvaluate(t *testing.T) {
ExpectedTotalScore: 2,
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil,
filepath.Join("evaluation.csv"): nil,
},
})
}
Expand Down
78 changes: 57 additions & 21 deletions evaluate/report/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package report
import (
"cmp"
"encoding/csv"
"io"
"slices"
"strconv"
"strings"
Expand Down Expand Up @@ -44,27 +45,6 @@ func GenerateCSV(formatter CSVFormatter) (csvData string, err error) {
return out.String(), nil
}

// Header returns the header description as a CSV row.
func (a *AssessmentStore) Header() (header []string) {
return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...)
}

// Rows returns all data as CSV rows.
func (a *AssessmentStore) Rows() (rows [][]string) {
_ = a.Walk(func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) (err error) {
metrics := a.StringCSV()
score := a.Score()
cost := m.Cost()

row := append([]string{m.ID(), m.Name(), strconv.FormatFloat(cost, 'f', -1, 64), l.ID(), r, string(t), strconv.FormatUint(uint64(score), 10)}, metrics...)
rows = append(rows, row)

return nil
})

return rows
}

// Header returns the header description as a CSV row.
func (a AssessmentPerModel) Header() (header []string) {
return append([]string{"model-id", "model-name", "cost", "score"}, metrics.AllAssessmentKeysStrings...)
Expand All @@ -88,3 +68,59 @@ func (a AssessmentPerModel) Rows() (rows [][]string) {

return rows
}

// EvaluationFile holds the evaluation CSV file writer.
type EvaluationFile struct {
// Holds the writer where the evaluation CSV is written to.
io.Writer
}

// NewEvaluationFile initializes an evaluation file and writes the corresponding CSV header.
func NewEvaluationFile(writer io.Writer) (evaluationFile *EvaluationFile, err error) {
evaluationFile = &EvaluationFile{
Writer: writer,
}

var out strings.Builder
csv := csv.NewWriter(&out)

if err := csv.Write(evaluationHeader()); err != nil {
return nil, pkgerrors.WithStack(err)
}
csv.Flush()

if _, err = evaluationFile.Writer.Write([]byte(out.String())); err != nil {
return nil, pkgerrors.WithStack(err)
}

return evaluationFile, nil
}

// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV.
func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) {
var out strings.Builder
csv := csv.NewWriter(&out)

tasks := maps.Keys(assessmentsPerTask)
slices.SortStableFunc(tasks, func(a, b task.Identifier) int {
return cmp.Compare(a, b)
})

for _, task := range tasks {
assessment := assessmentsPerTask[task]
row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...)
csv.Write(row)
}
csv.Flush()

if _, err := e.Writer.Write([]byte(out.String())); err != nil {
return pkgerrors.WithStack(err)
}

return nil
}

// evaluationHeader returns the CSV header for the evaluation CSV.
func evaluationHeader() (header []string) {
return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...)
}
Loading

0 comments on commit e09fa1c

Please sign in to comment.