From 8c9af43bf00adeadf9cad0798eb0d0bfaf36bd69 Mon Sep 17 00:00:00 2001 From: Rui Azevedo Date: Thu, 4 Jul 2024 07:52:52 +0100 Subject: [PATCH 1/2] Dump the assessments to the evaluation CSV right after running a task, to avoid losing information if the evaluation errors Part of #237 --- cmd/eval-dev-quality/cmd/evaluate.go | 9 -- cmd/eval-dev-quality/cmd/evaluate_test.go | 61 +++++--- evaluate/evaluate.go | 14 ++ evaluate/evaluate_test.go | 33 +++-- evaluate/report/csv.go | 78 +++++++--- evaluate/report/csv_test.go | 164 +++++++++++----------- model/testing/helper.go | 8 ++ 7 files changed, 226 insertions(+), 141 deletions(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index 07b01a41..e1d8d7ef 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -639,15 +639,6 @@ func (command *Evaluate) evaluateKubernetes(ctx *evaluate.Context) (err error) { // WriteCSVs writes the various CSV reports to disk. func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err error) { - // Write the "evaluation.csv" containing all data. - csv, err := report.GenerateCSV(assessments) - if err != nil { - return pkgerrors.Wrap(err, "could not create evaluation.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(csv), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write evaluation.csv summary") - } - // Write the "models-summed.csv" containing the summary per model. byModel := assessments.CollapseByModel() csvByModel, err := report.GenerateCSV(byModel) diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go index 51a7fb2b..83778a89 100644 --- a/cmd/eval-dev-quality/cmd/evaluate_test.go +++ b/cmd/eval-dev-quality/cmd/evaluate_test.go @@ -731,27 +731,54 @@ func TestEvaluateExecute(t *testing.T) { filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 30, - metrics.AssessmentKeyFilesExecuted: 3, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 3, - metrics.AssessmentKeyResponseWithCode: 3, + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, metrics.Assessments{ - metrics.AssessmentKeyCoverage: 30, - metrics.AssessmentKeyFilesExecuted: 3, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 3, - metrics.AssessmentKeyResponseWithCode: 3, + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, - }, []uint64{42, 42}) + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + }, []uint64{14, 14, 14, 14, 14, 14}) // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762)) - assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762)) - assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(762)) + for _, assessment := range actualAssessments { + assert.Greater(t, assessment[metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) + assert.Equal(t, assessment[metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + } }, filepath.Join("result-directory", "evaluation.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "Run 1/3") diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go index 1d8d69ba..412b32bb 100644 --- a/evaluate/evaluate.go +++ b/evaluate/evaluate.go @@ -74,6 +74,16 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin // Ensure we report metrics for every model even if they are excluded. assessments = report.NewAssessmentStore() problemsPerModel := map[string][]error{} + // Write the evaluation CSV header so it's only written once. + evaluationCSVFile, err := os.OpenFile(filepath.Join(ctx.ResultPath, "evaluation.csv"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + ctx.Log.Panicf("ERROR: unable to create evaluation CSV file: %+v", err) + } + defer evaluationCSVFile.Close() + evaluationFile, err := report.NewEvaluationFile(evaluationCSVFile) + if err != nil { + ctx.Log.Panicf("ERROR: %+v", err) + } { // Create temporary repositories for each language so the repository is copied only once per language. @@ -145,6 +155,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin modelSucceededBasicChecksOfLanguage[model][language] = true } assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) + // Write the task assessment to the evaluation CSV file. + evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), assessment) } }) } @@ -249,6 +261,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err) } assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) + // Write the task assessment to the evaluation CSV file. + evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), assessment) } }) } diff --git a/evaluate/evaluate_test.go b/evaluate/evaluate_test.go index 26f0bcd0..a2847dbb 100644 --- a/evaluate/evaluate_test.go +++ b/evaluate/evaluate_test.go @@ -171,7 +171,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, "empty-response-model") + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, "empty-response-model", "Empty Response Model", 0.0001) repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ @@ -211,6 +211,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -266,6 +267,7 @@ func TestEvaluate(t *testing.T) { filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, ErrEmptyResponseFromModel.Error()) }, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -332,6 +334,7 @@ func TestEvaluate(t *testing.T) { filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "Attempt 1/3: "+ErrEmptyResponseFromModel.Error()) }, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -397,6 +400,7 @@ func TestEvaluate(t *testing.T) { filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "DONE 0 tests, 1 error") }, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -427,7 +431,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) validate(t, &testCase{ Name: "Problems of previous runs shouldn't cancel successive runs", @@ -517,13 +521,14 @@ func TestEvaluate(t *testing.T) { ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) validate(t, &testCase{ Name: "Solving basic checks once is enough", @@ -612,13 +617,14 @@ func TestEvaluate(t *testing.T) { ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) validate(t, &testCase{ Name: "Never solving basic checks leads to exclusion", @@ -672,6 +678,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 0, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -683,7 +690,8 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) + repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ Name: "Interleaved", @@ -736,6 +744,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { assert.Contains(t, output, "Run 1/3") @@ -750,7 +759,8 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) + repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ Name: "Sequential", @@ -803,6 +813,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { assert.Contains(t, output, "Run 1/3 for model") @@ -825,7 +836,7 @@ func TestEvaluate(t *testing.T) { // Setup provider and model mocking. languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001) mockedProviderID := "testing-provider" mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) mockedLoader := providertesting.NewMockLoader(t) @@ -900,6 +911,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -907,7 +919,7 @@ func TestEvaluate(t *testing.T) { // Setup provider and model mocking. languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001) mockedProviderID := "testing-provider" mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) mockedLoader := providertesting.NewMockLoader(t) @@ -980,6 +992,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -988,7 +1001,8 @@ func TestEvaluate(t *testing.T) { // Setup provider and model mocking. languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001) + repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ @@ -1041,6 +1055,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go index 0088a712..81ece6b7 100644 --- a/evaluate/report/csv.go +++ b/evaluate/report/csv.go @@ -3,6 +3,7 @@ package report import ( "cmp" "encoding/csv" + "io" "slices" "strconv" "strings" @@ -44,27 +45,6 @@ func GenerateCSV(formatter CSVFormatter) (csvData string, err error) { return out.String(), nil } -// Header returns the header description as a CSV row. -func (a *AssessmentStore) Header() (header []string) { - return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) -} - -// Rows returns all data as CSV rows. -func (a *AssessmentStore) Rows() (rows [][]string) { - _ = a.Walk(func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) (err error) { - metrics := a.StringCSV() - score := a.Score() - cost := m.Cost() - - row := append([]string{m.ID(), m.Name(), strconv.FormatFloat(cost, 'f', -1, 64), l.ID(), r, string(t), strconv.FormatUint(uint64(score), 10)}, metrics...) - rows = append(rows, row) - - return nil - }) - - return rows -} - // Header returns the header description as a CSV row. func (a AssessmentPerModel) Header() (header []string) { return append([]string{"model-id", "model-name", "cost", "score"}, metrics.AllAssessmentKeysStrings...) @@ -88,3 +68,59 @@ func (a AssessmentPerModel) Rows() (rows [][]string) { return rows } + +// EvaluationFile holds the evaluation CSV file writer. +type EvaluationFile struct { + // Holds the writer where the evaluation CSV is written to. + io.Writer +} + +// NewEvaluationFile initializes an evaluation file and writes the corresponding CSV header. +func NewEvaluationFile(writer io.Writer) (evaluationFile *EvaluationFile, err error) { + evaluationFile = &EvaluationFile{ + Writer: writer, + } + + var out strings.Builder + csv := csv.NewWriter(&out) + + if err := csv.Write(evaluationHeader()); err != nil { + return nil, pkgerrors.WithStack(err) + } + csv.Flush() + + if _, err = evaluationFile.Writer.Write([]byte(out.String())); err != nil { + return nil, pkgerrors.WithStack(err) + } + + return evaluationFile, nil +} + +// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV. +func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) { + var out strings.Builder + csv := csv.NewWriter(&out) + + tasks := maps.Keys(assessmentsPerTask) + slices.SortStableFunc(tasks, func(a, b task.Identifier) int { + return cmp.Compare(a, b) + }) + + for _, task := range tasks { + assessment := assessmentsPerTask[task] + row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) + csv.Write(row) + } + csv.Flush() + + if _, err := e.Writer.Write([]byte(out.String())); err != nil { + return pkgerrors.WithStack(err) + } + + return nil +} + +// evaluationHeader returns the CSV header for the evaluation CSV. +func evaluationHeader() (header []string) { + return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) +} diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go index f3c968ce..4195387d 100644 --- a/evaluate/report/csv_test.go +++ b/evaluate/report/csv_test.go @@ -1,102 +1,20 @@ package report import ( + "strings" "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/zimmski/osutil/bytesutil" "github.com/symflower/eval-dev-quality/evaluate/metrics" - metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing" evaluatetask "github.com/symflower/eval-dev-quality/evaluate/task" languagetesting "github.com/symflower/eval-dev-quality/language/testing" modeltesting "github.com/symflower/eval-dev-quality/model/testing" + "github.com/symflower/eval-dev-quality/task" ) -func TestGenerateCSVForAssessmentPerModelPerLanguagePerRepository(t *testing.T) { - type testCase struct { - Name string - - Assessments metricstesting.AssessmentTuples - - ExpectedString string - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - assessmentStore := assessmentTuplesToStore(tc.Assessments) - - actualString, err := GenerateCSV(assessmentStore) - assert.NoError(t, err) - - assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedString), actualString) - }) - } - - validate(t, &testCase{ - Name: "Single Empty Model", - - Assessments: metricstesting.AssessmentTuples{ - &metricstesting.AssessmentTuple{ - Model: modeltesting.NewMockModelNamedWithCosts(t, "some-model", "Some Model", 0), - Language: languagetesting.NewMockLanguageNamed(t, "some-language"), - RepositoryPath: "some-repository", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.NewAssessments(), - }, - }, - - ExpectedString: ` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model,Some Model,0,some-language,some-repository,write-tests,0,0,0,0,0,0,0,0,0 - `, - }) - validate(t, &testCase{ - Name: "Multiple Models", - - Assessments: metricstesting.AssessmentTuples{ - &metricstesting.AssessmentTuple{ - Model: modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", "Some Model A", 0.0001), - Language: languagetesting.NewMockLanguageNamed(t, "some-language"), - RepositoryPath: "some-repository", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, - metrics.AssessmentKeyResponseCharacterCount: 100, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 200, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", "Some Model B", 0.0005), - Language: languagetesting.NewMockLanguageNamed(t, "some-language"), - RepositoryPath: "some-repository", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, - metrics.AssessmentKeyResponseCharacterCount: 200, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 300, - }, - }, - }, - - ExpectedString: ` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model-a,Some Model A,0.0001,some-language,some-repository,write-tests,15,1,2,50,200,100,3,4,5 - some-model-b,Some Model B,0.0005,some-language,some-repository,write-tests,15,1,2,100,300,200,3,4,5 - `, - }) -} - func TestGenerateCSVForAssessmentPerModel(t *testing.T) { type testCase struct { Name string @@ -160,3 +78,79 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { `, }) } + +func TestNewEvaluationFile(t *testing.T) { + var file strings.Builder + _, err := NewEvaluationFile(&file) + require.NoError(t, err) + + actualEvaluationFileContent := file.String() + require.NoError(t, err) + + expectedEvaluationFileContent := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + `) + + assert.Equal(t, expectedEvaluationFileContent, string(actualEvaluationFileContent)) +} + +func TestWriteEvaluationRecord(t *testing.T) { + type testCase struct { + Name string + + Assessments map[task.Identifier]metrics.Assessments + + ExpectedCSV string + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + var file strings.Builder + evaluationFile, err := NewEvaluationFile(&file) + require.NoError(t, err) + + modelMock := modeltesting.NewMockModelNamedWithCosts(t, "mocked-model", "Mocked Model", 0.0001) + languageMock := languagetesting.NewMockLanguageNamed(t, "golang") + + err = evaluationFile.WriteEvaluationRecord(modelMock, languageMock, "golang/plain", tc.Assessments) + require.NoError(t, err) + + assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedCSV), file.String()) + }) + } + + validate(t, &testCase{ + Name: "Single task with empty assessments", + + Assessments: map[task.Identifier]metrics.Assessments{ + evaluatetask.IdentifierWriteTests: metrics.NewAssessments(), + }, + + ExpectedCSV: ` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0 + `, + }) + validate(t, &testCase{ + Name: "Multiple tasks with assessments", + + Assessments: map[task.Identifier]metrics.Assessments{ + evaluatetask.IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 0, + }, + evaluatetask.IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 10, + }, + }, + + ExpectedCSV: ` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests,2,0,1,0,0,0,1,0,0 + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests-symflower-fix,12,10,1,0,0,0,1,0,0 + `, + }) +} diff --git a/model/testing/helper.go b/model/testing/helper.go index f66a808c..1a7a5da6 100644 --- a/model/testing/helper.go +++ b/model/testing/helper.go @@ -69,6 +69,14 @@ func NewMockCapabilityWriteTestsNamed(t *testing.T, id string) *MockModelCapabil } } +// NewMockCapabilityWriteTestsNamedWithCost returns a new named mocked model with costs. +func NewMockCapabilityWriteTestsNamedWithCost(t *testing.T, id string, name string, cost float64) *MockModelCapabilityWriteTests { + return &MockModelCapabilityWriteTests{ + MockModel: NewMockModelNamedWithCosts(t, id, name, cost), + MockCapabilityWriteTests: NewMockCapabilityWriteTests(t), + } +} + // MockModelCapabilityRepairCode holds a mock implementing the "Model" and the "CapabilityRepairCode" interface. type MockModelCapabilityRepairCode struct { *MockModel From 50c27245f384dfb3e8d83ec4cf15392949bdc1de Mon Sep 17 00:00:00 2001 From: Rui Azevedo Date: Mon, 8 Jul 2024 08:33:06 +0100 Subject: [PATCH 2/2] Generate the "models-summed.csv" and "language-summed.csv" files based on the "evaluation.csv" file Part of #237 --- cmd/eval-dev-quality/cmd/evaluate.go | 29 +- evaluate/report/collection.go | 23 - evaluate/report/collection_test.go | 123 ----- evaluate/report/csv.go | 275 ++++++++--- evaluate/report/csv_test.go | 661 +++++++++++++++++++++++++-- 5 files changed, 859 insertions(+), 252 deletions(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index e1d8d7ef..b1ebe626 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -437,7 +437,7 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err return nil }) - if err := writeCSVs(command.ResultPath, assessments); err != nil { + if err := report.WriteCSVs(command.ResultPath); err != nil { command.logger.Panicf("ERROR: %s", err) } @@ -636,30 +636,3 @@ func (command *Evaluate) evaluateKubernetes(ctx *evaluate.Context) (err error) { return nil } - -// WriteCSVs writes the various CSV reports to disk. -func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err error) { - // Write the "models-summed.csv" containing the summary per model. - byModel := assessments.CollapseByModel() - csvByModel, err := report.GenerateCSV(byModel) - if err != nil { - return pkgerrors.Wrap(err, "could not create models-summed.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, "models-summed.csv"), []byte(csvByModel), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write models-summed.csv summary") - } - - // Write the individual "language-summed.csv" containing the summary per model per language. - byLanguage := assessments.CollapseByLanguage() - for language, modelsByLanguage := range byLanguage { - csvByLanguage, err := report.GenerateCSV(modelsByLanguage) - if err != nil { - return pkgerrors.Wrap(err, "could not create "+language.ID()+"-summed.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, language.ID()+"-summed.csv"), []byte(csvByLanguage), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write "+language.ID()+"-summed.csv summary") - } - } - - return nil -} diff --git a/evaluate/report/collection.go b/evaluate/report/collection.go index 33760a20..dc739133 100644 --- a/evaluate/report/collection.go +++ b/evaluate/report/collection.go @@ -13,9 +13,6 @@ import ( "github.com/symflower/eval-dev-quality/task" ) -// AssessmentPerLanguagePerModel holds a collection of assessments per language and model. -type AssessmentPerLanguagePerModel map[language.Language]AssessmentPerModel - // AssessmentPerModel holds a collection of assessments per model. type AssessmentPerModel map[model.Model]metrics.Assessments @@ -133,23 +130,3 @@ func (a *AssessmentStore) CollapseByModel() AssessmentPerModel { return perModel } - -// CollapseByLanguage returns all assessments aggregated per language and model. -func (a *AssessmentStore) CollapseByLanguage() AssessmentPerLanguagePerModel { - assessments := AssessmentPerLanguagePerModel{} - _ = a.Walk(func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) (err error) { - if _, ok := assessments[l]; !ok { - assessments[l] = map[model.Model]metrics.Assessments{} - } - - if _, ok := assessments[l][m]; !ok { - assessments[l][m] = metrics.NewAssessments() - } - - assessments[l][m].Add(a) - - return nil - }) - - return assessments -} diff --git a/evaluate/report/collection_test.go b/evaluate/report/collection_test.go index d779c0d8..fb324ddd 100644 --- a/evaluate/report/collection_test.go +++ b/evaluate/report/collection_test.go @@ -388,129 +388,6 @@ func TestAssessmentCollapseByModel(t *testing.T) { }) } -func TestAssessmentCollapseByLanguage(t *testing.T) { - type testCase struct { - Name string - - Assessments metricstesting.AssessmentTuples - - ExpectedAssessmentPerLanguagePerModel AssessmentPerLanguagePerModel - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - assessmentStore := assessmentTuplesToStore(tc.Assessments) - - actualAssessmentPerLanguagePerModel := assessmentStore.CollapseByLanguage() - - assert.Equal(t, tc.ExpectedAssessmentPerLanguagePerModel, actualAssessmentPerLanguagePerModel) - }) - } - - modelA := modeltesting.NewMockCapabilityWriteTestsNamed(t, "some-model-a") - modelB := modeltesting.NewMockCapabilityWriteTestsNamed(t, "some-model-b") - languageA := languagetesting.NewMockLanguageNamed(t, "some-language-a") - languageB := languagetesting.NewMockLanguageNamed(t, "some-language-b") - - validate(t, &testCase{ - Name: "Collapse", - - Assessments: metricstesting.AssessmentTuples{ - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageA, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 1, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageA, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 2, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageB, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 3, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageB, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 4, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageA, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 5, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageA, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 6, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageB, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 7, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageB, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 8, - }, - }, - }, - - ExpectedAssessmentPerLanguagePerModel: AssessmentPerLanguagePerModel{ - languageA: map[model.Model]metrics.Assessments{ - modelA: { - metrics.AssessmentKeyResponseNoExcess: 3, - }, - modelB: { - metrics.AssessmentKeyResponseNoExcess: 11, - }, - }, - languageB: map[model.Model]metrics.Assessments{ - modelA: { - metrics.AssessmentKeyResponseNoExcess: 7, - }, - modelB: { - metrics.AssessmentKeyResponseNoExcess: 15, - }, - }, - }, - }) -} - func assessmentTuplesToStore(at metricstesting.AssessmentTuples) (store *AssessmentStore) { store = NewAssessmentStore() for _, a := range at { diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go index 81ece6b7..17e44615 100644 --- a/evaluate/report/csv.go +++ b/evaluate/report/csv.go @@ -4,6 +4,8 @@ import ( "cmp" "encoding/csv" "io" + "os" + "path/filepath" "slices" "strconv" "strings" @@ -25,102 +27,273 @@ type CSVFormatter interface { Rows() (rows [][]string) } -// GenerateCSV returns the whole CSV as string. -func GenerateCSV(formatter CSVFormatter) (csvData string, err error) { - var out strings.Builder - csv := csv.NewWriter(&out) +// EvaluationFile holds the evaluation CSV file writer. +type EvaluationFile struct { + // Holds the writer where the evaluation CSV is written to. + io.Writer +} - if err := csv.Write(formatter.Header()); err != nil { - return "", pkgerrors.WithStack(err) +// NewEvaluationFile initializes an evaluation file and writes the corresponding CSV header. +func NewEvaluationFile(writer io.Writer) (evaluationFile *EvaluationFile, err error) { + evaluationFile = &EvaluationFile{ + Writer: writer, } - for _, row := range formatter.Rows() { - if err := csv.Write(row); err != nil { - return "", pkgerrors.WithStack(err) - } + csv := csv.NewWriter(writer) + + if err := csv.Write(evaluationHeader()); err != nil { + return nil, pkgerrors.WithStack(err) } + csv.Flush() + + return evaluationFile, nil +} + +// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV. +func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) { + csv := csv.NewWriter(e.Writer) + + tasks := maps.Keys(assessmentsPerTask) + slices.SortStableFunc(tasks, func(a, b task.Identifier) int { + return cmp.Compare(a, b) + }) + for _, task := range tasks { + assessment := assessmentsPerTask[task] + row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) + csv.Write(row) + } csv.Flush() - return out.String(), nil + return nil +} + +// evaluationHeader returns the CSV header for the evaluation CSV. +func evaluationHeader() (header []string) { + return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) +} + +// EvaluationRecord holds a line of the evaluation CSV. +type EvaluationRecord struct { + // ModelID holds the model id. + ModelID string + // ModelName holds the model name. + ModelName string + // ModelCost holds the model cost. + ModelCost float64 + + // LanguageID holds the language id. + LanguageID string + + // Assessments holds the assessments of an entry. + Assessments metrics.Assessments +} + +// Clone clones an evaluation record. +func (e *EvaluationRecord) Clone() (new *EvaluationRecord) { + new = &EvaluationRecord{} + + new.ModelID = e.ModelID + new.ModelName = e.ModelName + new.ModelCost = e.ModelCost + new.LanguageID = e.LanguageID + new.Assessments = metrics.Merge(e.Assessments, nil) + + return new +} + +// EvaluationRecords holds all the evaluation records. +type EvaluationRecords []*EvaluationRecord + +// EvaluationRecordsPerModel holds the collection of evaluation records per model. +type EvaluationRecordsPerModel map[string]*EvaluationRecord + +// GroupByModel groups the evaluation records by model. +func (e EvaluationRecords) GroupByModel() EvaluationRecordsPerModel { + perModel := map[string]*EvaluationRecord{} + + for _, record := range e { + _, ok := perModel[record.ModelID] + if !ok { + perModel[record.ModelID] = record.Clone() + } else { + r := perModel[record.ModelID] + r.Assessments = metrics.Merge(r.Assessments, record.Assessments) + } + } + + return perModel } // Header returns the header description as a CSV row. -func (a AssessmentPerModel) Header() (header []string) { +func (EvaluationRecordsPerModel) Header() (header []string) { return append([]string{"model-id", "model-name", "cost", "score"}, metrics.AllAssessmentKeysStrings...) } // Rows returns all data as CSV rows. -func (a AssessmentPerModel) Rows() (rows [][]string) { - models := maps.Keys(a) - slices.SortStableFunc(models, func(a, b model.Model) int { - return cmp.Compare(a.ID(), b.ID()) +func (e EvaluationRecordsPerModel) Rows() (rows [][]string) { + models := maps.Keys(e) + slices.SortStableFunc(models, func(a, b string) int { + return cmp.Compare(a, b) }) for _, model := range models { - metrics := a[model].StringCSV() - score := a[model].Score() - cost := model.Cost() + record := e[model] + metrics := record.Assessments.StringCSV() + score := record.Assessments.Score() + modelCost := record.ModelCost - row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(cost, 'f', -1, 64), strconv.FormatUint(uint64(score), 10)}, metrics...) + row := append([]string{record.ModelID, record.ModelName, strconv.FormatFloat(modelCost, 'f', -1, 64), strconv.FormatUint(uint64(score), 10)}, metrics...) rows = append(rows, row) } return rows } -// EvaluationFile holds the evaluation CSV file writer. -type EvaluationFile struct { - // Holds the writer where the evaluation CSV is written to. - io.Writer +// EvaluationRecordsPerModel holds the collection of evaluation records per model. +type EvaluationRecordsPerLanguagePerModel map[string]EvaluationRecordsPerModel + +// GroupByLanguageAndModel groups the evaluation records by language and model. +func (e EvaluationRecords) GroupByLanguageAndModel() EvaluationRecordsPerLanguagePerModel { + perLanguageAndModel := map[string]EvaluationRecordsPerModel{} + + for _, record := range e { + perModel, ok := perLanguageAndModel[record.LanguageID] + if !ok { + perLanguageAndModel[record.LanguageID] = EvaluationRecordsPerModel{ + record.ModelID: record, + } + } else { + _, ok := perModel[record.ModelID] + if !ok { + perModel[record.ModelID] = record.Clone() + } else { + perModel[record.ModelID].Assessments = metrics.Merge(perModel[record.ModelID].Assessments, record.Assessments) + } + } + } + + return perLanguageAndModel } -// NewEvaluationFile initializes an evaluation file and writes the corresponding CSV header. -func NewEvaluationFile(writer io.Writer) (evaluationFile *EvaluationFile, err error) { - evaluationFile = &EvaluationFile{ - Writer: writer, +// loadEvaluationRecords reads and returns the evaluation records from the evaluation CSV file. +func loadEvaluationRecords(evaluationFilePath string) (evaluationRecords EvaluationRecords, err error) { + evaluationFile, err := os.Open(evaluationFilePath) + if err != nil { + return nil, pkgerrors.WithStack(err) } + defer evaluationFile.Close() - var out strings.Builder - csv := csv.NewWriter(&out) + reader := csv.NewReader(evaluationFile) - if err := csv.Write(evaluationHeader()); err != nil { + // Check if the evaluation CSV header is correct. + if header, err := reader.Read(); err != nil { + return nil, pkgerrors.Wrap(err, "found error while reading evaluation file") + } else if strings.Join(header, ",") != strings.Join(evaluationHeader(), ",") { + return nil, pkgerrors.WithStack(pkgerrors.Errorf("expected header %+v\nfound header %+v", evaluationHeader(), header)) + } + + // Read the raw records from the evaluation CSV file. + records, err := reader.ReadAll() + if err != nil { return nil, pkgerrors.WithStack(err) } - csv.Flush() - if _, err = evaluationFile.Writer.Write([]byte(out.String())); err != nil { + // Convert the raw records into assessments that can be easily manipulated. + evaluationRecords = EvaluationRecords{} + for _, record := range records { + evaluationRecord, err := convertRawRecordToEvaluationRecord(record) + if err != nil { + return nil, err + } + evaluationRecords = append(evaluationRecords, evaluationRecord) + } + + return evaluationRecords, nil +} + +// convertRawRecordToEvaluationRecord converts a raw CSV record into an evaluation record. +func convertRawRecordToEvaluationRecord(raw []string) (record *EvaluationRecord, err error) { + assessments := metrics.NewAssessments() + + modelID := raw[0] + modelName := raw[1] + modelCost, err := strconv.ParseFloat(raw[2], 64) + if err != nil { return nil, pkgerrors.WithStack(err) } - return evaluationFile, nil + languageID := raw[3] + + rawMetrics := raw[7:] + for i, assessementKey := range metrics.AllAssessmentKeysStrings { + metric, err := strconv.ParseUint(rawMetrics[i], 10, 64) + if err != nil { + return nil, pkgerrors.WithStack(err) + } + + assessments[metrics.AssessmentKey(assessementKey)] = metric + } + + return &EvaluationRecord{ + ModelID: modelID, + ModelName: modelName, + ModelCost: modelCost, + + LanguageID: languageID, + + Assessments: assessments, + }, nil } -// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV. -func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) { +// generateCSV returns the whole CSV as string. +func generateCSV(formatter CSVFormatter) (csvData string, err error) { var out strings.Builder csv := csv.NewWriter(&out) - tasks := maps.Keys(assessmentsPerTask) - slices.SortStableFunc(tasks, func(a, b task.Identifier) int { - return cmp.Compare(a, b) - }) + if err := csv.Write(formatter.Header()); err != nil { + return "", pkgerrors.WithStack(err) + } - for _, task := range tasks { - assessment := assessmentsPerTask[task] - row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) - csv.Write(row) + for _, row := range formatter.Rows() { + if err := csv.Write(row); err != nil { + return "", pkgerrors.WithStack(err) + } } + csv.Flush() - if _, err := e.Writer.Write([]byte(out.String())); err != nil { - return pkgerrors.WithStack(err) + return out.String(), nil +} + +// WriteCSVs writes the various CSV reports to disk. +func WriteCSVs(resultPath string) (err error) { + evaluationRecords, err := loadEvaluationRecords(filepath.Join(resultPath, "evaluation.csv")) + if err != nil { + return err } - return nil -} + // Write the "models-summed.csv" containing the summary per model. + perModel := evaluationRecords.GroupByModel() + csvByModel, err := generateCSV(perModel) + if err != nil { + return pkgerrors.Wrap(err, "could not create models-summed.csv summary") + } + if err := os.WriteFile(filepath.Join(resultPath, "models-summed.csv"), []byte(csvByModel), 0644); err != nil { + return pkgerrors.Wrap(err, "could not write models-summed.csv summary") + } -// evaluationHeader returns the CSV header for the evaluation CSV. -func evaluationHeader() (header []string) { - return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) + // Write the individual "language-summed.csv" containing the summary per model per language. + perLanguage := evaluationRecords.GroupByLanguageAndModel() + for language, modelsByLanguage := range perLanguage { + csvByLanguage, err := generateCSV(modelsByLanguage) + if err != nil { + return pkgerrors.Wrap(err, "could not create "+language+"-summed.csv summary") + } + if err := os.WriteFile(filepath.Join(resultPath, language+"-summed.csv"), []byte(csvByLanguage), 0644); err != nil { + return pkgerrors.Wrap(err, "could not write "+language+"-summed.csv summary") + } + } + + return nil } diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go index 4195387d..b8b6873f 100644 --- a/evaluate/report/csv_test.go +++ b/evaluate/report/csv_test.go @@ -1,12 +1,16 @@ package report import ( + "os" + "path/filepath" "strings" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/zimmski/osutil" "github.com/zimmski/osutil/bytesutil" + "golang.org/x/exp/maps" "github.com/symflower/eval-dev-quality/evaluate/metrics" evaluatetask "github.com/symflower/eval-dev-quality/evaluate/task" @@ -19,14 +23,14 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { type testCase struct { Name string - Assessments AssessmentPerModel + CSVFormatter CSVFormatter ExpectedString string } validate := func(t *testing.T, tc *testCase) { t.Run(tc.Name, func(t *testing.T) { - actualString, err := GenerateCSV(tc.Assessments) + actualString, err := generateCSV(tc.CSVFormatter) assert.NoError(t, err) assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedString), actualString) @@ -34,47 +38,65 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { } validate(t, &testCase{ - Name: "Single Empty Model", + Name: "Single empty model", - Assessments: AssessmentPerModel{ - modeltesting.NewMockModelNamedWithCosts(t, "some-model", "Some Model", 0): {}, + CSVFormatter: EvaluationRecordsPerModel{ + "some-model-a": &EvaluationRecord{ + ModelID: "some-model-a", + ModelName: "Some Model A", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.NewAssessments(), + }, }, ExpectedString: ` model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model,Some Model,0,0,0,0,0,0,0,0,0,0 + some-model-a,Some Model A,0.0001,0,0,0,0,0,0,0,0,0 `, }) validate(t, &testCase{ - Name: "Multiple Models", + Name: "Multiple models with assessments", - Assessments: AssessmentPerModel{ - modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", "Some Model A", 0.0001): { - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, - metrics.AssessmentKeyResponseCharacterCount: 100, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 200, + CSVFormatter: EvaluationRecordsPerModel{ + "some-model-a": &EvaluationRecord{ + ModelID: "some-model-a", + ModelName: "Some Model A", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, + metrics.AssessmentKeyResponseCharacterCount: 100, + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 3, + metrics.AssessmentKeyResponseNoExcess: 4, + metrics.AssessmentKeyResponseWithCode: 5, + metrics.AssessmentKeyProcessingTime: 200, + }, }, - modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", "Some Model B", 0.0005): { - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, - metrics.AssessmentKeyResponseCharacterCount: 200, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 300, + "some-model-b": &EvaluationRecord{ + ModelID: "some-model-b", + ModelName: "Some Model B", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, + metrics.AssessmentKeyResponseCharacterCount: 200, + metrics.AssessmentKeyCoverage: 6, + metrics.AssessmentKeyFilesExecuted: 7, + metrics.AssessmentKeyResponseNoError: 8, + metrics.AssessmentKeyResponseNoExcess: 9, + metrics.AssessmentKeyResponseWithCode: 10, + metrics.AssessmentKeyProcessingTime: 400, + }, }, }, ExpectedString: ` model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code some-model-a,Some Model A,0.0001,15,1,2,50,200,100,3,4,5 - some-model-b,Some Model B,0.0005,15,1,2,100,300,200,3,4,5 + some-model-b,Some Model B,0.0003,40,6,7,100,400,200,8,9,10 `, }) } @@ -154,3 +176,588 @@ func TestWriteEvaluationRecord(t *testing.T) { `, }) } + +func TestLoadEvaluationRecords(t *testing.T) { + type testCase struct { + Name string + + Before func(resultPath string) + + ExpectedEvaluationRecords EvaluationRecords + ExpectedErr func(err error) + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + resultPath := t.TempDir() + + if tc.Before != nil { + tc.Before(resultPath) + } + + actualAssessments, actualErr := loadEvaluationRecords(filepath.Join(resultPath, "evaluation.csv")) + + if tc.ExpectedErr != nil { + tc.ExpectedErr(actualErr) + } else { + assert.NoError(t, actualErr) + assert.Equal(t, tc.ExpectedEvaluationRecords, actualAssessments) + } + }) + } + + validate(t, &testCase{ + Name: "Evaluation file does not exist", + + ExpectedErr: func(err error) { + if osutil.IsWindows() { + assert.ErrorContains(t, err, "The system cannot find the file specified") + } else { + assert.ErrorContains(t, err, "no such file or directory") + } + }, + }) + validate(t, &testCase{ + Name: "Evaluation file exists but it is empty", + + Before: func(resultPath string) { + file, err := os.Create(filepath.Join(resultPath, "evaluation.csv")) + require.NoError(t, err) + defer file.Close() + }, + + ExpectedErr: func(err error) { + assert.ErrorContains(t, err, "found error while reading evaluation file") + }, + }) + validate(t, &testCase{ + Name: "Evaluation file exists but with the wrong header", + + Before: func(resultPath string) { + header := bytesutil.StringTrimIndentations(` + model-id,model-name,cost + `) + require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(header), 0644)) + }, + + ExpectedErr: func(err error) { + assert.ErrorContains(t, err, "found header [model-id model-name cost]") + }, + }) + validate(t, &testCase{ + Name: "Single assessment", + + Before: func(resultPath string) { + fileContent := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-1.2,Claude 1.2,0.0001,golang,golang/light,write-tests,982,750,18,70179,720571,71195,115,49,50 + `) + require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) + }, + + ExpectedEvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 750, + metrics.AssessmentKeyFilesExecuted: 18, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 70179, + metrics.AssessmentKeyProcessingTime: 720571, + metrics.AssessmentKeyResponseCharacterCount: 71195, + metrics.AssessmentKeyResponseNoError: 115, + metrics.AssessmentKeyResponseNoExcess: 49, + metrics.AssessmentKeyResponseWithCode: 50, + }, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple assessments", + + Before: func(resultPath string) { + fileContent := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-1.2,Claude 1.2,0.0001,golang,golang/light,write-tests,982,750,18,70179,720571,71195,115,49,50 + openrouter/anthropic/claude-1.2,Claude 1.2,0.0002,golang,golang/plain,write-tests,37,20,2,441,11042,523,5,5,5 + `) + require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) + }, + + ExpectedEvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 750, + metrics.AssessmentKeyFilesExecuted: 18, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 70179, + metrics.AssessmentKeyProcessingTime: 720571, + metrics.AssessmentKeyResponseCharacterCount: 71195, + metrics.AssessmentKeyResponseNoError: 115, + metrics.AssessmentKeyResponseNoExcess: 49, + metrics.AssessmentKeyResponseWithCode: 50, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0002, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 441, + metrics.AssessmentKeyProcessingTime: 11042, + metrics.AssessmentKeyResponseCharacterCount: 523, + metrics.AssessmentKeyResponseNoError: 5, + metrics.AssessmentKeyResponseNoExcess: 5, + metrics.AssessmentKeyResponseWithCode: 5, + }, + }, + }, + }) +} + +func TestEvaluationRecordsGroupByModel(t *testing.T) { + type testCase struct { + Name string + + EvaluationRecords EvaluationRecords + + ExpectedEvaluationRecords map[string]*EvaluationRecord + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualEvaluationRecords := tc.EvaluationRecords.GroupByModel() + + assert.ElementsMatch(t, maps.Keys(tc.ExpectedEvaluationRecords), maps.Keys(actualEvaluationRecords)) + + for modelID, expectedRecord := range tc.ExpectedEvaluationRecords { + actualRecord := actualEvaluationRecords[modelID] + assert.Equal(t, expectedRecord, actualRecord) + assert.Truef(t, expectedRecord.Assessments.Equal(actualRecord.Assessments), "model:%s\nexpected:%s\nactual:%s", modelID, tc.ExpectedEvaluationRecords, actualEvaluationRecords) + } + }) + } + + validate(t, &testCase{ + Name: "Single record", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + ExpectedEvaluationRecords: map[string]*EvaluationRecord{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple records", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0002, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + ExpectedEvaluationRecords: map[string]*EvaluationRecord{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 2, + metrics.AssessmentKeyFilesExecuted: 4, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 6, + metrics.AssessmentKeyProcessingTime: 8, + metrics.AssessmentKeyResponseCharacterCount: 10, + metrics.AssessmentKeyResponseNoError: 12, + metrics.AssessmentKeyResponseNoExcess: 14, + metrics.AssessmentKeyResponseWithCode: 16, + }, + }, + "ollama/codeqwen:latest": &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + }) +} + +func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { + type testCase struct { + Name string + + EvaluationRecords EvaluationRecords + + ExpectedEvaluationRecordsPerLanguagePerModel EvaluationRecordsPerLanguagePerModel + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualEvaluationRecordsPerLanguagePerModel := tc.EvaluationRecords.GroupByLanguageAndModel() + + assert.Equal(t, tc.ExpectedEvaluationRecordsPerLanguagePerModel, actualEvaluationRecordsPerLanguagePerModel) + }) + } + + validate(t, &testCase{ + Name: "Single record without assessments", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.NewAssessments(), + }, + }, + + ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{ + "golang": EvaluationRecordsPerModel{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.NewAssessments(), + }, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple records", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + + ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{ + "golang": EvaluationRecordsPerModel{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 2, + metrics.AssessmentKeyFilesExecuted: 4, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 6, + metrics.AssessmentKeyProcessingTime: 8, + metrics.AssessmentKeyResponseCharacterCount: 10, + metrics.AssessmentKeyResponseNoError: 12, + metrics.AssessmentKeyResponseNoExcess: 14, + metrics.AssessmentKeyResponseWithCode: 16, + }, + }, + "ollama/codeqwen:latest": &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + "java": EvaluationRecordsPerModel{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + "ollama/codeqwen:latest": &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + }, + }) + +} + +func TestWriteCSVs(t *testing.T) { + type testCase struct { + Name string + + FileName string + + ExpectedFileContent string + } + + resultPath := t.TempDir() + + evaluationFilePath := filepath.Join(resultPath, "evaluation.csv") + evaluationFileContent := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,golang,golang/light,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,golang,golang/plain,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,java,java/light,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,java,java/plain,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,golang,golang/light,write-tests,21,8,7,6,5,4,3,2,1 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,golang,golang/plain,write-tests,21,8,7,6,5,4,3,2,1 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,java,java/light,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,java,java/plain,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/openai/gpt-4,GPT 4,0.005,golang,golang/light,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,GPT 4,0.005,golang,golang/plain,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,GPT 4,0.005,java,java/light,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,GPT 4,0.005,java,java/plain,write-tests,24,1,2,3,4,5,6,7,8 + `) + require.NoError(t, os.WriteFile(evaluationFilePath, []byte(evaluationFileContent), 0644)) + + err := WriteCSVs(resultPath) + require.NoError(t, err) + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + summedFilePath := filepath.Join(resultPath, tc.FileName) + + _, err = os.Stat(summedFilePath) + require.NoError(t, err) + + actualSummedFileContent, err := os.ReadFile(summedFilePath) + require.NoError(t, err) + + assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedFileContent), string(actualSummedFileContent)) + }) + } + + validate(t, &testCase{ + Name: "Models summed", + + FileName: "models-summed.csv", + + ExpectedFileContent: ` + model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,186,22,26,30,34,38,42,46,50 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,180,36,36,36,36,36,36,36,36 + openrouter/openai/gpt-4,GPT 4,0.005,96,4,8,12,16,20,24,28,32 + `, + }) + validate(t, &testCase{ + Name: "Golang summed", + + FileName: "golang-summed.csv", + + ExpectedFileContent: ` + model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,48,2,4,6,8,10,12,14,16 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,42,16,14,12,10,8,6,4,2 + openrouter/openai/gpt-4,GPT 4,0.005,48,2,4,6,8,10,12,14,16 + `, + }) + validate(t, &testCase{ + Name: "Java summed", + + FileName: "java-summed.csv", + + ExpectedFileContent: ` + model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,138,20,22,24,26,28,30,32,34 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,138,20,22,24,26,28,30,32,34 + openrouter/openai/gpt-4,GPT 4,0.005,48,2,4,6,8,10,12,14,16 + `, + }) +}