diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index 1d3f5460..9421169b 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -629,15 +629,6 @@ func (command *Evaluate) evaluateKubernetes(ctx *evaluate.Context) (err error) { // WriteCSVs writes the various CSV reports to disk. func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err error) { - // Write the "evaluation.csv" containing all data. - csv, err := report.GenerateCSV(assessments) - if err != nil { - return pkgerrors.Wrap(err, "could not create evaluation.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(csv), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write evaluation.csv summary") - } - // Write the "models-summed.csv" containing the summary per model. byModel := assessments.CollapseByModel() csvByModel, err := report.GenerateCSV(byModel) diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go index f941ecdd..b36152b1 100644 --- a/cmd/eval-dev-quality/cmd/evaluate_test.go +++ b/cmd/eval-dev-quality/cmd/evaluate_test.go @@ -731,27 +731,54 @@ func TestEvaluateExecute(t *testing.T) { filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 30, - metrics.AssessmentKeyFilesExecuted: 3, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 3, - metrics.AssessmentKeyResponseWithCode: 3, + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, metrics.Assessments{ - metrics.AssessmentKeyCoverage: 30, - metrics.AssessmentKeyFilesExecuted: 3, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 3, - metrics.AssessmentKeyResponseWithCode: 3, + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, }, - }, []uint64{42, 42}) + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + }, []uint64{14, 14, 14, 14, 14, 14}) // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762)) - assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762)) - assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(762)) + for _, assessment := range actualAssessments { + assert.Greater(t, assessment[metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) + assert.Equal(t, assessment[metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + } }, filepath.Join("result-directory", "evaluation.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "Run 1/3") diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go index 1d8d69ba..aeb1dd47 100644 --- a/evaluate/evaluate.go +++ b/evaluate/evaluate.go @@ -74,6 +74,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin // Ensure we report metrics for every model even if they are excluded. assessments = report.NewAssessmentStore() problemsPerModel := map[string][]error{} + // Write the evaluation CSV header so it's only written once. + report.WriteEvaluationHeader(ctx.ResultPath) { // Create temporary repositories for each language so the repository is copied only once per language. @@ -145,6 +147,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin modelSucceededBasicChecksOfLanguage[model][language] = true } assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) + // Write the task assessment to the evaluation CSV file. + report.WriteEvaluationRecord(ctx.ResultPath, model, language, temporaryRepository.Name(), assessment) } }) } @@ -249,6 +253,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err) } assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) + // Write the task assessment to the evaluation CSV file. + report.WriteEvaluationRecord(ctx.ResultPath, model, language, temporaryRepository.Name(), assessment) } }) } diff --git a/evaluate/evaluate_test.go b/evaluate/evaluate_test.go index 26f0bcd0..c47c6eb0 100644 --- a/evaluate/evaluate_test.go +++ b/evaluate/evaluate_test.go @@ -180,6 +180,8 @@ func TestEvaluate(t *testing.T) { Before: func(t *testing.T, logger *log.Logger, resultPath string) { // Set up mocks, when test is running. mockedModel.MockCapabilityWriteTests.On("WriteTests", mock.Anything).Return(nil, ErrEmptyResponseFromModel) + mockedModel.MockModel.On("Name").Return("Empty Response Model") + mockedModel.MockModel.On("Cost").Return(0.0001) }, Context: &Context{ @@ -211,6 +213,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -266,6 +269,7 @@ func TestEvaluate(t *testing.T) { filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, ErrEmptyResponseFromModel.Error()) }, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -332,6 +336,7 @@ func TestEvaluate(t *testing.T) { filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "Attempt 1/3: "+ErrEmptyResponseFromModel.Error()) }, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -397,6 +402,7 @@ func TestEvaluate(t *testing.T) { filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "DONE 0 tests, 1 error") }, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -428,6 +434,8 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Mocked Generation Model") + mockedModel.MockModel.On("Cost").Return(0.0001) validate(t, &testCase{ Name: "Problems of previous runs shouldn't cancel successive runs", @@ -517,6 +525,7 @@ func TestEvaluate(t *testing.T) { ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -524,6 +533,8 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Mocked Generation Model") + mockedModel.MockModel.On("Cost").Return(0.0001) validate(t, &testCase{ Name: "Solving basic checks once is enough", @@ -612,6 +623,7 @@ func TestEvaluate(t *testing.T) { ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -619,6 +631,8 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Mocked Generation Model") + mockedModel.MockModel.On("Cost").Return(0.0001) validate(t, &testCase{ Name: "Never solving basic checks leads to exclusion", @@ -672,6 +686,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 0, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -684,6 +699,9 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Mocked Generation Model") + mockedModel.MockModel.On("Cost").Return(0.0001) + repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ Name: "Interleaved", @@ -736,6 +754,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { assert.Contains(t, output, "Run 1/3") @@ -751,6 +770,9 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Mocked Generation Model") + mockedModel.MockModel.On("Cost").Return(0.0001) + repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ Name: "Sequential", @@ -803,6 +825,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { assert.Contains(t, output, "Run 1/3 for model") @@ -826,6 +849,8 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Testing Model") + mockedModel.MockModel.On("Cost").Return(0.0001) mockedProviderID := "testing-provider" mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) mockedLoader := providertesting.NewMockLoader(t) @@ -900,6 +925,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -908,6 +934,8 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Testing Model") + mockedModel.MockModel.On("Cost").Return(0.0001) mockedProviderID := "testing-provider" mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) mockedLoader := providertesting.NewMockLoader(t) @@ -980,6 +1008,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } @@ -989,6 +1018,9 @@ func TestEvaluate(t *testing.T) { languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) + mockedModel.MockModel.On("Name").Return("Testing Model") + mockedModel.MockModel.On("Cost").Return(0.0001) + repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ @@ -1041,6 +1073,7 @@ func TestEvaluate(t *testing.T) { ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, + filepath.Join("evaluation.csv"): nil, }, }) } diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go index 0088a712..e10de5b9 100644 --- a/evaluate/report/csv.go +++ b/evaluate/report/csv.go @@ -3,6 +3,8 @@ package report import ( "cmp" "encoding/csv" + "os" + "path/filepath" "slices" "strconv" "strings" @@ -44,27 +46,6 @@ func GenerateCSV(formatter CSVFormatter) (csvData string, err error) { return out.String(), nil } -// Header returns the header description as a CSV row. -func (a *AssessmentStore) Header() (header []string) { - return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) -} - -// Rows returns all data as CSV rows. -func (a *AssessmentStore) Rows() (rows [][]string) { - _ = a.Walk(func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) (err error) { - metrics := a.StringCSV() - score := a.Score() - cost := m.Cost() - - row := append([]string{m.ID(), m.Name(), strconv.FormatFloat(cost, 'f', -1, 64), l.ID(), r, string(t), strconv.FormatUint(uint64(score), 10)}, metrics...) - rows = append(rows, row) - - return nil - }) - - return rows -} - // Header returns the header description as a CSV row. func (a AssessmentPerModel) Header() (header []string) { return append([]string{"model-id", "model-name", "cost", "score"}, metrics.AllAssessmentKeysStrings...) @@ -88,3 +69,55 @@ func (a AssessmentPerModel) Rows() (rows [][]string) { return rows } + +// Evaluation header returns the CSV header for the evaluation CSV. +func EvaluationHeader() (header []string) { + return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) +} + +// WriteHeader writes the header to the evaluation CSV file. +func WriteEvaluationHeader(resultPath string) (err error) { + var out strings.Builder + csv := csv.NewWriter(&out) + + if err := csv.Write(EvaluationHeader()); err != nil { + return pkgerrors.WithStack(err) + } + csv.Flush() + + if err = os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(out.String()), 0644); err != nil { + return pkgerrors.WithStack(err) + } + + return nil +} + +// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV. +func WriteEvaluationRecord(resultPath string, model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) { + var out strings.Builder + csv := csv.NewWriter(&out) + + tasks := maps.Keys(assessmentsPerTask) + slices.SortStableFunc(tasks, func(a, b task.Identifier) int { + return cmp.Compare(a, b) + }) + + for _, task := range tasks { + assessment := assessmentsPerTask[task] + row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) + csv.Write(row) + } + csv.Flush() + + evaluationFile, err := os.OpenFile(filepath.Join(resultPath, "evaluation.csv"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return pkgerrors.WithStack(err) + } + defer evaluationFile.Close() + + if _, err := evaluationFile.WriteString(out.String()); err != nil { + return pkgerrors.WithStack(err) + } + + return nil +} diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go index f3c968ce..c756d8bb 100644 --- a/evaluate/report/csv_test.go +++ b/evaluate/report/csv_test.go @@ -1,102 +1,21 @@ package report import ( + "os" + "path/filepath" "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/zimmski/osutil/bytesutil" "github.com/symflower/eval-dev-quality/evaluate/metrics" - metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing" evaluatetask "github.com/symflower/eval-dev-quality/evaluate/task" languagetesting "github.com/symflower/eval-dev-quality/language/testing" modeltesting "github.com/symflower/eval-dev-quality/model/testing" + "github.com/symflower/eval-dev-quality/task" ) -func TestGenerateCSVForAssessmentPerModelPerLanguagePerRepository(t *testing.T) { - type testCase struct { - Name string - - Assessments metricstesting.AssessmentTuples - - ExpectedString string - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - assessmentStore := assessmentTuplesToStore(tc.Assessments) - - actualString, err := GenerateCSV(assessmentStore) - assert.NoError(t, err) - - assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedString), actualString) - }) - } - - validate(t, &testCase{ - Name: "Single Empty Model", - - Assessments: metricstesting.AssessmentTuples{ - &metricstesting.AssessmentTuple{ - Model: modeltesting.NewMockModelNamedWithCosts(t, "some-model", "Some Model", 0), - Language: languagetesting.NewMockLanguageNamed(t, "some-language"), - RepositoryPath: "some-repository", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.NewAssessments(), - }, - }, - - ExpectedString: ` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model,Some Model,0,some-language,some-repository,write-tests,0,0,0,0,0,0,0,0,0 - `, - }) - validate(t, &testCase{ - Name: "Multiple Models", - - Assessments: metricstesting.AssessmentTuples{ - &metricstesting.AssessmentTuple{ - Model: modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", "Some Model A", 0.0001), - Language: languagetesting.NewMockLanguageNamed(t, "some-language"), - RepositoryPath: "some-repository", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, - metrics.AssessmentKeyResponseCharacterCount: 100, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 200, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", "Some Model B", 0.0005), - Language: languagetesting.NewMockLanguageNamed(t, "some-language"), - RepositoryPath: "some-repository", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, - metrics.AssessmentKeyResponseCharacterCount: 200, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 300, - }, - }, - }, - - ExpectedString: ` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model-a,Some Model A,0.0001,some-language,some-repository,write-tests,15,1,2,50,200,100,3,4,5 - some-model-b,Some Model B,0.0005,some-language,some-repository,write-tests,15,1,2,100,300,200,3,4,5 - `, - }) -} - func TestGenerateCSVForAssessmentPerModel(t *testing.T) { type testCase struct { Name string @@ -160,3 +79,122 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { `, }) } + +func TestWriteEvaluationHeader(t *testing.T) { + resultPath := t.TempDir() + + WriteEvaluationHeader(resultPath) + + _, err := os.Stat(filepath.Join(resultPath, "evaluation.csv")) + require.NoError(t, err) + + actualHeader, err := os.ReadFile(filepath.Join(resultPath, "evaluation.csv")) + require.NoError(t, err) + + expectedHeader := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + `) + + assert.Equal(t, expectedHeader, string(actualHeader)) +} + +func TestWriteEvaluationRecord(t *testing.T) { + type testCase struct { + Name string + + Before func(resultPath string) + + Assessments map[task.Identifier]metrics.Assessments + + ExpectedCSV string + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + resultPath := t.TempDir() + evaluationCSVFilePath := filepath.Join(resultPath, "evaluation.csv") + + modelMock := modeltesting.NewMockModelNamedWithCosts(t, "mocked-model", "Mocked Model", 0.0001) + languageMock := languagetesting.NewMockLanguageNamed(t, "golang") + + if tc.Before != nil { + tc.Before(resultPath) + } + + err := WriteEvaluationRecord(resultPath, modelMock, languageMock, "golang/plain", tc.Assessments) + require.NoError(t, err) + + _, err = os.Stat(evaluationCSVFilePath) + require.NoError(t, err) + + actualCSV, err := os.ReadFile(evaluationCSVFilePath) + require.NoError(t, err) + + assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedCSV), string(actualCSV)) + }) + } + + validate(t, &testCase{ + Name: "Evaluation file does not exist", + + Assessments: map[task.Identifier]metrics.Assessments{ + evaluatetask.IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 10, + }, + evaluatetask.IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 10, + }, + }, + + ExpectedCSV: ` + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests,12,10,1,0,0,0,1,0,0 + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests-symflower-fix,12,10,1,0,0,0,1,0,0 + `, + }) + validate(t, &testCase{ + Name: "Single task with empty assessments", + + Before: func(resultPath string) { + WriteEvaluationHeader(resultPath) + }, + + Assessments: map[task.Identifier]metrics.Assessments{ + evaluatetask.IdentifierWriteTests: metrics.NewAssessments(), + }, + + ExpectedCSV: ` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0 + `, + }) + validate(t, &testCase{ + Name: "Multiple tasks with assessments", + + Before: func(resultPath string) { + WriteEvaluationHeader(resultPath) + }, + + Assessments: map[task.Identifier]metrics.Assessments{ + evaluatetask.IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 0, + }, + evaluatetask.IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 10, + }, + }, + + ExpectedCSV: ` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests,2,0,1,0,0,0,1,0,0 + mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests-symflower-fix,12,10,1,0,0,0,1,0,0 + `, + }) +}