diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index e1d8d7ef..b1ebe626 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -437,7 +437,7 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err return nil }) - if err := writeCSVs(command.ResultPath, assessments); err != nil { + if err := report.WriteCSVs(command.ResultPath); err != nil { command.logger.Panicf("ERROR: %s", err) } @@ -636,30 +636,3 @@ func (command *Evaluate) evaluateKubernetes(ctx *evaluate.Context) (err error) { return nil } - -// WriteCSVs writes the various CSV reports to disk. -func writeCSVs(resultPath string, assessments *report.AssessmentStore) (err error) { - // Write the "models-summed.csv" containing the summary per model. - byModel := assessments.CollapseByModel() - csvByModel, err := report.GenerateCSV(byModel) - if err != nil { - return pkgerrors.Wrap(err, "could not create models-summed.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, "models-summed.csv"), []byte(csvByModel), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write models-summed.csv summary") - } - - // Write the individual "language-summed.csv" containing the summary per model per language. - byLanguage := assessments.CollapseByLanguage() - for language, modelsByLanguage := range byLanguage { - csvByLanguage, err := report.GenerateCSV(modelsByLanguage) - if err != nil { - return pkgerrors.Wrap(err, "could not create "+language.ID()+"-summed.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, language.ID()+"-summed.csv"), []byte(csvByLanguage), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write "+language.ID()+"-summed.csv summary") - } - } - - return nil -} diff --git a/evaluate/report/collection.go b/evaluate/report/collection.go index 33760a20..dc739133 100644 --- a/evaluate/report/collection.go +++ b/evaluate/report/collection.go @@ -13,9 +13,6 @@ import ( "github.com/symflower/eval-dev-quality/task" ) -// AssessmentPerLanguagePerModel holds a collection of assessments per language and model. -type AssessmentPerLanguagePerModel map[language.Language]AssessmentPerModel - // AssessmentPerModel holds a collection of assessments per model. type AssessmentPerModel map[model.Model]metrics.Assessments @@ -133,23 +130,3 @@ func (a *AssessmentStore) CollapseByModel() AssessmentPerModel { return perModel } - -// CollapseByLanguage returns all assessments aggregated per language and model. -func (a *AssessmentStore) CollapseByLanguage() AssessmentPerLanguagePerModel { - assessments := AssessmentPerLanguagePerModel{} - _ = a.Walk(func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) (err error) { - if _, ok := assessments[l]; !ok { - assessments[l] = map[model.Model]metrics.Assessments{} - } - - if _, ok := assessments[l][m]; !ok { - assessments[l][m] = metrics.NewAssessments() - } - - assessments[l][m].Add(a) - - return nil - }) - - return assessments -} diff --git a/evaluate/report/collection_test.go b/evaluate/report/collection_test.go index d779c0d8..fb324ddd 100644 --- a/evaluate/report/collection_test.go +++ b/evaluate/report/collection_test.go @@ -388,129 +388,6 @@ func TestAssessmentCollapseByModel(t *testing.T) { }) } -func TestAssessmentCollapseByLanguage(t *testing.T) { - type testCase struct { - Name string - - Assessments metricstesting.AssessmentTuples - - ExpectedAssessmentPerLanguagePerModel AssessmentPerLanguagePerModel - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - assessmentStore := assessmentTuplesToStore(tc.Assessments) - - actualAssessmentPerLanguagePerModel := assessmentStore.CollapseByLanguage() - - assert.Equal(t, tc.ExpectedAssessmentPerLanguagePerModel, actualAssessmentPerLanguagePerModel) - }) - } - - modelA := modeltesting.NewMockCapabilityWriteTestsNamed(t, "some-model-a") - modelB := modeltesting.NewMockCapabilityWriteTestsNamed(t, "some-model-b") - languageA := languagetesting.NewMockLanguageNamed(t, "some-language-a") - languageB := languagetesting.NewMockLanguageNamed(t, "some-language-b") - - validate(t, &testCase{ - Name: "Collapse", - - Assessments: metricstesting.AssessmentTuples{ - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageA, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 1, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageA, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 2, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageB, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 3, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelA, - Language: languageB, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 4, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageA, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 5, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageA, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 6, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageB, - RepositoryPath: "some-repository-a", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 7, - }, - }, - &metricstesting.AssessmentTuple{ - Model: modelB, - Language: languageB, - RepositoryPath: "some-repository-b", - Task: evaluatetask.IdentifierWriteTests, - Assessment: metrics.Assessments{ - metrics.AssessmentKeyResponseNoExcess: 8, - }, - }, - }, - - ExpectedAssessmentPerLanguagePerModel: AssessmentPerLanguagePerModel{ - languageA: map[model.Model]metrics.Assessments{ - modelA: { - metrics.AssessmentKeyResponseNoExcess: 3, - }, - modelB: { - metrics.AssessmentKeyResponseNoExcess: 11, - }, - }, - languageB: map[model.Model]metrics.Assessments{ - modelA: { - metrics.AssessmentKeyResponseNoExcess: 7, - }, - modelB: { - metrics.AssessmentKeyResponseNoExcess: 15, - }, - }, - }, - }) -} - func assessmentTuplesToStore(at metricstesting.AssessmentTuples) (store *AssessmentStore) { store = NewAssessmentStore() for _, a := range at { diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go index 81ece6b7..17e44615 100644 --- a/evaluate/report/csv.go +++ b/evaluate/report/csv.go @@ -4,6 +4,8 @@ import ( "cmp" "encoding/csv" "io" + "os" + "path/filepath" "slices" "strconv" "strings" @@ -25,102 +27,273 @@ type CSVFormatter interface { Rows() (rows [][]string) } -// GenerateCSV returns the whole CSV as string. -func GenerateCSV(formatter CSVFormatter) (csvData string, err error) { - var out strings.Builder - csv := csv.NewWriter(&out) +// EvaluationFile holds the evaluation CSV file writer. +type EvaluationFile struct { + // Holds the writer where the evaluation CSV is written to. + io.Writer +} - if err := csv.Write(formatter.Header()); err != nil { - return "", pkgerrors.WithStack(err) +// NewEvaluationFile initializes an evaluation file and writes the corresponding CSV header. +func NewEvaluationFile(writer io.Writer) (evaluationFile *EvaluationFile, err error) { + evaluationFile = &EvaluationFile{ + Writer: writer, } - for _, row := range formatter.Rows() { - if err := csv.Write(row); err != nil { - return "", pkgerrors.WithStack(err) - } + csv := csv.NewWriter(writer) + + if err := csv.Write(evaluationHeader()); err != nil { + return nil, pkgerrors.WithStack(err) } + csv.Flush() + + return evaluationFile, nil +} + +// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV. +func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) { + csv := csv.NewWriter(e.Writer) + + tasks := maps.Keys(assessmentsPerTask) + slices.SortStableFunc(tasks, func(a, b task.Identifier) int { + return cmp.Compare(a, b) + }) + for _, task := range tasks { + assessment := assessmentsPerTask[task] + row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) + csv.Write(row) + } csv.Flush() - return out.String(), nil + return nil +} + +// evaluationHeader returns the CSV header for the evaluation CSV. +func evaluationHeader() (header []string) { + return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) +} + +// EvaluationRecord holds a line of the evaluation CSV. +type EvaluationRecord struct { + // ModelID holds the model id. + ModelID string + // ModelName holds the model name. + ModelName string + // ModelCost holds the model cost. + ModelCost float64 + + // LanguageID holds the language id. + LanguageID string + + // Assessments holds the assessments of an entry. + Assessments metrics.Assessments +} + +// Clone clones an evaluation record. +func (e *EvaluationRecord) Clone() (new *EvaluationRecord) { + new = &EvaluationRecord{} + + new.ModelID = e.ModelID + new.ModelName = e.ModelName + new.ModelCost = e.ModelCost + new.LanguageID = e.LanguageID + new.Assessments = metrics.Merge(e.Assessments, nil) + + return new +} + +// EvaluationRecords holds all the evaluation records. +type EvaluationRecords []*EvaluationRecord + +// EvaluationRecordsPerModel holds the collection of evaluation records per model. +type EvaluationRecordsPerModel map[string]*EvaluationRecord + +// GroupByModel groups the evaluation records by model. +func (e EvaluationRecords) GroupByModel() EvaluationRecordsPerModel { + perModel := map[string]*EvaluationRecord{} + + for _, record := range e { + _, ok := perModel[record.ModelID] + if !ok { + perModel[record.ModelID] = record.Clone() + } else { + r := perModel[record.ModelID] + r.Assessments = metrics.Merge(r.Assessments, record.Assessments) + } + } + + return perModel } // Header returns the header description as a CSV row. -func (a AssessmentPerModel) Header() (header []string) { +func (EvaluationRecordsPerModel) Header() (header []string) { return append([]string{"model-id", "model-name", "cost", "score"}, metrics.AllAssessmentKeysStrings...) } // Rows returns all data as CSV rows. -func (a AssessmentPerModel) Rows() (rows [][]string) { - models := maps.Keys(a) - slices.SortStableFunc(models, func(a, b model.Model) int { - return cmp.Compare(a.ID(), b.ID()) +func (e EvaluationRecordsPerModel) Rows() (rows [][]string) { + models := maps.Keys(e) + slices.SortStableFunc(models, func(a, b string) int { + return cmp.Compare(a, b) }) for _, model := range models { - metrics := a[model].StringCSV() - score := a[model].Score() - cost := model.Cost() + record := e[model] + metrics := record.Assessments.StringCSV() + score := record.Assessments.Score() + modelCost := record.ModelCost - row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(cost, 'f', -1, 64), strconv.FormatUint(uint64(score), 10)}, metrics...) + row := append([]string{record.ModelID, record.ModelName, strconv.FormatFloat(modelCost, 'f', -1, 64), strconv.FormatUint(uint64(score), 10)}, metrics...) rows = append(rows, row) } return rows } -// EvaluationFile holds the evaluation CSV file writer. -type EvaluationFile struct { - // Holds the writer where the evaluation CSV is written to. - io.Writer +// EvaluationRecordsPerModel holds the collection of evaluation records per model. +type EvaluationRecordsPerLanguagePerModel map[string]EvaluationRecordsPerModel + +// GroupByLanguageAndModel groups the evaluation records by language and model. +func (e EvaluationRecords) GroupByLanguageAndModel() EvaluationRecordsPerLanguagePerModel { + perLanguageAndModel := map[string]EvaluationRecordsPerModel{} + + for _, record := range e { + perModel, ok := perLanguageAndModel[record.LanguageID] + if !ok { + perLanguageAndModel[record.LanguageID] = EvaluationRecordsPerModel{ + record.ModelID: record, + } + } else { + _, ok := perModel[record.ModelID] + if !ok { + perModel[record.ModelID] = record.Clone() + } else { + perModel[record.ModelID].Assessments = metrics.Merge(perModel[record.ModelID].Assessments, record.Assessments) + } + } + } + + return perLanguageAndModel } -// NewEvaluationFile initializes an evaluation file and writes the corresponding CSV header. -func NewEvaluationFile(writer io.Writer) (evaluationFile *EvaluationFile, err error) { - evaluationFile = &EvaluationFile{ - Writer: writer, +// loadEvaluationRecords reads and returns the evaluation records from the evaluation CSV file. +func loadEvaluationRecords(evaluationFilePath string) (evaluationRecords EvaluationRecords, err error) { + evaluationFile, err := os.Open(evaluationFilePath) + if err != nil { + return nil, pkgerrors.WithStack(err) } + defer evaluationFile.Close() - var out strings.Builder - csv := csv.NewWriter(&out) + reader := csv.NewReader(evaluationFile) - if err := csv.Write(evaluationHeader()); err != nil { + // Check if the evaluation CSV header is correct. + if header, err := reader.Read(); err != nil { + return nil, pkgerrors.Wrap(err, "found error while reading evaluation file") + } else if strings.Join(header, ",") != strings.Join(evaluationHeader(), ",") { + return nil, pkgerrors.WithStack(pkgerrors.Errorf("expected header %+v\nfound header %+v", evaluationHeader(), header)) + } + + // Read the raw records from the evaluation CSV file. + records, err := reader.ReadAll() + if err != nil { return nil, pkgerrors.WithStack(err) } - csv.Flush() - if _, err = evaluationFile.Writer.Write([]byte(out.String())); err != nil { + // Convert the raw records into assessments that can be easily manipulated. + evaluationRecords = EvaluationRecords{} + for _, record := range records { + evaluationRecord, err := convertRawRecordToEvaluationRecord(record) + if err != nil { + return nil, err + } + evaluationRecords = append(evaluationRecords, evaluationRecord) + } + + return evaluationRecords, nil +} + +// convertRawRecordToEvaluationRecord converts a raw CSV record into an evaluation record. +func convertRawRecordToEvaluationRecord(raw []string) (record *EvaluationRecord, err error) { + assessments := metrics.NewAssessments() + + modelID := raw[0] + modelName := raw[1] + modelCost, err := strconv.ParseFloat(raw[2], 64) + if err != nil { return nil, pkgerrors.WithStack(err) } - return evaluationFile, nil + languageID := raw[3] + + rawMetrics := raw[7:] + for i, assessementKey := range metrics.AllAssessmentKeysStrings { + metric, err := strconv.ParseUint(rawMetrics[i], 10, 64) + if err != nil { + return nil, pkgerrors.WithStack(err) + } + + assessments[metrics.AssessmentKey(assessementKey)] = metric + } + + return &EvaluationRecord{ + ModelID: modelID, + ModelName: modelName, + ModelCost: modelCost, + + LanguageID: languageID, + + Assessments: assessments, + }, nil } -// WriteEvaluationRecord writes the assessments of a task into the evaluation CSV. -func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language language.Language, repositoryName string, assessmentsPerTask map[task.Identifier]metrics.Assessments) (err error) { +// generateCSV returns the whole CSV as string. +func generateCSV(formatter CSVFormatter) (csvData string, err error) { var out strings.Builder csv := csv.NewWriter(&out) - tasks := maps.Keys(assessmentsPerTask) - slices.SortStableFunc(tasks, func(a, b task.Identifier) int { - return cmp.Compare(a, b) - }) + if err := csv.Write(formatter.Header()); err != nil { + return "", pkgerrors.WithStack(err) + } - for _, task := range tasks { - assessment := assessmentsPerTask[task] - row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) - csv.Write(row) + for _, row := range formatter.Rows() { + if err := csv.Write(row); err != nil { + return "", pkgerrors.WithStack(err) + } } + csv.Flush() - if _, err := e.Writer.Write([]byte(out.String())); err != nil { - return pkgerrors.WithStack(err) + return out.String(), nil +} + +// WriteCSVs writes the various CSV reports to disk. +func WriteCSVs(resultPath string) (err error) { + evaluationRecords, err := loadEvaluationRecords(filepath.Join(resultPath, "evaluation.csv")) + if err != nil { + return err } - return nil -} + // Write the "models-summed.csv" containing the summary per model. + perModel := evaluationRecords.GroupByModel() + csvByModel, err := generateCSV(perModel) + if err != nil { + return pkgerrors.Wrap(err, "could not create models-summed.csv summary") + } + if err := os.WriteFile(filepath.Join(resultPath, "models-summed.csv"), []byte(csvByModel), 0644); err != nil { + return pkgerrors.Wrap(err, "could not write models-summed.csv summary") + } -// evaluationHeader returns the CSV header for the evaluation CSV. -func evaluationHeader() (header []string) { - return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) + // Write the individual "language-summed.csv" containing the summary per model per language. + perLanguage := evaluationRecords.GroupByLanguageAndModel() + for language, modelsByLanguage := range perLanguage { + csvByLanguage, err := generateCSV(modelsByLanguage) + if err != nil { + return pkgerrors.Wrap(err, "could not create "+language+"-summed.csv summary") + } + if err := os.WriteFile(filepath.Join(resultPath, language+"-summed.csv"), []byte(csvByLanguage), 0644); err != nil { + return pkgerrors.Wrap(err, "could not write "+language+"-summed.csv summary") + } + } + + return nil } diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go index 4195387d..a1e97796 100644 --- a/evaluate/report/csv_test.go +++ b/evaluate/report/csv_test.go @@ -1,12 +1,16 @@ package report import ( + "os" + "path/filepath" "strings" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/zimmski/osutil" "github.com/zimmski/osutil/bytesutil" + "golang.org/x/exp/maps" "github.com/symflower/eval-dev-quality/evaluate/metrics" evaluatetask "github.com/symflower/eval-dev-quality/evaluate/task" @@ -19,14 +23,14 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { type testCase struct { Name string - Assessments AssessmentPerModel + CSVFormatter CSVFormatter ExpectedString string } validate := func(t *testing.T, tc *testCase) { t.Run(tc.Name, func(t *testing.T) { - actualString, err := GenerateCSV(tc.Assessments) + actualString, err := generateCSV(tc.CSVFormatter) assert.NoError(t, err) assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedString), actualString) @@ -34,47 +38,65 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { } validate(t, &testCase{ - Name: "Single Empty Model", + Name: "Single empty model", - Assessments: AssessmentPerModel{ - modeltesting.NewMockModelNamedWithCosts(t, "some-model", "Some Model", 0): {}, + CSVFormatter: EvaluationRecordsPerModel{ + "some-model-a": &EvaluationRecord{ + ModelID: "some-model-a", + ModelName: "Some Model A", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.NewAssessments(), + }, }, ExpectedString: ` model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model,Some Model,0,0,0,0,0,0,0,0,0,0 + some-model-a,Some Model A,0.0001,0,0,0,0,0,0,0,0,0 `, }) validate(t, &testCase{ - Name: "Multiple Models", + Name: "Multiple models with assessments", - Assessments: AssessmentPerModel{ - modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", "Some Model A", 0.0001): { - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, - metrics.AssessmentKeyResponseCharacterCount: 100, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 200, + CSVFormatter: EvaluationRecordsPerModel{ + "some-model-a": &EvaluationRecord{ + ModelID: "some-model-a", + ModelName: "Some Model A", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, + metrics.AssessmentKeyResponseCharacterCount: 100, + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 3, + metrics.AssessmentKeyResponseNoExcess: 4, + metrics.AssessmentKeyResponseWithCode: 5, + metrics.AssessmentKeyProcessingTime: 200, + }, }, - modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", "Some Model B", 0.0005): { - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, - metrics.AssessmentKeyResponseCharacterCount: 200, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 300, + "some-model-b": &EvaluationRecord{ + ModelID: "some-model-b", + ModelName: "Some Model B", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, + metrics.AssessmentKeyResponseCharacterCount: 200, + metrics.AssessmentKeyCoverage: 6, + metrics.AssessmentKeyFilesExecuted: 7, + metrics.AssessmentKeyResponseNoError: 8, + metrics.AssessmentKeyResponseNoExcess: 9, + metrics.AssessmentKeyResponseWithCode: 10, + metrics.AssessmentKeyProcessingTime: 400, + }, }, }, ExpectedString: ` model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code some-model-a,Some Model A,0.0001,15,1,2,50,200,100,3,4,5 - some-model-b,Some Model B,0.0005,15,1,2,100,300,200,3,4,5 + some-model-b,Some Model B,0.0003,40,6,7,100,400,200,8,9,10 `, }) } @@ -154,3 +176,590 @@ func TestWriteEvaluationRecord(t *testing.T) { `, }) } + +func TestLoadEvaluationRecords(t *testing.T) { + type testCase struct { + Name string + + Before func(resultPath string) + + ExpectedEvaluationRecords EvaluationRecords + ExpectedErr func(err error) + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + resultPath := t.TempDir() + + if tc.Before != nil { + tc.Before(resultPath) + } + + actualAssessments, actualErr := loadEvaluationRecords(filepath.Join(resultPath, "evaluation.csv")) + + if tc.ExpectedErr != nil { + tc.ExpectedErr(actualErr) + } else { + assert.NoError(t, actualErr) + assert.Equal(t, tc.ExpectedEvaluationRecords, actualAssessments) + } + }) + } + + validate(t, &testCase{ + Name: "Evaluation file does not exist", + + ExpectedErr: func(err error) { + if osutil.IsWindows() { + assert.ErrorContains(t, err, "The system cannot find the file specified") + } else { + assert.ErrorContains(t, err, "no such file or directory") + } + }, + }) + validate(t, &testCase{ + Name: "Evaluation file exists but it is empty", + + Before: func(resultPath string) { + file, err := os.Create(filepath.Join(resultPath, "evaluation.csv")) + require.NoError(t, err) + defer file.Close() + }, + + ExpectedErr: func(err error) { + assert.ErrorContains(t, err, "found error while reading evaluation file") + }, + }) + validate(t, &testCase{ + Name: "Evaluation file exists but with the wrong header", + + Before: func(resultPath string) { + header := bytesutil.StringTrimIndentations(` + model-id,model-name,cost + `) + require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(header), 0644)) + }, + + ExpectedErr: func(err error) { + assert.ErrorContains(t, err, "found header [model-id model-name cost]") + }, + }) + validate(t, &testCase{ + Name: "Single assessment", + + Before: func(resultPath string) { + fileContent := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-1.2,Claude 1.2,0.0001,golang,golang/light,write-tests,982,750,18,70179,720571,71195,115,49,50 + `) + require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) + }, + + ExpectedEvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 750, + metrics.AssessmentKeyFilesExecuted: 18, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 70179, + metrics.AssessmentKeyProcessingTime: 720571, + metrics.AssessmentKeyResponseCharacterCount: 71195, + metrics.AssessmentKeyResponseNoError: 115, + metrics.AssessmentKeyResponseNoExcess: 49, + metrics.AssessmentKeyResponseWithCode: 50, + }, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple assessments", + + Before: func(resultPath string) { + fileContent := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-1.2,Claude 1.2,0.0001,golang,golang/light,write-tests,982,750,18,70179,720571,71195,115,49,50 + openrouter/anthropic/claude-1.2,Claude 1.2,0.0002,golang,golang/plain,write-tests,37,20,2,441,11042,523,5,5,5 + `) + require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) + }, + + ExpectedEvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 750, + metrics.AssessmentKeyFilesExecuted: 18, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 70179, + metrics.AssessmentKeyProcessingTime: 720571, + metrics.AssessmentKeyResponseCharacterCount: 71195, + metrics.AssessmentKeyResponseNoError: 115, + metrics.AssessmentKeyResponseNoExcess: 49, + metrics.AssessmentKeyResponseWithCode: 50, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0002, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 441, + metrics.AssessmentKeyProcessingTime: 11042, + metrics.AssessmentKeyResponseCharacterCount: 523, + metrics.AssessmentKeyResponseNoError: 5, + metrics.AssessmentKeyResponseNoExcess: 5, + metrics.AssessmentKeyResponseWithCode: 5, + }, + }, + }, + }) +} + +func TestEvaluationRecordsGroupByModel(t *testing.T) { + type testCase struct { + Name string + + EvaluationRecords EvaluationRecords + + ExpectedEvaluationRecords map[string]*EvaluationRecord + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualEvaluationRecords := tc.EvaluationRecords.GroupByModel() + + expected, actual := maps.Clone(tc.ExpectedEvaluationRecords), maps.Clone(actualEvaluationRecords) + + assert.ElementsMatch(t, maps.Keys(expected), maps.Keys(actual)) + + for modelID, expectedRecord := range expected { + actualRecord := actual[modelID] + assert.Equal(t, expectedRecord, actualRecord) + assert.Truef(t, expectedRecord.Assessments.Equal(actualRecord.Assessments), "model:%s\nexpected:%s\nactual:%s", modelID, expected, actual) + } + }) + } + + validate(t, &testCase{ + Name: "Single record", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + ExpectedEvaluationRecords: map[string]*EvaluationRecord{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple records", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0002, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + ExpectedEvaluationRecords: map[string]*EvaluationRecord{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 2, + metrics.AssessmentKeyFilesExecuted: 4, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 6, + metrics.AssessmentKeyProcessingTime: 8, + metrics.AssessmentKeyResponseCharacterCount: 10, + metrics.AssessmentKeyResponseNoError: 12, + metrics.AssessmentKeyResponseNoExcess: 14, + metrics.AssessmentKeyResponseWithCode: 16, + }, + }, + "ollama/codeqwen:latest": &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + }) +} + +func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { + type testCase struct { + Name string + + EvaluationRecords EvaluationRecords + + ExpectedEvaluationRecordsPerLanguagePerModel EvaluationRecordsPerLanguagePerModel + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualEvaluationRecordsPerLanguagePerModel := tc.EvaluationRecords.GroupByLanguageAndModel() + + assert.Equal(t, tc.ExpectedEvaluationRecordsPerLanguagePerModel, actualEvaluationRecordsPerLanguagePerModel) + }) + } + + validate(t, &testCase{ + Name: "Single record without assessments", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.NewAssessments(), + }, + }, + + ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{ + "golang": EvaluationRecordsPerModel{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.NewAssessments(), + }, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple records", + + EvaluationRecords: EvaluationRecords{ + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + + ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{ + "golang": EvaluationRecordsPerModel{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 2, + metrics.AssessmentKeyFilesExecuted: 4, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 6, + metrics.AssessmentKeyProcessingTime: 8, + metrics.AssessmentKeyResponseCharacterCount: 10, + metrics.AssessmentKeyResponseNoError: 12, + metrics.AssessmentKeyResponseNoExcess: 14, + metrics.AssessmentKeyResponseWithCode: 16, + }, + }, + "ollama/codeqwen:latest": &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "golang", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + "java": EvaluationRecordsPerModel{ + "openrouter/anthropic/claude-1.2": &EvaluationRecord{ + ModelID: "openrouter/anthropic/claude-1.2", + ModelName: "Claude 1.2", + ModelCost: 0.0001, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + "ollama/codeqwen:latest": &EvaluationRecord{ + ModelID: "ollama/codeqwen:latest", + ModelName: "Code Qwen", + ModelCost: 0.0003, + LanguageID: "java", + Assessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, + metrics.AssessmentKeyProcessingTime: 4, + metrics.AssessmentKeyResponseCharacterCount: 5, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 7, + metrics.AssessmentKeyResponseWithCode: 8, + }, + }, + }, + }, + }) + +} + +func TestWriteCSVs(t *testing.T) { + type testCase struct { + Name string + + FileName string + + ExpectedFileContent string + } + + resultPath := t.TempDir() + + evaluationFilePath := filepath.Join(resultPath, "evaluation.csv") + evaluationFileContent := bytesutil.StringTrimIndentations(` + model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,golang,golang/light,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,golang,golang/plain,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,java,java/light,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,java,java/plain,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,golang,golang/light,write-tests,21,8,7,6,5,4,3,2,1 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,golang,golang/plain,write-tests,21,8,7,6,5,4,3,2,1 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,java,java/light,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,java,java/plain,write-tests,69,10,11,12,13,14,15,16,17 + openrouter/openai/gpt-4,GPT 4,0.005,golang,golang/light,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,GPT 4,0.005,golang,golang/plain,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,GPT 4,0.005,java,java/light,write-tests,24,1,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,GPT 4,0.005,java,java/plain,write-tests,24,1,2,3,4,5,6,7,8 + `) + require.NoError(t, os.WriteFile(evaluationFilePath, []byte(evaluationFileContent), 0644)) + + err := WriteCSVs(resultPath) + require.NoError(t, err) + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + summedFilePath := filepath.Join(resultPath, tc.FileName) + + _, err = os.Stat(summedFilePath) + require.NoError(t, err) + + actualSummedFileContent, err := os.ReadFile(summedFilePath) + require.NoError(t, err) + + assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedFileContent), string(actualSummedFileContent)) + }) + } + + validate(t, &testCase{ + Name: "Models summed", + + FileName: "models-summed.csv", + + ExpectedFileContent: ` + model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,186,22,26,30,34,38,42,46,50 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,180,36,36,36,36,36,36,36,36 + openrouter/openai/gpt-4,GPT 4,0.005,96,4,8,12,16,20,24,28,32 + `, + }) + validate(t, &testCase{ + Name: "Golang summed", + + FileName: "golang-summed.csv", + + ExpectedFileContent: ` + model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,48,2,4,6,8,10,12,14,16 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,42,16,14,12,10,8,6,4,2 + openrouter/openai/gpt-4,GPT 4,0.005,48,2,4,6,8,10,12,14,16 + `, + }) + validate(t, &testCase{ + Name: "Java summed", + + FileName: "java-summed.csv", + + ExpectedFileContent: ` + model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,Claude 2.0,0.001,138,20,22,24,26,28,30,32,34 + openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,138,20,22,24,26,28,30,32,34 + openrouter/openai/gpt-4,GPT 4,0.005,48,2,4,6,8,10,12,14,16 + `, + }) +}