From aa44c7ad71ea94f2025c635808fd62a580926fcb Mon Sep 17 00:00:00 2001 From: Rui Azevedo Date: Wed, 10 Jul 2024 07:22:44 +0100 Subject: [PATCH] Remove all occurrences of model costs and human-readable names, since they will be handled afterwards with tooling Part of #237 --- cmd/eval-dev-quality/cmd/evaluate.go | 5 +- cmd/eval-dev-quality/cmd/evaluate_test.go | 4 +- evaluate/evaluate_test.go | 18 +-- evaluate/report/csv.go | 25 ++-- evaluate/report/csv_test.go | 136 +++++++--------------- model/llm/llm.go | 33 ------ model/model.go | 5 - model/symflower/symflower.go | 10 -- model/testing/Model_mock_gen.go | 36 ------ model/testing/helper.go | 18 --- provider/ollama/ollama.go | 2 +- provider/openrouter/openrouter.go | 74 ++---------- 12 files changed, 73 insertions(+), 293 deletions(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index 6d590b77..fdc714f1 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -232,8 +232,7 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate. command.logger.Panicf("ERROR: unknown custom provider %q for model %q", providerID, model) } - modelName := strings.Split(model, provider.ProviderModelSeparator) - modelProvider.AddModel(llm.NewNamedModelWithCost(modelProvider, model, modelName[len(modelName)-1], 0)) + modelProvider.AddModel(llm.NewModel(modelProvider, model)) } } @@ -433,7 +432,7 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err } _ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) { - command.logger.Printf("Evaluation score for %q (%q): cost=%.2f, %s", model.ID(), assessment.Category(totalScore).ID, model.Cost(), assessment) + command.logger.Printf("Evaluation score for %q (%q): %s", model.ID(), assessment.Category(totalScore).ID, assessment) return nil }) diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go index 6d033e78..a390b6dd 100644 --- a/cmd/eval-dev-quality/cmd/evaluate_test.go +++ b/cmd/eval-dev-quality/cmd/evaluate_test.go @@ -60,7 +60,7 @@ var extractMetricsLogsMatch = extractMetricsMatch(regexp.MustCompile(`score=(\d+ // extractMetricsCSVMatch is a regular expression to extract metrics from CSV rows. // REMARK The cost is not match as a group since it's just a model property that we carry along for informational purposes. -var extractMetricsCSVMatch = extractMetricsMatch(regexp.MustCompile(`(?:\d+(?:\.\d+)?,)?(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`)) +var extractMetricsCSVMatch = extractMetricsMatch(regexp.MustCompile(`(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`)) // extractMetrics extracts multiple assessment metrics from the given string according to a given regular expression. func extractMetrics(t *testing.T, regex extractMetricsMatch, data string) (assessments []metrics.Assessments, scores []uint64) { @@ -532,7 +532,7 @@ func TestEvaluateExecute(t *testing.T) { }, ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { - assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): cost=0.00, score=28, coverage=20, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=508, processing-time=\d+, response-character-count=508, response-no-error=2, response-no-excess=2, response-with-code=2`, output) + assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): score=28, coverage=20, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=508, processing-time=\d+, response-character-count=508, response-no-error=2, response-no-excess=2, response-with-code=2`, output) assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ diff --git a/evaluate/evaluate_test.go b/evaluate/evaluate_test.go index 3c9440e6..fa0ab8cb 100644 --- a/evaluate/evaluate_test.go +++ b/evaluate/evaluate_test.go @@ -171,7 +171,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, "empty-response-model", "Empty Response Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, "empty-response-model") repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ @@ -443,7 +443,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) validate(t, &testCase{ Name: "Problems of previous runs shouldn't cancel successive runs", @@ -544,7 +544,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) validate(t, &testCase{ Name: "Solving basic checks once is enough", @@ -644,7 +644,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) validate(t, &testCase{ Name: "Never solving basic checks leads to exclusion", @@ -714,7 +714,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ @@ -785,7 +785,7 @@ func TestEvaluate(t *testing.T) { { languageGolang := &golang.Language{} mockedModelID := "mocked-generation-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Mocked Generation Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) repositoryPath := filepath.Join("golang", "plain") validate(t, &testCase{ @@ -864,7 +864,7 @@ func TestEvaluate(t *testing.T) { // Setup provider and model mocking. languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) mockedProviderID := "testing-provider" mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) mockedLoader := providertesting.NewMockLoader(t) @@ -949,7 +949,7 @@ func TestEvaluate(t *testing.T) { // Setup provider and model mocking. languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) mockedProviderID := "testing-provider" mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) mockedLoader := providertesting.NewMockLoader(t) @@ -1033,7 +1033,7 @@ func TestEvaluate(t *testing.T) { // Setup provider and model mocking. languageGolang := &golang.Language{} mockedModelID := "testing-provider/testing-model" - mockedModel := modeltesting.NewMockCapabilityWriteTestsNamedWithCost(t, mockedModelID, "Testing Model", 0.0001) + mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) repositoryPath := filepath.Join("golang", "plain") diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go index 17e44615..d60d4871 100644 --- a/evaluate/report/csv.go +++ b/evaluate/report/csv.go @@ -60,7 +60,7 @@ func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language langu for _, task := range tasks { assessment := assessmentsPerTask[task] - row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(model.Cost(), 'f', -1, 64), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) + row := append([]string{model.ID(), language.ID(), repositoryName, string(task), strconv.FormatUint(uint64(assessment.Score()), 10)}, assessment.StringCSV()...) csv.Write(row) } csv.Flush() @@ -70,17 +70,13 @@ func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language langu // evaluationHeader returns the CSV header for the evaluation CSV. func evaluationHeader() (header []string) { - return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) + return append([]string{"model-id", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) } // EvaluationRecord holds a line of the evaluation CSV. type EvaluationRecord struct { // ModelID holds the model id. ModelID string - // ModelName holds the model name. - ModelName string - // ModelCost holds the model cost. - ModelCost float64 // LanguageID holds the language id. LanguageID string @@ -94,8 +90,6 @@ func (e *EvaluationRecord) Clone() (new *EvaluationRecord) { new = &EvaluationRecord{} new.ModelID = e.ModelID - new.ModelName = e.ModelName - new.ModelCost = e.ModelCost new.LanguageID = e.LanguageID new.Assessments = metrics.Merge(e.Assessments, nil) @@ -127,7 +121,7 @@ func (e EvaluationRecords) GroupByModel() EvaluationRecordsPerModel { // Header returns the header description as a CSV row. func (EvaluationRecordsPerModel) Header() (header []string) { - return append([]string{"model-id", "model-name", "cost", "score"}, metrics.AllAssessmentKeysStrings...) + return append([]string{"model-id", "score"}, metrics.AllAssessmentKeysStrings...) } // Rows returns all data as CSV rows. @@ -141,9 +135,8 @@ func (e EvaluationRecordsPerModel) Rows() (rows [][]string) { record := e[model] metrics := record.Assessments.StringCSV() score := record.Assessments.Score() - modelCost := record.ModelCost - row := append([]string{record.ModelID, record.ModelName, strconv.FormatFloat(modelCost, 'f', -1, 64), strconv.FormatUint(uint64(score), 10)}, metrics...) + row := append([]string{record.ModelID, strconv.FormatUint(uint64(score), 10)}, metrics...) rows = append(rows, row) } @@ -217,15 +210,13 @@ func convertRawRecordToEvaluationRecord(raw []string) (record *EvaluationRecord, assessments := metrics.NewAssessments() modelID := raw[0] - modelName := raw[1] - modelCost, err := strconv.ParseFloat(raw[2], 64) if err != nil { return nil, pkgerrors.WithStack(err) } - languageID := raw[3] + languageID := raw[1] - rawMetrics := raw[7:] + rawMetrics := raw[5:] for i, assessementKey := range metrics.AllAssessmentKeysStrings { metric, err := strconv.ParseUint(rawMetrics[i], 10, 64) if err != nil { @@ -236,9 +227,7 @@ func convertRawRecordToEvaluationRecord(raw []string) (record *EvaluationRecord, } return &EvaluationRecord{ - ModelID: modelID, - ModelName: modelName, - ModelCost: modelCost, + ModelID: modelID, LanguageID: languageID, diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go index 4e65f9c5..dcd4931b 100644 --- a/evaluate/report/csv_test.go +++ b/evaluate/report/csv_test.go @@ -43,16 +43,14 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { CSVFormatter: EvaluationRecordsPerModel{ "some-model-a": &EvaluationRecord{ ModelID: "some-model-a", - ModelName: "Some Model A", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.NewAssessments(), }, }, ExpectedString: ` - model-id,model-name,cost,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model-a,Some Model A,0.0001,0,0,0,0,0,0,0,0,0,0 + model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + some-model-a,0,0,0,0,0,0,0,0,0,0 `, }) validate(t, &testCase{ @@ -61,8 +59,6 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { CSVFormatter: EvaluationRecordsPerModel{ "some-model-a": &EvaluationRecord{ ModelID: "some-model-a", - ModelName: "Some Model A", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, @@ -78,8 +74,6 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { }, "some-model-b": &EvaluationRecord{ ModelID: "some-model-b", - ModelName: "Some Model B", - ModelCost: 0.0003, LanguageID: "java", Assessments: metrics.Assessments{ metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, @@ -96,9 +90,9 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) { }, ExpectedString: ` - model-id,model-name,cost,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model-a,Some Model A,0.0001,15,1,2,2,50,200,100,3,4,5 - some-model-b,Some Model B,0.0003,40,6,7,7,100,400,200,8,9,10 + model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + some-model-a,15,1,2,2,50,200,100,3,4,5 + some-model-b,40,6,7,7,100,400,200,8,9,10 `, }) } @@ -112,7 +106,7 @@ func TestNewEvaluationFile(t *testing.T) { require.NoError(t, err) expectedEvaluationFileContent := bytesutil.StringTrimIndentations(` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code `) assert.Equal(t, expectedEvaluationFileContent, string(actualEvaluationFileContent)) @@ -133,7 +127,7 @@ func TestWriteEvaluationRecord(t *testing.T) { evaluationFile, err := NewEvaluationFile(&file) require.NoError(t, err) - modelMock := modeltesting.NewMockModelNamedWithCosts(t, "mocked-model", "Mocked Model", 0.0001) + modelMock := modeltesting.NewMockModelNamed(t, "mocked-model") languageMock := languagetesting.NewMockLanguageNamed(t, "golang") err = evaluationFile.WriteEvaluationRecord(modelMock, languageMock, "golang/plain", tc.Assessments) @@ -151,8 +145,8 @@ func TestWriteEvaluationRecord(t *testing.T) { }, ExpectedCSV: ` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0 + model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + mocked-model,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0 `, }) validate(t, &testCase{ @@ -174,9 +168,9 @@ func TestWriteEvaluationRecord(t *testing.T) { }, ExpectedCSV: ` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0 - mocked-model,Mocked Model,0.0001,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0 + model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + mocked-model,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0 + mocked-model,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0 `, }) } @@ -239,13 +233,13 @@ func TestLoadEvaluationRecords(t *testing.T) { Before: func(resultPath string) { header := bytesutil.StringTrimIndentations(` - model-id,model-name,cost + model-id `) require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(header), 0644)) }, ExpectedErr: func(err error) { - assert.ErrorContains(t, err, "found header [model-id model-name cost]") + assert.ErrorContains(t, err, "found header [model-id]") }, }) validate(t, &testCase{ @@ -253,8 +247,8 @@ func TestLoadEvaluationRecords(t *testing.T) { Before: func(resultPath string) { fileContent := bytesutil.StringTrimIndentations(` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-1.2,Claude 1.2,0.0001,golang,golang/light,write-tests,982,750,18,18,70179,720571,71195,115,49,50 + model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-1.2,golang,golang/light,write-tests,982,750,18,18,70179,720571,71195,115,49,50 `) require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) }, @@ -262,8 +256,6 @@ func TestLoadEvaluationRecords(t *testing.T) { ExpectedEvaluationRecords: EvaluationRecords{ &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 750, @@ -284,9 +276,9 @@ func TestLoadEvaluationRecords(t *testing.T) { Before: func(resultPath string) { fileContent := bytesutil.StringTrimIndentations(` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-1.2,Claude 1.2,0.0001,golang,golang/light,write-tests,982,750,18,18,70179,720571,71195,115,49,50 - openrouter/anthropic/claude-1.2,Claude 1.2,0.0002,golang,golang/plain,write-tests,37,20,2,2,441,11042,523,5,5,5 + model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-1.2,golang,golang/light,write-tests,982,750,18,18,70179,720571,71195,115,49,50 + openrouter/anthropic/claude-1.2,golang,golang/plain,write-tests,37,20,2,2,441,11042,523,5,5,5 `) require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) }, @@ -294,8 +286,6 @@ func TestLoadEvaluationRecords(t *testing.T) { ExpectedEvaluationRecords: EvaluationRecords{ &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 750, @@ -311,8 +301,6 @@ func TestLoadEvaluationRecords(t *testing.T) { }, &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0002, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 20, @@ -359,8 +347,6 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) { EvaluationRecords: EvaluationRecords{ &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -378,8 +364,6 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) { ExpectedEvaluationRecords: map[string]*EvaluationRecord{ "openrouter/anthropic/claude-1.2": &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -401,8 +385,6 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) { EvaluationRecords: EvaluationRecords{ &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -418,8 +400,6 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) { }, &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0002, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -435,8 +415,6 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) { }, &EvaluationRecord{ ModelID: "ollama/codeqwen:latest", - ModelName: "Code Qwen", - ModelCost: 0.0003, LanguageID: "java", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -454,8 +432,6 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) { ExpectedEvaluationRecords: map[string]*EvaluationRecord{ "openrouter/anthropic/claude-1.2": &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 2, @@ -471,8 +447,6 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) { }, "ollama/codeqwen:latest": &EvaluationRecord{ ModelID: "ollama/codeqwen:latest", - ModelName: "Code Qwen", - ModelCost: 0.0003, LanguageID: "java", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -513,8 +487,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { EvaluationRecords: EvaluationRecords{ &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.NewAssessments(), }, @@ -524,8 +496,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { "golang": EvaluationRecordsPerModel{ "openrouter/anthropic/claude-1.2": &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.NewAssessments(), }, @@ -538,8 +508,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { EvaluationRecords: EvaluationRecords{ &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -555,8 +523,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { }, &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -572,8 +538,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { }, &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "java", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -589,8 +553,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { }, &EvaluationRecord{ ModelID: "ollama/codeqwen:latest", - ModelName: "Code Qwen", - ModelCost: 0.0003, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -606,8 +568,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { }, &EvaluationRecord{ ModelID: "ollama/codeqwen:latest", - ModelName: "Code Qwen", - ModelCost: 0.0003, LanguageID: "java", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -627,8 +587,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { "golang": EvaluationRecordsPerModel{ "openrouter/anthropic/claude-1.2": &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 2, @@ -644,8 +602,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { }, "ollama/codeqwen:latest": &EvaluationRecord{ ModelID: "ollama/codeqwen:latest", - ModelName: "Code Qwen", - ModelCost: 0.0003, LanguageID: "golang", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -663,8 +619,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { "java": EvaluationRecordsPerModel{ "openrouter/anthropic/claude-1.2": &EvaluationRecord{ ModelID: "openrouter/anthropic/claude-1.2", - ModelName: "Claude 1.2", - ModelCost: 0.0001, LanguageID: "java", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -680,8 +634,6 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { }, "ollama/codeqwen:latest": &EvaluationRecord{ ModelID: "ollama/codeqwen:latest", - ModelName: "Code Qwen", - ModelCost: 0.0003, LanguageID: "java", Assessments: metrics.Assessments{ metrics.AssessmentKeyCoverage: 1, @@ -714,19 +666,19 @@ func TestWriteCSVs(t *testing.T) { evaluationFilePath := filepath.Join(resultPath, "evaluation.csv") evaluationFileContent := bytesutil.StringTrimIndentations(` - model-id,model-name,cost,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,Claude 2.0,0.001,golang,golang/light,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/anthropic/claude-2.0,Claude 2.0,0.001,golang,golang/plain,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/anthropic/claude-2.0,Claude 2.0,0.001,java,java/light,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/anthropic/claude-2.0,Claude 2.0,0.001,java,java/plain,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,golang,golang/light,write-tests,21,8,7,7,6,5,4,3,2,1 - openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,golang,golang/plain,write-tests,21,8,7,7,6,5,4,3,2,1 - openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,java,java/light,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,java,java/plain,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/openai/gpt-4,GPT 4,0.005,golang,golang/light,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/openai/gpt-4,GPT 4,0.005,golang,golang/plain,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/openai/gpt-4,GPT 4,0.005,java,java/light,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/openai/gpt-4,GPT 4,0.005,java,java/plain,write-tests,24,1,2,2,3,4,5,6,7,8 + model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,24,1,2,2,3,4,5,6,7,8 + openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,24,1,2,2,3,4,5,6,7,8 + openrouter/anthropic/claude-2.0,java,java/light,write-tests,69,10,11,11,12,13,14,15,16,17 + openrouter/anthropic/claude-2.0,java,java/plain,write-tests,69,10,11,11,12,13,14,15,16,17 + openrouter/anthropic/claude-3-sonnet,golang,golang/light,write-tests,21,8,7,7,6,5,4,3,2,1 + openrouter/anthropic/claude-3-sonnet,golang,golang/plain,write-tests,21,8,7,7,6,5,4,3,2,1 + openrouter/anthropic/claude-3-sonnet,java,java/light,write-tests,69,10,11,11,12,13,14,15,16,17 + openrouter/anthropic/claude-3-sonnet,java,java/plain,write-tests,69,10,11,11,12,13,14,15,16,17 + openrouter/openai/gpt-4,golang,golang/light,write-tests,24,1,2,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,golang,golang/plain,write-tests,24,1,2,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,java,java/light,write-tests,24,1,2,2,3,4,5,6,7,8 + openrouter/openai/gpt-4,java,java/plain,write-tests,24,1,2,2,3,4,5,6,7,8 `) require.NoError(t, os.WriteFile(evaluationFilePath, []byte(evaluationFileContent), 0644)) @@ -753,10 +705,10 @@ func TestWriteCSVs(t *testing.T) { FileName: "models-summed.csv", ExpectedFileContent: ` - model-id,model-name,cost,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,Claude 2.0,0.001,186,22,26,26,30,34,38,42,46,50 - openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,180,36,36,36,36,36,36,36,36,36 - openrouter/openai/gpt-4,GPT 4,0.005,96,4,8,8,12,16,20,24,28,32 + model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,186,22,26,26,30,34,38,42,46,50 + openrouter/anthropic/claude-3-sonnet,180,36,36,36,36,36,36,36,36,36 + openrouter/openai/gpt-4,96,4,8,8,12,16,20,24,28,32 `, }) validate(t, &testCase{ @@ -765,10 +717,10 @@ func TestWriteCSVs(t *testing.T) { FileName: "golang-summed.csv", ExpectedFileContent: ` - model-id,model-name,cost,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,Claude 2.0,0.001,48,2,4,4,6,8,10,12,14,16 - openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,42,16,14,14,12,10,8,6,4,2 - openrouter/openai/gpt-4,GPT 4,0.005,48,2,4,4,6,8,10,12,14,16 + model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,48,2,4,4,6,8,10,12,14,16 + openrouter/anthropic/claude-3-sonnet,42,16,14,14,12,10,8,6,4,2 + openrouter/openai/gpt-4,48,2,4,4,6,8,10,12,14,16 `, }) validate(t, &testCase{ @@ -777,10 +729,10 @@ func TestWriteCSVs(t *testing.T) { FileName: "java-summed.csv", ExpectedFileContent: ` - model-id,model-name,cost,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,Claude 2.0,0.001,138,20,22,22,24,26,28,30,32,34 - openrouter/anthropic/claude-3-sonnet,Claude 3 Sonnet,0.003,138,20,22,22,24,26,28,30,32,34 - openrouter/openai/gpt-4,GPT 4,0.005,48,2,4,4,6,8,10,12,14,16 + model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code + openrouter/anthropic/claude-2.0,138,20,22,22,24,26,28,30,32,34 + openrouter/anthropic/claude-3-sonnet,138,20,22,22,24,26,28,30,32,34 + openrouter/openai/gpt-4,48,2,4,4,6,8,10,12,14,16 `, }) } diff --git a/model/llm/llm.go b/model/llm/llm.go index 9f3c451c..c9844f18 100644 --- a/model/llm/llm.go +++ b/model/llm/llm.go @@ -27,14 +27,9 @@ type Model struct { provider provider.Query // model holds the identifier for the LLM model. model string - // name holds the name for the LLM model. - name string // queryAttempts holds the number of query attempts to perform when a model request errors in the process of solving a task. queryAttempts uint - - // cost holds the cost of a model - cost float64 } // NewModel returns an LLM model corresponding to the given identifier which is queried via the given provider. @@ -47,19 +42,6 @@ func NewModel(provider provider.Query, modelIdentifier string) *Model { } } -// NewNamedModelWithCost returns an LLM model corresponding to the given identifier which is queried via the given provider, and with name and pricing information. -func NewNamedModelWithCost(provider provider.Query, modelIdentifier string, name string, cost float64) *Model { - return &Model{ - provider: provider, - model: modelIdentifier, - name: name, - - queryAttempts: 1, - - cost: cost, - } -} - // llmSourceFilePromptContext is the context for template for generating an LLM test generation prompt. type llmSourceFilePromptContext struct { // Language holds the programming language name. @@ -137,11 +119,6 @@ func (m *Model) ID() (id string) { return m.model } -// Name returns the name of this model. -func (m *Model) Name() (name string) { - return m.name -} - var _ model.CapabilityWriteTests = (*Model)(nil) // WriteTests generates test files for the given implementation file in a repository. @@ -272,16 +249,6 @@ func (m *Model) RepairCode(ctx model.Context) (assessment metrics.Assessments, e return assessment, nil } -// Cost returns the cost of the model. -func (m *Model) Cost() (cost float64) { - return m.cost -} - -// SetCost sets the cost of a model. -func (m *Model) SetCost(cost float64) { - m.cost = cost -} - var _ model.SetQueryAttempts = (*Model)(nil) // SetQueryAttempts sets the number of query attempts to perform when a model request errors in the process of solving a task. diff --git a/model/model.go b/model/model.go index bcf2fde1..1c6c33cd 100644 --- a/model/model.go +++ b/model/model.go @@ -11,11 +11,6 @@ import ( type Model interface { // ID returns the unique ID of this model. ID() (id string) - // Name returns the human-readable name of this model. - Name() (name string) - - // Cost returns the cost of a model in US dollars. - Cost() (cost float64) } // Context holds the data needed by a model for running a task. diff --git a/model/symflower/symflower.go b/model/symflower/symflower.go index f3963cee..02337312 100644 --- a/model/symflower/symflower.go +++ b/model/symflower/symflower.go @@ -31,11 +31,6 @@ func (m *Model) ID() (id string) { return "symflower" + provider.ProviderModelSeparator + "symbolic-execution" } -// Name returns the name of this model. -func (m *Model) Name() (name string) { - return "Symbolic Execution" -} - var _ model.CapabilityWriteTests = (*Model)(nil) // generateTestsForFile generates test files for the given implementation file in a repository. @@ -97,8 +92,3 @@ func countCharactersOfGeneratedFiles(repositoryPath string, filePaths []string) return count, nil } - -// Cost returns the cost of the model. -func (m *Model) Cost() (cost float64) { - return 0 -} diff --git a/model/testing/Model_mock_gen.go b/model/testing/Model_mock_gen.go index 962558d4..04057324 100644 --- a/model/testing/Model_mock_gen.go +++ b/model/testing/Model_mock_gen.go @@ -9,24 +9,6 @@ type MockModel struct { mock.Mock } -// Cost provides a mock function with given fields: -func (_m *MockModel) Cost() float64 { - ret := _m.Called() - - if len(ret) == 0 { - panic("no return value specified for Cost") - } - - var r0 float64 - if rf, ok := ret.Get(0).(func() float64); ok { - r0 = rf() - } else { - r0 = ret.Get(0).(float64) - } - - return r0 -} - // ID provides a mock function with given fields: func (_m *MockModel) ID() string { ret := _m.Called() @@ -45,24 +27,6 @@ func (_m *MockModel) ID() string { return r0 } -// Name provides a mock function with given fields: -func (_m *MockModel) Name() string { - ret := _m.Called() - - if len(ret) == 0 { - panic("no return value specified for Name") - } - - var r0 string - if rf, ok := ret.Get(0).(func() string); ok { - r0 = rf() - } else { - r0 = ret.Get(0).(string) - } - - return r0 -} - // NewMockModel creates a new instance of MockModel. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. // The first argument is typically a *testing.T value. func NewMockModel(t interface { diff --git a/model/testing/helper.go b/model/testing/helper.go index 1a7a5da6..e984c45d 100644 --- a/model/testing/helper.go +++ b/model/testing/helper.go @@ -19,16 +19,6 @@ func NewMockModelNamed(t *testing.T, id string) *MockModel { return m } -// NewMockModelNamed returns a new named mocked model with cost information. -func NewMockModelNamedWithCosts(t *testing.T, id string, name string, cost float64) *MockModel { - m := NewMockModel(t) - m.On("ID").Return(id).Maybe() - m.On("Name").Return(name).Maybe() - m.On("Cost").Return(cost).Maybe() - - return m -} - // RegisterGenerateSuccess registers a mock call for successful generation. func (m *MockCapabilityWriteTests) RegisterGenerateSuccess(t *testing.T, filePath string, fileContent string, assessment metrics.Assessments) *mock.Call { return m.On("WriteTests", mock.Anything).Return(assessment, nil).Run(func(args mock.Arguments) { @@ -69,14 +59,6 @@ func NewMockCapabilityWriteTestsNamed(t *testing.T, id string) *MockModelCapabil } } -// NewMockCapabilityWriteTestsNamedWithCost returns a new named mocked model with costs. -func NewMockCapabilityWriteTestsNamedWithCost(t *testing.T, id string, name string, cost float64) *MockModelCapabilityWriteTests { - return &MockModelCapabilityWriteTests{ - MockModel: NewMockModelNamedWithCosts(t, id, name, cost), - MockCapabilityWriteTests: NewMockCapabilityWriteTests(t), - } -} - // MockModelCapabilityRepairCode holds a mock implementing the "Model" and the "CapabilityRepairCode" interface. type MockModelCapabilityRepairCode struct { *MockModel diff --git a/provider/ollama/ollama.go b/provider/ollama/ollama.go index 17c7fa07..103b6df9 100644 --- a/provider/ollama/ollama.go +++ b/provider/ollama/ollama.go @@ -72,7 +72,7 @@ func (p *Provider) Models() (models []model.Model, err error) { models = make([]model.Model, len(ms)) for i, modelName := range ms { - models[i] = llm.NewNamedModelWithCost(p, p.ID()+provider.ProviderModelSeparator+modelName, modelName, 0) + models[i] = llm.NewModel(p, p.ID()+provider.ProviderModelSeparator+modelName) } return models, nil diff --git a/provider/openrouter/openrouter.go b/provider/openrouter/openrouter.go index 32ddb5c9..7fcef65a 100644 --- a/provider/openrouter/openrouter.go +++ b/provider/openrouter/openrouter.go @@ -2,11 +2,7 @@ package openrouter import ( "context" - "encoding/json" "errors" - "io" - "net/http" - "strconv" "strings" "time" @@ -85,49 +81,16 @@ type Pricing struct { // Models returns which models are available to be queried via this provider. func (p *Provider) Models() (models []model.Model, err error) { - responseModels, err := providerModels(p.baseURL + "/models") - if err != nil { - return nil, err - } - - models = make([]model.Model, len(responseModels.Models)) - for i, model := range responseModels.Models { - cost, err := sumModelCosts(model) - if err != nil { - return nil, err - } - models[i] = llm.NewNamedModelWithCost(p, p.ID()+provider.ProviderModelSeparator+model.ID, model.Name, cost) - } - - return models, nil -} - -// providerModels returns the provider's list of models given the URL to fetch the models. -func providerModels(url string) (models ModelsList, err error) { - request, err := http.NewRequest("GET", url, nil) - if err != nil { - return ModelsList{}, pkgerrors.WithStack(err) - } - request.Header.Set("Accept", "application/json") + client := p.client() - client := &http.Client{} - var responseBody []byte + var responseModels openai.ModelsList if err := retry.Do( // Query available models with a retry logic cause "openrouter.ai" has failed us in the past. func() error { - response, err := client.Do(request) - if err != nil { - return pkgerrors.WithStack(err) - } - defer response.Body.Close() - - if response.StatusCode != http.StatusOK { - return pkgerrors.Errorf("received status code %d when querying provider models", response.StatusCode) - } - - responseBody, err = io.ReadAll(response.Body) + ms, err := client.ListModels(context.Background()) if err != nil { return pkgerrors.WithStack(err) } + responseModels = ms return nil }, @@ -136,38 +99,17 @@ func providerModels(url string) (models ModelsList, err error) { retry.DelayType(retry.BackOffDelay), retry.LastErrorOnly(true), ); err != nil { - return ModelsList{}, err + return nil, err } - if err = json.Unmarshal(responseBody, &models); err != nil { - return ModelsList{}, pkgerrors.WithStack(err) + models = make([]model.Model, len(responseModels.Models)) + for i, model := range responseModels.Models { + models[i] = llm.NewModel(p, p.ID()+provider.ProviderModelSeparator+model.ID) } return models, nil } -// sumModelCosts sums the different costs of a model. -func sumModelCosts(model Model) (cost float64, err error) { - prompt, err := strconv.ParseFloat(strings.TrimSpace(model.Pricing.Prompt), 64) - if err != nil { - return 0, pkgerrors.WithStack(err) - } - completion, err := strconv.ParseFloat(strings.TrimSpace(model.Pricing.Completion), 64) - if err != nil { - return 0, pkgerrors.WithStack(err) - } - request, err := strconv.ParseFloat(strings.TrimSpace(model.Pricing.Request), 64) - if err != nil { - return 0, pkgerrors.WithStack(err) - } - image, err := strconv.ParseFloat(strings.TrimSpace(model.Pricing.Image), 64) - if err != nil { - return 0, pkgerrors.WithStack(err) - } - - return prompt + completion + request + image, nil -} - var _ provider.InjectToken = (*Provider)(nil) // SetToken sets a potential token to be used in case the provider needs to authenticate a remote API.