Extract model names, to obtain a human-readable name for each model

Closes #206
symflower · Jun 26, 2024 · 8b70010 · 8b70010
1 parent 0af4eab
commit 8b70010
Show file tree

Hide file tree

Showing 10 changed files with 64 additions and 26 deletions.
diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -196,7 +196,8 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 				command.logger.Panicf("ERROR: unknown custom provider %q for model %q", providerID, model)
 			}
 
-			modelProvider.AddModel(llm.NewModel(modelProvider, model))
+			modelName := strings.Split(model, provider.ProviderModelSeparator)
+			modelProvider.AddModel(llm.NewNamedModelWithCost(modelProvider, model, modelName[len(modelName)-1], 0))
 		}
 	}
 

diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go
@@ -46,7 +46,7 @@ func GenerateCSV(formatter CSVFormatter) (csvData string, err error) {
 
 // Header returns the header description as a CSV row.
 func (a *AssessmentStore) Header() (header []string) {
-	return append([]string{"model", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...)
+	return append([]string{"model-id", "model-name", "cost", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...)
 }
 
 // Rows returns all data as CSV rows.
@@ -56,7 +56,7 @@ func (a *AssessmentStore) Rows() (rows [][]string) {
 		score := a.Score()
 		cost := m.Cost()
 
-		row := append([]string{m.ID(), strconv.FormatFloat(cost, 'f', -1, 64), l.ID(), r, string(t), strconv.FormatUint(uint64(score), 10)}, metrics...)
+		row := append([]string{m.ID(), m.Name(), strconv.FormatFloat(cost, 'f', -1, 64), l.ID(), r, string(t), strconv.FormatUint(uint64(score), 10)}, metrics...)
 		rows = append(rows, row)
 
 		return nil
@@ -67,7 +67,7 @@ func (a *AssessmentStore) Rows() (rows [][]string) {
 
 // Header returns the header description as a CSV row.
 func (a AssessmentPerModel) Header() (header []string) {
-	return append([]string{"model", "cost", "score"}, metrics.AllAssessmentKeysStrings...)
+	return append([]string{"model-id", "model-name", "cost", "score"}, metrics.AllAssessmentKeysStrings...)
 }
 
 // Rows returns all data as CSV rows.
@@ -82,7 +82,7 @@ func (a AssessmentPerModel) Rows() (rows [][]string) {
 		score := a[model].Score()
 		cost := model.Cost()
 
-		row := append([]string{model.ID(), strconv.FormatFloat(cost, 'f', -1, 64), strconv.FormatUint(uint64(score), 10)}, metrics...)
+		row := append([]string{model.ID(), model.Name(), strconv.FormatFloat(cost, 'f', -1, 64), strconv.FormatUint(uint64(score), 10)}, metrics...)
 		rows = append(rows, row)
 	}
 

diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go
@@ -38,7 +38,7 @@ func TestGenerateCSVForAssessmentPerModelPerLanguagePerRepository(t *testing.T)
 
 		Assessments: metricstesting.AssessmentTuples{
 			&metricstesting.AssessmentTuple{
-				Model:          modeltesting.NewMockModelNamedWithCosts(t, "some-model", 0),
+				Model:          modeltesting.NewMockModelNamedWithCosts(t, "some-model", "Some Model", 0),
 				Language:       languagetesting.NewMockLanguageNamed(t, "some-language"),
 				RepositoryPath: "some-repository",
 				Task:           evaluatetask.IdentifierWriteTests,
@@ -47,16 +47,16 @@ func TestGenerateCSVForAssessmentPerModelPerLanguagePerRepository(t *testing.T)
 		},
 
 		ExpectedString: `
-			model,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			some-model,0,some-language,some-repository,write-tests,0,0,0,0,0,0,0,0,0
+			model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
+			some-model,Some Model,0,some-language,some-repository,write-tests,0,0,0,0,0,0,0,0,0
 		`,
 	})
 	validate(t, &testCase{
 		Name: "Multiple Models",
 
 		Assessments: metricstesting.AssessmentTuples{
 			&metricstesting.AssessmentTuple{
-				Model:          modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", 0.0001),
+				Model:          modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", "Some Model A", 0.0001),
 				Language:       languagetesting.NewMockLanguageNamed(t, "some-language"),
 				RepositoryPath: "some-repository",
 				Task:           evaluatetask.IdentifierWriteTests,
@@ -72,7 +72,7 @@ func TestGenerateCSVForAssessmentPerModelPerLanguagePerRepository(t *testing.T)
 				},
 			},
 			&metricstesting.AssessmentTuple{
-				Model:          modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", 0.0005),
+				Model:          modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", "Some Model B", 0.0005),
 				Language:       languagetesting.NewMockLanguageNamed(t, "some-language"),
 				RepositoryPath: "some-repository",
 				Task:           evaluatetask.IdentifierWriteTests,
@@ -90,9 +90,9 @@ func TestGenerateCSVForAssessmentPerModelPerLanguagePerRepository(t *testing.T)
 		},
 
 		ExpectedString: `
-			model,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			some-model-a,0.0001,some-language,some-repository,write-tests,15,1,2,50,200,100,3,4,5
-			some-model-b,0.0005,some-language,some-repository,write-tests,15,1,2,100,300,200,3,4,5
+			model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
+			some-model-a,Some Model A,0.0001,some-language,some-repository,write-tests,15,1,2,50,200,100,3,4,5
+			some-model-b,Some Model B,0.0005,some-language,some-repository,write-tests,15,1,2,100,300,200,3,4,5
 		`,
 	})
 }
@@ -119,19 +119,19 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) {
 		Name: "Single Empty Model",
 
 		Assessments: AssessmentPerModel{
-			modeltesting.NewMockModelNamedWithCosts(t, "some-model", 0): {},
+			modeltesting.NewMockModelNamedWithCosts(t, "some-model", "Some Model", 0): {},
 		},
 
 		ExpectedString: `
-			model,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			some-model,0,0,0,0,0,0,0,0,0,0
+			model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
+			some-model,Some Model,0,0,0,0,0,0,0,0,0,0
 		`,
 	})
 	validate(t, &testCase{
 		Name: "Multiple Models",
 
 		Assessments: AssessmentPerModel{
-			modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", 0.0001): {
+			modeltesting.NewMockModelNamedWithCosts(t, "some-model-a", "Some Model A", 0.0001): {
 				metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50,
 				metrics.AssessmentKeyResponseCharacterCount:             100,
 				metrics.AssessmentKeyCoverage:                           1,
@@ -141,7 +141,7 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) {
 				metrics.AssessmentKeyResponseWithCode:                   5,
 				metrics.AssessmentKeyProcessingTime:                     200,
 			},
-			modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", 0.0005): {
+			modeltesting.NewMockModelNamedWithCosts(t, "some-model-b", "Some Model B", 0.0005): {
 				metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100,
 				metrics.AssessmentKeyResponseCharacterCount:             200,
 				metrics.AssessmentKeyCoverage:                           1,
@@ -154,9 +154,9 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) {
 		},
 
 		ExpectedString: `
-			model,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			some-model-a,0.0001,15,1,2,50,200,100,3,4,5
-			some-model-b,0.0005,15,1,2,100,300,200,3,4,5
+			model-id,model-name,cost,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
+			some-model-a,Some Model A,0.0001,15,1,2,50,200,100,3,4,5
+			some-model-b,Some Model B,0.0005,15,1,2,100,300,200,3,4,5
 		`,
 	})
 }
diff --git a/model/llm/llm.go b/model/llm/llm.go
@@ -28,6 +28,8 @@ type Model struct {
 	provider provider.Query
 	// model holds the identifier for the LLM model.
 	model string
+	// name holds the name for the LLM model.
+	name string
 
 	// queryAttempts holds the number of query attempts to perform when a model request errors in the process of solving a task.
 	queryAttempts uint
@@ -46,11 +48,12 @@ func NewModel(provider provider.Query, modelIdentifier string) *Model {
 	}
 }
 
-// NewModelWithCost returns an LLM model corresponding to the given identifier which is queried via the given provider, and with pricing information.
-func NewModelWithCost(provider provider.Query, modelIdentifier string, cost float64) *Model {
+// NewNamedModelWithCost returns an LLM model corresponding to the given identifier which is queried via the given provider, and with name and pricing information.
+func NewNamedModelWithCost(provider provider.Query, modelIdentifier string, name string, cost float64) *Model {
 	return &Model{
 		provider: provider,
 		model:    modelIdentifier,
+		name:     name,
 
 		queryAttempts: 1,
 
@@ -135,6 +138,11 @@ func (m *Model) ID() (id string) {
 	return m.model
 }
 
+// Name returns the name of this model.
+func (m *Model) Name() (name string) {
+	return m.name
+}
+
 // IsTaskSupported returns whether the model supports the given task or not.
 func (m *Model) IsTaskSupported(taskIdentifier task.Identifier) (isSupported bool) {
 	switch taskIdentifier {

diff --git a/model/model.go b/model/model.go
@@ -11,6 +11,8 @@ import (
 type Model interface {
 	// ID returns the unique ID of this model.
 	ID() (id string)
+	// Name returns the human-readable name of this model.
+	Name() (name string)
 
 	// IsTaskSupported returns whether the model supports the given task or not.
 	IsTaskSupported(taskIdentifier task.Identifier) (isSupported bool)

diff --git a/model/symflower/symflower.go b/model/symflower/symflower.go
@@ -33,6 +33,11 @@ func (m *Model) ID() (id string) {
 	return "symflower" + provider.ProviderModelSeparator + "symbolic-execution"
 }
 
+// Name returns the name of this model.
+func (m *Model) Name() (name string) {
+	return "Symbolic Execution"
+}
+
 // IsTaskSupported returns whether the model supports the given task or not.
 func (m *Model) IsTaskSupported(taskIdentifier task.Identifier) (isSupported bool) {
 	switch taskIdentifier {

diff --git a/model/testing/Model_mock_gen.go b/model/testing/Model_mock_gen.go
diff --git a/model/testing/helper.go b/model/testing/helper.go
@@ -20,9 +20,10 @@ func NewMockModelNamed(t *testing.T, id string) *MockModel {
 }
 
 // NewMockModelNamed returns a new named mocked model with cost information.
-func NewMockModelNamedWithCosts(t *testing.T, id string, cost float64) *MockModel {
+func NewMockModelNamedWithCosts(t *testing.T, id string, name string, cost float64) *MockModel {
 	m := NewMockModel(t)
 	m.On("ID").Return(id).Maybe()
+	m.On("Name").Return(name).Maybe()
 	m.On("Cost").Return(cost).Maybe()
 
 	return m

diff --git a/provider/ollama/ollama.go b/provider/ollama/ollama.go
@@ -72,7 +72,7 @@ func (p *Provider) Models() (models []model.Model, err error) {
 
 	models = make([]model.Model, len(ms))
 	for i, modelName := range ms {
-		models[i] = llm.NewModel(p, p.ID()+provider.ProviderModelSeparator+modelName)
+		models[i] = llm.NewNamedModelWithCost(p, p.ID()+provider.ProviderModelSeparator+modelName, modelName, 0)
 	}
 
 	return models, nil

diff --git a/provider/openrouter/openrouter.go b/provider/openrouter/openrouter.go
@@ -64,6 +64,9 @@ type ModelsList struct {
 type Model struct {
 	// ID holds the model id.
 	ID string `json:"id"`
+	// Name holds the model name.
+	Name string `json:"name"`
+
 	// Pricing holds the pricing information of a model.
 	Pricing Pricing `json:"pricing"`
 }
@@ -93,7 +96,7 @@ func (p *Provider) Models() (models []model.Model, err error) {
 		if err != nil {
 			return nil, err
 		}
-		models[i] = llm.NewModelWithCost(p, p.ID()+provider.ProviderModelSeparator+model.ID, cost)
+		models[i] = llm.NewNamedModelWithCost(p, p.ID()+provider.ProviderModelSeparator+model.ID, model.Name, cost)
 	}
 
 	return models, nil