Skip to content

Commit

Permalink
Store the task identifier and the repository name when reading the ev…
Browse files Browse the repository at this point in the history
…aluation CSV file, since they are part of an evaluation record

Part of #237
  • Loading branch information
ruiAzevedo19 committed Jul 9, 2024
1 parent c07ed10 commit 1bddab2
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 43 deletions.
57 changes: 41 additions & 16 deletions evaluate/report/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,40 +85,59 @@ type EvaluationRecord struct {
// LanguageID holds the language id.
LanguageID string

// RepositoryName holds the name of a repository .
RepositoryName string
// Task holds the task identifier.
Task string

// Assessments holds the assessments of an entry.
Assessments metrics.Assessments
}

// Clone clones an evaluation record.
func (e *EvaluationRecord) Clone() (new *EvaluationRecord) {
new = &EvaluationRecord{}
// EvaluationRecordSummary holds a subset of an evaluation record.
type EvaluationRecordSummary struct {
// ModelID holds the model id.
ModelID string
// ModelName holds the model name.
ModelName string
// ModelCost holds the model cost.
ModelCost float64

// LanguageID holds the language id.
LanguageID string

new.ModelID = e.ModelID
new.ModelName = e.ModelName
new.ModelCost = e.ModelCost
new.LanguageID = e.LanguageID
new.Assessments = metrics.Merge(e.Assessments, nil)
// Assessments holds the assessments of an entry.
Assessments metrics.Assessments
}

return new
// NewEvaluationRecordSummary creates a new evaluation record summary given an evaluation record.
func NewEvaluationRecordSummary(evaluationRecord *EvaluationRecord) (subset *EvaluationRecordSummary) {
return &EvaluationRecordSummary{
ModelID: evaluationRecord.ModelID,
ModelName: evaluationRecord.ModelName,
ModelCost: evaluationRecord.ModelCost,
LanguageID: evaluationRecord.LanguageID,
Assessments: metrics.Merge(evaluationRecord.Assessments, nil),
}
}

// EvaluationRecords holds all the evaluation records.
type EvaluationRecords []*EvaluationRecord

// EvaluationRecordsPerModel holds the collection of evaluation records per model.
type EvaluationRecordsPerModel map[string]*EvaluationRecord
type EvaluationRecordsPerModel map[string]*EvaluationRecordSummary

// GroupByModel groups the evaluation records by model.
func (e EvaluationRecords) GroupByModel() EvaluationRecordsPerModel {
perModel := map[string]*EvaluationRecord{}
perModel := map[string]*EvaluationRecordSummary{}

for _, record := range e {
_, ok := perModel[record.ModelID]
if !ok {
perModel[record.ModelID] = record.Clone()
perModel[record.ModelID] = NewEvaluationRecordSummary(record)
} else {
r := perModel[record.ModelID]
r.Assessments = metrics.Merge(r.Assessments, record.Assessments)
r.Assessments.Add(record.Assessments)
}
}

Expand Down Expand Up @@ -161,14 +180,14 @@ func (e EvaluationRecords) GroupByLanguageAndModel() EvaluationRecordsPerLanguag
perModel, ok := perLanguageAndModel[record.LanguageID]
if !ok {
perLanguageAndModel[record.LanguageID] = EvaluationRecordsPerModel{
record.ModelID: record,
record.ModelID: NewEvaluationRecordSummary(record),
}
} else {
_, ok := perModel[record.ModelID]
if !ok {
perModel[record.ModelID] = record.Clone()
perModel[record.ModelID] = NewEvaluationRecordSummary(record)
} else {
perModel[record.ModelID].Assessments = metrics.Merge(perModel[record.ModelID].Assessments, record.Assessments)
perModel[record.ModelID].Assessments.Add(record.Assessments)
}
}
}
Expand Down Expand Up @@ -225,6 +244,9 @@ func convertRawRecordToEvaluationRecord(raw []string) (record *EvaluationRecord,

languageID := raw[3]

repositoryName := raw[4]
task := raw[5]

rawMetrics := raw[7:]
for i, assessementKey := range metrics.AllAssessmentKeysStrings {
metric, err := strconv.ParseUint(rawMetrics[i], 10, 64)
Expand All @@ -242,6 +264,9 @@ func convertRawRecordToEvaluationRecord(raw []string) (record *EvaluationRecord,

LanguageID: languageID,

RepositoryName: repositoryName,
Task: task,

Assessments: assessments,
}, nil
}
Expand Down
60 changes: 33 additions & 27 deletions evaluate/report/csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) {
Name: "Single empty model",

CSVFormatter: EvaluationRecordsPerModel{
"some-model-a": &EvaluationRecord{
"some-model-a": &EvaluationRecordSummary{
ModelID: "some-model-a",
ModelName: "Some Model A",
ModelCost: 0.0001,
Expand All @@ -59,7 +59,7 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) {
Name: "Multiple models with assessments",

CSVFormatter: EvaluationRecordsPerModel{
"some-model-a": &EvaluationRecord{
"some-model-a": &EvaluationRecordSummary{
ModelID: "some-model-a",
ModelName: "Some Model A",
ModelCost: 0.0001,
Expand All @@ -75,7 +75,7 @@ func TestGenerateCSVForAssessmentPerModel(t *testing.T) {
metrics.AssessmentKeyProcessingTime: 200,
},
},
"some-model-b": &EvaluationRecord{
"some-model-b": &EvaluationRecordSummary{
ModelID: "some-model-b",
ModelName: "Some Model B",
ModelCost: 0.0003,
Expand Down Expand Up @@ -257,10 +257,12 @@ func TestLoadEvaluationRecords(t *testing.T) {

ExpectedEvaluationRecords: EvaluationRecords{
&EvaluationRecord{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
LanguageID: "golang",
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
LanguageID: "golang",
RepositoryName: "golang/light",
Task: "write-tests",
Assessments: metrics.Assessments{
metrics.AssessmentKeyCoverage: 750,
metrics.AssessmentKeyFilesExecuted: 18,
Expand All @@ -281,17 +283,19 @@ func TestLoadEvaluationRecords(t *testing.T) {
fileContent := bytesutil.StringTrimIndentations(`
model-id,model-name,cost,language,repository,task,score,coverage,files-executed,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
openrouter/anthropic/claude-1.2,Claude 1.2,0.0001,golang,golang/light,write-tests,982,750,18,70179,720571,71195,115,49,50
openrouter/anthropic/claude-1.2,Claude 1.2,0.0002,golang,golang/plain,write-tests,37,20,2,441,11042,523,5,5,5
openrouter/anthropic/claude-1.2,Claude 1.2,0.0002,golang,golang/plain,transpile,37,20,2,441,11042,523,5,5,5
`)
require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644))
},

ExpectedEvaluationRecords: EvaluationRecords{
&EvaluationRecord{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
LanguageID: "golang",
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
LanguageID: "golang",
RepositoryName: "golang/light",
Task: "write-tests",
Assessments: metrics.Assessments{
metrics.AssessmentKeyCoverage: 750,
metrics.AssessmentKeyFilesExecuted: 18,
Expand All @@ -304,10 +308,12 @@ func TestLoadEvaluationRecords(t *testing.T) {
},
},
&EvaluationRecord{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0002,
LanguageID: "golang",
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0002,
LanguageID: "golang",
RepositoryName: "golang/plain",
Task: "transpile",
Assessments: metrics.Assessments{
metrics.AssessmentKeyCoverage: 20,
metrics.AssessmentKeyFilesExecuted: 2,
Expand All @@ -329,7 +335,7 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) {

EvaluationRecords EvaluationRecords

ExpectedEvaluationRecords map[string]*EvaluationRecord
ExpectedEvaluationRecords map[string]*EvaluationRecordSummary
}

validate := func(t *testing.T, tc *testCase) {
Expand Down Expand Up @@ -367,8 +373,8 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) {
},
},
},
ExpectedEvaluationRecords: map[string]*EvaluationRecord{
"openrouter/anthropic/claude-1.2": &EvaluationRecord{
ExpectedEvaluationRecords: map[string]*EvaluationRecordSummary{
"openrouter/anthropic/claude-1.2": &EvaluationRecordSummary{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
Expand Down Expand Up @@ -439,8 +445,8 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) {
},
},
},
ExpectedEvaluationRecords: map[string]*EvaluationRecord{
"openrouter/anthropic/claude-1.2": &EvaluationRecord{
ExpectedEvaluationRecords: map[string]*EvaluationRecordSummary{
"openrouter/anthropic/claude-1.2": &EvaluationRecordSummary{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
Expand All @@ -456,7 +462,7 @@ func TestEvaluationRecordsGroupByModel(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 16,
},
},
"ollama/codeqwen:latest": &EvaluationRecord{
"ollama/codeqwen:latest": &EvaluationRecordSummary{
ModelID: "ollama/codeqwen:latest",
ModelName: "Code Qwen",
ModelCost: 0.0003,
Expand Down Expand Up @@ -508,7 +514,7 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) {

ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{
"golang": EvaluationRecordsPerModel{
"openrouter/anthropic/claude-1.2": &EvaluationRecord{
"openrouter/anthropic/claude-1.2": &EvaluationRecordSummary{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
Expand Down Expand Up @@ -606,7 +612,7 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) {

ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{
"golang": EvaluationRecordsPerModel{
"openrouter/anthropic/claude-1.2": &EvaluationRecord{
"openrouter/anthropic/claude-1.2": &EvaluationRecordSummary{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
Expand All @@ -622,7 +628,7 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 16,
},
},
"ollama/codeqwen:latest": &EvaluationRecord{
"ollama/codeqwen:latest": &EvaluationRecordSummary{
ModelID: "ollama/codeqwen:latest",
ModelName: "Code Qwen",
ModelCost: 0.0003,
Expand All @@ -640,7 +646,7 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) {
},
},
"java": EvaluationRecordsPerModel{
"openrouter/anthropic/claude-1.2": &EvaluationRecord{
"openrouter/anthropic/claude-1.2": &EvaluationRecordSummary{
ModelID: "openrouter/anthropic/claude-1.2",
ModelName: "Claude 1.2",
ModelCost: 0.0001,
Expand All @@ -656,7 +662,7 @@ func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) {
metrics.AssessmentKeyResponseWithCode: 8,
},
},
"ollama/codeqwen:latest": &EvaluationRecord{
"ollama/codeqwen:latest": &EvaluationRecordSummary{
ModelID: "ollama/codeqwen:latest",
ModelName: "Code Qwen",
ModelCost: 0.0003,
Expand Down

0 comments on commit 1bddab2

Please sign in to comment.