Skip to content

Commit

Permalink
Task for code transpilation, so models can transpile Go code to Java …
Browse files Browse the repository at this point in the history
…and back

Closes #201
  • Loading branch information
ruiAzevedo19 authored and ahumenberger committed Jul 23, 2024
1 parent f0262c7 commit 08ad491
Show file tree
Hide file tree
Showing 15 changed files with 1,031 additions and 44 deletions.
1 change: 1 addition & 0 deletions .mockery.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ packages:
Model:
CapabilityWriteTests:
CapabilityRepairCode:
CapabilityTranspile:
github.com/symflower/eval-dev-quality/provider:
interfaces:
Loader:
Expand Down
24 changes: 12 additions & 12 deletions cmd/eval-dev-quality/cmd/report_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,24 @@ import (
)

var claudeEvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2
openrouter/anthropic/claude-2.0,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3
openrouter/anthropic/claude-2.0,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4
openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2,2
openrouter/anthropic/claude-2.0,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3,3
openrouter/anthropic/claude-2.0,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4,4
`)

var gemmaEvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/google/gemma-7b-it,golang,golang/light,write-tests,5,5,5,5,5,5,5,5,5,5
openrouter/google/gemma-7b-it,golang,golang/plain,write-tests,6,6,6,6,6,6,6,6,6,6
openrouter/google/gemma-7b-it,java,java/light,write-tests,7,7,7,7,7,7,7,7,7,7
openrouter/google/gemma-7b-it,java,java/plain,write-tests,8,8,8,8,8,8,8,8,8,8
openrouter/google/gemma-7b-it,golang,golang/light,write-tests,5,5,5,5,5,5,5,5,5,5,5
openrouter/google/gemma-7b-it,golang,golang/plain,write-tests,6,6,6,6,6,6,6,6,6,6,6
openrouter/google/gemma-7b-it,java,java/light,write-tests,7,7,7,7,7,7,7,7,7,7,7
openrouter/google/gemma-7b-it,java,java/plain,write-tests,8,8,8,8,8,8,8,8,8,8,8
`)

var gpt4EvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/openai/gpt-4,golang,golang/light,write-tests,9,9,9,9,9,9,9,9,9,9
openrouter/openai/gpt-4,golang,golang/plain,write-tests,10,10,10,10,10,10,10,10,10,10
openrouter/openai/gpt-4,java,java/light,write-tests,11,11,11,11,11,11,11,11,11,11
openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12
openrouter/openai/gpt-4,golang,golang/light,write-tests,9,9,9,9,9,9,9,9,9,9,9
openrouter/openai/gpt-4,golang,golang/plain,write-tests,10,10,10,10,10,10,10,10,10,10,10
openrouter/openai/gpt-4,java,java/light,write-tests,11,11,11,11,11,11,11,11,11,11,11
openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12,12
`)

// validateMarkdownLinks checks if the Markdown report data contains all the links to other relevant report files.
Expand Down
4 changes: 4 additions & 0 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ var (
// AssessmentKeyCoverage counts execution coverage objects.
AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 10)

// AssessmentKeyTestsPassing holds the percentage of passing tests.
AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing", 10)

// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
// AssessmentKeyGenerateTestsForFileCharacterCount counts the number of characters of a generated test file.
Expand Down Expand Up @@ -167,6 +170,7 @@ func CombineWithSymflowerFixAssessments(model Assessments, fixed Assessments) (c
combined[AssessmentKeyResponseNoError] = model[AssessmentKeyResponseNoError]
combined[AssessmentKeyResponseNoExcess] = model[AssessmentKeyResponseNoExcess]
combined[AssessmentKeyResponseWithCode] = model[AssessmentKeyResponseWithCode]
combined[AssessmentKeyTestsPassing] = fixed[AssessmentKeyTestsPassing]

return combined
}
7 changes: 5 additions & 2 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ func TestAssessmentString(t *testing.T) {

Assessment: NewAssessments(),

ExpectedString: "score=0, coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0",
ExpectedString: "score=0, coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0, tests-passing=0",
})

validate(t, &testCase{
Expand All @@ -153,9 +153,10 @@ func TestAssessmentString(t *testing.T) {
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
AssessmentKeyTestsPassing: 70,
},

ExpectedString: "score=15, coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5",
ExpectedString: "score=85, coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=70",
})
}

Expand Down Expand Up @@ -310,6 +311,7 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
AssessmentKeyProcessingTime: uint64(100),
AssessmentKeyCoverage: 10,
AssessmentKeyResponseNoError: 1,
AssessmentKeyTestsPassing: 100,
},

ExpectedAssessments: Assessments{
Expand All @@ -321,6 +323,7 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
AssessmentKeyResponseNoError: 0,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyResponseNoExcess: 1,
AssessmentKeyTestsPassing: 100,
},
})
}
66 changes: 36 additions & 30 deletions evaluate/report/csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func TestNewEvaluationFile(t *testing.T) {
require.NoError(t, err)

expectedEvaluationFileContent := bytesutil.StringTrimIndentations(`
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
`)

assert.Equal(t, expectedEvaluationFileContent, string(actualEvaluationFileContent))
Expand Down Expand Up @@ -67,8 +67,8 @@ func TestWriteEvaluationRecord(t *testing.T) {
},

ExpectedCSV: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
mocked-model,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
mocked-model,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0,0
`,
})
validate(t, &testCase{
Expand All @@ -90,9 +90,9 @@ func TestWriteEvaluationRecord(t *testing.T) {
},

ExpectedCSV: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
mocked-model,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0
mocked-model,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
mocked-model,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0,0
mocked-model,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0,0
`,
})
}
Expand Down Expand Up @@ -225,37 +225,37 @@ func TestEvaluationFileWriteLines(t *testing.T) {
Name: "No records",

ExpectedEvaluationFile: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
`,
})
validate(t, &testCase{
Name: "Single record",

RawRecords: [][]string{
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
},

ExpectedEvaluationFile: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
`,
})
validate(t, &testCase{
Name: "Multiple records",

RawRecords: [][]string{
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
[]string{"modelA", "golang", "golang/plain", "write-tests", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"},
[]string{"modelA", "java", "java/light", "write-tests", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"},
[]string{"modelA", "java", "java/plain", "write-tests", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4"},
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
[]string{"modelA", "golang", "golang/plain", "write-tests", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"},
[]string{"modelA", "java", "java/light", "write-tests", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"},
[]string{"modelA", "java", "java/plain", "write-tests", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4"},
},

ExpectedEvaluationFile: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
modelA,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2
modelA,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3
modelA,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
modelA,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2,2
modelA,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3,3
modelA,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4,4
`,
})
}
Expand Down Expand Up @@ -347,7 +347,7 @@ func TestAssessmentFromRecord(t *testing.T) {
validate(t, &testCase{
Name: "Valid assessments",

Record: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9"},
Record: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},

ExpectedAssessments: metrics.Assessments{
metrics.AssessmentKeyCoverage: 1,
Expand All @@ -359,6 +359,7 @@ func TestAssessmentFromRecord(t *testing.T) {
metrics.AssessmentKeyResponseNoError: 7,
metrics.AssessmentKeyResponseNoExcess: 8,
metrics.AssessmentKeyResponseWithCode: 9,
metrics.AssessmentKeyTestsPassing: 10,
},
})
}
Expand All @@ -385,7 +386,7 @@ func TestRecordsToAssessmentsPerModel(t *testing.T) {
Name: "Single record",

Records: [][]string{
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
},

ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
Expand All @@ -399,16 +400,17 @@ func TestRecordsToAssessmentsPerModel(t *testing.T) {
metrics.AssessmentKeyResponseNoError: 7,
metrics.AssessmentKeyResponseNoExcess: 8,
metrics.AssessmentKeyResponseWithCode: 9,
metrics.AssessmentKeyTestsPassing: 10,
},
},
})
validate(t, &testCase{
Name: "Multiple records from the same model",

Records: [][]string{
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
},

ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
Expand All @@ -422,19 +424,20 @@ func TestRecordsToAssessmentsPerModel(t *testing.T) {
metrics.AssessmentKeyResponseNoError: 21,
metrics.AssessmentKeyResponseNoExcess: 24,
metrics.AssessmentKeyResponseWithCode: 27,
metrics.AssessmentKeyTestsPassing: 30,
},
},
})
validate(t, &testCase{
Name: "Multiple records from different models",

Records: [][]string{
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelC", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
[]string{"modelC", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"},
},

ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
Expand All @@ -448,6 +451,7 @@ func TestRecordsToAssessmentsPerModel(t *testing.T) {
metrics.AssessmentKeyResponseNoError: 21,
metrics.AssessmentKeyResponseNoExcess: 24,
metrics.AssessmentKeyResponseWithCode: 27,
metrics.AssessmentKeyTestsPassing: 30,
},
"modelB": metrics.Assessments{
metrics.AssessmentKeyCoverage: 2,
Expand All @@ -459,6 +463,7 @@ func TestRecordsToAssessmentsPerModel(t *testing.T) {
metrics.AssessmentKeyResponseNoError: 14,
metrics.AssessmentKeyResponseNoExcess: 16,
metrics.AssessmentKeyResponseWithCode: 18,
metrics.AssessmentKeyTestsPassing: 20,
},
"modelC": metrics.Assessments{
metrics.AssessmentKeyCoverage: 1,
Expand All @@ -470,6 +475,7 @@ func TestRecordsToAssessmentsPerModel(t *testing.T) {
metrics.AssessmentKeyResponseNoError: 7,
metrics.AssessmentKeyResponseNoExcess: 8,
metrics.AssessmentKeyResponseWithCode: 9,
metrics.AssessmentKeyTestsPassing: 10,
},
},
})
Expand Down
Loading

0 comments on commit 08ad491

Please sign in to comment.