Skip to content

Commit

Permalink
Task for code transpilation, so models can transpile Go code to Java …
Browse files Browse the repository at this point in the history
…and back

Closes #201
  • Loading branch information
ruiAzevedo19 authored and ahumenberger committed Jul 19, 2024
1 parent af201b1 commit eb2a51d
Show file tree
Hide file tree
Showing 15 changed files with 1,012 additions and 33 deletions.
1 change: 1 addition & 0 deletions .mockery.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ packages:
Model:
CapabilityWriteTests:
CapabilityRepairCode:
CapabilityTranspile:
github.com/symflower/eval-dev-quality/provider:
interfaces:
Loader:
Expand Down
24 changes: 12 additions & 12 deletions cmd/eval-dev-quality/cmd/report_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,24 @@ import (
)

var claudeEvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2
openrouter/anthropic/claude-2.0,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3
openrouter/anthropic/claude-2.0,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4
openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2,2
openrouter/anthropic/claude-2.0,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3,3
openrouter/anthropic/claude-2.0,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4,4
`)

var gemmaEvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/google/gemma-7b-it,golang,golang/light,write-tests,5,5,5,5,5,5,5,5,5,5
openrouter/google/gemma-7b-it,golang,golang/plain,write-tests,6,6,6,6,6,6,6,6,6,6
openrouter/google/gemma-7b-it,java,java/light,write-tests,7,7,7,7,7,7,7,7,7,7
openrouter/google/gemma-7b-it,java,java/plain,write-tests,8,8,8,8,8,8,8,8,8,8
openrouter/google/gemma-7b-it,golang,golang/light,write-tests,5,5,5,5,5,5,5,5,5,5,5
openrouter/google/gemma-7b-it,golang,golang/plain,write-tests,6,6,6,6,6,6,6,6,6,6,6
openrouter/google/gemma-7b-it,java,java/light,write-tests,7,7,7,7,7,7,7,7,7,7,7
openrouter/google/gemma-7b-it,java,java/plain,write-tests,8,8,8,8,8,8,8,8,8,8,8
`)

var gpt4EvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/openai/gpt-4,golang,golang/light,write-tests,9,9,9,9,9,9,9,9,9,9
openrouter/openai/gpt-4,golang,golang/plain,write-tests,10,10,10,10,10,10,10,10,10,10
openrouter/openai/gpt-4,java,java/light,write-tests,11,11,11,11,11,11,11,11,11,11
openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12
openrouter/openai/gpt-4,golang,golang/light,write-tests,9,9,9,9,9,9,9,9,9,9,9
openrouter/openai/gpt-4,golang,golang/plain,write-tests,10,10,10,10,10,10,10,10,10,10,10
openrouter/openai/gpt-4,java,java/light,write-tests,11,11,11,11,11,11,11,11,11,11,11
openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12,12
`)

func TestReportExecute(t *testing.T) {
Expand Down
4 changes: 4 additions & 0 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ var (
// AssessmentKeyCoverage counts execution coverage objects.
AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 10)

// AssessmentKeyTestsPassing holds the percentage of passing tests.
AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing", 10)

// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
// AssessmentKeyGenerateTestsForFileCharacterCount counts the number of characters of a generated test file.
Expand Down Expand Up @@ -167,6 +170,7 @@ func CombineWithSymflowerFixAssessments(model Assessments, fixed Assessments) (c
combined[AssessmentKeyResponseNoError] = model[AssessmentKeyResponseNoError]
combined[AssessmentKeyResponseNoExcess] = model[AssessmentKeyResponseNoExcess]
combined[AssessmentKeyResponseWithCode] = model[AssessmentKeyResponseWithCode]
combined[AssessmentKeyTestsPassing] = fixed[AssessmentKeyTestsPassing]

return combined
}
7 changes: 5 additions & 2 deletions evaluate/metrics/assessment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ func TestAssessmentString(t *testing.T) {

Assessment: NewAssessments(),

ExpectedString: "score=0, coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0",
ExpectedString: "score=0, coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0, tests-passing=0",
})

validate(t, &testCase{
Expand All @@ -153,9 +153,10 @@ func TestAssessmentString(t *testing.T) {
AssessmentKeyResponseNoExcess: 4,
AssessmentKeyResponseWithCode: 5,
AssessmentKeyProcessingTime: 200,
AssessmentKeyTestsPassing: 70,
},

ExpectedString: "score=15, coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5",
ExpectedString: "score=85, coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=70",
})
}

Expand Down Expand Up @@ -310,6 +311,7 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
AssessmentKeyProcessingTime: uint64(100),
AssessmentKeyCoverage: 10,
AssessmentKeyResponseNoError: 1,
AssessmentKeyTestsPassing: 100,
},

ExpectedAssessments: Assessments{
Expand All @@ -321,6 +323,7 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
AssessmentKeyResponseNoError: 0,
AssessmentKeyResponseWithCode: 1,
AssessmentKeyResponseNoExcess: 1,
AssessmentKeyTestsPassing: 100,
},
})
}
38 changes: 19 additions & 19 deletions evaluate/report/csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func TestNewEvaluationFile(t *testing.T) {
require.NoError(t, err)

expectedEvaluationFileContent := bytesutil.StringTrimIndentations(`
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
`)

assert.Equal(t, expectedEvaluationFileContent, string(actualEvaluationFileContent))
Expand Down Expand Up @@ -66,8 +66,8 @@ func TestWriteEvaluationRecord(t *testing.T) {
},

ExpectedCSV: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
mocked-model,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
mocked-model,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0,0
`,
})
validate(t, &testCase{
Expand All @@ -89,9 +89,9 @@ func TestWriteEvaluationRecord(t *testing.T) {
},

ExpectedCSV: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
mocked-model,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0
mocked-model,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
mocked-model,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0,0
mocked-model,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0,0
`,
})
}
Expand Down Expand Up @@ -224,37 +224,37 @@ func TestEvaluationFileWriteLines(t *testing.T) {
Name: "No records",

ExpectedEvaluationFile: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
`,
})
validate(t, &testCase{
Name: "Single record",

RawRecords: [][]string{
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
},

ExpectedEvaluationFile: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
`,
})
validate(t, &testCase{
Name: "Multiple records",

RawRecords: [][]string{
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
[]string{"modelA", "golang", "golang/plain", "write-tests", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"},
[]string{"modelA", "java", "java/light", "write-tests", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"},
[]string{"modelA", "java", "java/plain", "write-tests", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4"},
[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
[]string{"modelA", "golang", "golang/plain", "write-tests", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"},
[]string{"modelA", "java", "java/light", "write-tests", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"},
[]string{"modelA", "java", "java/plain", "write-tests", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4"},
},

ExpectedEvaluationFile: `
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
modelA,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2
modelA,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3
modelA,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4
model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
modelA,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2,2
modelA,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3,3
modelA,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4,4
`,
})
}
Expand Down
193 changes: 193 additions & 0 deletions evaluate/task/task-transpile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
package task

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strings"

pkgerrors "github.com/pkg/errors"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/language"
"github.com/symflower/eval-dev-quality/language/golang"
"github.com/symflower/eval-dev-quality/language/java"
"github.com/symflower/eval-dev-quality/log"
"github.com/symflower/eval-dev-quality/model"
evaltask "github.com/symflower/eval-dev-quality/task"
)

// TaskTranspile holds the transpilation task.
type TaskTranspile struct{}

// TaskArgumentsTranspile holds extra arguments to be used in a query prompt.
type TaskArgumentsTranspile struct {
// OriginLanguage holds the language we are transpiling from.
OriginLanguage language.Language
// OriginFilePath holds the path for the file containing the source code we want to transpile.
OriginFilePath string
}

var _ evaltask.Task = (*TaskTranspile)(nil)

// Identifier returns the transpilation task identifier.
func (t *TaskTranspile) Identifier() evaltask.Identifier {
return IdentifierTranspile
}

// Run transpiles code between languages and runs predefined tests to check if the transpilation was successful.
func (t *TaskTranspile) Run(ctx evaltask.Context) (repositoryAssessment map[evaltask.Identifier]metrics.Assessments, problems []error, err error) {
modelCapability, ok := ctx.Model.(model.CapabilityTranspile)
if !ok {
return nil, nil, pkgerrors.Wrap(evaltask.ErrTaskUnsupportedByModel, fmt.Sprintf("%q does not support %q", ctx.Model.ID(), string(t.Identifier())))
}

taskLogger, err := newTaskLogger(ctx, t)
if err != nil {
return nil, nil, err
}
defer func() {
taskLogger.finalize(problems)
}()

var packagePaths []string
files, err := os.ReadDir(ctx.Repository.DataPath())
if err != nil {
return nil, nil, pkgerrors.WithStack(err)
}
for _, file := range files {
if file.IsDir() && !strings.HasPrefix(file.Name(), ".") { // Ignore hidden directories.
packagePaths = append(packagePaths, file.Name())
}
}

modelAssessments := metrics.NewAssessments()
withSymflowerAssessments := metrics.NewAssessments()

maximumReachableFiles := uint64(len(packagePaths))
modelAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles
withSymflowerAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles

for _, packagePath := range packagePaths {
modelAssessmentsForFile := metrics.NewAssessments()
withSymflowerAssessmentsForFile := modelAssessmentsForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until a failure actually happens.

if err := ctx.Repository.Reset(ctx.Logger); err != nil {
ctx.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
}

var originLanguage language.Language
if _, ok := ctx.Language.(*golang.Language); ok {
originLanguage = &java.Language{}
} else {
originLanguage = &golang.Language{}
}

originFilePath, stubFilePath, err := t.unpackTranspilerPackage(ctx, taskLogger.Logger, originLanguage, packagePath)
if err != nil {
return nil, nil, err
}

modelContext := model.Context{
Language: ctx.Language,

RepositoryPath: filepath.Join(ctx.Repository.DataPath(), packagePath),
FilePath: stubFilePath,

Arguments: &TaskArgumentsTranspile{
OriginLanguage: originLanguage,
OriginFilePath: originFilePath,
},

Logger: taskLogger.Logger,
}
assessments, err := modelCapability.Transpile(modelContext)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, originFilePath))

continue
}
if assessments[metrics.AssessmentKeyProcessingTime] == 0 {
return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", ctx.Model.ID(), ctx.Repository.Name())
}
modelAssessmentsForFile.Add(assessments)
modelAssessmentsForFile.Award(metrics.AssessmentKeyResponseNoError)

testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
problems = append(problems, ps...)
if err != nil {
problems = append(problems, pkgerrors.WithMessage(err, originFilePath))

// If there is an execution timeout do not run "symflower fix" because the code itself is correct.
if errors.Is(err, context.DeadlineExceeded) {
modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)

continue
}

// Run "symflower fix" if the model response fails to execute.
if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
problems = append(problems, ps...)
if err != nil {
problems = append(problems, err)

modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)

continue
} else {
testsPassingPercentage := withSymflowerFixTestResult.PassingTestsPercentage()
taskLogger.Printf("Executes tests with %d percent tests passing after \"symflower fix\"", testsPassingPercentage)

// Symflower was able to fix a failure so now update the assessment with the improved results.
withSymflowerFixAssessments := metrics.NewAssessments()
withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassingPercentage))

withSymflowerAssessmentsForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentsForFile, withSymflowerFixAssessments)
}
}
} else {
testsPassingPercentage := testResult.PassingTestsPercentage()
taskLogger.Printf("Executes tests with %d percent tests passing", testsPassingPercentage)
modelAssessmentsForFile.Award(metrics.AssessmentKeyFilesExecuted)
modelAssessmentsForFile.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassingPercentage))
}

modelAssessments.Add(modelAssessmentsForFile)
withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)
}

repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{
IdentifierTranspile: modelAssessments,
IdentifierTranspileSymflowerFix: withSymflowerAssessments,
}

return repositoryAssessment, problems, nil
}

// unpackTranspilerPackage checks if the testdata repository for the transpilation task is well-formed and returns the path to the implementation file and also the path to the file that holds the stub.
func (t *TaskTranspile) unpackTranspilerPackage(ctx evaltask.Context, fileLogger *log.Logger, originLanguage language.Language, packagePath string) (originFilePath string, stubFilePath string, err error) {
packagePathAbsolute := filepath.Join(ctx.Repository.DataPath(), packagePath)
// Check if the package path has a directory called "implementation" with a source file in the language to transpile from.
files, err := originLanguage.Files(fileLogger, filepath.Join(packagePathAbsolute, "implementation"))
if err != nil {
return "", "", pkgerrors.WithStack(err)
} else if len(files) != 1 {
return "", "", pkgerrors.Errorf("package %q in repository %q must have an \"implementation\" directory with just one %s source file to transpile", packagePath, ctx.Repository.Name(), originLanguage.Name())
} else if strings.HasSuffix(files[0], originLanguage.DefaultTestFileSuffix()) {
return "", "", pkgerrors.Errorf("package %q in repository %q must have an \"implementation\" directory with only a %s source file, but found a test file %q", packagePath, ctx.Repository.Name(), originLanguage.Name(), originFilePath)
}
originFilePath = filepath.Join("implementation", files[0])

stubFilePath, err = packageSourceFile(fileLogger, packagePathAbsolute, ctx.Language)
if err != nil {
return "", "", err
}

return originFilePath, stubFilePath, nil
}
Loading

0 comments on commit eb2a51d

Please sign in to comment.