Task for code transpilation, so models can transpile Go code to Java …

…and back Closes #201
symflower · Jul 18, 2024 · 2be1f87 · 2be1f87
1 parent 0a2a04f
commit 2be1f87
Show file tree

Hide file tree

Showing 14 changed files with 1,000 additions and 21 deletions.
diff --git a/.mockery.yml b/.mockery.yml
@@ -15,6 +15,7 @@ packages:
       Model:
       CapabilityWriteTests:
       CapabilityRepairCode:
+      CapabilityTranspile:
   github.com/symflower/eval-dev-quality/provider:
     interfaces:
       Loader:

diff --git a/evaluate/metrics/assessment.go b/evaluate/metrics/assessment.go
@@ -44,6 +44,9 @@ var (
 	// AssessmentKeyCoverage counts execution coverage objects.
 	AssessmentKeyCoverage = RegisterAssessmentKey("coverage", 10)
 
+	// AssessmentKeyTestsPassing holds the percentage of passing tests.
+	AssessmentKeyTestsPassing = RegisterAssessmentKey("tests-passing", 10)
+
 	// AssessmentKeyResponseCharacterCount counts the number of characters of a response.
 	AssessmentKeyResponseCharacterCount = RegisterAssessmentKey("response-character-count", 0)
 	// AssessmentKeyGenerateTestsForFileCharacterCount counts the number of characters of a generated test file.
@@ -167,6 +170,7 @@ func CombineWithSymflowerFixAssessments(model Assessments, fixed Assessments) (c
 	combined[AssessmentKeyResponseNoError] = model[AssessmentKeyResponseNoError]
 	combined[AssessmentKeyResponseNoExcess] = model[AssessmentKeyResponseNoExcess]
 	combined[AssessmentKeyResponseWithCode] = model[AssessmentKeyResponseWithCode]
+	combined[AssessmentKeyTestsPassing] = fixed[AssessmentKeyTestsPassing]
 
 	return combined
 }
diff --git a/evaluate/metrics/assessment_test.go b/evaluate/metrics/assessment_test.go
@@ -137,7 +137,7 @@ func TestAssessmentString(t *testing.T) {
 
 		Assessment: NewAssessments(),
 
-		ExpectedString: "score=0, coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0",
+		ExpectedString: "score=0, coverage=0, files-executed=0, files-executed-maximum-reachable=0, generate-tests-for-file-character-count=0, processing-time=0, response-character-count=0, response-no-error=0, response-no-excess=0, response-with-code=0, tests-passing=0",
 	})
 
 	validate(t, &testCase{
@@ -153,9 +153,10 @@ func TestAssessmentString(t *testing.T) {
 			AssessmentKeyResponseNoExcess:                   4,
 			AssessmentKeyResponseWithCode:                   5,
 			AssessmentKeyProcessingTime:                     200,
+			AssessmentKeyTestsPassing:                       70,
 		},
 
-		ExpectedString: "score=15, coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5",
+		ExpectedString: "score=85, coverage=1, files-executed=2, files-executed-maximum-reachable=2, generate-tests-for-file-character-count=50, processing-time=200, response-character-count=100, response-no-error=3, response-no-excess=4, response-with-code=5, tests-passing=70",
 	})
 }
 
@@ -310,6 +311,7 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
 			AssessmentKeyProcessingTime:  uint64(100),
 			AssessmentKeyCoverage:        10,
 			AssessmentKeyResponseNoError: 1,
+			AssessmentKeyTestsPassing:    100,
 		},
 
 		ExpectedAssessments: Assessments{
@@ -321,6 +323,7 @@ func TestCombineModelAndSymflowerFixAssessments(t *testing.T) {
 			AssessmentKeyResponseNoError:                    0,
 			AssessmentKeyResponseWithCode:                   1,
 			AssessmentKeyResponseNoExcess:                   1,
+			AssessmentKeyTestsPassing:                       100,
 		},
 	})
 }
diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go
@@ -27,7 +27,7 @@ func TestNewEvaluationFile(t *testing.T) {
 	require.NoError(t, err)
 
 	expectedEvaluationFileContent := bytesutil.StringTrimIndentations(`
-		model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
+		model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
 	`)
 
 	assert.Equal(t, expectedEvaluationFileContent, string(actualEvaluationFileContent))
@@ -66,8 +66,8 @@ func TestWriteEvaluationRecord(t *testing.T) {
 		},
 
 		ExpectedCSV: `
-			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			mocked-model,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0
+			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
+			mocked-model,golang,golang/plain,write-tests,0,0,0,0,0,0,0,0,0,0,0
 		`,
 	})
 	validate(t, &testCase{
@@ -89,9 +89,9 @@ func TestWriteEvaluationRecord(t *testing.T) {
 		},
 
 		ExpectedCSV: `
-			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			mocked-model,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0
-			mocked-model,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0
+			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
+			mocked-model,golang,golang/plain,write-tests,2,0,1,1,0,0,0,1,0,0,0
+			mocked-model,golang,golang/plain,write-tests-symflower-fix,12,10,1,1,0,0,0,1,0,0,0
 		`,
 	})
 }
@@ -224,37 +224,37 @@ func TestEvaluationFileWriteLines(t *testing.T) {
 		Name: "No records",
 
 		ExpectedEvaluationFile: `
-			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
+			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
 		`,
 	})
 	validate(t, &testCase{
 		Name: "Single record",
 
 		RawRecords: [][]string{
-			[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
+			[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
 		},
 
 		ExpectedEvaluationFile: `
-			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
+			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
+			modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
 		`,
 	})
 	validate(t, &testCase{
 		Name: "Multiple records",
 
 		RawRecords: [][]string{
-			[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
-			[]string{"modelA", "golang", "golang/plain", "write-tests", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"},
-			[]string{"modelA", "java", "java/light", "write-tests", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"},
-			[]string{"modelA", "java", "java/plain", "write-tests", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4"},
+			[]string{"modelA", "golang", "golang/light", "write-tests", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"},
+			[]string{"modelA", "golang", "golang/plain", "write-tests", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"},
+			[]string{"modelA", "java", "java/light", "write-tests", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"},
+			[]string{"modelA", "java", "java/plain", "write-tests", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4"},
 		},
 
 		ExpectedEvaluationFile: `
-			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code
-			modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1
-			modelA,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2
-			modelA,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3
-			modelA,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4
+			model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code,tests-passing
+			modelA,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
+			modelA,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2,2
+			modelA,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3,3
+			modelA,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4,4
 		`,
 	})
 }

diff --git a/evaluate/task/task-transpile.go b/evaluate/task/task-transpile.go
@@ -0,0 +1,193 @@
+package task
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	pkgerrors "github.com/pkg/errors"
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+	"github.com/symflower/eval-dev-quality/language"
+	"github.com/symflower/eval-dev-quality/language/golang"
+	"github.com/symflower/eval-dev-quality/language/java"
+	"github.com/symflower/eval-dev-quality/log"
+	"github.com/symflower/eval-dev-quality/model"
+	evaltask "github.com/symflower/eval-dev-quality/task"
+)
+
+// TaskTranspile holds the transpilation task.
+type TaskTranspile struct{}
+
+// TaskArgumentsTranspile holds extra arguments to be used in a query prompt.
+type TaskArgumentsTranspile struct {
+	// OriginLanguage holds the language we are transpiling from.
+	OriginLanguage language.Language
+	// OriginFilePath holds the path for the file containing the source code we want to transpile.
+	OriginFilePath string
+}
+
+var _ evaltask.Task = (*TaskTranspile)(nil)
+
+// Identifier returns the transpilation task identifier.
+func (t *TaskTranspile) Identifier() evaltask.Identifier {
+	return IdentifierTranspile
+}
+
+// Run transpiles code between languages and runs predefined tests to check if the transpilation was successful.
+func (t *TaskTranspile) Run(ctx evaltask.Context) (repositoryAssessment map[evaltask.Identifier]metrics.Assessments, problems []error, err error) {
+	modelCapability, ok := ctx.Model.(model.CapabilityTranspile)
+	if !ok {
+		return nil, nil, pkgerrors.Wrap(evaltask.ErrTaskUnsupportedByModel, fmt.Sprintf("%q does not support %q", ctx.Model.ID(), string(t.Identifier())))
+	}
+
+	taskLogger, err := newTaskLogger(ctx, t)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer func() {
+		taskLogger.finalize(problems)
+	}()
+
+	var packagePaths []string
+	files, err := os.ReadDir(ctx.Repository.DataPath())
+	if err != nil {
+		return nil, nil, pkgerrors.WithStack(err)
+	}
+	for _, file := range files {
+		if file.IsDir() && !strings.HasPrefix(file.Name(), ".") { // Ignore hidden directories.
+			packagePaths = append(packagePaths, file.Name())
+		}
+	}
+
+	modelAssessments := metrics.NewAssessments()
+	withSymflowerAssessments := metrics.NewAssessments()
+
+	maximumReachableFiles := uint64(len(packagePaths))
+	modelAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles
+	withSymflowerAssessments[metrics.AssessmentKeyFilesExecutedMaximumReachable] = maximumReachableFiles
+
+	for _, packagePath := range packagePaths {
+		modelAssessmentsForFile := metrics.NewAssessments()
+		withSymflowerAssessmentsForFile := modelAssessmentsForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until a failure actually happens.
+
+		if err := ctx.Repository.Reset(ctx.Logger); err != nil {
+			ctx.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err)
+		}
+
+		var originLanguage language.Language
+		if _, ok := ctx.Language.(*golang.Language); ok {
+			originLanguage = &java.Language{}
+		} else {
+			originLanguage = &golang.Language{}
+		}
+
+		originFilePath, stubFilePath, err := t.unpackTranspilerPackage(ctx, taskLogger.Logger, originLanguage, packagePath)
+		if err != nil {
+			return nil, nil, err
+		}
+
+		modelContext := model.Context{
+			Language: ctx.Language,
+
+			RepositoryPath: filepath.Join(ctx.Repository.DataPath(), packagePath),
+			FilePath:       stubFilePath,
+
+			Arguments: &TaskArgumentsTranspile{
+				OriginLanguage: originLanguage,
+				OriginFilePath: originFilePath,
+			},
+
+			Logger: taskLogger.Logger,
+		}
+		assessments, err := modelCapability.Transpile(modelContext)
+		if err != nil {
+			problems = append(problems, pkgerrors.WithMessage(err, originFilePath))
+
+			continue
+		}
+		if assessments[metrics.AssessmentKeyProcessingTime] == 0 {
+			return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", ctx.Model.ID(), ctx.Repository.Name())
+		}
+		modelAssessmentsForFile.Add(assessments)
+		modelAssessmentsForFile.Award(metrics.AssessmentKeyResponseNoError)
+
+		testResult, ps, err := ctx.Language.ExecuteTests(taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
+		problems = append(problems, ps...)
+		if err != nil {
+			problems = append(problems, pkgerrors.WithMessage(err, originFilePath))
+
+			// If there is an execution timeout do not run "symflower fix" because the code itself is correct.
+			if errors.Is(err, context.DeadlineExceeded) {
+				modelAssessments.Add(modelAssessmentsForFile)
+				withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)
+
+				continue
+			}
+
+			// Run "symflower fix" if the model response fails to execute.
+			if ctx.Language.ID() == "golang" { // Currently we only support Go for "symflower fix".
+				withSymflowerFixTestResult, processingTime, ps, err := ExecuteWithSymflowerFix(ctx, taskLogger.Logger, filepath.Join(ctx.Repository.DataPath(), packagePath))
+				problems = append(problems, ps...)
+				if err != nil {
+					problems = append(problems, err)
+
+					modelAssessments.Add(modelAssessmentsForFile)
+					withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)
+
+					continue
+				} else {
+					testsPassing := withSymflowerFixTestResult.TestsPass / withSymflowerFixTestResult.TestsTotal * 100
+					taskLogger.Printf("Executes tests with %d percent tests passing after \"symflower fix\"", testsPassing)
+
+					// Symflower was able to fix a failure so now update the assessment with the improved results.
+					withSymflowerFixAssessments := metrics.NewAssessments()
+					withSymflowerFixAssessments[metrics.AssessmentKeyProcessingTime] = processingTime
+					withSymflowerFixAssessments.Award(metrics.AssessmentKeyFilesExecuted)
+					withSymflowerFixAssessments.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
+
+					withSymflowerAssessmentsForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentsForFile, withSymflowerFixAssessments)
+				}
+			}
+		} else {
+			testsPassing := testResult.TestsPass / testResult.TestsTotal * 100
+			taskLogger.Printf("Executes tests with %d percent tests passing", testsPassing)
+			modelAssessmentsForFile.Award(metrics.AssessmentKeyFilesExecuted)
+			modelAssessmentsForFile.AwardPoints(metrics.AssessmentKeyTestsPassing, uint64(testsPassing))
+		}
+
+		modelAssessments.Add(modelAssessmentsForFile)
+		withSymflowerAssessments.Add(withSymflowerAssessmentsForFile)
+	}
+
+	repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{
+		IdentifierTranspile:             modelAssessments,
+		IdentifierTranspileSymflowerFix: withSymflowerAssessments,
+	}
+
+	return repositoryAssessment, problems, nil
+}
+
+// unpackTranspilerPackage checks if the testdata repository for the transpilation task is well-formed and returns the path to the implementation file and also the path to the file that holds the stub.
+func (t *TaskTranspile) unpackTranspilerPackage(ctx evaltask.Context, fileLogger *log.Logger, originLanguage language.Language, packagePath string) (originFilePath string, stubFilePath string, err error) {
+	packagePathAbsolute := filepath.Join(ctx.Repository.DataPath(), packagePath)
+	// Check if the package path has a directory called "implementation" with a source file in the language to transpile from.
+	files, err := originLanguage.Files(fileLogger, filepath.Join(packagePathAbsolute, "implementation"))
+	if err != nil {
+		return "", "", pkgerrors.WithStack(err)
+	} else if len(files) != 1 {
+		return "", "", pkgerrors.Errorf("package %q in repository %q must have an \"implementation\" directory with just one %s source file to transpile", packagePath, ctx.Repository.Name(), originLanguage.Name())
+	} else if strings.HasSuffix(files[0], originLanguage.DefaultTestFileSuffix()) {
+		return "", "", pkgerrors.Errorf("package %q in repository %q must have an \"implementation\" directory with only a %s source file, but found a test file %q", packagePath, ctx.Repository.Name(), originLanguage.Name(), originFilePath)
+	}
+	originFilePath = filepath.Join("implementation", files[0])
+
+	stubFilePath, err = packageSourceFile(fileLogger, packagePathAbsolute, ctx.Language)
+	if err != nil {
+		return "", "", err
+	}
+
+	return originFilePath, stubFilePath, nil
+}