Skip to content

Commit

Permalink
Merge pull request #337 from symflower/track-runs-in-csv
Browse files Browse the repository at this point in the history
Write run count to CSV report
  • Loading branch information
zimmski authored Oct 2, 2024
2 parents 8433021 + 1fb8cdf commit b3f4415
Show file tree
Hide file tree
Showing 18 changed files with 5,831 additions and 5,406 deletions.
10 changes: 9 additions & 1 deletion cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ type extractMetricsMatch *regexp.Regexp
var extractMetricsLogsMatch = extractMetricsMatch(regexp.MustCompile(`score=(\d+), coverage=(\d+), files-executed=(\d+), files-executed-maximum-reachable=(\d+), generate-tests-for-file-character-count=(\d+), processing-time=(\d+), response-character-count=(\d+), response-no-error=(\d+), response-no-excess=(\d+), response-with-code=(\d+)`))

// extractMetricsCSVMatch is a regular expression to extract metrics from CSV rows.
var extractMetricsCSVMatch = extractMetricsMatch(regexp.MustCompile(`(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`))
var extractMetricsCSVMatch = extractMetricsMatch(regexp.MustCompile(`\d+,(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`))

// extractMetrics extracts multiple assessment metrics from the given string according to a given regular expression.
func extractMetrics(t *testing.T, regex extractMetricsMatch, data string) (assessments []metrics.Assessments, scores []uint64) {
Expand Down Expand Up @@ -643,6 +643,14 @@ func TestEvaluateExecute(t *testing.T) {
filepath.Join("result-directory", "categories.svg"): nil,
filepath.Join("result-directory", "config.json"): nil,
filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
// Check if the runs are written to the CSV file.
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests,1")
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests,2")
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests,3")
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests-symflower-fix,1")
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests-symflower-fix,2")
assert.Contains(t, data, "golang,"+filepath.Join("golang", "plain")+",write-tests-symflower-fix,3")

actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{
metrics.Assessments{
metrics.AssessmentKeyCoverage: 10,
Expand Down
24 changes: 12 additions & 12 deletions cmd/eval-dev-quality/cmd/report_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,24 @@ import (
)

var claudeEvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1
openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,2,2,2,2,2,2,2,2,2,2,2
openrouter/anthropic/claude-2.0,java,java/light,write-tests,3,3,3,3,3,3,3,3,3,3,3
openrouter/anthropic/claude-2.0,java,java/plain,write-tests,4,4,4,4,4,4,4,4,4,4,4
openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,1,1,1,1,1,1,1,1,1,1,1,1
openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,1,2,2,2,2,2,2,2,2,2,2,2
openrouter/anthropic/claude-2.0,java,java/light,write-tests,1,3,3,3,3,3,3,3,3,3,3,3
openrouter/anthropic/claude-2.0,java,java/plain,write-tests,1,4,4,4,4,4,4,4,4,4,4,4
`)

var gemmaEvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/google/gemma-7b-it,golang,golang/light,write-tests,5,5,5,5,5,5,5,5,5,5,5
openrouter/google/gemma-7b-it,golang,golang/plain,write-tests,6,6,6,6,6,6,6,6,6,6,6
openrouter/google/gemma-7b-it,java,java/light,write-tests,7,7,7,7,7,7,7,7,7,7,7
openrouter/google/gemma-7b-it,java,java/plain,write-tests,8,8,8,8,8,8,8,8,8,8,8
openrouter/google/gemma-7b-it,golang,golang/light,write-tests,1,5,5,5,5,5,5,5,5,5,5,5
openrouter/google/gemma-7b-it,golang,golang/plain,write-tests,1,6,6,6,6,6,6,6,6,6,6,6
openrouter/google/gemma-7b-it,java,java/light,write-tests,1,7,7,7,7,7,7,7,7,7,7,7
openrouter/google/gemma-7b-it,java,java/plain,write-tests,1,8,8,8,8,8,8,8,8,8,8,8
`)

var gpt4EvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/openai/gpt-4,golang,golang/light,write-tests,9,9,9,9,9,9,9,9,9,9,9
openrouter/openai/gpt-4,golang,golang/plain,write-tests,10,10,10,10,10,10,10,10,10,10,10
openrouter/openai/gpt-4,java,java/light,write-tests,11,11,11,11,11,11,11,11,11,11,11
openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12,12
openrouter/openai/gpt-4,golang,golang/light,write-tests,1,9,9,9,9,9,9,9,9,9,9,9
openrouter/openai/gpt-4,golang,golang/plain,write-tests,1,10,10,10,10,10,10,10,10,10,10,10
openrouter/openai/gpt-4,java,java/light,write-tests,1,11,11,11,11,11,11,11,11,11,11,11
openrouter/openai/gpt-4,java,java/plain,write-tests,1,12,12,12,12,12,12,12,12,12,12,12
`)

// validateMarkdownLinks checks if the Markdown report data contains all the links to other relevant report files.
Expand Down
2 changes: 2 additions & 0 deletions docs/reports/v0.6/evaluation-by-language-score.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ openrouter/nousresearch/nous-hermes-yi-34b,8206.0,3274.0,4919.0,13.0
openrouter/openai/gpt-4-turbo,38422.0,7213.0,18170.0,13039.0
openrouter/openai/gpt-4o,39005.0,6656.0,17900.0,14449.0
openrouter/openai/gpt-4o-mini,39441.0,7182.0,17859.0,14400.0
openrouter/openai/o1-mini,40089.0,8058.0,18099.0,13932.0
openrouter/openai/o1-preview,40806.0,8009.0,18119.0,14678.0
openrouter/openchat/openchat-8b,5376.0,3198.0,2167.0,11.0
openrouter/perplexity/llama-3-sonar-large-32k-chat,35051.0,6853.0,15929.0,12269.0
openrouter/perplexity/llama-3-sonar-small-32k-chat,7338.0,977.0,2167.0,4194.0
Expand Down
6 changes: 6 additions & 0 deletions docs/reports/v0.6/evaluation-by-language.csv
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,12 @@ openrouter/openai/gpt-4o,ruby,14449.0,11520.0,174.0,175.0,108779.0,620681.0,1108
openrouter/openai/gpt-4o-mini,golang,7182.0,3610.0,157.0,175.0,101802.0,721566.0,103576.0,175.0,175.0,175.0,2890.0
openrouter/openai/gpt-4o-mini,java,17859.0,14820.0,174.0,175.0,172655.0,853504.0,174755.0,175.0,175.0,175.0,2340.0
openrouter/openai/gpt-4o-mini,ruby,14400.0,11350.0,175.0,175.0,123933.0,638322.0,126033.0,175.0,175.0,175.0,2350.0
openrouter/openai/o1-mini,golang,8058.0,4470.0,173.0,175.0,137214.0,1601540.0,139456.0,175.0,175.0,175.0,2890.0
openrouter/openai/o1-mini,java,18099.0,15060.0,174.0,175.0,170329.0,1360139.0,172429.0,175.0,175.0,175.0,2340.0
openrouter/openai/o1-mini,ruby,13932.0,11130.0,167.0,175.0,121220.0,1600540.0,123320.0,175.0,175.0,175.0,2110.0
openrouter/openai/o1-preview,golang,8009.0,4420.0,174.0,175.0,143127.0,4178626.0,145361.0,175.0,175.0,175.0,2890.0
openrouter/openai/o1-preview,java,18119.0,15080.0,174.0,175.0,204023.0,4149955.0,206123.0,175.0,175.0,175.0,2340.0
openrouter/openai/o1-preview,ruby,14678.0,11710.0,173.0,175.0,135021.0,3891526.0,137121.0,175.0,175.0,175.0,2270.0
openrouter/openchat/openchat-8b,golang,3198.0,1030.0,69.0,175.0,95117.0,639275.0,116099.0,173.0,87.0,169.0,1670.0
openrouter/openchat/openchat-8b,java,2167.0,0.0,46.0,55.0,21833.0,129744.0,26831.0,54.0,14.0,53.0,2000.0
openrouter/openchat/openchat-8b,ruby,11.0,0.0,0.0,5.0,1113.0,11076.0,2327.0,5.0,1.0,5.0,0.0
Expand Down
2 changes: 2 additions & 0 deletions docs/reports/v0.6/evaluation-by-model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ openrouter/nousresearch/nous-hermes-yi-34b,Nous: Hermes 2 Yi 34B,1.44e-06,8206.0
openrouter/openai/gpt-4-turbo,OpenAI: GPT-4 Turbo,4e-05,38422.0,30050.0,507.0,525.0,370196.0,4884796.0,376394.0,525.0,525.0,525.0,6290.0
openrouter/openai/gpt-4o,OpenAI: GPT-4o,2e-05,39005.0,29520.0,493.0,524.0,347236.0,2230436.0,353722.0,524.0,524.0,524.0,7420.0
openrouter/openai/gpt-4o-mini,OpenAI: GPT-4o-mini,7.5e-07,39441.0,29780.0,506.0,525.0,398390.0,2213392.0,404364.0,525.0,525.0,525.0,7580.0
openrouter/openai/o1-mini,,,40089.0,30660.0,514.0,525.0,428763.0,4562219.0,435205.0,525.0,525.0,525.0,7340.0
openrouter/openai/o1-preview,,,40806.0,31210.0,521.0,525.0,482171.0,12220107.0,488605.0,525.0,525.0,525.0,7500.0
openrouter/openchat/openchat-8b,OpenChat 3.6 8B,1.1e-07,5376.0,1030.0,115.0,235.0,118063.0,780095.0,145257.0,232.0,102.0,227.0,3670.0
openrouter/perplexity/llama-3-sonar-large-32k-chat,Perplexity: Llama3 Sonar 70B,2e-06,35051.0,26960.0,483.0,525.0,332540.0,3794568.0,336998.0,523.0,522.0,523.0,6040.0
openrouter/perplexity/llama-3-sonar-small-32k-chat,Perplexity: Llama3 Sonar 8B,4e-07,7338.0,2230.0,146.0,355.0,356068.0,1319498.0,391432.0,353.0,188.0,351.0,4070.0
Expand Down
2 changes: 2 additions & 0 deletions docs/reports/v0.6/evaluation-by-symflower-fix.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ openrouter/nousresearch/nous-hermes-yi-34b,3064.0,3845.0,66.0,87.0
openrouter/openai/gpt-4-turbo,7003.0,7790.0,153.0,170.0
openrouter/openai/gpt-4o,6446.0,6895.0,139.0,148.0
openrouter/openai/gpt-4o-mini,6972.0,7707.0,152.0,167.0
openrouter/openai/o1-mini,7848.0,7899.0,168.0,169.0
openrouter/openai/o1-preview,7799.0,7799.0,169.0,169.0
openrouter/openchat/openchat-8b,3071.0,4574.0,66.0,99.0
openrouter/perplexity/llama-3-sonar-large-32k-chat,6643.0,7234.0,153.0,164.0
openrouter/perplexity/llama-3-sonar-small-32k-chat,768.0,1414.0,16.0,32.0
Expand Down
18 changes: 18 additions & 0 deletions docs/reports/v0.6/evaluation-by-task-by-language.csv
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,24 @@ openrouter/openai/gpt-4o-mini,transpile,ruby,2400.0,0.0,50.0,50.0,14099.0,128161
openrouter/openai/gpt-4o-mini,write-tests,golang,4072.0,3610.0,102.0,120.0,87076.0,604092.0,88276.0,120.0,120.0,120.0,0.0
openrouter/openai/gpt-4o-mini,write-tests,java,15299.0,14820.0,119.0,120.0,154516.0,715868.0,155956.0,120.0,120.0,120.0,0.0
openrouter/openai/gpt-4o-mini,write-tests,ruby,11830.0,11350.0,120.0,120.0,109215.0,501052.0,110655.0,120.0,120.0,120.0,0.0
openrouter/openai/o1-mini,code-repair,golang,210.0,0.0,5.0,5.0,638.0,19018.0,708.0,5.0,5.0,5.0,190.0
openrouter/openai/o1-mini,code-repair,java,160.0,0.0,5.0,5.0,997.0,18818.0,1057.0,5.0,5.0,5.0,140.0
openrouter/openai/o1-mini,code-repair,ruby,170.0,0.0,5.0,5.0,620.0,18691.0,680.0,5.0,5.0,5.0,150.0
openrouter/openai/o1-mini,transpile,golang,2900.0,0.0,50.0,50.0,13901.0,268383.0,14533.0,50.0,50.0,50.0,2700.0
openrouter/openai/o1-mini,transpile,java,2400.0,0.0,50.0,50.0,20245.0,235603.0,20845.0,50.0,50.0,50.0,2200.0
openrouter/openai/o1-mini,transpile,ruby,2158.0,0.0,48.0,50.0,15300.0,450293.0,15900.0,50.0,50.0,50.0,1960.0
openrouter/openai/o1-mini,write-tests,golang,4948.0,4470.0,118.0,120.0,122675.0,1314139.0,124215.0,120.0,120.0,120.0,0.0
openrouter/openai/o1-mini,write-tests,java,15539.0,15060.0,119.0,120.0,149087.0,1105718.0,150527.0,120.0,120.0,120.0,0.0
openrouter/openai/o1-mini,write-tests,ruby,11604.0,11130.0,114.0,120.0,105300.0,1131556.0,106740.0,120.0,120.0,120.0,0.0
openrouter/openai/o1-preview,code-repair,golang,210.0,0.0,5.0,5.0,638.0,40148.0,704.0,5.0,5.0,5.0,190.0
openrouter/openai/o1-preview,code-repair,java,160.0,0.0,5.0,5.0,1144.0,46459.0,1204.0,5.0,5.0,5.0,140.0
openrouter/openai/o1-preview,code-repair,ruby,170.0,0.0,5.0,5.0,605.0,38370.0,665.0,5.0,5.0,5.0,150.0
openrouter/openai/o1-preview,transpile,golang,2900.0,0.0,50.0,50.0,14603.0,664631.0,15199.0,50.0,50.0,50.0,2700.0
openrouter/openai/o1-preview,transpile,java,2400.0,0.0,50.0,50.0,20103.0,730686.0,20703.0,50.0,50.0,50.0,2200.0
openrouter/openai/o1-preview,transpile,ruby,2320.0,0.0,50.0,50.0,14325.0,873917.0,14925.0,50.0,50.0,50.0,2120.0
openrouter/openai/o1-preview,write-tests,golang,4899.0,4420.0,119.0,120.0,127886.0,3473847.0,129458.0,120.0,120.0,120.0,0.0
openrouter/openai/o1-preview,write-tests,java,15559.0,15080.0,119.0,120.0,182776.0,3372810.0,184216.0,120.0,120.0,120.0,0.0
openrouter/openai/o1-preview,write-tests,ruby,12188.0,11710.0,118.0,120.0,120091.0,2979239.0,121531.0,120.0,120.0,120.0,0.0
openrouter/openchat/openchat-8b,code-repair,golang,127.0,0.0,3.0,5.0,555.0,8171.0,1469.0,5.0,4.0,5.0,110.0
openrouter/openchat/openchat-8b,transpile,golang,1708.0,0.0,31.0,50.0,14404.0,100334.0,16674.0,49.0,22.0,46.0,1560.0
openrouter/openchat/openchat-8b,transpile,java,2155.0,0.0,46.0,50.0,19325.0,115551.0,23075.0,49.0,12.0,48.0,2000.0
Expand Down
2 changes: 2 additions & 0 deletions docs/reports/v0.6/evaluation-by-task-score.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ openrouter/nousresearch/nous-hermes-yi-34b,8206.0,4474.0,3394.0,338.0
openrouter/openai/gpt-4-turbo,38422.0,31472.0,6410.0,540.0
openrouter/openai/gpt-4o,39005.0,30926.0,7539.0,540.0
openrouter/openai/gpt-4o-mini,39441.0,31201.0,7700.0,540.0
openrouter/openai/o1-mini,40089.0,32091.0,7458.0,540.0
openrouter/openai/o1-preview,40806.0,32646.0,7620.0,540.0
openrouter/openchat/openchat-8b,5376.0,1386.0,3863.0,127.0
openrouter/perplexity/llama-3-sonar-large-32k-chat,35051.0,28366.0,6179.0,506.0
openrouter/perplexity/llama-3-sonar-small-32k-chat,7338.0,2856.0,4165.0,317.0
Expand Down
6 changes: 6 additions & 0 deletions docs/reports/v0.6/evaluation-by-task.csv
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,12 @@ openrouter/openai/gpt-4o,write-tests,30926.0,29520.0,329.0,359.0,296375.0,185876
openrouter/openai/gpt-4o-mini,code-repair,540.0,0.0,15.0,15.0,2200.0,25907.0,2390.0,15.0,15.0,15.0,480.0
openrouter/openai/gpt-4o-mini,transpile,7700.0,0.0,150.0,150.0,45383.0,366473.0,47087.0,150.0,150.0,150.0,7100.0
openrouter/openai/gpt-4o-mini,write-tests,31201.0,29780.0,341.0,360.0,350807.0,1821012.0,354887.0,360.0,360.0,360.0,0.0
openrouter/openai/o1-mini,code-repair,540.0,0.0,15.0,15.0,2255.0,56527.0,2445.0,15.0,15.0,15.0,480.0
openrouter/openai/o1-mini,transpile,7458.0,0.0,148.0,150.0,49446.0,954279.0,51278.0,150.0,150.0,150.0,6860.0
openrouter/openai/o1-mini,write-tests,32091.0,30660.0,351.0,360.0,377062.0,3551413.0,381482.0,360.0,360.0,360.0,0.0
openrouter/openai/o1-preview,code-repair,540.0,0.0,15.0,15.0,2387.0,124977.0,2573.0,15.0,15.0,15.0,480.0
openrouter/openai/o1-preview,transpile,7620.0,0.0,150.0,150.0,49031.0,2269234.0,50827.0,150.0,150.0,150.0,7020.0
openrouter/openai/o1-preview,write-tests,32646.0,31210.0,356.0,360.0,430753.0,9825896.0,435205.0,360.0,360.0,360.0,0.0
openrouter/openchat/openchat-8b,code-repair,127.0,0.0,3.0,5.0,555.0,8171.0,1469.0,5.0,4.0,5.0,110.0
openrouter/openchat/openchat-8b,transpile,3863.0,0.0,77.0,100.0,33729.0,215885.0,39749.0,98.0,34.0,94.0,3560.0
openrouter/openchat/openchat-8b,write-tests,1386.0,1030.0,35.0,130.0,83779.0,556039.0,104039.0,129.0,64.0,128.0,0.0
Expand Down
30 changes: 30 additions & 0 deletions docs/reports/v0.6/evaluation-cases.csv
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,36 @@ openrouter/openai/gpt-4o-mini,write-tests,ruby,120.0
openrouter/openai/gpt-4o-mini,write-tests-symflower-fix,golang,120.0
openrouter/openai/gpt-4o-mini,write-tests-symflower-fix,java,120.0
openrouter/openai/gpt-4o-mini,write-tests-symflower-fix,ruby,120.0
openrouter/openai/o1-mini,code-repair,golang,5.0
openrouter/openai/o1-mini,code-repair,java,5.0
openrouter/openai/o1-mini,code-repair,ruby,5.0
openrouter/openai/o1-mini,transpile,golang,50.0
openrouter/openai/o1-mini,transpile,java,50.0
openrouter/openai/o1-mini,transpile,ruby,50.0
openrouter/openai/o1-mini,transpile-symflower-fix,golang,50.0
openrouter/openai/o1-mini,transpile-symflower-fix,java,50.0
openrouter/openai/o1-mini,transpile-symflower-fix,ruby,50.0
openrouter/openai/o1-mini,write-tests,golang,120.0
openrouter/openai/o1-mini,write-tests,java,120.0
openrouter/openai/o1-mini,write-tests,ruby,120.0
openrouter/openai/o1-mini,write-tests-symflower-fix,golang,120.0
openrouter/openai/o1-mini,write-tests-symflower-fix,java,120.0
openrouter/openai/o1-mini,write-tests-symflower-fix,ruby,120.0
openrouter/openai/o1-preview,code-repair,golang,5.0
openrouter/openai/o1-preview,code-repair,java,5.0
openrouter/openai/o1-preview,code-repair,ruby,5.0
openrouter/openai/o1-preview,transpile,golang,50.0
openrouter/openai/o1-preview,transpile,java,50.0
openrouter/openai/o1-preview,transpile,ruby,50.0
openrouter/openai/o1-preview,transpile-symflower-fix,golang,50.0
openrouter/openai/o1-preview,transpile-symflower-fix,java,50.0
openrouter/openai/o1-preview,transpile-symflower-fix,ruby,50.0
openrouter/openai/o1-preview,write-tests,golang,120.0
openrouter/openai/o1-preview,write-tests,java,120.0
openrouter/openai/o1-preview,write-tests,ruby,120.0
openrouter/openai/o1-preview,write-tests-symflower-fix,golang,120.0
openrouter/openai/o1-preview,write-tests-symflower-fix,java,120.0
openrouter/openai/o1-preview,write-tests-symflower-fix,ruby,120.0
openrouter/openchat/openchat-8b,code-repair,golang,5.0
openrouter/openchat/openchat-8b,transpile,golang,50.0
openrouter/openchat/openchat-8b,transpile,java,50.0
Expand Down
Loading

0 comments on commit b3f4415

Please sign in to comment.