diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json index ae1a6b4a3..a562fd335 100644 --- a/dev/.documenter-siteinfo.json +++ b/dev/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-08-07T08:36:18","documenter_version":"1.5.0"}} \ No newline at end of file +{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-08-12T20:24:34","documenter_version":"1.5.0"}} \ No newline at end of file diff --git a/dev/examples/compare_paid_vs_local/index.html b/dev/examples/compare_paid_vs_local/index.html index 92daf2f79..dc5d72e68 100644 --- a/dev/examples/compare_paid_vs_local/index.html +++ b/dev/examples/compare_paid_vs_local/index.html @@ -89,4 +89,4 @@ rename(_, names(_) .|> unscrub_string) end # markdown_table(output, String) |> clipboard -markdown_table(output)
ModelElapsedElapsed MedianScoreScore MedianCount Zero ScoreCount Full ScoreIs Paid
claude-3-5-sonnet-202406206.36.386.0100.05.0179.0true
claude-3-opus-2024022920.520.583.090.01.0161.0true
claude-3-sonnet-202402298.78.779.095.015.0161.0true
codestral-24051.91.978.095.016.0146.0true
mistralai/Mixtral-8x22B-Instruct-v0.114.114.177.690.05.0151.0false
gpt-4o-2024-08-064.74.777.390.012.0155.0true
meta-llama/Llama-3-70b-chat-hf4.34.376.888.30.0160.0false
gpt-4-turbo-2024-04-0910.910.974.990.022.0146.0true
gpt-4-1106-preview22.422.474.490.019.0142.0true
claude-3-haiku-202403074.04.074.184.24.0125.0true
mistral-large-240711.311.373.683.115.0137.0true
gpt-4o-mini-2024-07-185.25.273.586.719.0137.0true
gpt-4-0125-preview30.230.273.188.826.0140.0true
gpt-4o-2024-05-134.34.372.286.721.0122.0true
deepseek-coder13.013.071.683.339.0115.0true
deepseek-chat17.917.971.380.630.0138.0true
mistral-large-24028.68.671.180.05.0103.0true
meta-llama/Llama-3-8b-chat-hf1.51.567.766.75.070.0false
claude-2.110.110.167.375.027.0116.0true
microsoft/WizardLM-2-8x22B34.734.762.760.033.0118.0false
gpt-3.5-turbo-01251.21.262.167.162.095.0true
phind-codellama:34b-v237.137.161.862.536.058.0false
mistral-medium18.118.160.860.022.090.0true
mistral-small-24025.05.060.255.015.084.0true
mistral-small5.95.960.155.027.076.0true
magicoder:7b-s-cl-q6_K15.615.659.960.018.035.0false
gpt-3.5-turbo-11062.12.158.462.582.097.0true
codellama:13b-instruct-q4KM3.23.256.454.656.061.0false
deepseek-coder:33b-instruct-q4KM46.746.755.050.062.068.0false
magicoder12.812.853.750.049.052.0false
nous-hermes2:34b-yi-q4KM56.856.850.750.078.056.0false
accounts/fireworks/models/dbrx-instruct3.73.750.050.0121.075.0false
codellama:13b-instruct18.118.150.050.065.044.0false
openchat:7b-v3.5-1210-q4KM14.414.449.450.048.023.0false
openhermes2.5-mistral12.912.948.950.055.027.0false
starling-lm:latest13.713.748.450.058.026.0false
codellama:7b-instruct-q4KM2.12.147.850.095.038.0false
mistral-tiny4.64.646.950.075.042.0true
yi:34b-chat43.943.945.650.045.034.0false
mistral:7b-instruct-v0.2-q6_K21.721.745.450.044.023.0false
mistral:7b-instruct-v0.2-q4_012.412.444.350.075.032.0false
mistral:7b-instruct-v0.2-q4KM15.615.642.650.071.023.0false
gpt-3.5-turbo3.63.642.350.0132.054.0true
codellama:34b-instruct-q4KM7.57.539.750.0127.035.0false
codellama:70b-instruct-q4KM16.316.336.40.0179.058.0false
gemini-1.0-pro-latest4.24.235.950.076.09.0true
solar:10.7b-instruct-v1-q4KM18.818.835.250.0107.010.0false
mistral:7b-instruct-q4KM13.913.934.850.080.00.0false
codellama:70b-instruct-q2_K11.211.229.80.0198.029.0false
llama217.117.126.525.0131.00.0false
gemma:7b-instruct-q6_K20.920.925.925.0147.02.0false
orca2:13b20.120.123.10.0166.011.0false
stablelm-zephyr9.99.915.40.0192.01.0false
dolphin-phi:2.7b-v2.6-q6_K8.98.914.90.0188.00.0false
codellama:13b-python12.512.512.80.0155.00.0false
phi:2.7b-chat-v2-q6_K13.013.08.90.0222.00.0false

This page was generated using Literate.jl.

+markdown_table(output)
ModelElapsedElapsed MedianScoreScore MedianCount Zero ScoreCount Full ScoreIs Paid
claude-3-5-sonnet-202406206.36.386.0100.05.0179.0true
claude-3-opus-2024022920.520.583.090.01.0161.0true
claude-3-sonnet-202402298.78.779.095.015.0161.0true
codestral-24051.91.978.095.016.0146.0true
mistralai/Mixtral-8x22B-Instruct-v0.114.114.177.690.05.0151.0false
gpt-4o-2024-08-064.74.777.390.012.0155.0true
meta-llama/Llama-3-70b-chat-hf4.34.376.888.30.0160.0false
gpt-4-turbo-2024-04-0910.910.974.990.022.0146.0true
gpt-4-1106-preview22.422.474.490.019.0142.0true
claude-3-haiku-202403074.04.074.184.24.0125.0true
mistral-large-240711.311.373.683.115.0137.0true
gpt-4o-mini-2024-07-185.25.273.586.719.0137.0true
gpt-4-0125-preview30.230.273.188.826.0140.0true
gpt-4o-2024-05-134.34.372.286.721.0122.0true
deepseek-coder13.013.071.683.339.0115.0true
deepseek-chat17.917.971.380.630.0138.0true
mistral-large-24028.68.671.180.05.0103.0true
meta-llama/Llama-3-8b-chat-hf1.51.567.766.75.070.0false
claude-2.110.110.167.375.027.0116.0true
microsoft/WizardLM-2-8x22B34.734.762.760.033.0118.0false
gpt-3.5-turbo-01251.21.262.167.162.095.0true
phind-codellama:34b-v237.137.161.862.536.058.0false
mistral-medium18.118.160.860.022.090.0true
mistral-small-24025.05.060.255.015.084.0true
mistral-small5.95.960.155.027.076.0true
magicoder:7b-s-cl-q6_K15.615.659.960.018.035.0false
gpt-3.5-turbo-11062.12.158.462.582.097.0true
codellama:13b-instruct-q4KM3.23.256.454.656.061.0false
deepseek-coder:33b-instruct-q4KM46.746.755.050.062.068.0false
magicoder12.812.853.750.049.052.0false
nous-hermes2:34b-yi-q4KM56.856.850.750.078.056.0false
accounts/fireworks/models/dbrx-instruct3.73.750.050.0121.075.0false
codellama:13b-instruct18.118.150.050.065.044.0false
openchat:7b-v3.5-1210-q4KM14.414.449.450.048.023.0false
openhermes2.5-mistral12.912.948.950.055.027.0false
starling-lm:latest13.713.748.450.058.026.0false
codellama:7b-instruct-q4KM2.12.147.850.095.038.0false
mistral-tiny4.64.646.950.075.042.0true
yi:34b-chat43.943.945.650.045.034.0false
mistral:7b-instruct-v0.2-q6_K21.721.745.450.044.023.0false
mistral:7b-instruct-v0.2-q4_012.412.444.350.075.032.0false
mistral:7b-instruct-v0.2-q4KM15.615.642.650.071.023.0false
gpt-3.5-turbo3.63.642.350.0132.054.0true
codellama:34b-instruct-q4KM7.57.539.750.0127.035.0false
codellama:70b-instruct-q4KM16.316.336.40.0179.058.0false
gemini-1.0-pro-latest4.24.235.950.076.09.0true
solar:10.7b-instruct-v1-q4KM18.818.835.250.0107.010.0false
mistral:7b-instruct-q4KM13.913.934.850.080.00.0false
codellama:70b-instruct-q2_K11.211.229.80.0198.029.0false
llama217.117.126.525.0131.00.0false
gemma:7b-instruct-q6_K20.920.925.925.0147.02.0false
orca2:13b20.120.123.10.0166.011.0false
stablelm-zephyr9.99.915.40.0192.01.0false
dolphin-phi:2.7b-v2.6-q6_K8.98.914.90.0188.00.0false
codellama:13b-python12.512.512.80.0155.00.0false
phi:2.7b-chat-v2-q6_K13.013.08.90.0222.00.0false

This page was generated using Literate.jl.

diff --git a/dev/examples/summarize_results_local/index.html b/dev/examples/summarize_results_local/index.html index b709667a0..40b7c00a9 100644 --- a/dev/examples/summarize_results_local/index.html +++ b/dev/examples/summarize_results_local/index.html @@ -218,4 +218,4 @@ rename(_, names(_) .|> unscrub_string) end # markdown_table(output, String) |> clipboard -markdown_table(output)
ModelPrompt LabelElapsedElapsed MedianScore AvgScore MedianCntPoint Per Second
codellama:13b-instruct-q4KMJuliaExpertAsk2.01.963.475.070.032.1
codellama:7b-instruct-q4KMInJulia2.02.057.755.070.029.1
codellama:7b-instruct-q4KMJuliaExpertAsk1.20.933.10.070.026.5
codellama:7b-instruct-q4KMJuliaRecapTask2.62.560.460.070.023.3
codellama:7b-instruct-q4KMJuliaExpertCoTTask1.61.432.60.070.020.6
codellama:13b-instruct-q4KMInJulia3.63.467.661.270.018.7
codellama:7b-instruct-q4KMJuliaRecapCoTTask3.02.755.450.070.018.3
codellama:13b-instruct-q4KMJuliaExpertCoTTask2.72.342.850.070.015.9
codellama:13b-instruct-q4KMJuliaRecapTask3.93.655.850.070.014.2
codellama:13b-instruct-q4KMJuliaRecapCoTTask3.94.152.550.070.013.5
codellama:34b-instruct-q4KMJuliaExpertAsk6.35.853.050.070.08.4
mistral:7b-instruct-v0.2-q4_0JuliaExpertAsk5.85.640.350.070.06.9
openchat:7b-v3.5-1210-q4KMJuliaExpertAsk7.86.951.050.055.06.5
codellama:34b-instruct-q4KMInJulia8.17.950.150.070.06.2
mistral:7b-instruct-v0.2-q4KMJuliaExpertAsk8.16.948.450.070.06.0
openhermes2.5-mistralJuliaExpertAsk8.78.950.752.558.05.8
starling-lm:latestJuliaExpertAsk9.99.855.550.058.05.6
magicoderInJulia11.09.660.860.057.05.6
codellama:34b-instruct-q4KMJuliaExpertCoTTask6.46.434.825.070.05.4
magicoderJuliaExpertAsk9.88.750.250.058.05.1
codellama:13b-instructJuliaExpertAsk10.48.551.150.058.04.9
mistral:7b-instruct-q4KMJuliaExpertAsk7.77.437.350.057.04.9
openhermes2.5-mistralInJulia10.89.849.650.058.04.6
starling-lm:latestInJulia11.111.151.150.058.04.6
openchat:7b-v3.5-1210-q4KMInJulia11.911.750.950.055.04.3
magicoder:7b-s-cl-q6_KInJulia14.615.362.255.042.04.3
magicoderJuliaRecapCoTTask13.412.056.650.057.04.2
magicoderJuliaRecapTask13.410.956.350.057.04.2
codellama:34b-instruct-q4KMJuliaRecapTask7.66.231.60.070.04.2
magicoder:7b-s-cl-q6_KJuliaExpertAsk14.614.760.158.142.04.1
magicoder:7b-s-cl-q6_KJuliaRecapTask16.116.565.260.042.04.1
mistral:7b-instruct-v0.2-q6_KJuliaExpertAsk10.08.239.850.042.04.0
mistral:7b-instruct-v0.2-q4_0InJulia12.111.347.450.070.03.9
mistral:7b-instruct-q4KMInJulia10.310.037.350.057.03.6
magicoder:7b-s-cl-q6_KJuliaRecapCoTTask16.916.459.455.642.03.5
openhermes2.5-mistralJuliaRecapTask15.014.751.350.058.03.4
magicoder:7b-s-cl-q6_KJuliaExpertCoTTask16.016.552.858.142.03.3
starling-lm:latestJuliaRecapTask16.013.652.650.058.03.3
codellama:34b-instruct-q4KMJuliaRecapCoTTask8.87.429.012.570.03.3
codellama:13b-instructInJulia16.414.753.250.058.03.2
mistral:7b-instruct-v0.2-q4_0JuliaExpertCoTTask13.212.942.850.070.03.2
codellama:70b-instruct-q2_KJuliaRecapTask11.79.537.425.070.03.2
llama2JuliaExpertAsk9.89.131.450.059.03.2
openhermes2.5-mistralJuliaExpertCoTTask16.616.051.950.057.03.1
starling-lm:latestJuliaRecapCoTTask14.813.246.150.058.03.1
openhermes2.5-mistralJuliaRecapCoTTask13.313.540.950.058.03.1
openchat:7b-v3.5-1210-q4KMJuliaRecapTask17.215.852.950.055.03.1
mistral:7b-instruct-v0.2-q4KMInJulia14.113.941.850.070.03.0
mistral:7b-instruct-v0.2-q4_0JuliaRecapCoTTask14.814.243.850.070.03.0
mistral:7b-instruct-v0.2-q4_0JuliaRecapTask16.215.447.350.070.02.9
openchat:7b-v3.5-1210-q4KMJuliaRecapCoTTask16.915.749.150.055.02.9
solar:10.7b-instruct-v1-q4KMJuliaExpertAsk13.012.536.950.057.02.8
codellama:70b-instruct-q2_KJuliaExpertCoTTask9.18.425.50.070.02.8
magicoderJuliaExpertCoTTask16.215.144.737.558.02.8
orca2:13bInJulia11.410.531.325.057.02.7
codellama:70b-instruct-q4KMInJulia16.514.743.950.070.02.7
codellama:70b-instruct-q2_KInJulia13.010.934.625.070.02.7
solar:10.7b-instruct-v1-q4KMInJulia17.415.743.950.057.02.5
stablelm-zephyrJuliaExpertAsk6.36.615.60.057.02.5
mistral:7b-instruct-v0.2-q4KMJuliaExpertCoTTask16.415.940.550.070.02.5
codellama:70b-instruct-q2_KJuliaRecapCoTTask12.39.730.00.070.02.4
codellama:13b-instructJuliaRecapTask21.920.653.050.058.02.4
gemma:7b-instruct-q6_KJuliaExpertAsk10.56.225.325.070.02.4
dolphin-phi:2.7b-v2.6-q6_KJuliaExpertAsk6.86.316.10.056.02.4
openchat:7b-v3.5-1210-q4KMJuliaExpertCoTTask18.217.943.150.055.02.4
codellama:70b-instruct-q4KMJuliaRecapTask17.914.142.437.570.02.4
phind-codellama:34b-v2JuliaExpertAsk29.527.768.166.757.02.3
codellama:13b-instructJuliaRecapCoTTask21.720.848.550.058.02.2
codellama:13b-instructJuliaExpertCoTTask20.019.344.550.058.02.2
mistral:7b-instruct-v0.2-q6_KInJulia19.417.143.250.042.02.2
starling-lm:latestJuliaExpertCoTTask16.616.236.850.058.02.2
codellama:70b-instruct-q2_KJuliaExpertAsk9.88.821.30.070.02.2
mistral:7b-instruct-q4KMJuliaExpertCoTTask16.315.835.025.057.02.1
mistral:7b-instruct-v0.2-q4KMJuliaRecapTask20.518.744.050.070.02.1
codellama:70b-instruct-q4KMJuliaRecapCoTTask16.712.835.20.070.02.1
codellama:70b-instruct-q4KMJuliaExpertCoTTask14.813.330.80.070.02.1
mistral:7b-instruct-v0.2-q4KMJuliaRecapCoTTask18.917.938.550.070.02.0
yi:34b-chatJuliaExpertAsk26.122.852.752.558.02.0
mistral:7b-instruct-v0.2-q6_KJuliaExpertCoTTask23.825.147.550.042.02.0
mistral:7b-instruct-q4KMJuliaRecapTask16.715.933.025.055.02.0
codellama:70b-instruct-q4KMJuliaExpertAsk15.713.329.90.070.01.9
solar:10.7b-instruct-v1-q4KMJuliaRecapCoTTask19.719.136.750.057.01.9
solar:10.7b-instruct-v1-q4KMJuliaRecapTask21.321.038.950.057.01.8
mistral:7b-instruct-v0.2-q6_KJuliaRecapCoTTask26.924.748.250.042.01.8
phind-codellama:34b-v2InJulia33.234.359.061.257.01.8
dolphin-phi:2.7b-v2.6-q6_KJuliaRecapTask9.59.316.30.056.01.7
llama2InJulia15.313.926.425.059.01.7
mistral:7b-instruct-v0.2-q6_KJuliaRecapTask28.327.248.650.042.01.7
mistral:7b-instruct-q4KMJuliaRecapCoTTask18.717.631.550.055.01.7
phind-codellama:34b-v2JuliaRecapCoTTask37.136.959.861.257.01.6
stablelm-zephyrJuliaRecapTask12.18.319.20.057.01.6
stablelm-zephyrInJulia8.56.613.30.057.01.6
phind-codellama:34b-v2JuliaRecapTask41.140.662.061.257.01.5
dolphin-phi:2.7b-v2.6-q6_KJuliaExpertCoTTask8.18.012.20.056.01.5
dolphin-phi:2.7b-v2.6-q6_KJuliaRecapCoTTask9.48.914.10.056.01.5
orca2:13bJuliaExpertAsk11.09.216.50.057.01.5
dolphin-phi:2.7b-v2.6-q6_KInJulia10.69.415.60.056.01.5
stablelm-zephyrJuliaRecapCoTTask11.48.816.50.057.01.4
llama2JuliaExpertCoTTask18.917.327.225.059.01.4
gemma:7b-instruct-q6_KJuliaRecapCoTTask25.725.034.950.070.01.4
phind-codellama:34b-v2JuliaExpertCoTTask44.646.560.166.757.01.3
codellama:13b-pythonJuliaRecapCoTTask9.56.412.40.042.01.3
llama2JuliaRecapCoTTask19.319.225.025.059.01.3
codellama:13b-pythonJuliaExpertAsk10.47.913.30.044.01.3
nous-hermes2:34b-yi-q4KMInJulia52.045.561.860.067.01.2
gemma:7b-instruct-q6_KInJulia19.720.322.625.070.01.1
gemma:7b-instruct-q6_KJuliaRecapTask24.823.626.925.070.01.1
phi:2.7b-chat-v2-q6_KJuliaExpertCoTTask9.35.79.90.055.01.1
stablelm-zephyrJuliaExpertCoTTask11.49.612.20.057.01.1
nous-hermes2:34b-yi-q4KMJuliaExpertAsk35.932.937.450.067.01.0
codellama:13b-pythonJuliaExpertCoTTask12.812.913.30.043.01.0
llama2JuliaRecapTask22.122.222.40.059.01.0
orca2:13bJuliaExpertCoTTask23.923.124.20.057.01.0
yi:34b-chatJuliaRecapTask50.848.847.650.058.00.9
yi:34b-chatJuliaExpertCoTTask42.140.639.225.058.00.9
yi:34b-chatJuliaRecapCoTTask49.545.644.050.057.00.9
solar:10.7b-instruct-v1-q4KMJuliaExpertCoTTask22.522.419.70.058.00.9
yi:34b-chatInJulia51.148.644.550.058.00.9
codellama:13b-pythonJuliaRecapTask16.310.913.90.043.00.9
nous-hermes2:34b-yi-q4KMJuliaRecapTask67.661.956.650.065.00.8
gemma:7b-instruct-q6_KJuliaExpertCoTTask23.823.319.925.070.00.8
phi:2.7b-chat-v2-q6_KJuliaRecapCoTTask12.110.89.90.055.00.8
codellama:13b-pythonInJulia13.712.511.00.044.00.8
orca2:13bJuliaRecapCoTTask26.725.421.50.057.00.8
phi:2.7b-chat-v2-q6_KJuliaRecapTask13.612.910.80.055.00.8
orca2:13bJuliaRecapTask27.524.821.90.057.00.8
nous-hermes2:34b-yi-q4KMJuliaRecapCoTTask58.560.046.150.065.00.8
nous-hermes2:34b-yi-q4KMJuliaExpertCoTTask70.465.551.455.067.00.7
phi:2.7b-chat-v2-q6_KJuliaExpertAsk14.613.48.10.055.00.6
phi:2.7b-chat-v2-q6_KInJulia15.717.86.00.055.00.4

This page was generated using Literate.jl.

+markdown_table(output)
ModelPrompt LabelElapsedElapsed MedianScore AvgScore MedianCntPoint Per Second
codellama:13b-instruct-q4KMJuliaExpertAsk2.01.963.475.070.032.1
codellama:7b-instruct-q4KMInJulia2.02.057.755.070.029.1
codellama:7b-instruct-q4KMJuliaExpertAsk1.20.933.10.070.026.5
codellama:7b-instruct-q4KMJuliaRecapTask2.62.560.460.070.023.3
codellama:7b-instruct-q4KMJuliaExpertCoTTask1.61.432.60.070.020.6
codellama:13b-instruct-q4KMInJulia3.63.467.661.270.018.7
codellama:7b-instruct-q4KMJuliaRecapCoTTask3.02.755.450.070.018.3
codellama:13b-instruct-q4KMJuliaExpertCoTTask2.72.342.850.070.015.9
codellama:13b-instruct-q4KMJuliaRecapTask3.93.655.850.070.014.2
codellama:13b-instruct-q4KMJuliaRecapCoTTask3.94.152.550.070.013.5
codellama:34b-instruct-q4KMJuliaExpertAsk6.35.853.050.070.08.4
mistral:7b-instruct-v0.2-q4_0JuliaExpertAsk5.85.640.350.070.06.9
openchat:7b-v3.5-1210-q4KMJuliaExpertAsk7.86.951.050.055.06.5
codellama:34b-instruct-q4KMInJulia8.17.950.150.070.06.2
mistral:7b-instruct-v0.2-q4KMJuliaExpertAsk8.16.948.450.070.06.0
openhermes2.5-mistralJuliaExpertAsk8.78.950.752.558.05.8
starling-lm:latestJuliaExpertAsk9.99.855.550.058.05.6
magicoderInJulia11.09.660.860.057.05.6
codellama:34b-instruct-q4KMJuliaExpertCoTTask6.46.434.825.070.05.4
magicoderJuliaExpertAsk9.88.750.250.058.05.1
codellama:13b-instructJuliaExpertAsk10.48.551.150.058.04.9
mistral:7b-instruct-q4KMJuliaExpertAsk7.77.437.350.057.04.9
openhermes2.5-mistralInJulia10.89.849.650.058.04.6
starling-lm:latestInJulia11.111.151.150.058.04.6
openchat:7b-v3.5-1210-q4KMInJulia11.911.750.950.055.04.3
magicoder:7b-s-cl-q6_KInJulia14.615.362.255.042.04.3
magicoderJuliaRecapCoTTask13.412.056.650.057.04.2
magicoderJuliaRecapTask13.410.956.350.057.04.2
codellama:34b-instruct-q4KMJuliaRecapTask7.66.231.60.070.04.2
magicoder:7b-s-cl-q6_KJuliaExpertAsk14.614.760.158.142.04.1
magicoder:7b-s-cl-q6_KJuliaRecapTask16.116.565.260.042.04.1
mistral:7b-instruct-v0.2-q6_KJuliaExpertAsk10.08.239.850.042.04.0
mistral:7b-instruct-v0.2-q4_0InJulia12.111.347.450.070.03.9
mistral:7b-instruct-q4KMInJulia10.310.037.350.057.03.6
magicoder:7b-s-cl-q6_KJuliaRecapCoTTask16.916.459.455.642.03.5
openhermes2.5-mistralJuliaRecapTask15.014.751.350.058.03.4
magicoder:7b-s-cl-q6_KJuliaExpertCoTTask16.016.552.858.142.03.3
starling-lm:latestJuliaRecapTask16.013.652.650.058.03.3
codellama:34b-instruct-q4KMJuliaRecapCoTTask8.87.429.012.570.03.3
codellama:13b-instructInJulia16.414.753.250.058.03.2
mistral:7b-instruct-v0.2-q4_0JuliaExpertCoTTask13.212.942.850.070.03.2
codellama:70b-instruct-q2_KJuliaRecapTask11.79.537.425.070.03.2
llama2JuliaExpertAsk9.89.131.450.059.03.2
openhermes2.5-mistralJuliaExpertCoTTask16.616.051.950.057.03.1
starling-lm:latestJuliaRecapCoTTask14.813.246.150.058.03.1
openhermes2.5-mistralJuliaRecapCoTTask13.313.540.950.058.03.1
openchat:7b-v3.5-1210-q4KMJuliaRecapTask17.215.852.950.055.03.1
mistral:7b-instruct-v0.2-q4KMInJulia14.113.941.850.070.03.0
mistral:7b-instruct-v0.2-q4_0JuliaRecapCoTTask14.814.243.850.070.03.0
mistral:7b-instruct-v0.2-q4_0JuliaRecapTask16.215.447.350.070.02.9
openchat:7b-v3.5-1210-q4KMJuliaRecapCoTTask16.915.749.150.055.02.9
solar:10.7b-instruct-v1-q4KMJuliaExpertAsk13.012.536.950.057.02.8
codellama:70b-instruct-q2_KJuliaExpertCoTTask9.18.425.50.070.02.8
magicoderJuliaExpertCoTTask16.215.144.737.558.02.8
orca2:13bInJulia11.410.531.325.057.02.7
codellama:70b-instruct-q4KMInJulia16.514.743.950.070.02.7
codellama:70b-instruct-q2_KInJulia13.010.934.625.070.02.7
solar:10.7b-instruct-v1-q4KMInJulia17.415.743.950.057.02.5
stablelm-zephyrJuliaExpertAsk6.36.615.60.057.02.5
mistral:7b-instruct-v0.2-q4KMJuliaExpertCoTTask16.415.940.550.070.02.5
codellama:70b-instruct-q2_KJuliaRecapCoTTask12.39.730.00.070.02.4
codellama:13b-instructJuliaRecapTask21.920.653.050.058.02.4
gemma:7b-instruct-q6_KJuliaExpertAsk10.56.225.325.070.02.4
dolphin-phi:2.7b-v2.6-q6_KJuliaExpertAsk6.86.316.10.056.02.4
openchat:7b-v3.5-1210-q4KMJuliaExpertCoTTask18.217.943.150.055.02.4
codellama:70b-instruct-q4KMJuliaRecapTask17.914.142.437.570.02.4
phind-codellama:34b-v2JuliaExpertAsk29.527.768.166.757.02.3
codellama:13b-instructJuliaRecapCoTTask21.720.848.550.058.02.2
codellama:13b-instructJuliaExpertCoTTask20.019.344.550.058.02.2
mistral:7b-instruct-v0.2-q6_KInJulia19.417.143.250.042.02.2
starling-lm:latestJuliaExpertCoTTask16.616.236.850.058.02.2
codellama:70b-instruct-q2_KJuliaExpertAsk9.88.821.30.070.02.2
mistral:7b-instruct-q4KMJuliaExpertCoTTask16.315.835.025.057.02.1
mistral:7b-instruct-v0.2-q4KMJuliaRecapTask20.518.744.050.070.02.1
codellama:70b-instruct-q4KMJuliaRecapCoTTask16.712.835.20.070.02.1
codellama:70b-instruct-q4KMJuliaExpertCoTTask14.813.330.80.070.02.1
mistral:7b-instruct-v0.2-q4KMJuliaRecapCoTTask18.917.938.550.070.02.0
yi:34b-chatJuliaExpertAsk26.122.852.752.558.02.0
mistral:7b-instruct-v0.2-q6_KJuliaExpertCoTTask23.825.147.550.042.02.0
mistral:7b-instruct-q4KMJuliaRecapTask16.715.933.025.055.02.0
codellama:70b-instruct-q4KMJuliaExpertAsk15.713.329.90.070.01.9
solar:10.7b-instruct-v1-q4KMJuliaRecapCoTTask19.719.136.750.057.01.9
solar:10.7b-instruct-v1-q4KMJuliaRecapTask21.321.038.950.057.01.8
mistral:7b-instruct-v0.2-q6_KJuliaRecapCoTTask26.924.748.250.042.01.8
phind-codellama:34b-v2InJulia33.234.359.061.257.01.8
dolphin-phi:2.7b-v2.6-q6_KJuliaRecapTask9.59.316.30.056.01.7
llama2InJulia15.313.926.425.059.01.7
mistral:7b-instruct-v0.2-q6_KJuliaRecapTask28.327.248.650.042.01.7
mistral:7b-instruct-q4KMJuliaRecapCoTTask18.717.631.550.055.01.7
phind-codellama:34b-v2JuliaRecapCoTTask37.136.959.861.257.01.6
stablelm-zephyrJuliaRecapTask12.18.319.20.057.01.6
stablelm-zephyrInJulia8.56.613.30.057.01.6
phind-codellama:34b-v2JuliaRecapTask41.140.662.061.257.01.5
dolphin-phi:2.7b-v2.6-q6_KJuliaExpertCoTTask8.18.012.20.056.01.5
dolphin-phi:2.7b-v2.6-q6_KJuliaRecapCoTTask9.48.914.10.056.01.5
orca2:13bJuliaExpertAsk11.09.216.50.057.01.5
dolphin-phi:2.7b-v2.6-q6_KInJulia10.69.415.60.056.01.5
stablelm-zephyrJuliaRecapCoTTask11.48.816.50.057.01.4
llama2JuliaExpertCoTTask18.917.327.225.059.01.4
gemma:7b-instruct-q6_KJuliaRecapCoTTask25.725.034.950.070.01.4
phind-codellama:34b-v2JuliaExpertCoTTask44.646.560.166.757.01.3
codellama:13b-pythonJuliaRecapCoTTask9.56.412.40.042.01.3
llama2JuliaRecapCoTTask19.319.225.025.059.01.3
codellama:13b-pythonJuliaExpertAsk10.47.913.30.044.01.3
nous-hermes2:34b-yi-q4KMInJulia52.045.561.860.067.01.2
gemma:7b-instruct-q6_KInJulia19.720.322.625.070.01.1
gemma:7b-instruct-q6_KJuliaRecapTask24.823.626.925.070.01.1
phi:2.7b-chat-v2-q6_KJuliaExpertCoTTask9.35.79.90.055.01.1
stablelm-zephyrJuliaExpertCoTTask11.49.612.20.057.01.1
nous-hermes2:34b-yi-q4KMJuliaExpertAsk35.932.937.450.067.01.0
codellama:13b-pythonJuliaExpertCoTTask12.812.913.30.043.01.0
llama2JuliaRecapTask22.122.222.40.059.01.0
orca2:13bJuliaExpertCoTTask23.923.124.20.057.01.0
yi:34b-chatJuliaRecapTask50.848.847.650.058.00.9
yi:34b-chatJuliaExpertCoTTask42.140.639.225.058.00.9
yi:34b-chatJuliaRecapCoTTask49.545.644.050.057.00.9
solar:10.7b-instruct-v1-q4KMJuliaExpertCoTTask22.522.419.70.058.00.9
yi:34b-chatInJulia51.148.644.550.058.00.9
codellama:13b-pythonJuliaRecapTask16.310.913.90.043.00.9
nous-hermes2:34b-yi-q4KMJuliaRecapTask67.661.956.650.065.00.8
gemma:7b-instruct-q6_KJuliaExpertCoTTask23.823.319.925.070.00.8
phi:2.7b-chat-v2-q6_KJuliaRecapCoTTask12.110.89.90.055.00.8
codellama:13b-pythonInJulia13.712.511.00.044.00.8
orca2:13bJuliaRecapCoTTask26.725.421.50.057.00.8
phi:2.7b-chat-v2-q6_KJuliaRecapTask13.612.910.80.055.00.8
orca2:13bJuliaRecapTask27.524.821.90.057.00.8
nous-hermes2:34b-yi-q4KMJuliaRecapCoTTask58.560.046.150.065.00.8
nous-hermes2:34b-yi-q4KMJuliaExpertCoTTask70.465.551.455.067.00.7
phi:2.7b-chat-v2-q6_KJuliaExpertAsk14.613.48.10.055.00.6
phi:2.7b-chat-v2-q6_KInJulia15.717.86.00.055.00.4

This page was generated using Literate.jl.

diff --git a/dev/examples/summarize_results_paid/index.html b/dev/examples/summarize_results_paid/index.html index 60cc8c947..982d2b46b 100644 --- a/dev/examples/summarize_results_paid/index.html +++ b/dev/examples/summarize_results_paid/index.html @@ -200,4 +200,4 @@ leftjoin(average_, on = :name) @orderby -:AverageScore end -markdown_table(output)
nameclaude-2.1claude-3-5-sonnet-20240620claude-3-haiku-20240307claude-3-opus-20240229claude-3-sonnet-20240229codestral-2405deepseek-chatdeepseek-codergemini-1.0-pro-latestgpt-3.5-turbogpt-3.5-turbo-0125gpt-3.5-turbo-1106gpt-4-0125-previewgpt-4-1106-previewgpt-4-turbo-2024-04-09gpt-4o-2024-05-13gpt-4o-2024-08-06gpt-4o-mini-2024-07-18mistral-large-2402mistral-large-2407mistral-mediummistral-smallmistral-small-2402mistral-tinyAverageScore
FloatWithUnits62.097.598.0100.0100.098.0100.0100.057.076.091.580.060.572.078.593.599.596.599.5100.098.070.0100.080.287.8
timezone_bumper82.1100.098.199.795.589.5100.0100.039.948.077.479.290.090.094.895.098.599.196.4100.097.076.678.162.087.0
clean_column100.097.389.8100.096.492.378.471.241.535.566.769.888.890.590.089.387.488.091.692.081.084.699.780.883.4
keeponlynames90.191.665.085.394.995.488.474.454.050.880.674.290.991.086.277.578.780.998.789.466.276.667.951.079.2
wrap_string93.894.877.264.570.288.081.782.532.664.050.155.394.997.894.697.094.694.371.994.584.768.068.648.377.7
countmodelrows58.0100.082.698.894.884.467.260.736.652.875.756.297.498.489.389.095.475.578.690.279.067.261.753.276.8
weatherdataanalyzer74.185.093.386.886.889.393.083.826.535.264.259.085.485.081.067.473.576.586.054.685.455.452.656.872.4
add_yearmonth53.888.586.292.081.062.571.262.535.833.067.665.278.672.875.968.074.967.272.271.248.062.240.233.265.2
event_scheduler86.584.476.690.277.256.876.082.437.829.044.442.887.966.682.573.867.737.557.332.836.059.038.737.260.9
ispersonal52.062.069.054.072.090.061.084.016.043.072.068.654.356.066.562.066.394.067.257.035.048.048.029.559.5
audi_filter38.093.056.093.063.859.547.057.828.127.055.058.047.558.049.056.281.078.858.092.043.048.544.827.056.7
extractjuliacode56.463.360.465.448.247.941.348.636.441.043.648.454.548.756.152.550.445.344.163.831.852.250.430.149.2
qanda_extractor73.563.762.368.065.557.043.326.726.231.735.536.756.753.349.345.350.254.746.831.038.744.755.836.048.0
pig_latinify30.679.834.667.157.056.549.067.118.724.739.823.154.761.460.154.254.848.033.661.727.828.831.633.145.7

This page was generated using Literate.jl.

+markdown_table(output)
nameclaude-2.1claude-3-5-sonnet-20240620claude-3-haiku-20240307claude-3-opus-20240229claude-3-sonnet-20240229codestral-2405deepseek-chatdeepseek-codergemini-1.0-pro-latestgpt-3.5-turbogpt-3.5-turbo-0125gpt-3.5-turbo-1106gpt-4-0125-previewgpt-4-1106-previewgpt-4-turbo-2024-04-09gpt-4o-2024-05-13gpt-4o-2024-08-06gpt-4o-mini-2024-07-18mistral-large-2402mistral-large-2407mistral-mediummistral-smallmistral-small-2402mistral-tinyAverageScore
FloatWithUnits62.097.598.0100.0100.098.0100.0100.057.076.091.580.060.572.078.593.599.596.599.5100.098.070.0100.080.287.8
timezone_bumper82.1100.098.199.795.589.5100.0100.039.948.077.479.290.090.094.895.098.599.196.4100.097.076.678.162.087.0
clean_column100.097.389.8100.096.492.378.471.241.535.566.769.888.890.590.089.387.488.091.692.081.084.699.780.883.4
keeponlynames90.191.665.085.394.995.488.474.454.050.880.674.290.991.086.277.578.780.998.789.466.276.667.951.079.2
wrap_string93.894.877.264.570.288.081.782.532.664.050.155.394.997.894.697.094.694.371.994.584.768.068.648.377.7
countmodelrows58.0100.082.698.894.884.467.260.736.652.875.756.297.498.489.389.095.475.578.690.279.067.261.753.276.8
weatherdataanalyzer74.185.093.386.886.889.393.083.826.535.264.259.085.485.081.067.473.576.586.054.685.455.452.656.872.4
add_yearmonth53.888.586.292.081.062.571.262.535.833.067.665.278.672.875.968.074.967.272.271.248.062.240.233.265.2
event_scheduler86.584.476.690.277.256.876.082.437.829.044.442.887.966.682.573.867.737.557.332.836.059.038.737.260.9
ispersonal52.062.069.054.072.090.061.084.016.043.072.068.654.356.066.562.066.394.067.257.035.048.048.029.559.5
audi_filter38.093.056.093.063.859.547.057.828.127.055.058.047.558.049.056.281.078.858.092.043.048.544.827.056.7
extractjuliacode56.463.360.465.448.247.941.348.636.441.043.648.454.548.756.152.550.445.344.163.831.852.250.430.149.2
qanda_extractor73.563.762.368.065.557.043.326.726.231.735.536.756.753.349.345.350.254.746.831.038.744.755.836.048.0
pig_latinify30.679.834.667.157.056.549.067.118.724.739.823.154.761.460.154.254.848.033.661.727.828.831.633.145.7

This page was generated using Literate.jl.

diff --git a/dev/examples/summarize_results_prompts/index.html b/dev/examples/summarize_results_prompts/index.html index c845c7855..bb5774bae 100644 --- a/dev/examples/summarize_results_prompts/index.html +++ b/dev/examples/summarize_results_prompts/index.html @@ -84,4 +84,4 @@ "score_median" => "Median Score (Max 100 pts)") end # markdown_table(output, String) |> clipboard -markdown_table(output)
Prompt TemplateElapsed (s, average)Elapsed (s, median)Avg. Score (Max 100 pts)Median Score (Max 100 pts)
InJulia13.28.858.558.3
JuliaExpertAsk9.25.757.055.0
JuliaRecapTask15.610.655.455.0
JuliaExpertCoTTask14.29.353.555.0
JuliaRecapCoTTask15.110.652.550.0

This page was generated using Literate.jl.

+markdown_table(output)
Prompt TemplateElapsed (s, average)Elapsed (s, median)Avg. Score (Max 100 pts)Median Score (Max 100 pts)
InJulia13.28.858.558.3
JuliaExpertAsk9.25.757.055.0
JuliaRecapTask15.610.655.455.0
JuliaExpertCoTTask14.29.353.555.0
JuliaRecapCoTTask15.110.652.550.0

This page was generated using Literate.jl.

diff --git a/dev/examples/summarize_results_test_cases/index.html b/dev/examples/summarize_results_test_cases/index.html index 4b46526b5..d2af0b51a 100644 --- a/dev/examples/summarize_results_test_cases/index.html +++ b/dev/examples/summarize_results_test_cases/index.html @@ -270,4 +270,4 @@ end return strip(wrapped_text) end -

Winning Paid Model: "gpt-4-1106-preview" with average score 97.8 (Full score: 14/25, Zero score: 0/25)

Winning Locally-hosted Model: "claude-2.1" with average score 92.8 (Full score: 12/25, Zero score: 0/25)


This page was generated using Literate.jl.

+

Winning Paid Model: "gpt-4-1106-preview" with average score 97.8 (Full score: 14/25, Zero score: 0/25)

Winning Locally-hosted Model: "claude-2.1" with average score 92.8 (Full score: 12/25, Zero score: 0/25)


This page was generated using Literate.jl.

diff --git a/dev/examples/summarize_results_test_cases_waitlist/index.html b/dev/examples/summarize_results_test_cases_waitlist/index.html index c6b83b8c7..3d8a44b92 100644 --- a/dev/examples/summarize_results_test_cases_waitlist/index.html +++ b/dev/examples/summarize_results_test_cases_waitlist/index.html @@ -155,4 +155,4 @@ return middle(m[1], m[2]) end end -

Winning Paid Model: "gpt-4-0125-preview" with average score 81.3 (Full score: 1/25, Zero score: 1/25)


This page was generated using Literate.jl.

+

Winning Paid Model: "gpt-4-0125-preview" with average score 81.3 (Full score: 1/25, Zero score: 1/25)


This page was generated using Literate.jl.

diff --git a/dev/frequently_asked_questions/index.html b/dev/frequently_asked_questions/index.html index beb62dc71..da77619f5 100644 --- a/dev/frequently_asked_questions/index.html +++ b/dev/frequently_asked_questions/index.html @@ -1,2 +1,2 @@ -F.A.Q. · JuliaLLMLeaderboard.jl

Frequently Asked Questions

What are the so-whats?

There is limited guidance or comments in the docs, because it’s meant to be automatically generated (and, hence, can move around slightly). For the resulting insights, see the associated blog posts!

Want to add a new model?

In the short term, we don't foresee adding more models, unless there is some transformative new option that runs on a consumer-grade hardware.

If you want to add the benchmark for some specific model, submit your evals in a PR. We'll review it and, if it's good, we'll merge it.

The expectations for a successful PR are:

  • the model is publicly available and the submission can be verified
  • you have executed at least 5 different samples for each of the 5 basic prompt templates (see examples/code_gen_benchmark.jl for the list of templates) and for each test cases
  • ie, 14 * 5 * 5 = 350 evaluations and conversations are to be submitted in the PR

What’s Next?

We'd like to add more tests and, potentially, also types of tests (code questions).

It would be good to grow the number of prompt templates tested, as those are more versatile.

+F.A.Q. · JuliaLLMLeaderboard.jl

Frequently Asked Questions

What are the so-whats?

There is limited guidance or comments in the docs, because it’s meant to be automatically generated (and, hence, can move around slightly). For the resulting insights, see the associated blog posts!

Want to add a new model?

In the short term, we don't foresee adding more models, unless there is some transformative new option that runs on a consumer-grade hardware.

If you want to add the benchmark for some specific model, submit your evals in a PR. We'll review it and, if it's good, we'll merge it.

The expectations for a successful PR are:

  • the model is publicly available and the submission can be verified
  • you have executed at least 5 different samples for each of the 5 basic prompt templates (see examples/code_gen_benchmark.jl for the list of templates) and for each test cases
  • ie, 14 * 5 * 5 = 350 evaluations and conversations are to be submitted in the PR

What’s Next?

We'd like to add more tests and, potentially, also types of tests (code questions).

It would be good to grow the number of prompt templates tested, as those are more versatile.

diff --git a/dev/getting_started/index.html b/dev/getting_started/index.html index 1521ca76e..756c663cc 100644 --- a/dev/getting_started/index.html +++ b/dev/getting_started/index.html @@ -37,4 +37,4 @@ num_samples = 1, http_kwargs = (; readtimeout = 150)); # You can then easily score each of these evaluation runs -scores = score_evals.(evals)

Create Your Analysis

To inspect individual model answers and their associated scores, see examples/inspect_results.jl or examples/debugging_results.jl.

To compare different models, see examples/summarize_results_paid.jl

Run an Experiment

Want to run some experiments and save the results? Check out examples/experiment_hyperparameter_scan.jl for finding the optimal temperature and top_p !

Contributing Results

  1. Run Your Evaluation: Choose your model and prompt, and run the test.
  2. Save Results: Store both the conversation and the evaluation.
  3. Open a PR: Include the part of the code snippet you changed in the PR comments. We generally require 1-2 independent verifications of your result or at least 3 samples for each combination (for validity).
+scores = score_evals.(evals)

Create Your Analysis

To inspect individual model answers and their associated scores, see examples/inspect_results.jl or examples/debugging_results.jl.

To compare different models, see examples/summarize_results_paid.jl

Run an Experiment

Want to run some experiments and save the results? Check out examples/experiment_hyperparameter_scan.jl for finding the optimal temperature and top_p !

Contributing Results

  1. Run Your Evaluation: Choose your model and prompt, and run the test.
  2. Save Results: Store both the conversation and the evaluation.
  3. Open a PR: Include the part of the code snippet you changed in the PR comments. We generally require 1-2 independent verifications of your result or at least 3 samples for each combination (for validity).
diff --git a/dev/index.html b/dev/index.html index 698fc0613..7d96e0d69 100644 --- a/dev/index.html +++ b/dev/index.html @@ -1,2 +1,2 @@ -Home · JuliaLLMLeaderboard.jl

JuliaLLMLeaderboard

Documentation for Julia LLM Leaderboard.

Introduction

Welcome to the Julia Code Generation Benchmark Repository!

This project is designed for the Julia community to compare the code generation capabilities of various AI models. Unlike academic benchmarks, our focus is practicality and simplicity: "Generate code, run it, and see if it works(-ish)."

This repository aims to understand how different AI models and prompting strategies perform in generating syntactically correct Julia code to guide users in choosing the best model for their needs.

Itchy fingers? Open the Results section or just run your own benchmark with run_benchmark() (eg, examples/code_gen_benchmark.jl).

First Steps

To get started with benchmarking, see the Getting Started section, or simply continue to results:

Feedback and Improvements

We highly value community input. If you have suggestions or ideas for improvement, please open an issue. All contributions are welcome!

+Home · JuliaLLMLeaderboard.jl

JuliaLLMLeaderboard

Documentation for Julia LLM Leaderboard.

Introduction

Welcome to the Julia Code Generation Benchmark Repository!

This project is designed for the Julia community to compare the code generation capabilities of various AI models. Unlike academic benchmarks, our focus is practicality and simplicity: "Generate code, run it, and see if it works(-ish)."

This repository aims to understand how different AI models and prompting strategies perform in generating syntactically correct Julia code to guide users in choosing the best model for their needs.

Itchy fingers? Open the Results section or just run your own benchmark with run_benchmark() (eg, examples/code_gen_benchmark.jl).

First Steps

To get started with benchmarking, see the Getting Started section, or simply continue to results:

Feedback and Improvements

We highly value community input. If you have suggestions or ideas for improvement, please open an issue. All contributions are welcome!

diff --git a/dev/methodology/index.html b/dev/methodology/index.html index d0e86f160..cdc290121 100644 --- a/dev/methodology/index.html +++ b/dev/methodology/index.html @@ -1,2 +1,2 @@ -Methodology · JuliaLLMLeaderboard.jl

Automated Evaluation Methodology

Each model's and prompt's performance is evaluated based on several criteria:

  1. Parsing: Does the generated code parse correctly in Julia?
  2. Execution: Can the code execute without errors?
  3. Unit Tests: Do the included unit tests pass?
  4. Example Runs: Does the code run in a provided example scenario?

At the moment, all criteria are weighed equally and each test case can earn a maximum of 100 points.

If a code passes all criteria, it gets 100/100 points.

If it fails one criterion (eg, all unit tests), it gets 75/100 points.

If it fails two criteria (eg, it runs but all examples and unit tests are broken), it gets 50 points, and so on.

Definition.toml

Each test case is defined in a definition.toml file with the structure described in Anatomy of definition.toml.

We chose TOML format because it is human-readable and easy to edit in a text editor / GITHub.

Repo Structure / Naming Convention

To enhance transparency and reproducibility, we save all conversations and evaluations in a nested folder structure.

Folder Convention:

  • Definitions are saved in nested folders following the format code_generation/category/test_case_name/definition.toml
  • Evaluation results are saved in nested sub-folders, keyed by the model:
    • Evaluation result: code_generation/category/test_case_name/model/evaluation__PROMPT__STRATEGY__TIMESTAMP.json
    • Conversation: code_generation/category/test_case_name/model/conversation__PROMPT__STRATEGY__TIMESTAMP.json

You can load any conversation with PromptingTools.load_conversation() and display it with edit or preview depending on your IDE/preference.

You can load any evaluation with JSON3.read and score it with score_eval.

+Methodology · JuliaLLMLeaderboard.jl

Automated Evaluation Methodology

Each model's and prompt's performance is evaluated based on several criteria:

  1. Parsing: Does the generated code parse correctly in Julia?
  2. Execution: Can the code execute without errors?
  3. Unit Tests: Do the included unit tests pass?
  4. Example Runs: Does the code run in a provided example scenario?

At the moment, all criteria are weighed equally and each test case can earn a maximum of 100 points.

If a code passes all criteria, it gets 100/100 points.

If it fails one criterion (eg, all unit tests), it gets 75/100 points.

If it fails two criteria (eg, it runs but all examples and unit tests are broken), it gets 50 points, and so on.

Definition.toml

Each test case is defined in a definition.toml file with the structure described in Anatomy of definition.toml.

We chose TOML format because it is human-readable and easy to edit in a text editor / GITHub.

Repo Structure / Naming Convention

To enhance transparency and reproducibility, we save all conversations and evaluations in a nested folder structure.

Folder Convention:

  • Definitions are saved in nested folders following the format code_generation/category/test_case_name/definition.toml
  • Evaluation results are saved in nested sub-folders, keyed by the model:
    • Evaluation result: code_generation/category/test_case_name/model/evaluation__PROMPT__STRATEGY__TIMESTAMP.json
    • Conversation: code_generation/category/test_case_name/model/conversation__PROMPT__STRATEGY__TIMESTAMP.json

You can load any conversation with PromptingTools.load_conversation() and display it with edit or preview depending on your IDE/preference.

You can load any evaluation with JSON3.read and score it with score_eval.

diff --git a/dev/objects.inv b/dev/objects.inv index 317d6b39b..84eff8b1a 100644 Binary files a/dev/objects.inv and b/dev/objects.inv differ diff --git a/dev/reference/index.html b/dev/reference/index.html index 5fa154b42..8ddd4fc0c 100644 --- a/dev/reference/index.html +++ b/dev/reference/index.html @@ -1,5 +1,5 @@ -Reference · JuliaLLMLeaderboard.jl

Reference

InteractiveUtils.editFunction
InteractiveUtils.edit(conversation::AbstractVector{<:PT.AbstractMessage}, bookmark::Int=-1)

Opens the conversation in a preview window formatted as markdown (In VSCode, right click on the tab and select "Open Preview" to format it nicely).

See also: preview (for rendering as markdown in REPL)

source
JuliaLLMLeaderboard.evaluate_1shotMethod
evaluate_1shot(; conversation, fn_definition, definition, model, prompt_label, schema, parameters::NamedTuple=NamedTuple(), device="UNKNOWN", timestamp=timestamp_now(), version_pt=string(pkgversion(PromptingTools)), prompt_strategy="1SHOT", verbose::Bool=false,
+Reference · JuliaLLMLeaderboard.jl

Reference

InteractiveUtils.editFunction
InteractiveUtils.edit(conversation::AbstractVector{<:PT.AbstractMessage}, bookmark::Int=-1)

Opens the conversation in a preview window formatted as markdown (In VSCode, right click on the tab and select "Open Preview" to format it nicely).

See also: preview (for rendering as markdown in REPL)

source
JuliaLLMLeaderboard.evaluate_1shotMethod
evaluate_1shot(; conversation, fn_definition, definition, model, prompt_label, schema, parameters::NamedTuple=NamedTuple(), device="UNKNOWN", timestamp=timestamp_now(), version_pt=string(pkgversion(PromptingTools)), prompt_strategy="1SHOT", verbose::Bool=false,
 auto_save::Bool=true, save_dir::AbstractString=dirname(fn_definition), experiment::AbstractString="",
 execution_timeout::Int=60, capture_stdout::Bool=true)

Runs evaluation for a single test case (parse, execute, run examples, run unit tests), including saving the files.

If auto_save=true, it saves the following files

  • <model-name>/evaluation__PROMPTABC__1SHOT__TIMESTAMP.json
  • <model-name>/conversation__PROMPTABC__1SHOT__TIMESTAMP.json

into a sub-folder of where the definition file was stored.

Keyword Arguments

  • conversation: the conversation to evaluate (vector of messages), eg, from aigenerate when return_all=true
  • fn_definition: path to the definition file (eg, joinpath("code_generation", "utility_functions", "event_scheduler", "definition.toml"))
  • definition: the test case definition dict loaded from the definition file. It's subset to only the relevant keys for code generation, eg, definition=load_definition(fn_definition)["code_generation"]
  • model: the model name, eg, model="gpt4t"
  • prompt_label: the prompt label, eg, prompt_label="JuliaExpertAsk"
  • schema: the schema used for the prompt, eg, schema="-" or schema="OllamaManagedSchema()"
  • parameters: the parameters used for the generation like temperature or top_p, eg, parameters=(; top_p=0.9)
  • device: the device used for the generation, eg, device="Apple-MacBook-Pro-M1"
  • timestamp: the timestamp used for the generation. Defaults to timestamp=timestamp_now() which is like "20231201_120000"
  • version_pt: the version of PromptingTools used for the generation, eg, version_pt="0.1.0"
  • prompt_strategy: the prompt strategy used for the generation, eg, prompt_strategy="1SHOT". Fixed for now!
  • verbose: if verbose=true, it will print out more information about the evaluation process, eg, the evaluation errors
  • auto_save: if auto_save=true, it will save the evaluation and conversation files into a sub-folder of where the definition file was stored.
  • save_dir: the directory where the evaluation and conversation files are saved. Defaults to dirname(fn_definition).
  • experiment: the experiment name, eg, experiment="my_experiment" (eg, when you're doing a parameter search). Defaults to "" for standard benchmark run.
  • execution_timeout: the timeout for the AICode code execution in seconds. Defaults to 60s.
  • capture_stdout: if capture_stdout=true, AICode will capture the stdout of the code execution. Set to false if you're evaluating with multithreading (stdout capture is not thread-safe). Defaults to true to avoid poluting the benchmark.
  • remove_tests: if remove_tests=true, AICode will remove any @testset blocks and unit tests from the main code definition (shields against model defining wrong unit tests inadvertedly).

Examples

using JuliaLLMLeaderboard
 using PromptingTools
@@ -10,7 +10,7 @@
 msg = aigenerate(:JuliaExpertAsk; ask=d["code_generation"]["prompt"], model="gpt4t", return_all=true)
 
 # Try evaluating it -- auto_save=false not to polute our benchmark
-evals = evaluate_1shot(; conversation=msg, fn_definition, definition=d["code_generation"], model="gpt4t", prompt_label="JuliaExpertAsk", timestamp=timestamp_now(), device="Apple-MacBook-Pro-M1", schema="-", prompt_strategy="1SHOT", verbose=true, auto_save=false)
source
JuliaLLMLeaderboard.load_evalsMethod
load_evals(base_dir::AbstractString; score::Bool=true, max_history::Int=5, new_columns::Vector{Symbol}=Symbol[], kwargs...)

Loads all evaluation JSONs from a given director loaded in a DataFrame as rows. The directory is searched recursively, and all files starting with the prefix evaluation__ are loaded.

Keyword Arguments

  • score::Bool=true: If score=true, the function will also call score_eval on the resulting DataFrame.
  • max_history::Int=5: Only max_history most recent evaluations are loaded. If max_history=0, all evaluations are loaded.

Returns: DataFrame

Note: It loads a fixed set of columns (set in a local variable eval_cols), so if you added some new columns, you'll need to pass them to new_columns::Vector{Symbol} argument.

source
JuliaLLMLeaderboard.previewMethod
preview(conversation::AbstractVector{<:PT.AbstractMessage})

Render a conversation, which is a vector of AbstractMessage objects, as a single markdown-formatted string. Each message is rendered individually and concatenated with separators for clear readability.

This function is particularly useful for displaying the flow of a conversation in a structured and readable format. It leverages the PT.preview method for individual messages to create a cohesive view of the entire conversation.

Arguments

  • conversation::AbstractVector{<:PT.AbstractMessage}: A vector of messages representing the conversation.

Returns

  • String: A markdown-formatted string representing the entire conversation.

Example

conversation = [
+evals = evaluate_1shot(; conversation=msg, fn_definition, definition=d["code_generation"], model="gpt4t", prompt_label="JuliaExpertAsk", timestamp=timestamp_now(), device="Apple-MacBook-Pro-M1", schema="-", prompt_strategy="1SHOT", verbose=true, auto_save=false)
source
JuliaLLMLeaderboard.load_evalsMethod
load_evals(base_dir::AbstractString; score::Bool=true, max_history::Int=5, new_columns::Vector{Symbol}=Symbol[], kwargs...)

Loads all evaluation JSONs from a given director loaded in a DataFrame as rows. The directory is searched recursively, and all files starting with the prefix evaluation__ are loaded.

Keyword Arguments

  • score::Bool=true: If score=true, the function will also call score_eval on the resulting DataFrame.
  • max_history::Int=5: Only max_history most recent evaluations are loaded. If max_history=0, all evaluations are loaded.

Returns: DataFrame

Note: It loads a fixed set of columns (set in a local variable eval_cols), so if you added some new columns, you'll need to pass them to new_columns::Vector{Symbol} argument.

source
JuliaLLMLeaderboard.previewMethod
preview(conversation::AbstractVector{<:PT.AbstractMessage})

Render a conversation, which is a vector of AbstractMessage objects, as a single markdown-formatted string. Each message is rendered individually and concatenated with separators for clear readability.

This function is particularly useful for displaying the flow of a conversation in a structured and readable format. It leverages the PT.preview method for individual messages to create a cohesive view of the entire conversation.

Arguments

  • conversation::AbstractVector{<:PT.AbstractMessage}: A vector of messages representing the conversation.

Returns

  • String: A markdown-formatted string representing the entire conversation.

Example

conversation = [
     PT.SystemMessage("Welcome"),
     PT.UserMessage("Hello"),
     PT.AIMessage("Hi, how can I help you?")
@@ -23,9 +23,9 @@
 ---
 # AI Message
 Hi, how can I help you?
----
source
JuliaLLMLeaderboard.previewMethod
preview(msg::PT.AbstractMessage)

Render a single AbstractMessage as a markdown-formatted string, highlighting the role of the message sender and the content of the message.

This function identifies the type of the message (User, Data, System, AI, or Unknown) and formats it with a header indicating the sender's role, followed by the content of the message. The output is suitable for nicer rendering, especially in REPL or markdown environments.

Arguments

  • msg::PT.AbstractMessage: The message to be rendered.

Returns

  • String: A markdown-formatted string representing the message.

Example

msg = PT.UserMessage("Hello, world!")
+---
source
JuliaLLMLeaderboard.previewMethod
preview(msg::PT.AbstractMessage)

Render a single AbstractMessage as a markdown-formatted string, highlighting the role of the message sender and the content of the message.

This function identifies the type of the message (User, Data, System, AI, or Unknown) and formats it with a header indicating the sender's role, followed by the content of the message. The output is suitable for nicer rendering, especially in REPL or markdown environments.

Arguments

  • msg::PT.AbstractMessage: The message to be rendered.

Returns

  • String: A markdown-formatted string representing the message.

Example

msg = PT.UserMessage("Hello, world!")
 println(PT.preview(msg))

This will output:

# User Message
-Hello, world!
source
JuliaLLMLeaderboard.run_benchmarkMethod
run_benchmark(; fn_definitions::Vector{<:AbstractString}=find_definitons(joinpath(@__DIR__, "..", "code_generation")),
 models::Vector{String}=["gpt-3.5-turbo-1106"], model_suffix::String="", prompt_labels::Vector{<:AbstractString}=["JuliaExpertCoTTask", "JuliaExpertAsk", "InJulia", "AsIs", "JuliaRecapTask", "JuliaRecapCoTTask"],
 api_kwargs::NamedTuple=NamedTuple(), http_kwargs::NamedTuple=(; readtimeout=300),
 experiment::AbstractString="", save_dir::AbstractString="", auto_save::Bool=true, verbose::Union{Int,Bool}=true, device::AbstractString="-",
@@ -35,7 +35,7 @@
     experiment="my-first-run", save_dir="temp", auto_save=true, verbose=true, device="Apple-MacBook-Pro-M1",
     num_samples=1);
 
-# not using `schema_lookup` as it's not needed for OpenAI models

Or if you want only one test case use: fn_definitions = [joinpath("code_generation", "utility_functions", "event_scheduler", "definition.toml")]

source
JuliaLLMLeaderboard.run_code_blocks_additiveMethod
run_code_blocks_additive(cb::AICode, code_blocks::AbstractVector{<:AbstractString};
+# not using `schema_lookup` as it's not needed for OpenAI models

Or if you want only one test case use: fn_definitions = [joinpath("code_generation", "utility_functions", "event_scheduler", "definition.toml")]

source
JuliaLLMLeaderboard.run_code_blocks_additiveMethod
run_code_blocks_additive(cb::AICode, code_blocks::AbstractVector{<:AbstractString};
     verbose::Bool = false,
     setup_code::AbstractString = "", teardown_code::AbstractString = "",
     capture_stdout::Bool = true, execution_timeout::Int = 60)

Runner for the additional code_blocks (can be either unit tests or examples), returns count of examples executed without an error.

code_blocks should be a vector of strings, each of which is a valid Julia expression that can be evaluated without an error thrown. Each successful run (no error thrown) is counted as a successful example.

Keyword Arguments

  • verbose=true will provide more information about the test failures.
  • setup_code is a string that will be prepended to each code block before it's evaluated. Useful for setting up the environment/test objects.
  • teardown_code is a string that will be appended to each code block before it's evaluated. Useful for cleaning up the environment/test objects.
  • capture_stdout is a boolean whether to capture the stdout of the code execution. Set to false if you're evaluating with multithreading (stdout capture is not thread-safe).
  • execution_timeout is the timeout for the AICode code execution in seconds. Defaults to 60s.

Returns

  • count_successful the number of examples that were executed without an error thrown.

Example

using JuliaLLMLeaderboard: run_code_blocks
@@ -44,14 +44,14 @@
 cb = AICode("mysum(a,b)=a+b")
 code = "mysum(1,2)"
 run_code_blocks(cb, [code])
-# Output: 1 (= 1 example executed without an error thrown)
source
JuliaLLMLeaderboard.run_code_mainMethod
run_code_main(msg::PT.AIMessage; verbose::Bool = true, function_name::AbstractString = "",
+# Output: 1 (= 1 example executed without an error thrown)
source
JuliaLLMLeaderboard.run_code_mainMethod
run_code_main(msg::PT.AIMessage; verbose::Bool = true, function_name::AbstractString = "",
     prefix::String = "",
     execution_timeout::Int = 60,
     capture_stdout::Bool = true,
-    expression_transform::Symbol = :remove_all_tests)

Runs the code block in the message msg and returns the result as an AICode object.

Logic:

  • Always execute with a timeout
  • Always execute in a "safe mode" (inside a custom module, safe_eval=true)
  • Skip any package imports or environment changes (skip_unsafe=true)
  • Skip invalid/broken lines (skip_invalid=true)
  • Remove any unit tests (expression_transform=:remove_all_tests), because model might have added some without being asked for it explicitly
  • First, evaluate the code block as a whole, and if it fails, try to extract the function definition and evaluate it separately (fallback)
source
JuliaLLMLeaderboard.score_evalMethod
score_eval(parsed, executed, unit_tests_success_ratio, examples_success_ratio; max_points::Int=100)

Score the evaluation result by distributing max_points equally across the available criteria.

Example

df=@rtransform df :score = score_eval(:parsed, :executed, :unit_tests_passed / :unit_tests_count, :examples_executed / :examples_count)
source
JuliaLLMLeaderboard.score_evalMethod
score_eval(eval::AbstractDict; max_points::Int=100)
+    expression_transform::Symbol = :remove_all_tests)

Runs the code block in the message msg and returns the result as an AICode object.

Logic:

  • Always execute with a timeout
  • Always execute in a "safe mode" (inside a custom module, safe_eval=true)
  • Skip any package imports or environment changes (skip_unsafe=true)
  • Skip invalid/broken lines (skip_invalid=true)
  • Remove any unit tests (expression_transform=:remove_all_tests), because model might have added some without being asked for it explicitly
  • First, evaluate the code block as a whole, and if it fails, try to extract the function definition and evaluate it separately (fallback)
source
JuliaLLMLeaderboard.score_evalMethod
score_eval(parsed, executed, unit_tests_success_ratio, examples_success_ratio; max_points::Int=100)

Score the evaluation result by distributing max_points equally across the available criteria.

Example

df=@rtransform df :score = score_eval(:parsed, :executed, :unit_tests_passed / :unit_tests_count, :examples_executed / :examples_count)
source
JuliaLLMLeaderboard.score_evalMethod
score_eval(eval::AbstractDict; max_points::Int=100)
 
-score_eval(parsed, executed, unit_tests_success_ratio, examples_success_ratio; max_points::Int=100)

Scores the evaluation result eval by distributing max_points equally across the available criteria. Alternatively, you can provide the individual scores as arguments (see above) with values in the 0-1 range.

Eg, if all 4 criteria are available, each will be worth 25% of points:

  • parsed (25% if true)
  • executed (25% if true)
  • unit_tests (25% if all unit tests passed)
  • examples (25% if all examples executed without an error thrown)
source
JuliaLLMLeaderboard.timestamp_nowMethod

Provide a current timestamp in the format yyyymmddHHMMSS. If `addrandom` is true, a random number between 100 and 999 is appended to avoid overrides.

source
JuliaLLMLeaderboard.tmapreduceMethod
tmapreduce(f, op, itr; tasks_per_thread::Int = 2, kwargs...)

A parallelized version of the mapreduce function leveraging multi-threading.

The function f is applied to each element of itr, and then the results are reduced using an associative two-argument function op.

Arguments

  • f: A function to apply to each element of itr.
  • op: An associative two-argument reduction function.
  • itr: An iterable collection of data.

Keyword Arguments

  • tasks_per_thread::Int = 2: The number of tasks spawned per thread. Determines the granularity of parallelism.
  • kwargs...: Additional keyword arguments to pass to the inner mapreduce calls.

Implementation Details

The function divides itr into chunks, spawning tasks for processing each chunk in parallel. The size of each chunk is determined by tasks_per_thread and the number of available threads (nthreads). The results from each task are then aggregated using the op function.

Notes

This implementation serves as a general replacement for older patterns. The goal is to introduce this function or a version of it to base Julia in the future.

Example

using Base.Threads: nthreads, @spawn
-result = tmapreduce(x -> x^2, +, 1:10)

The above example squares each number in the range 1 through 10 and then sums them up in parallel.

Source: Julia Blog post

source
JuliaLLMLeaderboard.validate_definitionMethod
validate_definition(definition::AbstractDict; evaluate::Bool=true, verbose::Bool=true)

Validates the definition.toml file for the code generation benchmark.

Returns true if the definition is valid.

Keyword Arguments

  • evaluate: a boolean whether to evaluate the definition. If not specified, it will evaluate the definition.
  • verbose: a boolean whether to print progress during the evaluation. If not specified, it will print progress.
  • kwargs: keyword arguments to pass to code parsing function (PT.AICode).

Example

fn_definition = joinpath("code_generation", "utility_functions", "event_scheduler", "definition.toml")
+score_eval(parsed, executed, unit_tests_success_ratio, examples_success_ratio; max_points::Int=100)

Scores the evaluation result eval by distributing max_points equally across the available criteria. Alternatively, you can provide the individual scores as arguments (see above) with values in the 0-1 range.

Eg, if all 4 criteria are available, each will be worth 25% of points:

  • parsed (25% if true)
  • executed (25% if true)
  • unit_tests (25% if all unit tests passed)
  • examples (25% if all examples executed without an error thrown)
source
JuliaLLMLeaderboard.timestamp_nowMethod

Provide a current timestamp in the format yyyymmddHHMMSS. If `addrandom` is true, a random number between 100 and 999 is appended to avoid overrides.

source
JuliaLLMLeaderboard.tmapreduceMethod
tmapreduce(f, op, itr; tasks_per_thread::Int = 2, kwargs...)

A parallelized version of the mapreduce function leveraging multi-threading.

The function f is applied to each element of itr, and then the results are reduced using an associative two-argument function op.

Arguments

  • f: A function to apply to each element of itr.
  • op: An associative two-argument reduction function.
  • itr: An iterable collection of data.

Keyword Arguments

  • tasks_per_thread::Int = 2: The number of tasks spawned per thread. Determines the granularity of parallelism.
  • kwargs...: Additional keyword arguments to pass to the inner mapreduce calls.

Implementation Details

The function divides itr into chunks, spawning tasks for processing each chunk in parallel. The size of each chunk is determined by tasks_per_thread and the number of available threads (nthreads). The results from each task are then aggregated using the op function.

Notes

This implementation serves as a general replacement for older patterns. The goal is to introduce this function or a version of it to base Julia in the future.

Example

using Base.Threads: nthreads, @spawn
+result = tmapreduce(x -> x^2, +, 1:10)

The above example squares each number in the range 1 through 10 and then sums them up in parallel.

Source: Julia Blog post

source
JuliaLLMLeaderboard.validate_definitionMethod
validate_definition(definition::AbstractDict; evaluate::Bool=true, verbose::Bool=true)

Validates the definition.toml file for the code generation benchmark.

Returns true if the definition is valid.

Keyword Arguments

  • evaluate: a boolean whether to evaluate the definition. If not specified, it will evaluate the definition.
  • verbose: a boolean whether to print progress during the evaluation. If not specified, it will print progress.
  • kwargs: keyword arguments to pass to code parsing function (PT.AICode).

Example

fn_definition = joinpath("code_generation", "utility_functions", "event_scheduler", "definition.toml")
 definition = load_definition(fn_definition)
 validate_definition(definition)
-# output: true
source
+# output: true
source
diff --git a/dev/test_definitions/index.html b/dev/test_definitions/index.html index 20a685bcf..b8edefb56 100644 --- a/dev/test_definitions/index.html +++ b/dev/test_definitions/index.html @@ -1,2 +1,2 @@ -Test Definitions · JuliaLLMLeaderboard.jl

Develop Your Test Case

All test cases are defined in definition.toml files with the structure described below.

Folder Structure

Definitions are saved in the following file paths code_generation/category/test_case_name/definition.toml.

Anatomy of definition.toml

Required fields in definition.toml include:

  • name: Corresponding to the file path.
  • contributor: The creator of the test case (and their collaborators).
  • criteria: The evaluation criteria (eg, parsing, execution, unit_tests, examples).
  • prompt: The problem statement or task.
  • version: The version of the test case. Starts at "1.0".
  • examples: Example scenarios for testing, provided as a vector of executable statements using the function name (eg, my_function(1, 2)).
  • unit_tests: Tests to validate the code, provided as a vector of @test X = Z statements.
  • imports: Packages that are made available to the model (to avoid failures due to a failed dependency).
  • reference_solution: A reference solution to the problem, provided as a string of Julia code (no code fences).

There are several optional fields:

  • examples_setup: Code to run before each example eval, provided as a string of Julia code (no code fences). Used to setup any variables or functions needed for the examples.
  • examples_teardown: Code to run after each example eval, provided as a string of Julia code (no code fences). Used to clean up any variables or functions needed for the examples.
  • unittestssetup: Code to run before each unit test eval, provided as a string of Julia code (no code fences). Used to setup any variables or functions needed for the unit tests.
  • unitteststeardown: Code to run after each unit test eval, provided as a string of Julia code (no code fences). Used to clean up any variables or functions needed for the unit tests.

The above fields can improve re-use of code across the examples/unit tests.

See an example in examples/create_definition.jl.

You can validate your test case definitions with validate_definition.

Feedback and Improvements

We highly value community input. If you have suggestions or ideas for improvement, please open an issue. All contributions are welcome!

+Test Definitions · JuliaLLMLeaderboard.jl

Develop Your Test Case

All test cases are defined in definition.toml files with the structure described below.

Folder Structure

Definitions are saved in the following file paths code_generation/category/test_case_name/definition.toml.

Anatomy of definition.toml

Required fields in definition.toml include:

  • name: Corresponding to the file path.
  • contributor: The creator of the test case (and their collaborators).
  • criteria: The evaluation criteria (eg, parsing, execution, unit_tests, examples).
  • prompt: The problem statement or task.
  • version: The version of the test case. Starts at "1.0".
  • examples: Example scenarios for testing, provided as a vector of executable statements using the function name (eg, my_function(1, 2)).
  • unit_tests: Tests to validate the code, provided as a vector of @test X = Z statements.
  • imports: Packages that are made available to the model (to avoid failures due to a failed dependency).
  • reference_solution: A reference solution to the problem, provided as a string of Julia code (no code fences).

There are several optional fields:

  • examples_setup: Code to run before each example eval, provided as a string of Julia code (no code fences). Used to setup any variables or functions needed for the examples.
  • examples_teardown: Code to run after each example eval, provided as a string of Julia code (no code fences). Used to clean up any variables or functions needed for the examples.
  • unittestssetup: Code to run before each unit test eval, provided as a string of Julia code (no code fences). Used to setup any variables or functions needed for the unit tests.
  • unitteststeardown: Code to run after each unit test eval, provided as a string of Julia code (no code fences). Used to clean up any variables or functions needed for the unit tests.

The above fields can improve re-use of code across the examples/unit tests.

See an example in examples/create_definition.jl.

You can validate your test case definitions with validate_definition.

Feedback and Improvements

We highly value community input. If you have suggestions or ideas for improvement, please open an issue. All contributions are welcome!