diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index d01c407..69c1f55 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -3,13 +3,11 @@
 from timeit import default_timer as timer
 from typing import Dict, List, Tuple
 
-import nltk
 import torch
 import typer
-from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
-from nltk.translate.meteor_score import single_meteor_score
 
 from autora.doc.classes.EvalResult import EvalResult
+from autora.doc.pipelines.metrics import eval_bleu_meteor
 from autora.doc.runtime.predict_hf import Predictor
 from autora.doc.runtime.prompts import PROMPTS, PromptIds
 from autora.doc.util import get_prompts_from_file
@@ -22,33 +20,6 @@
 logger = logging.getLogger(__name__)
 
 
-def evaluate_documentation(predictions: List[str], references: List[str]) -> Tuple[float, float]:
-    nltk.download("wordnet")
-
-    # Tokenize references
-    tokenized_references = [ref.split() for ref in references]
-    # Currently there is only 1 prediction for 1 reference, need to avg in future
-    tokenized_predictions = [pred.split() if pred else [] for pred in predictions]
-
-    # Calculate BLEU score with smoothing function
-    # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
-    bleu = corpus_bleu(
-        # Wrap each reference list in another list
-        [[tokenized_ref] for tokenized_ref in tokenized_references],
-        tokenized_predictions,
-        smoothing_function=SmoothingFunction().method1,
-    )
-
-    # Calculate METEOR scores
-    meteor_scores = [
-        single_meteor_score(tokenized_ref, tokenized_pred)
-        for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
-    ]
-    meteor = sum(meteor_scores) / len(predictions) if predictions else 0
-
-    return (bleu, meteor)
-
-
 @app.command(help="Evaluate a model for code-to-documentation generation for all prompts in the prompts_file")
 def eval_prompts(
     data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"),
@@ -143,7 +114,7 @@ def eval_prompt(
     timer_start = timer()
     predictions = pred.predict(prompt, inputs, **param_dict)
     timer_end = timer()
-    bleu, meteor = evaluate_documentation(predictions, labels)
+    bleu, meteor = eval_bleu_meteor(predictions, labels)
     pred_time = timer_end - timer_start
     mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
     for i in range(len(inputs)):
diff --git a/src/autora/doc/pipelines/metrics.py b/src/autora/doc/pipelines/metrics.py
new file mode 100644
index 0000000..92bdf4c
--- /dev/null
+++ b/src/autora/doc/pipelines/metrics.py
@@ -0,0 +1,32 @@
+from typing import List, Tuple
+
+import nltk
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
+from nltk.translate.meteor_score import single_meteor_score
+
+
+def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]:
+    nltk.download("wordnet")
+
+    # Tokenize references
+    tokenized_references = [ref.split() for ref in references]
+    # Currently there is only 1 prediction for 1 reference, need to avg in future
+    tokenized_predictions = [pred.split() if pred else [] for pred in predictions]
+
+    # Calculate BLEU score with smoothing function
+    # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
+    bleu = corpus_bleu(
+        # Wrap each reference list in another list
+        [[tokenized_ref] for tokenized_ref in tokenized_references],
+        tokenized_predictions,
+        smoothing_function=SmoothingFunction().method1,
+    )
+
+    # Calculate METEOR scores
+    meteor_scores = [
+        single_meteor_score(tokenized_ref, tokenized_pred)
+        for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
+    ]
+    meteor = sum(meteor_scores) / len(predictions) if predictions else 0
+
+    return (bleu, meteor)
diff --git a/tests/test_main.py b/tests/test_main.py
index 2437be1..13be933 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,11 +1,8 @@
 from pathlib import Path
-from typing import Dict, List
-
-import jsonlines
-import pytest
+from typing import List
 
 from autora.doc.classes.EvalResult import EvalResult
-from autora.doc.pipelines.main import eval, eval_prompts, evaluate_documentation, generate, import_data
+from autora.doc.pipelines.main import eval, eval_prompts, generate, import_data
 from autora.doc.runtime.prompts import PromptIds
 
 # dummy HF model for testing
@@ -20,55 +17,6 @@ def test_predict() -> None:
         assert len(output) > 0, "Expected non-empty output"
 
 
-def test_evaluation() -> None:
-    # Test Case: Meteor and Bleu scores are close to 1
-    data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
-    with jsonlines.open(data) as reader:
-        items = [item for item in reader]
-        labels = [item["output"] for item in items]
-        predictions = [item["output"] for item in items]
-
-    bleu, meteor = evaluate_documentation(predictions, labels)
-    assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
-    assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"
-
-
-def test_extra_token_in_prediction() -> None:
-    # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
-    labels = ["this is a test"]
-    predictions = ["this is a test extra"]
-    bleu, meteor = evaluate_documentation(predictions, labels)
-    assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
-    assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"
-
-
-def test_missing_token_in_prediction() -> None:
-    # bleu score is less, meteor is higher
-    labels = ["this is a test"]
-    predictions = ["this is a"]
-    bleu, meteor = evaluate_documentation(predictions, labels)
-    assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
-    assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"
-
-
-def test_completely_different_tokens() -> None:
-    # both scores are less, as no common tokens
-    labels = ["this is a test"]
-    predictions = ["completely different sentence"]
-    bleu, meteor = evaluate_documentation(predictions, labels)
-    assert bleu <= 0.1, f"BLEU Score is {bleu}"
-    assert meteor <= 0.1, f"METEOR Score is {meteor}"
-
-
-def test_partially_matching_tokens() -> None:
-    # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
-    labels = ["this is a test"]
-    predictions = ["this is a different test"]
-    bleu, meteor = evaluate_documentation(predictions, labels)
-    assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
-    assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"
-
-
 def test_generate() -> None:
     python_file = __file__
     output = Path("output.txt")
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 0000000..693b93a
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,55 @@
+from pathlib import Path
+
+import jsonlines
+import pytest
+
+from autora.doc.pipelines.metrics import eval_bleu_meteor
+
+
+def test_evaluation() -> None:
+    # Test Case: Meteor and Bleu scores are close to 1
+    data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
+    with jsonlines.open(data) as reader:
+        items = [item for item in reader]
+        labels = [item["output"] for item in items]
+        predictions = [item["output"] for item in items]
+
+    bleu, meteor = eval_bleu_meteor(predictions, labels)
+    assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
+    assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"
+
+
+def test_extra_token_in_prediction() -> None:
+    # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
+    labels = ["this is a test"]
+    predictions = ["this is a test extra"]
+    bleu, meteor = eval_bleu_meteor(predictions, labels)
+    assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"
+
+
+def test_missing_token_in_prediction() -> None:
+    # bleu score is less, meteor is higher
+    labels = ["this is a test"]
+    predictions = ["this is a"]
+    bleu, meteor = eval_bleu_meteor(predictions, labels)
+    assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
+    assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"
+
+
+def test_completely_different_tokens() -> None:
+    # both scores are less, as no common tokens
+    labels = ["this is a test"]
+    predictions = ["completely different sentence"]
+    bleu, meteor = eval_bleu_meteor(predictions, labels)
+    assert bleu <= 0.1, f"BLEU Score is {bleu}"
+    assert meteor <= 0.1, f"METEOR Score is {meteor}"
+
+
+def test_partially_matching_tokens() -> None:
+    # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
+    labels = ["this is a test"]
+    predictions = ["this is a different test"]
+    bleu, meteor = eval_bleu_meteor(predictions, labels)
+    assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"