diff --git a/.mypy.ini b/.mypy.ini
index d9e2214..6e719c2 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -8,3 +8,6 @@ ignore_missing_imports = True
 
 [mypy-mlflow.*]
 ignore_missing_imports = True
+
+[mypy-nltk.*]
+ignore_missing_imports = True
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cc58e8e..fe08b75 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -78,7 +78,7 @@ repos:
         # supported by your project here, or alternatively use
         # pre-commit's default_language_version, see
         # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.10
+        language_version: python3
 
 
 
diff --git a/azureml/conda.yml b/azureml/conda.yml
index fab3656..fa4aa98 100644
--- a/azureml/conda.yml
+++ b/azureml/conda.yml
@@ -14,5 +14,6 @@ dependencies:
     - transformers>=4.35.2
     - xformers
     - scipy
+    - nltk
     # This works, while installing from pytorch and cuda from conda does not
     - torch==2.0.1
diff --git a/pyproject.toml b/pyproject.toml
index e1c4382..7ce597f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     # This works, while installing from pytorch and cuda from conda does not",
     "torch==2.0.1",
     "transformers>=4.35.2",
+    "nltk",
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index a553153..d303bf5 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -1,10 +1,13 @@
 import itertools
 import logging
 from timeit import default_timer as timer
-from typing import List
+from typing import List, Tuple
 
+import nltk
 import torch
 import typer
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
+from nltk.translate.meteor_score import single_meteor_score
 
 from autora.doc.runtime.predict_hf import Predictor
 from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts
@@ -17,6 +20,33 @@
 logger = logging.getLogger(__name__)
 
 
+def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]:
+    nltk.download("wordnet")
+
+    # Tokenize references
+    tokenized_references = [ref.split() for ref in references]
+    # Currently there is only 1 prediction for 1 reference, need to avg in future
+    tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
+
+    # Calculate BLEU score with smoothing function
+    # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
+    bleu = corpus_bleu(
+        # Wrap each reference list in another list
+        [[tokenized_ref] for tokenized_ref in tokenized_references],
+        tokenized_predictions,
+        smoothing_function=SmoothingFunction().method1,
+    )
+
+    # Calculate METEOR scores
+    meteor_scores = [
+        single_meteor_score(tokenized_ref, tokenized_pred)
+        for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
+    ]
+    meteor = sum(meteor_scores) / len(predictions) if predictions else 0
+
+    return (bleu, meteor)
+
+
 @app.command(help="Evaluate model on a data file")
 def eval(
     data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"),
@@ -55,6 +85,8 @@ def eval(
         pred = Predictor(model_path)
         timer_start = timer()
         predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict)
+        bleu, meteor = evaluate_documentation(predictions, labels)
+
         timer_end = timer()
         pred_time = timer_end - timer_start
         mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
@@ -63,6 +95,8 @@ def eval(
             mlflow.log_text(inputs[i], f"input_{i}.py")
             for j in range(len(predictions[i])):
                 mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
+        mlflow.log_text("bleu_score is ", str(bleu))
+        mlflow.log_text("meteor_score is ", str(meteor))
 
         # flatten predictions for counting tokens
         predictions_flat = list(itertools.chain.from_iterable(predictions))
@@ -70,6 +104,8 @@ def eval(
         total_tokens = sum([len(token) for token in tokens])
         mlflow.log_metric("total_tokens", total_tokens)
         mlflow.log_metric("tokens/sec", total_tokens / pred_time)
+        mlflow.log_metric("bleu_score", round(bleu, 5))
+        mlflow.log_metric("meteor_score", round(meteor, 5))
         return predictions
 
 
diff --git a/tests/test_main.py b/tests/test_main.py
index f5a283e..a1eed5f 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,6 +1,9 @@
 from pathlib import Path
 
-from autora.doc.pipelines.main import eval, generate, import_data
+import jsonlines
+import pytest
+
+from autora.doc.pipelines.main import eval, evaluate_documentation, generate, import_data
 from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
 
 # dummy HF model for testing
@@ -15,6 +18,55 @@ def test_predict() -> None:
         assert len(output[0]) > 0, "Expected non-empty output"
 
 
+def test_evaluation() -> None:
+    # Test Case: Meteor and Bleu scores are close to 1
+    data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
+    with jsonlines.open(data) as reader:
+        items = [item for item in reader]
+        labels = [item["output"] for item in items]
+        predictions = [[item["output"]] for item in items]
+
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
+    assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"
+
+
+def test_extra_token_in_prediction() -> None:
+    # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
+    labels = ["this is a test"]
+    predictions = [["this is a test extra"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"
+
+
+def test_missing_token_in_prediction() -> None:
+    # bleu score is less, meteor is higher
+    labels = ["this is a test"]
+    predictions = [["this is a"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
+    assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"
+
+
+def test_completely_different_tokens() -> None:
+    # both scores are less, as no common tokens
+    labels = ["this is a test"]
+    predictions = [["completely different sentence"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert bleu <= 0.1, f"BLEU Score is {bleu}"
+    assert meteor <= 0.1, f"METEOR Score is {meteor}"
+
+
+def test_partially_matching_tokens() -> None:
+    # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
+    labels = ["this is a test"]
+    predictions = [["this is a different test"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"
+
+
 def test_generate() -> None:
     python_file = __file__
     output = Path("output.txt")