diff --git a/README.md b/README.md
index 6db068f..127c2f4 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,9 @@
 [![Template](https://img.shields.io/badge/Template-LINCC%20Frameworks%20Python%20Project%20Template-brightgreen)](https://lincc-ppt.readthedocs.io/en/latest/)
 
 [![PyPI](https://img.shields.io/pypi/v/autora-doc?color=blue&logo=pypi&logoColor=white)](https://pypi.org/project/autora-doc/)
-[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/AutoResearch/autodoc/smoke-test.yml)](https://github.com/AutoResearch/autodoc/actions/workflows/smoke-test.yml)
+
+
+[![GitHub Workflow Status](https://github.com/autoresearch/autodoc/actions/workflows/smoke-test.yml/badge.svg)](https://github.com/AutoResearch/autodoc/actions/workflows/smoke-test.yml)
 [![codecov](https://codecov.io/gh/AutoResearch/autodoc/branch/main/graph/badge.svg)](https://codecov.io/gh/AutoResearch/autodoc)
 [![Read the Docs](https://img.shields.io/readthedocs/autora-doc)](https://autora-doc.readthedocs.io/)
 
@@ -76,7 +78,7 @@ az storage blob upload  --account-name <account> --container <container>> --file
 
 Prediction
 ```sh
-az ml job create -f azureml/predict.yml  --set display_name="Test prediction job" --web
+az ml job create -f azureml/eval.yml  --set display_name="Test prediction job" --web
 ```
 
 Notes:
diff --git a/azureml/predict.yml b/azureml/eval.yml
similarity index 87%
rename from azureml/predict.yml
rename to azureml/eval.yml
index 7f888b4..a2f72b6 100644
--- a/azureml/predict.yml
+++ b/azureml/eval.yml
@@ -1,5 +1,10 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
-command: python -m autora.doc.pipelines.main predict ${{inputs.data_dir}}/data.jsonl ${{inputs.model_dir}}/llama-2-7b-chat-hf
+command: >
+  python -m autora.doc.pipelines.main eval 
+  ${{inputs.data_dir}}/data.jsonl
+  ${{inputs.model_dir}}/llama-2-7b-chat-hf
+  SYS_1
+  INSTR_SWEETP_1
 code: ../src
 inputs:
   data_dir:
diff --git a/azureml/generate.yml b/azureml/generate.yml
new file mode 100644
index 0000000..c7df113
--- /dev/null
+++ b/azureml/generate.yml
@@ -0,0 +1,18 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
+command: >
+  python -m autora.doc.pipelines.main generate 
+  --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf
+  --output ./outputs/output.txt
+  autora/doc/pipelines/main.py    
+code: ../src
+inputs:
+  model_dir:
+    type: uri_folder 
+    path: azureml://datastores/workspaceblobstore/paths/base_models    
+environment: 
+  image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21
+  conda_file: conda.yml
+display_name: autodoc_prediction
+compute: azureml:v100cluster
+experiment_name: autodoc_prediction
+description: |
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index e6db6a3..422c8ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,11 +16,11 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "transformers>=4.35.2",
     "typer",
     "scipy",
     # This works, while installing from pytorch and cuda from conda does not",
     "torch==2.0.1",
+    "transformers>=4.35.2",
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
@@ -42,17 +42,17 @@ dev = [
     "nbsphinx", # Used to integrate Python notebooks into Sphinx documentation
     "ipython", # Also used in building notebooks into Sphinx
     "matplotlib", # Used in sample notebook intro_notebook.ipynb
-    "numpy", # Used in sample notebook intro_notebook.ipynb
     "ipykernel",
 ]
 train = [
+    "jsonlines",
     "mlflow",
-    "azureml-mlflow",
+]
+azure = [
     "azureml-core",
-    "jsonlines",
+    "azureml-mlflow",
 ]
-
-train_cuda = [
+cuda = [
     "bitsandbytes>=0.41.2.post2",
     "accelerate>=0.24.1",
     "xformers",
@@ -98,3 +98,6 @@ include = ["src/autora"]
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/autora"]
+
+[project.scripts]
+autodoc = "autora.doc.pipelines.main:app"
\ No newline at end of file
diff --git a/src/autora/doc/example_module.py b/src/autora/doc/example_module.py
deleted file mode 100644
index f76e837..0000000
--- a/src/autora/doc/example_module.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""An example module containing simplistic functions."""
-
-
-def greetings() -> str:
-    """A friendly greeting for a future friend.
-
-    Returns
-    -------
-    str
-        A typical greeting from a software engineer.
-    """
-    return "Hello from LINCC-Frameworks!"
-
-
-def meaning() -> int:
-    """The meaning of life, the universe, and everything.
-
-    Returns
-    -------
-    int
-        The meaning of life.
-    """
-    return 42
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 292c8ff..de7e906 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -1,12 +1,12 @@
 import logging
 from timeit import default_timer as timer
+from typing import List
 
-import jsonlines
-import mlflow
 import torch
 import typer
 
 from autora.doc.runtime.predict_hf import Predictor
+from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts
 
 app = typer.Typer()
 logging.basicConfig(
@@ -15,21 +15,18 @@
 )
 logger = logging.getLogger(__name__)
 
-# TODO: organize the system and instruction prompts into a separate module
-SYS = """You are a technical documentation writer. You always write clear, concise, and accurate documentation for
- scientific experiments. Your documentation focuses on the experiment's purpose, procedure, and results. Therefore,
- details about specific python functions, packages, or libraries are not necessary. Your readers are experimental
- scientists.
-"""
 
-instr = """Please generate high-level two paragraph documentation for the following experiment. The first paragraph
- should explain the purpose and the second one the procedure, but don't use the word 'Paragraph'"""
+@app.command()
+def eval(data_file: str, model_path: str, sys_id: SystemPrompts, instruc_id: InstructionPrompts) -> List[str]:
+    import jsonlines
+    import mlflow
 
+    mlflow.autolog()
 
-@app.command()
-def predict(data_file: str, model_path: str) -> None:
     run = mlflow.active_run()
 
+    sys_prompt = SYS[sys_id]
+    instr_prompt = INSTR[instruc_id]
     if run is None:
         run = mlflow.start_run()
     with run:
@@ -37,7 +34,6 @@ def predict(data_file: str, model_path: str) -> None:
         logger.info(f"running predict with {data_file}")
         logger.info(f"model path: {model_path}")
 
-        # predictions = []
         with jsonlines.open(data_file) as reader:
             items = [item for item in reader]
             inputs = [item["instruction"] for item in items]
@@ -45,7 +41,7 @@ def predict(data_file: str, model_path: str) -> None:
 
         pred = Predictor(model_path)
         timer_start = timer()
-        predictions = pred.predict(SYS, instr, inputs)
+        predictions = pred.predict(sys_prompt, instr_prompt, inputs)
         timer_end = timer()
         pred_time = timer_end - timer_start
         mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
@@ -58,6 +54,27 @@ def predict(data_file: str, model_path: str) -> None:
         total_tokens = sum([len(token) for token in tokens])
         mlflow.log_metric("total_tokens", total_tokens)
         mlflow.log_metric("tokens/sec", total_tokens / pred_time)
+        return predictions
+
+
+@app.command()
+def generate(
+    python_file: str,
+    model_path: str = "meta-llama/llama-2-7b-chat-hf",
+    output: str = "output.txt",
+    sys_id: SystemPrompts = SystemPrompts.SYS_1,
+    instruc_id: InstructionPrompts = InstructionPrompts.INSTR_SWEETP_1,
+) -> None:
+    with open(python_file, "r") as f:
+        inputs = [f.read()]
+    sys_prompt = SYS[sys_id]
+    instr_prompt = INSTR[instruc_id]
+    pred = Predictor(model_path)
+    predictions = pred.predict(sys_prompt, instr_prompt, inputs)
+    assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}"
+    logger.info(f"Writing output to {output}")
+    with open(output, "w") as f:
+        f.write(predictions[0])
 
 
 @app.command()
@@ -68,5 +85,4 @@ def import_model(model_name: str) -> None:
 if __name__ == "__main__":
     logger.info(f"Torch version: {torch.__version__} , Cuda available: {torch.cuda.is_available()}")
 
-    mlflow.autolog()
     app()
diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py
index ba3e59d..23c484e 100644
--- a/src/autora/doc/runtime/predict_hf.py
+++ b/src/autora/doc/runtime/predict_hf.py
@@ -3,24 +3,22 @@
 
 import torch
 import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from autora.doc.runtime.prompts import LLAMA2_INST_CLOSE, TEMP_LLAMA2
 
 logger = logging.getLogger(__name__)
 
 
 class Predictor:
     def __init__(self, model_path: str):
-        # Load the model in 4bit quantization for faster inference on smaller GPUs
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.bfloat16,
-        )
+        config = self.get_config()
+
         logger.info(f"Loading model from {model_path}")
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
         self.model = AutoModelForCausalLM.from_pretrained(
-            model_path, quantization_config=bnb_config, device_map="auto"
+            model_path,
+            **config,
         )
         logger.info("Model loaded")
         self.pipeline = transformers.pipeline(
@@ -30,18 +28,8 @@ def __init__(self, model_path: str):
         )
 
     def predict(self, sys: str, instr: str, inputs: List[str]) -> List[str]:
-        # Standard Llama2 template
-        template = f"""
-[INST]<<SYS>>
-{sys}
-
-{instr}
-
-[INPUT]
-[/INST]
-"""
         logger.info(f"Generating {len(inputs)} predictions")
-        prompts = [template.replace("[INPUT]", input) for input in inputs]
+        prompts = [TEMP_LLAMA2.format(sys=sys, instr=instr, input=input) for input in inputs]
         # TODO: Make these parameters configurable
         sequences = self.pipeline(
             prompts,
@@ -51,13 +39,38 @@ def predict(self, sys: str, instr: str, inputs: List[str]) -> List[str]:
             top_k=40,
             num_return_sequences=1,
             eos_token_id=self.tokenizer.eos_token_id,
-            max_length=1000,
+            max_length=2048,
         )
 
-        results = [sequence[0]["generated_text"] for sequence in sequences]
+        results = [Predictor.trim_prompt(sequence[0]["generated_text"]) for sequence in sequences]
         logger.info(f"Generated {len(results)} results")
         return results
 
+    @staticmethod
+    def trim_prompt(output: str) -> str:
+        marker = output.find(LLAMA2_INST_CLOSE)
+        if marker == -1:
+            logger.warning(f"Could not find end of prompt marker '{LLAMA2_INST_CLOSE}' in '{output}'")
+            return output
+        return output[marker + len(LLAMA2_INST_CLOSE) :]
+
     def tokenize(self, input: List[str]) -> Dict[str, List[List[int]]]:
         tokens: Dict[str, List[List[int]]] = self.tokenizer(input)
         return tokens
+
+    def get_config(self) -> Dict[str, str]:
+        if torch.cuda.is_available():
+            from transformers import BitsAndBytesConfig
+
+            # Load the model in 4bit quantization for faster inference on smaller GPUs
+            return {
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                ),
+                "device_map": "auto",
+            }
+        else:
+            return {}
diff --git a/src/autora/doc/runtime/prompts.py b/src/autora/doc/runtime/prompts.py
new file mode 100644
index 0000000..75019fc
--- /dev/null
+++ b/src/autora/doc/runtime/prompts.py
@@ -0,0 +1,36 @@
+from enum import Enum
+
+LLAMA2_INST_CLOSE = "[/INST]\n"
+
+# Standard Llama2 template
+TEMP_LLAMA2 = """
+[INST]<<SYS>>
+{sys}
+
+{instr}
+
+{input}
+[/INST]
+"""
+
+
+SYS_1 = """You are a technical documentation writer. You always write clear, concise, and accurate documentation for
+scientific experiments. Your documentation focuses on the experiment's purpose, procedure, and results. Therefore,
+details about specific python functions, packages, or libraries are not necessary. Your readers are experimental
+scientists.
+"""
+
+INSTR_SWEETP_1 = """Please generate high-level two paragraph documentation for the following experiment. The first
+paragraph should explain the purpose and the second one the procedure, but don't use the word 'Paragraph'"""
+
+
+class SystemPrompts(str, Enum):
+    SYS_1 = "SYS_1"
+
+
+class InstructionPrompts(str, Enum):
+    INSTR_SWEETP_1 = "INSTR_SWEETP_1"
+
+
+SYS = {SystemPrompts.SYS_1: SYS_1}
+INSTR = {InstructionPrompts.INSTR_SWEETP_1: INSTR_SWEETP_1}
diff --git a/tests/test.py b/tests/test.py
deleted file mode 100644
index a578227..0000000
--- a/tests/test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from autora.doc import example_module
-
-
-def test_greetings() -> None:
-    """Verify the output of the `greetings` function"""
-    output = example_module.greetings()
-    assert output == "Hello from LINCC-Frameworks!"
-
-
-def test_meaning() -> None:
-    """Verify the output of the `meaning` function"""
-    output = example_module.meaning()
-    assert output == 42
diff --git a/tests/test_main.py b/tests/test_main.py
new file mode 100644
index 0000000..3e67bab
--- /dev/null
+++ b/tests/test_main.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+from autora.doc.pipelines.main import eval, generate
+from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
+
+# dummy HF model for testing
+TEST_HF_MODEL = "hf-internal-testing/tiny-random-FalconForCausalLM"
+
+
+def test_predict() -> None:
+    data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
+    outputs = eval(str(data), TEST_HF_MODEL, SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1)
+    assert len(outputs) == 3, "Expected 3 outputs"
+    for output in outputs:
+        assert len(output) > 0, "Expected non-empty output"
+
+
+def test_generate() -> None:
+    python_file = __file__
+    output = Path("output.txt")
+    output.unlink(missing_ok=True)
+    generate(python_file, TEST_HF_MODEL, str(output), SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1)
+    assert output.exists(), f"Expected output file {output} to exist"
+    with open(str(output), "r") as f:
+        assert len(f.read()) > 0, f"Expected non-empty output file {output}"
diff --git a/tests/test_predict_hf.py b/tests/test_predict_hf.py
new file mode 100644
index 0000000..9ad2c47
--- /dev/null
+++ b/tests/test_predict_hf.py
@@ -0,0 +1,16 @@
+from autora.doc.runtime.predict_hf import Predictor
+
+
+def test_trim_prompt() -> None:
+    """Verify the output of the `trim_prompt` function"""
+    no_marker = "Generated text with no marker"
+    output = Predictor.trim_prompt(no_marker)
+    assert output == no_marker
+
+    with_marker = """
+The prompt is here
+[/INST]
+output
+"""
+    output = Predictor.trim_prompt(with_marker)
+    assert output == "output\n"