diff --git a/example/rlaif/llama2-70b.ipynb b/example/rlaif/llama2-70b.ipynb
new file mode 100644
index 0000000..ddbe139
--- /dev/null
+++ b/example/rlaif/llama2-70b.ipynb
@@ -0,0 +1,204 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LLaMA2 70B 4-bit Inference\n",
+    "\n",
+    "## Setup: \n",
+    "- EC2 `G5.12x` (96G GPU memory) instance\n",
+    "- Deep Learning AMI GPU PyTorch 2.0.1 (Ubuntu 20.04) 20230827\n",
+    "    - `nvcc --version`: 12.1\n",
+    "- EBS: 300G\n",
+    "- Python: 3.10.12\n",
+    "- torch: 2.2.0.dev20230911+cu121\n",
+    "\n",
+    "## Installation\n",
+    "```\n",
+    "conda create -n 0911a python=3.10\n",
+    "source activate 0911a\n",
+    "pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121\n",
+    "pip3 install transformers bitsandbytes\n",
+    "huggingface-cli login\n",
+    "```\n",
+    "\n",
+    "## Inference script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/0911a/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import (\n",
+    "    AutoTokenizer,\n",
+    "    AutoModelForCausalLM,\n",
+    "    pipeline,\n",
+    "    BitsAndBytesConfig,\n",
+    ")\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)okenizer_config.json: 100%|██████████| 776/776 [00:00<00:00, 5.38MB/s]\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 16.3MB/s]\n",
+      "Downloading (…)/main/tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 6.84MB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 3.37MB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "name = \"meta-llama/Llama-2-70b-chat-hf\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(name)\n",
+    "tokenizer.pad_token_id = tokenizer.eos_token_id    # for open-ended generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)lve/main/config.json: 100%|██████████| 614/614 [00:00<00:00, 4.70MB/s]\n",
+      "Downloading (…)fetensors.index.json: 100%|██████████| 66.7k/66.7k [00:00<00:00, 160MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.85G/9.85G [00:43<00:00, 225MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [00:43<00:00, 224MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.97G/9.97G [00:43<00:00, 231MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [00:41<00:00, 234MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [00:41<00:00, 235MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [00:44<00:00, 218MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.97G/9.97G [00:43<00:00, 229MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [01:00<00:00, 162MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [01:14<00:00, 131MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [01:14<00:00, 131MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.97G/9.97G [01:16<00:00, 131MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [01:15<00:00, 131MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [01:14<00:00, 131MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 9.50G/9.50G [01:12<00:00, 131MB/s]\n",
+      "Downloading (…)of-00015.safetensors: 100%|██████████| 524M/524M [00:03<00:00, 137MB/s]\n",
+      "Downloading shards: 100%|██████████| 15/15 [13:37<00:00, 54.48s/it]\n",
+      "Loading checkpoint shards: 100%|██████████| 15/15 [00:18<00:00,  1.21s/it]\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 188/188 [00:00<00:00, 1.32MB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.float16,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    ")\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    name,\n",
+    "    quantization_config=bnb_config,\n",
+    "    device_map=\"auto\",\n",
+    "    trust_remote_code=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\n",
+      "pip install xformers.\n"
+     ]
+    }
+   ],
+   "source": [
+    "generation_pipe = pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    "    trust_remote_code=True,\n",
+    "    device_map=\"auto\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "who is jeff bezos?\n",
+      "Jeff Bezos is an American technology and retail entrepreneur, and the founder, chairman, and CEO of Amazon, the world's largest online retailer. He is widely recognized as one of the most successful entrepreneurs of our time, and has been named the richest person in the world by Forbes magazine for several years in a row.\n",
+      "Bezos was born in 1964 in Albuquerque, New Mexico, and grew up in Houston, Texas. He graduated from Princeton University in 1986 with a degree in electrical engineering and computer science. After working on Wall Street for several years, he left to start Amazon in 1994, initially operating the company out of his garage.\n",
+      "Under Bezos' leadership, Amazon has grown from a small online bookstore to a global retail giant, selling a wide range of products including electronics, clothing, home goods, and more. The company has also expanded into new areas such as cloud computing, advertising, and media production.\n",
+      "Bezos is known for his focus on customer satisfaction and his long-term approach to building and growing Amazon. He has also\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = \"who is jeff bezos?\"    # prompt goes here\n",
+    "\n",
+    "sequences = generation_pipe(\n",
+    "    text,\n",
+    "    max_length=256,\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    "    eos_token_id=tokenizer.eos_token_id,\n",
+    "    do_sample=True,\n",
+    "    top_k=10,\n",
+    "    temperature=0.4,\n",
+    "    top_p=0.9\n",
+    ")\n",
+    "\n",
+    "print(sequences[0][\"generated_text\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "0911a",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}