feat(colabs): add llamaindex local models nb (#542)

wandb · Jun 26, 2024 · 4df2e4b · 4df2e4b
1 parent 333d338
commit 4df2e4b
Showing 1 changed file with 265 additions and 0 deletions.
diff --git a/colabs/llamaindex/llamaindex_local_models.ipynb b/colabs/llamaindex/llamaindex_local_models.ipynb
@@ -0,0 +1,265 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "private_outputs": true,
+   "provenance": [],
+   "gpuType": "T4"
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "accelerator": "GPU"
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**TL;DR:** Build a RAG application using llamaindex and local models (embedding + LLM), with [weave](https://wandb.github.io/weave/) for LLM observability"
+   ],
+   "metadata": {
+    "id": "CPP-IZhwh-m8"
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 📦 Packages and Basic Setup\n",
+    "---"
+   ],
+   "metadata": {
+    "id": "AWiGTYivdRo2"
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_rh_Fvi4L9aw"
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!wget https://controlroom.jurassicoutpost.com/app/uploads/2016/05/JurassicPark-Final.pdf\n",
+    "!pip install -qU llama-index-callbacks-wandb\n",
+    "!pip install -qU llama-index-llms-huggingface\n",
+    "!pip install -qU llama-index-readers-file pymupdf\n",
+    "!pip install -qU llama-index-embeddings-huggingface\n",
+    "!pip install -qU weave ml-collections accelerate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import wandb\n",
+    "import weave\n",
+    "from llama_index.callbacks.wandb import WandbCallbackHandler\n",
+    "\n",
+    "wandb.login()\n",
+    "weave.init(\"llamaindex-weave-jurassic-qna\")\n",
+    "wandb_callback = WandbCallbackHandler(\n",
+    "    run_args={\"project\": \"llamaindex-weave-jurassic-qna\"}\n",
+    ")"
+   ],
+   "metadata": {
+    "id": "OXPCU-xUiVyJ"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# @title ⚙️ Configuration\n",
+    "import ml_collections\n",
+    "\n",
+    "from llama_index.core import Settings\n",
+    "\n",
+    "\n",
+    "def get_config() -> ml_collections.ConfigDict:\n",
+    "    config = ml_collections.ConfigDict()\n",
+    "    config.model: str = \"Writer/camel-5b-hf\"  # @param {type: \"string\"}\n",
+    "    config.embedding_model: str = \"BAAI/bge-small-en-v1.5\"  # @param {type: \"string\"}\n",
+    "    config.fetch_index_from_wandb: bool = True  # @param {type: \"boolean\"}\n",
+    "    config.wandb_entity: str = \"sauravmaheshkar\"  # @param {type: \"string\"}\n",
+    "\n",
+    "    return config\n",
+    "\n",
+    "\n",
+    "config = get_config()"
+   ],
+   "metadata": {
+    "cellView": "form",
+    "id": "48tGZgo-PDTN"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 💿 The Dataset\n",
+    "---\n",
+    "\n",
+    "In this example, we'll use the original Jurassic Park screenplay to act as our dataset."
+   ],
+   "metadata": {
+    "id": "s8MsVv_jdVS9"
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from llama_index.core import Document\n",
+    "from llama_index.readers.file import PyMuPDFReader\n",
+    "\n",
+    "documents = PyMuPDFReader().load(\n",
+    "    file_path=\"/content/JurassicPark-Final.pdf\", metadata=True\n",
+    ")\n",
+    "\n",
+    "doc_text = \"\\n\\n\".join([d.get_content() for d in documents])\n",
+    "docs = [Document(text=doc_text)]"
+   ],
+   "metadata": {
+    "id": "F2AtWCxVNCEF"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## ✍️ Model Architecture & Training\n",
+    "---\n",
+    "\n",
+    "Since we're using all local models in this example, we'll have to our own Embedding model and llm. In this particular example we'll use \"`BAAI/bge-small-en-v1.5`\" as our local embedding model and \"`Writer/camel-5b-hf`\" as the local LLM."
+   ],
+   "metadata": {
+    "id": "T5vxYGJFdod8"
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "%%capture\n",
+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
+    "\n",
+    "Settings.embed_model = HuggingFaceEmbedding(model_name=config.embedding_model)"
+   ],
+   "metadata": {
+    "id": "TjsSQ94rNsYd"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "%%capture\n",
+    "import torch\n",
+    "from llama_index.core import PromptTemplate\n",
+    "from llama_index.llms.huggingface import HuggingFaceLLM\n",
+    "\n",
+    "query_wrapper_prompt = PromptTemplate(\n",
+    "    \"Below is an instruction that describes a task. \"\n",
+    "    \"Write a response that appropriately completes the request.\\n\\n\"\n",
+    "    \"### Instruction:\\n{query_str}\\n\\n### Response:\"\n",
+    ")\n",
+    "\n",
+    "Settings.llm = HuggingFaceLLM(\n",
+    "    context_window=2048,\n",
+    "    max_new_tokens=256,\n",
+    "    generate_kwargs={\"do_sample\": False},\n",
+    "    query_wrapper_prompt=query_wrapper_prompt,\n",
+    "    tokenizer_name=config.model,\n",
+    "    model_name=config.model,\n",
+    "    device_map=\"auto\",\n",
+    "    tokenizer_kwargs={\"max_length\": 2048},\n",
+    "    model_kwargs={\"torch_dtype\": torch.float16},\n",
+    ")"
+   ],
+   "metadata": {
+    "id": "8XZVcHcmP879"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 🗂 Creating a Index\n",
+    "---\n",
+    "\n",
+    "Based on the value you set for `config.fetch_index_from_wandb` we can either create our own index, or simply download the index stored as an artifact."
+   ],
+   "metadata": {
+    "id": "x-UeU3n_d9jI"
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "if not config.fetch_index_from_wandb:\n",
+    "    index = VectorStoreIndex.from_documents(documents)\n",
+    "    wandb_callback.persist_index(index, index_name=\"camel-5b-hf-index\")"
+   ],
+   "metadata": {
+    "id": "brWxtrjuQpi5"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from llama_index.core import load_index_from_storage\n",
+    "\n",
+    "if config.fetch_index_from_wandb:\n",
+    "    storage_context = wandb_callback.load_storage_context(\n",
+    "        artifact_url=\"sauravmaheshkar/llamaindex-local-models-index/camel-5b-hf-index:v0\"\n",
+    "    )\n",
+    "\n",
+    "    # Load the index and initialize a query engine\n",
+    "    index = load_index_from_storage(\n",
+    "        storage_context,\n",
+    "    )"
+   ],
+   "metadata": {
+    "id": "q8Gg6xzrcf8y"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\"Are Velociraptors pack hunters ?\")\n",
+    "print(response, sep=\"\\n\")"
+   ],
+   "metadata": {
+    "id": "r0nOPBNlQ5jg"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "wandb_callback.finish()"
+   ],
+   "metadata": {
+    "id": "acv-Uia8U0BB"
+   },
+   "execution_count": null,
+   "outputs": []
+  }
+ ]
+}