diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 8d22aeb..2c21fae 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -25,7 +25,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v3
with:
- python-version: '3.11'
+ python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml
index 4e4ef83..bd04b22 100644
--- a/.github/workflows/python-test.yml
+++ b/.github/workflows/python-test.yml
@@ -15,10 +15,10 @@ jobs:
steps:
- uses: actions/checkout@v3
- - name: Set up Python 3.11
+ - name: Set up Python 3.12
uses: actions/setup-python@v3
with:
- python-version: "3.11"
+ python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
diff --git a/Dockerfile b/Dockerfile
index b5d47ab..5246f1b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
FROM nvidia/cuda:12.5.1-devel-ubuntu22.04
-ARG PYTHON_VERSION=3.11
+ARG PYTHON_VERSION=3.12
ARG http_proxy
ARG https_proxy
diff --git a/Install.md b/Install.md
index 2e17f11..e88b5c9 100644
--- a/Install.md
+++ b/Install.md
@@ -65,7 +65,7 @@ MoE-PEFT: NVIDIA CUDA initialized successfully.
git clone https://github.com/TUDB-Labs/MoE-PEFT
cd moe_peft
# Optional but recommended
-conda create -n moe_peft python=3.11
+conda create -n moe_peft python=3.12
conda activate moe_peft
# Install requirements
pip3 install -r requirements.txt --upgrade
@@ -116,7 +116,7 @@ MoE-PEFT: NVIDIA CUDA initialized successfully.
git clone https://github.com/TUDB-Labs/MoE-PEFT
cd moe_peft
# Optional but recommended
-conda create -n moe_peft python=3.11
+conda create -n moe_peft python=3.12
conda activate moe_peft
# Install requirements (CUDA 12.1)
pip3 install torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
@@ -164,7 +164,7 @@ MoE-PEFT: NVIDIA CUDA initialized successfully.
git clone https://github.com/TUDB-Labs/MoE-PEFT
cd moe_peft
# Optional but recommended
-conda create -n moe_peft python=3.11
+conda create -n moe_peft python=3.12
conda activate moe_peft
# Install requirements
pip3 install -r requirements.txt --upgrade
diff --git a/generate.py b/generate.py
index ea90992..34e954f 100644
--- a/generate.py
+++ b/generate.py
@@ -56,9 +56,9 @@ def main(
)
for prompt in output[adapter_name]:
- print(f"\n{'='*10}\n")
+ print(f"\n{'=' * 10}\n")
print(prompt)
- print(f"\n{'='*10}\n")
+ print(f"\n{'=' * 10}\n")
if __name__ == "__main__":
diff --git a/misc/finetune-demo.ipynb b/misc/finetune-demo.ipynb
index 66ee3ba..80b4e96 100644
--- a/misc/finetune-demo.ipynb
+++ b/misc/finetune-demo.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# MoE-PEFT: An Efficient LLM Fine-Tuning Factory Optimized for MoE PEFT\n",
+ "# MoE-PEFT: An Efficient LLM Fine-Tuning Factory for Mixture of Expert (MoE) Parameter-Efficient Fine-Tuning.\n",
"[![](https://github.com/TUDB-Labs/MoE-PEFT/actions/workflows/python-test.yml/badge.svg)](https://github.com/TUDB-Labs/MoE-PEFT/actions/workflows/python-test.yml)\n",
"[![](https://img.shields.io/github/stars/TUDB-Labs/MoE-PEFT?logo=GitHub&style=flat)](https://github.com/TUDB-Labs/MoE-PEFT/stargazers)\n",
"[![](https://img.shields.io/github/v/release/TUDB-Labs/MoE-PEFT?logo=Github)](https://github.com/TUDB-Labs/MoE-PEFT/releases/latest)\n",
@@ -12,13 +12,15 @@
"[![](https://img.shields.io/docker/v/mikecovlee/moe_peft?logo=Docker&label=docker)](https://hub.docker.com/r/mikecovlee/moe_peft/tags)\n",
"[![](https://img.shields.io/github/license/TUDB-Labs/MoE-PEFT)](http://www.apache.org/licenses/LICENSE-2.0)\n",
"\n",
- "MoE-PEFT is an open-source *LLMOps* framework built on [m-LoRA](https://github.com/TUDB-Labs/mLoRA) developed by the [IDs Lab](https://ids-lab-asia.github.io) at Sichuan University. It is designed for high-throughput fine-tuning, evaluation, and inference of Large Language Models (LLMs) using techniques such as LoRA, DoRA, MixLoRA, and others. Key features of MoE-PEFT include:\n",
+ "MoE-PEFT is an open-source *LLMOps* framework built on [m-LoRA](https://github.com/TUDB-Labs/mLoRA). It is designed for high-throughput fine-tuning, evaluation, and inference of Large Language Models (LLMs) using techniques such as MoE + Others (like LoRA, DoRA). Key features of MoE-PEFT include:\n",
"\n",
- "- Concurrent fine-tuning of multiple adapters with a shared pre-trained model.\n",
+ "- Concurrent fine-tuning, evaluation, and inference of multiple adapters with a shared pre-trained model.\n",
+ "\n",
+ "- **MoE PEFT** optimization, mainly for [MixLoRA](https://github.com/TUDB-Labs/MixLoRA) and other MoLE implementation.\n",
"\n",
"- Support for multiple PEFT algorithms and various pre-trained models.\n",
"\n",
- "- MoE PEFT optimization, mainly for [MixLoRA](https://github.com/TUDB-Labs/MixLoRA).\n",
+ "- Seamless integration with the [HuggingFace](https://huggingface.co) ecosystem.\n",
"\n",
"## About this notebook\n",
"\n",
@@ -84,12 +86,18 @@
"metadata": {},
"outputs": [],
"source": [
- "lora_config = moe_peft.LoraConfig(\n",
+ "lora_config = moe_peft.adapter_factory(\n",
+ " peft_type=\"LORA\",\n",
" adapter_name=\"lora_0\",\n",
- " lora_r_=32,\n",
- " lora_alpha_=64,\n",
- " lora_dropout_=0.05,\n",
- " target_modules_={\"q_proj\": True, \"k_proj\": True, \"v_proj\": True, \"o_proj\": True},\n",
+ " r=8,\n",
+ " lora_alpha=16,\n",
+ " lora_dropout=0.05,\n",
+ " target_modules=[\n",
+ " \"q_proj\",\n",
+ " \"k_proj\",\n",
+ " \"v_proj\",\n",
+ " \"o_proj\",\n",
+ " ],\n",
")\n",
"\n",
"model.init_adapter(lora_config)\n",
@@ -149,7 +157,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.9"
+ "version": "3.12.7"
}
},
"nbformat": 4,
diff --git a/misc/inference-demo.ipynb b/misc/inference-demo.ipynb
new file mode 100644
index 0000000..b48dfc0
--- /dev/null
+++ b/misc/inference-demo.ipynb
@@ -0,0 +1,424 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# MoE-PEFT: An Efficient LLM Fine-Tuning Factory for Mixture of Expert (MoE) Parameter-Efficient Fine-Tuning.\n",
+ "[![](https://github.com/TUDB-Labs/MoE-PEFT/actions/workflows/python-test.yml/badge.svg)](https://github.com/TUDB-Labs/MoE-PEFT/actions/workflows/python-test.yml)\n",
+ "[![](https://img.shields.io/github/stars/TUDB-Labs/MoE-PEFT?logo=GitHub&style=flat)](https://github.com/TUDB-Labs/MoE-PEFT/stargazers)\n",
+ "[![](https://img.shields.io/github/v/release/TUDB-Labs/MoE-PEFT?logo=Github)](https://github.com/TUDB-Labs/MoE-PEFT/releases/latest)\n",
+ "[![](https://img.shields.io/pypi/v/moe_peft?logo=pypi)](https://pypi.org/project/moe_peft/)\n",
+ "[![](https://img.shields.io/docker/v/mikecovlee/moe_peft?logo=Docker&label=docker)](https://hub.docker.com/r/mikecovlee/moe_peft/tags)\n",
+ "[![](https://img.shields.io/github/license/TUDB-Labs/MoE-PEFT)](http://www.apache.org/licenses/LICENSE-2.0)\n",
+ "\n",
+ "MoE-PEFT is an open-source *LLMOps* framework built on [m-LoRA](https://github.com/TUDB-Labs/mLoRA). It is designed for high-throughput fine-tuning, evaluation, and inference of Large Language Models (LLMs) using techniques such as MoE + Others (like LoRA, DoRA). Key features of MoE-PEFT include:\n",
+ "\n",
+ "- Concurrent fine-tuning, evaluation, and inference of multiple adapters with a shared pre-trained model.\n",
+ "\n",
+ "- **MoE PEFT** optimization, mainly for [MixLoRA](https://github.com/TUDB-Labs/MixLoRA) and other MoLE implementation.\n",
+ "\n",
+ "- Support for multiple PEFT algorithms and various pre-trained models.\n",
+ "\n",
+ "- Seamless integration with the [HuggingFace](https://huggingface.co) ecosystem.\n",
+ "\n",
+ "## About this notebook\n",
+ "\n",
+ "This is a simple jupiter notebook for showcasing the basic process of building chatbot with TinyLLaMA."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Clone and install MoE-PEFT"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! pip uninstall torchvision torchaudio -y\n",
+ "! pip install moe_peft"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading the model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "365a13c3d8654e51ad894b8459a5297c",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "\n",
+ "import moe_peft\n",
+ "\n",
+ "base_model = \"google/gemma-2-2b-it\"\n",
+ "\n",
+ "model = moe_peft.LLMModel.from_pretrained(\n",
+ " base_model,\n",
+ " device=moe_peft.executor.default_device_name(),\n",
+ " load_dtype=torch.bfloat16,\n",
+ ")\n",
+ "tokenizer = moe_peft.Tokenizer(base_model)\n",
+ "\n",
+ "model.init_adapter(moe_peft.AdapterConfig(adapter_name=\"default\"))\n",
+ "\n",
+ "gen_config = moe_peft.GenerateConfig(adapter_name=\"default\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Build a chatbot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from IPython.display import Markdown\n",
+ "import textwrap\n",
+ "\n",
+ "\n",
+ "def display_chat(prompt, text):\n",
+ " formatted_prompt = (\n",
+ " \"🙋♂️\" + prompt + \"
\"\n",
+ " )\n",
+ " text = text.replace(\"•\", \" *\")\n",
+ " text = textwrap.indent(text, \"> \", predicate=lambda _: True)\n",
+ " formatted_text = \"🤖\\n\\n\" + text + \"\\n\"\n",
+ " return Markdown(formatted_prompt + formatted_text)\n",
+ "\n",
+ "\n",
+ "def to_markdown(text):\n",
+ " text = text.replace(\"•\", \" *\")\n",
+ " return Markdown(textwrap.indent(text, \"> \", predicate=lambda _: True))\n",
+ "\n",
+ "\n",
+ "class ChatState:\n",
+ " \"\"\"\n",
+ " Manages the conversation history for a turn-based chatbot\n",
+ " Follows the turn-based conversation guidelines for the Gemma family of models\n",
+ " documented at https://ai.google.dev/gemma/docs/formatting\n",
+ " \"\"\"\n",
+ "\n",
+ " __START_TURN_USER__ = \"user\\n\"\n",
+ " __START_TURN_MODEL__ = \"model\\n\"\n",
+ " __END_TURN__ = \"\"\n",
+ "\n",
+ " def __init__(\n",
+ " self,\n",
+ " model: moe_peft.LLMModel,\n",
+ " tokenizer: moe_peft.Tokenizer,\n",
+ " gen_config: moe_peft.GenerateConfig,\n",
+ " system: str = \"\",\n",
+ " ):\n",
+ " \"\"\"\n",
+ " Initializes the chat state.\n",
+ "\n",
+ " Args:\n",
+ " model: The language model to use for generating responses.\n",
+ " system: (Optional) System instructions or bot description.\n",
+ " \"\"\"\n",
+ " self.model = model\n",
+ " self.tokenizer = tokenizer\n",
+ " self.gen_config = gen_config\n",
+ " self.system = system\n",
+ " self.history = []\n",
+ "\n",
+ " def add_to_history_as_user(self, message):\n",
+ " \"\"\"\n",
+ " Adds a user message to the history with start/end turn markers.\n",
+ " \"\"\"\n",
+ " self.history.append(\n",
+ " self.__START_TURN_USER__ + message + self.__END_TURN__ + \"\\n\"\n",
+ " )\n",
+ "\n",
+ " def add_to_history_as_model(self, message):\n",
+ " \"\"\"\n",
+ " Adds a model response to the history with start/end turn markers.\n",
+ " \"\"\"\n",
+ " self.history.append(self.__START_TURN_MODEL__ + message)\n",
+ "\n",
+ " def get_history(self):\n",
+ " \"\"\"\n",
+ " Returns the entire chat history as a single string.\n",
+ " \"\"\"\n",
+ " return \"\".join([*self.history])\n",
+ "\n",
+ " def get_full_prompt(self):\n",
+ " \"\"\"\n",
+ " Builds the prompt for the language model, including history and system description.\n",
+ " \"\"\"\n",
+ " prompt = self.get_history() + self.__START_TURN_MODEL__\n",
+ " if len(self.system) > 0:\n",
+ " prompt = self.system + \"\\n\" + prompt\n",
+ " return prompt\n",
+ "\n",
+ " def send_message(self, message):\n",
+ " \"\"\"\n",
+ " Handles sending a user message and getting a model response.\n",
+ "\n",
+ " Args:\n",
+ " message: The user's message.\n",
+ "\n",
+ " Returns:\n",
+ " The model's response.\n",
+ " \"\"\"\n",
+ " self.add_to_history_as_user(message)\n",
+ " prompt = self.get_full_prompt()\n",
+ " self.gen_config.prompts = [prompt]\n",
+ " response = moe_peft.generate(\n",
+ " self.model, self.tokenizer, [self.gen_config], max_gen_len=2048\n",
+ " )[self.gen_config.adapter_name][0]\n",
+ " result = response.replace(prompt, \"\").replace(\n",
+ " self.__END_TURN__, \"\"\n",
+ " ) # Extract only the new response\n",
+ " self.add_to_history_as_model(result)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Chat with the model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "🙋♂️Tell me, in a few words, how to compute all prime numbers up to 1000?
🤖\n",
+ "\n",
+ "> Sieve of Eratosthenes.\n",
+ "> \n",
+ "> **Explanation:** This is an efficient algorithm for finding prime numbers. \n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chat = ChatState(model, tokenizer, gen_config)\n",
+ "message = \"Tell me, in a few words, how to compute all prime numbers up to 1000?\"\n",
+ "display_chat(message, chat.send_message(message))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "🙋♂️Now in Python! No numpy, please!
🤖\n",
+ "\n",
+ "> ```python\n",
+ "> def sieve_of_eratosthenes(n):\n",
+ "> \"\"\"Returns a list of prime numbers up to n.\"\"\"\n",
+ "> primes = [True] * (n + 1)\n",
+ "> primes[0] = primes[1] = False\n",
+ "> \n",
+ "> for i in range(2, int(n ** 0.5) + 1):\n",
+ "> if primes[i]:\n",
+ "> for j in range(i*i, n + 1, i):\n",
+ "> primes[j] = False\n",
+ "> \n",
+ "> return [i for i, is_prime in enumerate(primes) if is_prime]\n",
+ "> \n",
+ "> print(sieve_of_eratosthenes(1000))\n",
+ "> ```\n",
+ "> \n",
+ "> \n",
+ "> **Explanation:**\n",
+ "> \n",
+ "> 1. **Initialization:** A boolean list `primes` is created with size `n+1`, representing potential primes from 0 to n. Initially, both 0 and 1 are marked as non-primes.\n",
+ "> 2. **Iteration:** The loop starts from 2 up to the square root of `n`. We only need to check divisors up to the square root because any composite number has a prime factor less than or equal to its square root. \n",
+ "> 3. **Marking Non-Primes:** For each prime `i`, its multiples starting from `i*i` are marked as non-primes in the `primes` array.\n",
+ "> 4. **Returning Primes:** Finally, we create a new list by filtering the `primes` array using list comprehension, keeping only those indices where `primes[i]` is True, indicating that the corresponding index corresponds to a prime number. \n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> Let me know if you have any further questions or need additional explanations.\n",
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "message = \"Now in Python! No numpy, please!\"\n",
+ "display_chat(message, chat.send_message(message))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "🙋♂️Thank you, it works! Can you explain the code in French?
🤖\n",
+ "\n",
+ "> La fonction `sieve_of_eratosthenes(n)` renvoie une liste de nombres premiers jusqu'à `n`.\n",
+ "> \n",
+ "> ### Explication:\n",
+ "> La fonction utilise la méthode de Sieve d’Erathostène pour trouver les nombres premiers. Voici comment ça marche: \n",
+ "> \n",
+ "> 1. **Initialisation**: \n",
+ "> - On crée une liste booléenne `primes` de taille `n+1` représentant des nombres potentiels qui sont premiers. Les éléments initiaux sont tous définis comme `True`. Nous assignons les valeurs `False` à 0 et 1 car ils ne sont pas premiers.\n",
+ "> \n",
+ "> 2. **Itération**: \n",
+ "> - On commence par le nombre 2 jusqu'au carré root de `n` (inclusif). Cela signifie que nous allons vérifier seulement les diviseurs potentiels jusqu'à la racine carrée de `n`. \n",
+ "> 3. **Marquer les non-premiers**: \n",
+ "> - Pour chaque nombre premier `i`, on vérifie ses multiples (commençant par `i*i`) en plaçant les valeurs de `primes[j]` à `False`.\n",
+ "> \n",
+ "> \n",
+ "> 4. **Retourner les nombres premiers**: \n",
+ "> - Enfin, un nouvel array est construit en filtrant la liste `primes` en utilisant la syntaxe de liste comprehension. On sélectionne uniquement les indices correspondant à des nombres premiers où `primes[i]` est `True`. \n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> J'espère que cela vous aide à comprendre la fonction plus clairement! Si vous avez des questions supplémentaires, n'hésitez pas à les poser. \n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "message = \"Thank you, it works! Can you explain the code in French?\"\n",
+ "display_chat(message, chat.send_message(message))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "🙋♂️Great! Now add those explanations as comments in the code.
🤖\n",
+ "\n",
+ "> ```python\n",
+ "> def sieve_of_eratosthenes(n):\n",
+ "> \"\"\"Returns a list of prime numbers up to n.\"\"\"\n",
+ "> #Initialize a boolean list 'primes' with size n+1 to represent all possible prime numbers.\n",
+ "> primes = [True] * (n + 1) \n",
+ "> primes[0] = primes[1] = False # 0 and 1 are not primes.\n",
+ "> \n",
+ "> # Iterate through the integers from 2 up to the square root of n.\n",
+ "> for i in range(2, int(n ** 0.5) + 1): \n",
+ "> if primes[i]:\n",
+ "> # If the current integer 'i' is prime\n",
+ "> # Mark all multiples of 'i' as non-prime by setting their corresponding value in 'primes' to False\n",
+ "> for j in range(i * i, n + 1, i): \n",
+ "> primes[j] = False \n",
+ "> \n",
+ "> #Filter the 'primes' list to obtain a list of prime numbers\n",
+ "> return [i for i, is_prime in enumerate(primes) if is_prime]\n",
+ "> \n",
+ "> print(sieve_of_eratosthenes(1000))\n",
+ "> \n",
+ "> ```\n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> \n",
+ "> Let me know if you would like further details or clarifications! 😄 \n",
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "message = \"Great! Now add those explanations as comments in the code.\"\n",
+ "display_chat(message, chat.send_message(message))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "moe_peft",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/misc/mixlora-demo.ipynb b/misc/mixlora-demo.ipynb
new file mode 100644
index 0000000..b828021
--- /dev/null
+++ b/misc/mixlora-demo.ipynb
@@ -0,0 +1,166 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# MoE-PEFT: An Efficient LLM Fine-Tuning Factory for Mixture of Expert (MoE) Parameter-Efficient Fine-Tuning.\n",
+ "[![](https://github.com/TUDB-Labs/MoE-PEFT/actions/workflows/python-test.yml/badge.svg)](https://github.com/TUDB-Labs/MoE-PEFT/actions/workflows/python-test.yml)\n",
+ "[![](https://img.shields.io/github/stars/TUDB-Labs/MoE-PEFT?logo=GitHub&style=flat)](https://github.com/TUDB-Labs/MoE-PEFT/stargazers)\n",
+ "[![](https://img.shields.io/github/v/release/TUDB-Labs/MoE-PEFT?logo=Github)](https://github.com/TUDB-Labs/MoE-PEFT/releases/latest)\n",
+ "[![](https://img.shields.io/pypi/v/moe_peft?logo=pypi)](https://pypi.org/project/moe_peft/)\n",
+ "[![](https://img.shields.io/docker/v/mikecovlee/moe_peft?logo=Docker&label=docker)](https://hub.docker.com/r/mikecovlee/moe_peft/tags)\n",
+ "[![](https://img.shields.io/github/license/TUDB-Labs/MoE-PEFT)](http://www.apache.org/licenses/LICENSE-2.0)\n",
+ "\n",
+ "MoE-PEFT is an open-source *LLMOps* framework built on [m-LoRA](https://github.com/TUDB-Labs/mLoRA). It is designed for high-throughput fine-tuning, evaluation, and inference of Large Language Models (LLMs) using techniques such as MoE + Others (like LoRA, DoRA). Key features of MoE-PEFT include:\n",
+ "\n",
+ "- Concurrent fine-tuning, evaluation, and inference of multiple adapters with a shared pre-trained model.\n",
+ "\n",
+ "- **MoE PEFT** optimization, mainly for [MixLoRA](https://github.com/TUDB-Labs/MixLoRA) and other MoLE implementation.\n",
+ "\n",
+ "- Support for multiple PEFT algorithms and various pre-trained models.\n",
+ "\n",
+ "- Seamless integration with the [HuggingFace](https://huggingface.co) ecosystem.\n",
+ "\n",
+ "## About this notebook\n",
+ "\n",
+ "This is a simple jupiter notebook for showcasing the basic process of building MixLoRA MoE model from TinyLLaMA by fine-tuning with dummy data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Clone and install MoE-PEFT"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! pip uninstall torchvision torchaudio -y\n",
+ "! pip install moe_peft"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading the base model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "\n",
+ "import moe_peft\n",
+ "\n",
+ "moe_peft.setup_logging(\"INFO\")\n",
+ "\n",
+ "base_model = \"TinyLlama/TinyLlama_v1.1\"\n",
+ "\n",
+ "model = moe_peft.LLMModel.from_pretrained(\n",
+ " base_model,\n",
+ " device=moe_peft.executor.default_device_name(),\n",
+ " load_dtype=torch.bfloat16,\n",
+ ")\n",
+ "tokenizer = moe_peft.Tokenizer(base_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Training a dummy LoRA adapter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lora_config = moe_peft.adapter_factory(\n",
+ " peft_type=\"MIXLORA\",\n",
+ " adapter_name=\"mixlora_0\",\n",
+ " r=8,\n",
+ " lora_alpha=16,\n",
+ " lora_dropout=0.05,\n",
+ " target_modules=[\n",
+ " \"up_proj\",\n",
+ " \"down_proj\",\n",
+ " \"gate_proj\",\n",
+ " ],\n",
+ " routing_strategy=\"mixlora\",\n",
+ " num_experts=6,\n",
+ ")\n",
+ "\n",
+ "model.init_adapter(lora_config)\n",
+ "\n",
+ "train_config = moe_peft.TrainConfig(\n",
+ " adapter_name=\"mixlora_0\",\n",
+ " data_path=\"TUDB-Labs/Dummy-MoE-PEFT\",\n",
+ " num_epochs=10,\n",
+ " batch_size=16,\n",
+ " micro_batch_size=8,\n",
+ " learning_rate=1e-4,\n",
+ ")\n",
+ "\n",
+ "moe_peft.train(model=model, tokenizer=tokenizer, configs=[train_config])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Validate the effectiveness of LoRA adapter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "generate_config = moe_peft.GenerateConfig(\n",
+ " adapter_name=\"mixlora_0\",\n",
+ " prompts=[\"Could you provide an introduction to MoE-PEFT?\"],\n",
+ " stop_token=\"\\n\",\n",
+ ")\n",
+ "\n",
+ "output = moe_peft.generate(\n",
+ " model=model, tokenizer=tokenizer, configs=[generate_config], max_gen_len=128\n",
+ ")\n",
+ "\n",
+ "print(output[\"mixlora_0\"][0])"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "moe_peft",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/moe_peft.py b/moe_peft.py
index 1f08794..a70f7eb 100644
--- a/moe_peft.py
+++ b/moe_peft.py
@@ -212,12 +212,12 @@ def inference(
cache_implementation=args.cache_implementation,
stream_callback=callback,
)
- print(f"\n{'='*10}\n")
+ print(f"\n{'=' * 10}\n")
print(f"PROMPT: {input_raw}")
for adapter_name, output in outputs.items():
print(f"{adapter_name} OUTPUT:")
print(output[0])
- print(f"\n{'='*10}\n")
+ print(f"\n{'=' * 10}\n")
# Main Function
diff --git a/moe_peft/__init__.py b/moe_peft/__init__.py
index a659493..ab8e116 100644
--- a/moe_peft/__init__.py
+++ b/moe_peft/__init__.py
@@ -1,3 +1,4 @@
+from .adapters import adapter_factory
from .common import (
AdapterConfig,
LLMBatchConfig,
@@ -36,6 +37,7 @@
"LLMModelInput",
"AdapterConfig",
"LoraConfig",
+ "adapter_factory",
"TrainTask",
"Dispatcher",
"EvaluateConfig",
diff --git a/moe_peft/adapters/__init__.py b/moe_peft/adapters/__init__.py
index 7c21615..8fa73cf 100644
--- a/moe_peft/adapters/__init__.py
+++ b/moe_peft/adapters/__init__.py
@@ -62,6 +62,13 @@ def lora_config_factory(config: Dict[str, any]) -> LoraConfig:
return config_class.from_config(config).check()
+def adapter_factory(peft_type: str, adapter_name: str, **kwargs) -> LoraConfig:
+ kwargs["peft_type"] = peft_type
+ config = lora_config_factory(kwargs)
+ config.adapter_name = adapter_name
+ return config
+
+
def router_loss_factory(config: MixLoraConfig) -> torch.nn.Module:
if config.routing_strategy_ not in router_loss_dict:
return None
@@ -101,4 +108,5 @@ def moe_layer_factory(
"lora_config_factory",
"router_loss_factory",
"moe_layer_factory",
+ "adapter_factory",
]
diff --git a/moe_peft/evaluator.py b/moe_peft/evaluator.py
index 369bfcc..07a0b0c 100644
--- a/moe_peft/evaluator.py
+++ b/moe_peft/evaluator.py
@@ -184,7 +184,7 @@ def _compute_metrcis(model, current_configs, sequence_lengths, batch_labels, out
router_statistic_[idx] += val
for idx, val in enumerate(router_statistic_):
logging.info(
- f"{config.adapter_name}: expert {idx}, load = {val/32}"
+ f"{config.adapter_name}: expert {idx}, load = {val / 32}"
)
batch_size = logits.shape[0]
diff --git a/pyproject.toml b/pyproject.toml
index 5b929f2..7c21987 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "moe_peft"
-version = "2.0.0"
+version = "2.0.1"
description = "An Efficient LLM Fine-Tuning Factory Optimized for MoE PEFT"
readme = "README.md"
requires-python = ">=3.8"
@@ -14,11 +14,11 @@ classifiers = [
"Operating System :: OS Independent",
]
dependencies = [
- "torch>=2.3.0,<2.5.0",
+ "torch>=2.4.0,<2.6.0",
"datasets",
"evaluate",
"accelerate",
- "transformers>=4.44.0,<4.46.0",
+ "transformers>=4.44.0,<4.47.0",
"sentencepiece",
"huggingface_hub",
"scikit-learn",
diff --git a/requirements.txt b/requirements.txt
index 88c514b..97dfa58 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,14 @@
-torch>=2.3.0,<2.5.0
+torch>=2.4.0,<2.6.0
datasets
evaluate
accelerate
-transformers>=4.44.0,<4.46.0
+transformers>=4.44.0,<4.47.0
sentencepiece
huggingface_hub
scikit-learn
tiktoken
mixlora>=0.2.2,<0.3.0
-gradio==4.38.1
+gradio
peft==0.11.1
pandas
fire
diff --git a/tests/dummy_train.py b/tests/dummy_train.py
index bb6b1b8..9900064 100644
--- a/tests/dummy_train.py
+++ b/tests/dummy_train.py
@@ -5,10 +5,11 @@
def main(
- base_model: str,
+ base_model: str = "TinyLlama/TinyLlama_v1.1",
adapter_name: str = "lora_0",
train_data: str = "TUDB-Labs/Dummy-MoE-PEFT",
test_prompt: str = "Could you provide an introduction to MoE-PEFT?",
+ save_path: str = None,
):
moe_peft.setup_logging("INFO")
@@ -19,17 +20,18 @@ def main(
)
tokenizer = moe_peft.Tokenizer(base_model)
- lora_config = moe_peft.LoraConfig(
+ lora_config = moe_peft.adapter_factory(
+ peft_type="LORA",
adapter_name=adapter_name,
- lora_r_=32,
- lora_alpha_=64,
- lora_dropout_=0.05,
- target_modules_={
- "q_proj": True,
- "k_proj": True,
- "v_proj": True,
- "o_proj": True,
- },
+ r=8,
+ lora_alpha=16,
+ lora_dropout=0.05,
+ target_modules=[
+ "q_proj",
+ "k_proj",
+ "v_proj",
+ "o_proj",
+ ],
)
train_config = moe_peft.TrainConfig(
@@ -44,6 +46,10 @@ def main(
with moe_peft.executors.no_cache():
model.init_adapter(lora_config)
moe_peft.train(model=model, tokenizer=tokenizer, configs=[train_config])
+ if save_path:
+ moe_peft.trainer.save_adapter_weight(
+ model=model, config=train_config, path=save_path
+ )
lora_config, lora_weight = model.unload_adapter(adapter_name)
generate_configs = [
@@ -69,12 +75,12 @@ def main(
max_gen_len=128,
)
- print(f"\n{'='*10}\n")
+ print(f"\n{'=' * 10}\n")
print(f"PROMPT: {test_prompt}\n")
for adapter_name, output in outputs.items():
print(f"{adapter_name} OUTPUT:")
print(f"{output[0]}\n")
- print(f"\n{'='*10}\n")
+ print(f"\n{'=' * 10}\n")
if __name__ == "__main__":
diff --git a/tests/dummy_train_mixlora.py b/tests/dummy_train_mixlora.py
new file mode 100644
index 0000000..57b80f0
--- /dev/null
+++ b/tests/dummy_train_mixlora.py
@@ -0,0 +1,89 @@
+import fire
+import torch
+
+import moe_peft
+import moe_peft.adapters
+
+
+def main(
+ base_model: str = "TinyLlama/TinyLlama_v1.1",
+ adapter_name: str = "mixlora_0",
+ train_data: str = "TUDB-Labs/Dummy-MoE-PEFT",
+ test_prompt: str = "Could you provide an introduction to MoE-PEFT?",
+ save_path: str = None,
+):
+ moe_peft.setup_logging("INFO")
+
+ model: moe_peft.LLMModel = moe_peft.LLMModel.from_pretrained(
+ base_model,
+ device=moe_peft.executor.default_device_name(),
+ load_dtype=torch.bfloat16,
+ )
+ tokenizer = moe_peft.Tokenizer(base_model)
+
+ lora_config = moe_peft.adapter_factory(
+ peft_type="MIXLORA",
+ adapter_name=adapter_name,
+ r=8,
+ lora_alpha=16,
+ lora_dropout=0.05,
+ target_modules=[
+ "up_proj",
+ "down_proj",
+ "gate_proj",
+ ],
+ routing_strategy="mixlora",
+ num_experts=6,
+ )
+
+ train_config = moe_peft.TrainConfig(
+ adapter_name=adapter_name,
+ data_path=train_data,
+ num_epochs=10,
+ batch_size=16,
+ micro_batch_size=8,
+ learning_rate=1e-4,
+ )
+
+ with moe_peft.executors.no_cache():
+ model.init_adapter(lora_config)
+ moe_peft.train(model=model, tokenizer=tokenizer, configs=[train_config])
+ if save_path:
+ moe_peft.trainer.save_adapter_weight(
+ model=model, config=train_config, path=save_path
+ )
+ lora_config, lora_weight = model.unload_adapter(adapter_name)
+
+ generate_configs = [
+ moe_peft.GenerateConfig(
+ adapter_name=adapter_name,
+ prompts=[test_prompt],
+ stop_token="\n",
+ ),
+ moe_peft.GenerateConfig(
+ adapter_name="default",
+ prompts=[test_prompt],
+ stop_token="\n",
+ ),
+ ]
+
+ with moe_peft.executors.no_cache():
+ model.init_adapter(lora_config, lora_weight)
+ model.init_adapter(moe_peft.AdapterConfig(adapter_name="default"))
+ outputs = moe_peft.generate(
+ model=model,
+ tokenizer=tokenizer,
+ configs=generate_configs,
+ max_gen_len=128,
+ )
+
+ print(f"\n{'=' * 10}\n")
+ print(f"PROMPT: {test_prompt}\n")
+ for adapter_name, output in outputs.items():
+ print(f"{adapter_name} OUTPUT:")
+ print(f"{output[0]}\n")
+ print(f"\n{'=' * 10}\n")
+
+
+if __name__ == "__main__":
+ fire.Fire(main)