diff --git a/colabs/peft/llama_img.png b/colabs/peft/llama_img.png new file mode 100644 index 00000000..868e5c4b Binary files /dev/null and b/colabs/peft/llama_img.png differ diff --git a/colabs/peft/llama_token_cls.ipynb b/colabs/peft/llama_token_cls.ipynb new file mode 100644 index 00000000..55d68a90 --- /dev/null +++ b/colabs/peft/llama_token_cls.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](llama_img.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📦 Packages and Basic Setup\n", + "---\n", + "\n", + "To run the notebooks you'll need two secrets named `W&B` and `HF_TOKEN`. Also, in the configuration section change the `wandb_entity` to your username/workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -q -U bitsandbytes datasets evaluate ml-collections seqeval wandb\n", + "!pip install -q git+https://github.com/huggingface/peft.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import evaluate\n", + "import numpy as np\n", + "from transformers import AutoTokenizer\n", + "from datasets import ClassLabel, load_dataset\n", + "from transformers import TrainingArguments, Trainer\n", + "from peft import get_peft_model, LoraConfig, TaskType\n", + "from transformers import DataCollatorForTokenClassification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import wandb\n", + "wandb.login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title ⚙️ Configuration\n", + "\n", + "import ml_collections\n", + "\n", + "def get_config() -> ml_collections.ConfigDict:\n", + " config = ml_collections.ConfigDict()\n", + " config.model = \"unsloth/llama-2-7b-bnb-4bit\" # @param {type: \"string\"}\n", + " config.lora_r = 4 # @param {type: \"number\"}\n", + " config.lora_alpha = 32 # @param {type: \"number\"}\n", + " config.lora_dropout = 0.1 # @param {type: \"number\"}\n", + " config.max_length = 32 # @param {type: \"number\"}\n", + " config.batch_size = 16 # @param {type: \"number\"}\n", + " config.num_epochs = 5 # @param {type: \"number\"}\n", + " config.learning_rate = 1e-3 # @param {type: \"number\"}\n", + " config.dataset = \"conll2003\" # @param {type: \"string\"}\n", + " config.wandb_entity = None # @param {type: \"string\"}\n", + " return config\n", + "\n", + "config = get_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import wandb\n", + "\n", + "wandb.init(\n", + " project=\"Llama-NER\",\n", + " job_type=\"train\",\n", + " group=config.model,\n", + " config = config.to_dict(),\n", + " entity=config.wandb_entity,\n", + ")\n", + "\n", + "os.environ[\"WANDB_WATCH\"]=\"false\"\n", + "os.environ[\"WANDB_LOG_MODEL\"]=\"false\"\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 💿 The Dataset\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "ds = load_dataset(\n", + " config.dataset,\n", + " cache_dir=\"/cache/\",\n", + ")\n", + "\n", + "seqeval = evaluate.load(\"seqeval\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_names = ds[\"train\"].column_names\n", + "features = ds[\"train\"].features\n", + "\n", + "text_column_name = \"tokens\"\n", + "label_column_name = \"ner_tags\"\n", + "\n", + "label_list = features[label_column_name].feature.names\n", + "label2id = {i: i for i in range(len(label_list))}\n", + "id2label = {v: k for k, v in label2id.items()}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🖖 Utility Functions\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_metrics(p):\n", + " predictions, labels = p\n", + " predictions = np.argmax(predictions, axis=2)\n", + "\n", + " true_predictions = [\n", + " [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + " true_labels = [\n", + " [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + "\n", + " results = seqeval.compute(predictions=true_predictions, references=true_labels)\n", + " return {\n", + " \"precision\": results[\"overall_precision\"],\n", + " \"recall\": results[\"overall_recall\"],\n", + " \"f1\": results[\"overall_f1\"],\n", + " \"accuracy\": results[\"overall_accuracy\"],\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🏠 Model Architecture\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Implementating `LlamaForTokenClassification`\n", + "\n", + "[Source: @KoichiYasuoka](https://github.com/huggingface/transformers/issues/26521#issuecomment-1868284434)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "from typing import List, Optional, Tuple, Union\n", + "import torch\n", + "from torch import nn\n", + "from transformers.modeling_outputs import TokenClassifierOutput\n", + "from transformers.file_utils import add_start_docstrings_to_model_forward\n", + "from transformers.models.llama.modeling_llama import LlamaModel, LlamaPreTrainedModel, LLAMA_INPUTS_DOCSTRING\n", + "\n", + "class LlamaForTokenClassification(LlamaPreTrainedModel):\n", + " def __init__(self, config):\n", + " super().__init__(config)\n", + " self.num_labels = config.num_labels\n", + " self.model = LlamaModel(config)\n", + " if hasattr(config, \"classifier_dropout\") and config.classifier_dropout is not None:\n", + " classifier_dropout = config.classifier_dropout\n", + " elif hasattr(config, \"hidden_dropout\") and config.hidden_dropout is not None:\n", + " classifier_dropout = config.hidden_dropout\n", + " else:\n", + " classifier_dropout = 0.1\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(config.hidden_size, config.num_labels)\n", + "\n", + " # Initialize weights and apply final processing\n", + " self.post_init()\n", + "\n", + " def get_input_embeddings(self):\n", + " return self.model.embed_tokens\n", + "\n", + " def set_input_embeddings(self, value):\n", + " self.model.embed_tokens = value\n", + "\n", + " @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)\n", + " def forward(\n", + " self,\n", + " input_ids: Optional[torch.LongTensor] = None,\n", + " attention_mask: Optional[torch.Tensor] = None,\n", + " position_ids: Optional[torch.LongTensor] = None,\n", + " past_key_values: Optional[List[torch.FloatTensor]] = None,\n", + " inputs_embeds: Optional[torch.FloatTensor] = None,\n", + " labels: Optional[torch.LongTensor] = None,\n", + " use_cache: Optional[bool] = None,\n", + " output_attentions: Optional[bool] = None,\n", + " output_hidden_states: Optional[bool] = None,\n", + " return_dict: Optional[bool] = None,\n", + " ) -> Union[Tuple, TokenClassifierOutput]:\n", + "\n", + " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", + "\n", + " transformer_outputs = self.model(\n", + " input_ids,\n", + " attention_mask=attention_mask,\n", + " position_ids=position_ids,\n", + " past_key_values=past_key_values,\n", + " inputs_embeds=inputs_embeds,\n", + " use_cache=use_cache,\n", + " output_attentions=output_attentions,\n", + " output_hidden_states=output_hidden_states,\n", + " return_dict=return_dict,\n", + " )\n", + "\n", + " hidden_states = transformer_outputs[0]\n", + " hidden_states = self.dropout(hidden_states)\n", + " logits = self.classifier(hidden_states)\n", + "\n", + " loss = None\n", + " if labels is not None:\n", + " labels = labels.to(logits.device)\n", + " loss_fct = nn.CrossEntropyLoss()\n", + " loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))\n", + "\n", + " if not return_dict:\n", + " output = (logits,) + transformer_outputs[2:]\n", + " return ((loss,) + output) if loss is not None else output\n", + "\n", + " return TokenClassifierOutput(\n", + " loss=loss,\n", + " logits=logits,\n", + " hidden_states=transformer_outputs.hidden_states,\n", + " attentions=transformer_outputs.attentions\n", + " )\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(config.model)\n", + "\n", + "model = LlamaForTokenClassification.from_pretrained(\n", + " config.model,\n", + " num_labels=len(label_list),\n", + " id2label=id2label,\n", + " label2id=label2id,\n", + " cache_dir=\"/cache/\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process Dataset for Token Classification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_and_align_labels(examples):\n", + " tokenized_inputs = tokenizer(examples[\"tokens\"], is_split_into_words=True, padding='longest', max_length=config.max_length, truncation=True)\n", + "\n", + " labels = []\n", + " for i, label in enumerate(examples[f\"ner_tags\"]):\n", + " word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.\n", + " previous_word_idx = None\n", + " label_ids = []\n", + " for word_idx in word_ids: # Set the special tokens to -100.\n", + " if word_idx is None:\n", + " label_ids.append(-100)\n", + " elif word_idx != previous_word_idx: # Only label the first token of a given word.\n", + " label_ids.append(label[word_idx])\n", + " else:\n", + " label_ids.append(-100)\n", + " previous_word_idx = word_idx\n", + " labels.append(label_ids)\n", + "\n", + " tokenized_inputs[\"labels\"] = labels\n", + " return tokenized_inputs\n", + "\n", + "tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)\n", + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Obtaining 🤗 PEFT Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = LoraConfig(\n", + " task_type=TaskType.TOKEN_CLS,\n", + " inference_mode=False,\n", + " r=config.lora_r,\n", + " lora_alpha=config.lora_alpha,\n", + " lora_dropout=config.lora_dropout\n", + ")\n", + "\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ✍️ Training\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_args = TrainingArguments(\n", + " output_dir=\"unsloth-llama-2-7b-bnb-4bit-conll2003\",\n", + " learning_rate=config.learning_rate,\n", + " gradient_accumulation_steps=2,\n", + " per_device_train_batch_size=config.batch_size,\n", + " per_device_eval_batch_size=config.batch_size,\n", + " num_train_epochs=config.num_epochs,\n", + " logging_steps=100,\n", + " weight_decay=0.01,\n", + " evaluation_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " report_to=[\"wandb\"],\n", + " optim=\"paged_adamw_8bit\",\n", + " load_best_model_at_end=True,\n", + " push_to_hub=True,\n", + ")\n", + "\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_ds[\"train\"],\n", + " eval_dataset=tokenized_ds[\"test\"],\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + " compute_metrics=compute_metrics,\n", + ")\n", + "\n", + "train_results = trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wandb.config.train_results = train_results\n", + "wandb.finish()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📚 References\n", + "\n", + "* Github: [`4AI/LS-LLaMA`](https://github.com/4AI/LS-LLaMA)\n", + "* [Alpaca + Llama 7b example by `@unslothai`](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "include_colab_link": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}