From a263c4ba834221fb47f64873b1bb454f8ded59b2 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Wed, 29 May 2024 16:03:38 +0200 Subject: [PATCH] clean up --- colabs/peft/llama_token_cls.ipynb | 946 ++++++++++++++---------------- 1 file changed, 442 insertions(+), 504 deletions(-) diff --git a/colabs/peft/llama_token_cls.ipynb b/colabs/peft/llama_token_cls.ipynb index 2dd44d58..55d68a90 100644 --- a/colabs/peft/llama_token_cls.ipynb +++ b/colabs/peft/llama_token_cls.ipynb @@ -1,507 +1,445 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "collapsed_sections": [ - "qQZXezFvGvgg", - "BhOagCyPTiwO", - "wV-xDETxODhA", - "u3sWpjEuRpwV" - ], - "toc_visible": true, - "gpuType": "T4", - "private_outputs": true, - "cell_execution_strategy": "setup", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open\n" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "![](llama_img.png)" - ], - "metadata": { - "id": "jY57rlNA8rn_" - } - }, - { - "cell_type": "markdown", - "source": [ - "## 📦 Packages and Basic Setup\n", - "---\n", - "\n", - "To run the notebooks you'll need two secrets named `W&B` and `HF_TOKEN`. Also, in the configuration section change the `wandb_entity` to your username/workspace." - ], - "metadata": { - "id": "guHo1NzrGc33" - } - }, - { - "cell_type": "code", - "source": [ - "%%capture\n", - "!pip install -q -U bitsandbytes datasets evaluate ml-collections seqeval wandb\n", - "!pip install -q git+https://github.com/huggingface/peft.git" - ], - "metadata": { - "id": "YoK6kYdVEjmI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zIMX8IAYFytA" - }, - "outputs": [], - "source": [ - "import evaluate\n", - "import numpy as np\n", - "from transformers import AutoTokenizer\n", - "from datasets import ClassLabel, load_dataset\n", - "from transformers import TrainingArguments, Trainer\n", - "from peft import get_peft_model, LoraConfig, TaskType\n", - "from transformers import DataCollatorForTokenClassification" - ] - }, - { - "cell_type": "code", - "source": [ - "import wandb\n", - "wandb.login()" - ], - "metadata": { - "id": "uV4ifIYHTxsa" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title ⚙️ Configuration\n", - "\n", - "import ml_collections\n", - "\n", - "def get_config() -> ml_collections.ConfigDict:\n", - " config = ml_collections.ConfigDict()\n", - " config.model = \"unsloth/llama-2-7b-bnb-4bit\" # @param {type: \"string\"}\n", - " config.lora_r = 4 # @param {type: \"number\"}\n", - " config.lora_alpha = 32 # @param {type: \"number\"}\n", - " config.lora_dropout = 0.1 # @param {type: \"number\"}\n", - " config.max_length = 32 # @param {type: \"number\"}\n", - " config.batch_size = 16 # @param {type: \"number\"}\n", - " config.num_epochs = 5 # @param {type: \"number\"}\n", - " config.learning_rate = 1e-3 # @param {type: \"number\"}\n", - " config.dataset = \"conll2003\" # @param {type: \"string\"}\n", - " config.wandb_entity = None # @param {type: \"string\"}\n", - " return config\n", - "\n", - "config = get_config()" - ], - "metadata": { - "cellView": "form", - "id": "xGpvXtooGe5c" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "import wandb\n", - "\n", - "wandb.init(\n", - " project=\"Llama-NER\",\n", - " job_type=\"train\",\n", - " group=config.model,\n", - " config = config.to_dict(),\n", - " entity=config.wandb_entity,\n", - ")\n", - "\n", - "os.environ[\"WANDB_WATCH\"]=\"false\"\n", - "os.environ[\"WANDB_LOG_MODEL\"]=\"false\"\n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" - ], - "metadata": { - "id": "unwtlHx-UFSY" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## 💿 The Dataset\n", - "---" - ], - "metadata": { - "id": "qQZXezFvGvgg" - } - }, - { - "cell_type": "code", - "source": [ - "%%capture\n", - "ds = load_dataset(\n", - " config.dataset,\n", - " cache_dir=\"/cache/\",\n", - ")\n", - "\n", - "seqeval = evaluate.load(\"seqeval\")" - ], - "metadata": { - "id": "N4L9dLPJGynQ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "column_names = ds[\"train\"].column_names\n", - "features = ds[\"train\"].features\n", - "\n", - "text_column_name = \"tokens\"\n", - "label_column_name = \"ner_tags\"\n", - "\n", - "label_list = features[label_column_name].feature.names\n", - "label2id = {i: i for i in range(len(label_list))}\n", - "id2label = {v: k for k, v in label2id.items()}" - ], - "metadata": { - "id": "WfyZrcMQH9gu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## 🖖 Utility Functions\n", - "---" - ], - "metadata": { - "id": "BhOagCyPTiwO" - } - }, - { - "cell_type": "code", - "source": [ - "def compute_metrics(p):\n", - " predictions, labels = p\n", - " predictions = np.argmax(predictions, axis=2)\n", - "\n", - " true_predictions = [\n", - " [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n", - " for prediction, label in zip(predictions, labels)\n", - " ]\n", - " true_labels = [\n", - " [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n", - " for prediction, label in zip(predictions, labels)\n", - " ]\n", - "\n", - " results = seqeval.compute(predictions=true_predictions, references=true_labels)\n", - " return {\n", - " \"precision\": results[\"overall_precision\"],\n", - " \"recall\": results[\"overall_recall\"],\n", - " \"f1\": results[\"overall_f1\"],\n", - " \"accuracy\": results[\"overall_accuracy\"],\n", - " }" - ], - "metadata": { - "id": "f03prfWbTxsK" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## 🏠 Model Architecture\n", - "---" - ], - "metadata": { - "id": "_fPQvJkJLV1B" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Implementating `LlamaForTokenClassification`\n", - "\n", - "[Source: @KoichiYasuoka](https://github.com/huggingface/transformers/issues/26521#issuecomment-1868284434)" - ], - "metadata": { - "id": "wV-xDETxODhA" - } - }, - { - "cell_type": "code", - "source": [ - "%%capture\n", - "from typing import List, Optional, Tuple, Union\n", - "import torch\n", - "from torch import nn\n", - "from transformers.modeling_outputs import TokenClassifierOutput\n", - "from transformers.file_utils import add_start_docstrings_to_model_forward\n", - "from transformers.models.llama.modeling_llama import LlamaModel, LlamaPreTrainedModel, LLAMA_INPUTS_DOCSTRING\n", - "\n", - "class LlamaForTokenClassification(LlamaPreTrainedModel):\n", - " def __init__(self, config):\n", - " super().__init__(config)\n", - " self.num_labels = config.num_labels\n", - " self.model = LlamaModel(config)\n", - " if hasattr(config, \"classifier_dropout\") and config.classifier_dropout is not None:\n", - " classifier_dropout = config.classifier_dropout\n", - " elif hasattr(config, \"hidden_dropout\") and config.hidden_dropout is not None:\n", - " classifier_dropout = config.hidden_dropout\n", - " else:\n", - " classifier_dropout = 0.1\n", - " self.dropout = nn.Dropout(classifier_dropout)\n", - " self.classifier = nn.Linear(config.hidden_size, config.num_labels)\n", - "\n", - " # Initialize weights and apply final processing\n", - " self.post_init()\n", - "\n", - " def get_input_embeddings(self):\n", - " return self.model.embed_tokens\n", - "\n", - " def set_input_embeddings(self, value):\n", - " self.model.embed_tokens = value\n", - "\n", - " @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)\n", - " def forward(\n", - " self,\n", - " input_ids: Optional[torch.LongTensor] = None,\n", - " attention_mask: Optional[torch.Tensor] = None,\n", - " position_ids: Optional[torch.LongTensor] = None,\n", - " past_key_values: Optional[List[torch.FloatTensor]] = None,\n", - " inputs_embeds: Optional[torch.FloatTensor] = None,\n", - " labels: Optional[torch.LongTensor] = None,\n", - " use_cache: Optional[bool] = None,\n", - " output_attentions: Optional[bool] = None,\n", - " output_hidden_states: Optional[bool] = None,\n", - " return_dict: Optional[bool] = None,\n", - " ) -> Union[Tuple, TokenClassifierOutput]:\n", - "\n", - " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", - "\n", - " transformer_outputs = self.model(\n", - " input_ids,\n", - " attention_mask=attention_mask,\n", - " position_ids=position_ids,\n", - " past_key_values=past_key_values,\n", - " inputs_embeds=inputs_embeds,\n", - " use_cache=use_cache,\n", - " output_attentions=output_attentions,\n", - " output_hidden_states=output_hidden_states,\n", - " return_dict=return_dict,\n", - " )\n", - "\n", - " hidden_states = transformer_outputs[0]\n", - " hidden_states = self.dropout(hidden_states)\n", - " logits = self.classifier(hidden_states)\n", - "\n", - " loss = None\n", - " if labels is not None:\n", - " labels = labels.to(logits.device)\n", - " loss_fct = nn.CrossEntropyLoss()\n", - " loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))\n", - "\n", - " if not return_dict:\n", - " output = (logits,) + transformer_outputs[2:]\n", - " return ((loss,) + output) if loss is not None else output\n", - "\n", - " return TokenClassifierOutput(\n", - " loss=loss,\n", - " logits=logits,\n", - " hidden_states=transformer_outputs.hidden_states,\n", - " attentions=transformer_outputs.attentions\n", - " )\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(config.model)\n", - "\n", - "model = LlamaForTokenClassification.from_pretrained(\n", - " config.model,\n", - " num_labels=len(label_list),\n", - " id2label=id2label,\n", - " label2id=label2id,\n", - " cache_dir=\"/cache/\",\n", - ")" - ], - "metadata": { - "id": "O7fqyqpnMY3m" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Process Dataset for Token Classification" - ], - "metadata": { - "id": "u3sWpjEuRpwV" - } - }, - { - "cell_type": "code", - "source": [ - "def tokenize_and_align_labels(examples):\n", - " tokenized_inputs = tokenizer(examples[\"tokens\"], is_split_into_words=True, padding='longest', max_length=config.max_length, truncation=True)\n", - "\n", - " labels = []\n", - " for i, label in enumerate(examples[f\"ner_tags\"]):\n", - " word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.\n", - " previous_word_idx = None\n", - " label_ids = []\n", - " for word_idx in word_ids: # Set the special tokens to -100.\n", - " if word_idx is None:\n", - " label_ids.append(-100)\n", - " elif word_idx != previous_word_idx: # Only label the first token of a given word.\n", - " label_ids.append(label[word_idx])\n", - " else:\n", - " label_ids.append(-100)\n", - " previous_word_idx = word_idx\n", - " labels.append(label_ids)\n", - "\n", - " tokenized_inputs[\"labels\"] = labels\n", - " return tokenized_inputs\n", - "\n", - "tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)\n", - "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" - ], - "metadata": { - "id": "rvlLnwEHRsOF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Obtaining 🤗 PEFT Model" - ], - "metadata": { - "id": "Xi8EfJTNQZ50" - } - }, - { - "cell_type": "code", - "source": [ - "peft_config = LoraConfig(\n", - " task_type=TaskType.TOKEN_CLS,\n", - " inference_mode=False,\n", - " r=config.lora_r,\n", - " lora_alpha=config.lora_alpha,\n", - " lora_dropout=config.lora_dropout\n", - ")\n", - "\n", - "model = get_peft_model(model, peft_config)\n", - "model.print_trainable_parameters()" - ], - "metadata": { - "id": "7CrzHHifLYym" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## ✍️ Training\n", - "---" - ], - "metadata": { - "id": "3pcAz6LISPEy" - } - }, - { - "cell_type": "code", - "source": [ - "training_args = TrainingArguments(\n", - " output_dir=\"unsloth-llama-2-7b-bnb-4bit-conll2003\",\n", - " learning_rate=config.learning_rate,\n", - " gradient_accumulation_steps=2,\n", - " per_device_train_batch_size=config.batch_size,\n", - " per_device_eval_batch_size=config.batch_size,\n", - " num_train_epochs=config.num_epochs,\n", - " logging_steps=100,\n", - " weight_decay=0.01,\n", - " evaluation_strategy=\"epoch\",\n", - " save_strategy=\"epoch\",\n", - " report_to=[\"wandb\"],\n", - " optim=\"paged_adamw_8bit\",\n", - " load_best_model_at_end=True,\n", - " push_to_hub=True,\n", - ")\n", - "\n", - "trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " train_dataset=tokenized_ds[\"train\"],\n", - " eval_dataset=tokenized_ds[\"test\"],\n", - " tokenizer=tokenizer,\n", - " data_collator=data_collator,\n", - " compute_metrics=compute_metrics,\n", - ")\n", - "\n", - "train_results = trainer.train()" - ], - "metadata": { - "id": "XzWxRp27SQz9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "wandb.config.train_results = train_results\n", - "wandb.finish()" - ], - "metadata": { - "id": "7rORnBvUVea1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## 📚 References\n", - "\n", - "* Github: [`4AI/LS-LLaMA`](https://github.com/4AI/LS-LLaMA)\n", - "* [Alpaca + Llama 7b example by `@unslothai`](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)" - ], - "metadata": { - "id": "GYHkmefyPTOQ" - } - } - ] + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](llama_img.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📦 Packages and Basic Setup\n", + "---\n", + "\n", + "To run the notebooks you'll need two secrets named `W&B` and `HF_TOKEN`. Also, in the configuration section change the `wandb_entity` to your username/workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -q -U bitsandbytes datasets evaluate ml-collections seqeval wandb\n", + "!pip install -q git+https://github.com/huggingface/peft.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import evaluate\n", + "import numpy as np\n", + "from transformers import AutoTokenizer\n", + "from datasets import ClassLabel, load_dataset\n", + "from transformers import TrainingArguments, Trainer\n", + "from peft import get_peft_model, LoraConfig, TaskType\n", + "from transformers import DataCollatorForTokenClassification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import wandb\n", + "wandb.login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title ⚙️ Configuration\n", + "\n", + "import ml_collections\n", + "\n", + "def get_config() -> ml_collections.ConfigDict:\n", + " config = ml_collections.ConfigDict()\n", + " config.model = \"unsloth/llama-2-7b-bnb-4bit\" # @param {type: \"string\"}\n", + " config.lora_r = 4 # @param {type: \"number\"}\n", + " config.lora_alpha = 32 # @param {type: \"number\"}\n", + " config.lora_dropout = 0.1 # @param {type: \"number\"}\n", + " config.max_length = 32 # @param {type: \"number\"}\n", + " config.batch_size = 16 # @param {type: \"number\"}\n", + " config.num_epochs = 5 # @param {type: \"number\"}\n", + " config.learning_rate = 1e-3 # @param {type: \"number\"}\n", + " config.dataset = \"conll2003\" # @param {type: \"string\"}\n", + " config.wandb_entity = None # @param {type: \"string\"}\n", + " return config\n", + "\n", + "config = get_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import wandb\n", + "\n", + "wandb.init(\n", + " project=\"Llama-NER\",\n", + " job_type=\"train\",\n", + " group=config.model,\n", + " config = config.to_dict(),\n", + " entity=config.wandb_entity,\n", + ")\n", + "\n", + "os.environ[\"WANDB_WATCH\"]=\"false\"\n", + "os.environ[\"WANDB_LOG_MODEL\"]=\"false\"\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 💿 The Dataset\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "ds = load_dataset(\n", + " config.dataset,\n", + " cache_dir=\"/cache/\",\n", + ")\n", + "\n", + "seqeval = evaluate.load(\"seqeval\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_names = ds[\"train\"].column_names\n", + "features = ds[\"train\"].features\n", + "\n", + "text_column_name = \"tokens\"\n", + "label_column_name = \"ner_tags\"\n", + "\n", + "label_list = features[label_column_name].feature.names\n", + "label2id = {i: i for i in range(len(label_list))}\n", + "id2label = {v: k for k, v in label2id.items()}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🖖 Utility Functions\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_metrics(p):\n", + " predictions, labels = p\n", + " predictions = np.argmax(predictions, axis=2)\n", + "\n", + " true_predictions = [\n", + " [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + " true_labels = [\n", + " [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + "\n", + " results = seqeval.compute(predictions=true_predictions, references=true_labels)\n", + " return {\n", + " \"precision\": results[\"overall_precision\"],\n", + " \"recall\": results[\"overall_recall\"],\n", + " \"f1\": results[\"overall_f1\"],\n", + " \"accuracy\": results[\"overall_accuracy\"],\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🏠 Model Architecture\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Implementating `LlamaForTokenClassification`\n", + "\n", + "[Source: @KoichiYasuoka](https://github.com/huggingface/transformers/issues/26521#issuecomment-1868284434)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "from typing import List, Optional, Tuple, Union\n", + "import torch\n", + "from torch import nn\n", + "from transformers.modeling_outputs import TokenClassifierOutput\n", + "from transformers.file_utils import add_start_docstrings_to_model_forward\n", + "from transformers.models.llama.modeling_llama import LlamaModel, LlamaPreTrainedModel, LLAMA_INPUTS_DOCSTRING\n", + "\n", + "class LlamaForTokenClassification(LlamaPreTrainedModel):\n", + " def __init__(self, config):\n", + " super().__init__(config)\n", + " self.num_labels = config.num_labels\n", + " self.model = LlamaModel(config)\n", + " if hasattr(config, \"classifier_dropout\") and config.classifier_dropout is not None:\n", + " classifier_dropout = config.classifier_dropout\n", + " elif hasattr(config, \"hidden_dropout\") and config.hidden_dropout is not None:\n", + " classifier_dropout = config.hidden_dropout\n", + " else:\n", + " classifier_dropout = 0.1\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(config.hidden_size, config.num_labels)\n", + "\n", + " # Initialize weights and apply final processing\n", + " self.post_init()\n", + "\n", + " def get_input_embeddings(self):\n", + " return self.model.embed_tokens\n", + "\n", + " def set_input_embeddings(self, value):\n", + " self.model.embed_tokens = value\n", + "\n", + " @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)\n", + " def forward(\n", + " self,\n", + " input_ids: Optional[torch.LongTensor] = None,\n", + " attention_mask: Optional[torch.Tensor] = None,\n", + " position_ids: Optional[torch.LongTensor] = None,\n", + " past_key_values: Optional[List[torch.FloatTensor]] = None,\n", + " inputs_embeds: Optional[torch.FloatTensor] = None,\n", + " labels: Optional[torch.LongTensor] = None,\n", + " use_cache: Optional[bool] = None,\n", + " output_attentions: Optional[bool] = None,\n", + " output_hidden_states: Optional[bool] = None,\n", + " return_dict: Optional[bool] = None,\n", + " ) -> Union[Tuple, TokenClassifierOutput]:\n", + "\n", + " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", + "\n", + " transformer_outputs = self.model(\n", + " input_ids,\n", + " attention_mask=attention_mask,\n", + " position_ids=position_ids,\n", + " past_key_values=past_key_values,\n", + " inputs_embeds=inputs_embeds,\n", + " use_cache=use_cache,\n", + " output_attentions=output_attentions,\n", + " output_hidden_states=output_hidden_states,\n", + " return_dict=return_dict,\n", + " )\n", + "\n", + " hidden_states = transformer_outputs[0]\n", + " hidden_states = self.dropout(hidden_states)\n", + " logits = self.classifier(hidden_states)\n", + "\n", + " loss = None\n", + " if labels is not None:\n", + " labels = labels.to(logits.device)\n", + " loss_fct = nn.CrossEntropyLoss()\n", + " loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))\n", + "\n", + " if not return_dict:\n", + " output = (logits,) + transformer_outputs[2:]\n", + " return ((loss,) + output) if loss is not None else output\n", + "\n", + " return TokenClassifierOutput(\n", + " loss=loss,\n", + " logits=logits,\n", + " hidden_states=transformer_outputs.hidden_states,\n", + " attentions=transformer_outputs.attentions\n", + " )\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(config.model)\n", + "\n", + "model = LlamaForTokenClassification.from_pretrained(\n", + " config.model,\n", + " num_labels=len(label_list),\n", + " id2label=id2label,\n", + " label2id=label2id,\n", + " cache_dir=\"/cache/\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process Dataset for Token Classification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_and_align_labels(examples):\n", + " tokenized_inputs = tokenizer(examples[\"tokens\"], is_split_into_words=True, padding='longest', max_length=config.max_length, truncation=True)\n", + "\n", + " labels = []\n", + " for i, label in enumerate(examples[f\"ner_tags\"]):\n", + " word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.\n", + " previous_word_idx = None\n", + " label_ids = []\n", + " for word_idx in word_ids: # Set the special tokens to -100.\n", + " if word_idx is None:\n", + " label_ids.append(-100)\n", + " elif word_idx != previous_word_idx: # Only label the first token of a given word.\n", + " label_ids.append(label[word_idx])\n", + " else:\n", + " label_ids.append(-100)\n", + " previous_word_idx = word_idx\n", + " labels.append(label_ids)\n", + "\n", + " tokenized_inputs[\"labels\"] = labels\n", + " return tokenized_inputs\n", + "\n", + "tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)\n", + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Obtaining 🤗 PEFT Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = LoraConfig(\n", + " task_type=TaskType.TOKEN_CLS,\n", + " inference_mode=False,\n", + " r=config.lora_r,\n", + " lora_alpha=config.lora_alpha,\n", + " lora_dropout=config.lora_dropout\n", + ")\n", + "\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ✍️ Training\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_args = TrainingArguments(\n", + " output_dir=\"unsloth-llama-2-7b-bnb-4bit-conll2003\",\n", + " learning_rate=config.learning_rate,\n", + " gradient_accumulation_steps=2,\n", + " per_device_train_batch_size=config.batch_size,\n", + " per_device_eval_batch_size=config.batch_size,\n", + " num_train_epochs=config.num_epochs,\n", + " logging_steps=100,\n", + " weight_decay=0.01,\n", + " evaluation_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " report_to=[\"wandb\"],\n", + " optim=\"paged_adamw_8bit\",\n", + " load_best_model_at_end=True,\n", + " push_to_hub=True,\n", + ")\n", + "\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_ds[\"train\"],\n", + " eval_dataset=tokenized_ds[\"test\"],\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + " compute_metrics=compute_metrics,\n", + ")\n", + "\n", + "train_results = trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wandb.config.train_results = train_results\n", + "wandb.finish()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📚 References\n", + "\n", + "* Github: [`4AI/LS-LLaMA`](https://github.com/4AI/LS-LLaMA)\n", + "* [Alpaca + Llama 7b example by `@unslothai`](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "include_colab_link": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 }