hpcaitech · TongLi3701 · Jul 28, 2023 · Jul 28, 2023
@@ -0,0 +1,87 @@
+# Copyright 2023 lm-sys@FastChat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+from enum import Enum, auto
+from typing import List
+
+
+class SeparatorStyle(Enum):
+ ADD_EOS_TOKEN = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+ system: str
+ roles: List[str]
+ messages: List[List[str]]
+ offset: int
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_EOS_TOKEN
+ sep: str = "</s>"
+
+ skip_next: bool = False
+
+ def get_prompt(self):
+ if self.sep_style == SeparatorStyle.ADD_EOS_TOKEN:
+ ret = self.system
+ for role, message in self.messages:
+ if message:
+ ret += role + ": " + message + self.sep
+ else:
+ ret += role + ": "
+ return ret
+ else:
+ raise ValueError(f"Invalid style: {self.sep_style}")
+
+ def append_message(self, role, message):
+ self.messages.append([role, message])
+
+ def to_gradio_chatbot(self):
+ ret = []
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
+ if i % 2 == 0:
+ ret.append([msg, None])
+ else:
+ ret[-1][-1] = msg
+ return ret
+
+ def copy(self):
+ return Conversation(system=self.system,
+ roles=self.roles,
+ messages=[[x, y] for x, y in self.messages],
+ offset=self.offset,
+ sep_style=self.sep_style,
+ sep=self.sep)
+
+ def dict(self):
+ return {
+ "system": self.system,
+ "roles": self.roles,
+ "messages": self.messages,
+ "offset": self.offset,
+ "sep": self.sep
+ }
+
+
+conv = Conversation(
+ system="A chat between a curious human and an artificial intelligence assistant. "
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+ roles=("Human", "Assistant"),
+ messages=(),
+ offset=0,
+ sep_style=SeparatorStyle.ADD_EOS_TOKEN,
+ sep="</s>",
+)
+
+default_conversation = conv
@@ -15,7 +15,7 @@
 import copy
 import random
 from dataclasses import dataclass, field
-from typing import Callable, Dict, Sequence
+from typing import Callable, Dict, List, Sequence, Tuple
 
 import torch
 import torch.distributed as dist
@@ -25,11 +25,21 @@
 
 from colossalai.logging import get_dist_logger
 
+from .conversation import default_conversation
 from .utils import is_rank_0, jload
 
+# The following is a template prompt for a 4-round conversation.
+"""
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+Human: xxx</s>Assistant: xxx</s>Human: xxx</s>Assistant: xxx</s>Human: xxx</s>Assistant: xxx</s>Human: xxx</s>Assistant: xxx</s>
+"""
+# Please note that we only calculate loss on assistant's answer tokens.
+
 logger = get_dist_logger()
 
 IGNORE_INDEX = -100
+DEFAULT_EOS_TOKEN = "</s>"
 PROMPT_DICT = {
  "prompt_input":
  ("Below is an instruction that describes a task, paired with an input that provides further context. "
@@ -107,6 +117,61 @@ def preprocess(
  return dict(input_ids=input_ids, labels=labels)
 
 
+def preprocess_conversation(sources: List[List[Dict]], tokenizer: transformers.PreTrainedTokenizer,
+ max_length: int) -> Dict:
+ """Preprocess the conversation data by tokenizing."""
+ conversations = []
+ intermediates = []
+ for source in sources:
+ header = f"{default_conversation.system}"
+ conversation, intermediate = _add_speaker_and_signal(header, source)
+ conversations.append(conversation)
+ intermediates.append(intermediate)
+
+ conversations_tokenized = _tokenize_fn(conversations, tokenizer, max_length)
+ input_ids = conversations_tokenized["input_ids"]
+ targets = copy.deepcopy(input_ids)
+
+ assert len(targets) == len(intermediates)
+ for target, inters in zip(targets, intermediates):
+ mask = torch.zeros_like(target, dtype=torch.bool)
+ for inter in inters:
+ tokenized = _tokenize_fn(inter, tokenizer, max_length)
+
+ start_idx = tokenized["input_ids"][0].size(0) - 1
+ end_idx = tokenized["input_ids"][1].size(0)
+
+ mask[start_idx:end_idx] = True
+ target[~mask] = IGNORE_INDEX
+
+ return dict(input_ids=input_ids, labels=targets)
+
+
+def _add_speaker_and_signal(header: str,
+ source: List[Dict],
+ get_conversation: bool = True) -> Tuple[str, List[List[str]]]:
+ END_SIGNAL = DEFAULT_EOS_TOKEN
+ conversation = header
+ intermediate = []
+ for sentence in source:
+ from_str = sentence["from"]
+ if from_str.lower() == "human":
+ from_str = default_conversation.roles[0]
+ elif from_str.lower() == "gpt":
+ from_str = default_conversation.roles[1]
+ else:
+ from_str = 'unknown'
+
+ value = from_str + ": " + sentence["value"] + END_SIGNAL
+ if sentence["from"].lower() == "gpt":
+ start = conversation + from_str + ": "
+ end = conversation + value
+ intermediate.append([start, end])
+ if get_conversation:
+ conversation += value
+ return conversation, intermediate
+
+
 class SupervisedDataset(Dataset):
  """Dataset for supervised fine-tuning."""
 
@@ -125,15 +190,27 @@ def __init__(self,
  list_data_dict = list_data_dict[:max_datasets_size]
 
  logger.info("Formatting inputs...")
- prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
- sources = [
- prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
- for example in list_data_dict
- ]
- targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
-
- logger.info("Tokenizing inputs... This may take some time...")
- data_dict = preprocess(sources, targets, tokenizer, max_length)
+ if "conversations" not in list_data_dict[0]:
+ prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
+ sources = [
+ prompt_input.format_map(example)
+ if example.get("input", "") != "" else prompt_no_input.format_map(example) for example in list_data_dict
+ ]
+ targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
+
+ if is_rank_0():
+ logger.info("Tokenizing inputs... This may take some time...")
+
+ data_dict = preprocess(sources, targets, tokenizer, max_length)
+ else:
+ if is_rank_0():
+ logger.info("Tokenizing inputs... This may take some time...")
+
+ sources = [conv["conversations"] for conv in list_data_dict]
+ data_dict = preprocess_conversation(sources, tokenizer, max_length)
+
+ if is_rank_0():
+ logger.info("Tokenizing finish.")
 
  self.input_ids = data_dict["input_ids"]
  self.labels = data_dict["labels"]

@@ -6,6 +6,7 @@
  - [Table of Contents](#table-of-contents)
  - [Install requirements](#install-requirements)
  - [Supervised datasets collection](#supervised-datasets-collection)
+ - [Conversation dataset generation](#conversation-dataset-generation)
  - [Stage1 - Supervised instructs tuning](#stage1---supervised-instructs-tuning)
  - [Arg List](#arg-list)
  - [Stage2 - Training reward model](#stage2---training-reward-model)
@@ -45,6 +46,49 @@ The following pic shows how we collected the data.
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/data-collect.png" width=500/>
 </p>
 
+### Conversation dataset generation
+
+In order to further improve the model's ability to handle multi-turn conversations, we need to include samples with multi-turn conversations in the dataset. However, the samples in InstructWild and Alpaca datasets currently consist of only single-turn conversations, and their dataset organization is not suitable for storing multi-turn conversations. Additionally, after converting the aforementioned datasets, we also need to include multi-turn conversation datasets like ShareGPT, and we should transform them into the training format supported by ColossalChat.
+
+A sample of conversation dataset should have the following fields:
+
+* `type` (str, optional): The type of the data sample.
+* `language` (str, optional): The language of the data sample.
+* `dataset` (str, optional): The dataset the data sample originates from.
+* `conversations` (str, compulsory): Conversation content of the data sample.
+* `id` (int, optional): The ID of the data sample.
+
+A simple example:
+```json
+{
+ "type": "instruction",
+ "language": "English",
+ "dataset": "Alpaca",
+ "conversations": [
+ {
+ "from": "human",
+ "value": "Give three tips for staying healthy."
+ },
+ {
+ "from": "gpt",
+ "value": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."
+ }
+ ],
+ "id": 1
+}
+```
+
+> **NOTE:** Only key `conversations` is compulsary for training and other keys serve as metadata. The length of `conversations` varies.
+
+You can run the `examples/generate_conversation_dataset.py` to generate a conversation dataset supported by ColossalChat.
+
+You can use the following cmd to generate conversation dataset.
+```
+python generate_conversation_dataset.py \
+ --dataset "All"
+ --save_path "/path/to/dataset"
+```
+
 ## Stage1 - Supervised instructs tuning
 
 Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.

@@ -0,0 +1,79 @@
+import argparse
+import json
+
+from datasets import load_dataset
+
+
+def generate_alpaca():
+ # We can convert dataset with the same format("instruction", "input", "output") as Alpaca into a one-round conversation.
+ conversation_dataset = []
+ dataset = load_dataset("tatsu-lab/alpaca", split="train")
+
+ instructions = dataset["instruction"]
+ inputs = dataset["input"]
+ outputs = dataset["output"]
+
+ assert len(instructions) == len(inputs) == len(outputs)
+
+ for idx in range(len(instructions)):
+ human_utterance = instructions[idx] + "\n\n" + inputs[idx] if inputs[idx] else instructions[idx]
+ human = {"from": "human", "value": human_utterance}
+
+ gpt_utterance = outputs[idx]
+ gpt = {"from": "gpt", "value": gpt_utterance}
+
+ conversation = dict(type="instruction", language="English", dataset="Alpaca", conversations=[human, gpt])
+ conversation_dataset.append(conversation)
+
+ return conversation_dataset
+
+
+def generate_sharegpt():
+ # ShareGPT data requires less processing.
+ conversation_dataset = []
+ dataset = load_dataset("anon8231489123/ShareGPT_Vicuna_unfiltered",
+ data_files="ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json",
+ split="train")
+
+ conversations = dataset["conversations"]
+
+ for idx in range(len(conversations)):
+ for conv in conversations[idx]:
+ # We don't need markdown and text value.
+ del conv["markdown"]
+ del conv["text"]
+
+ conversation = dict(type="conversation",
+ language="Multilingual",
+ dataset="ShareGPT",
+ conversations=conversations[idx])
+ conversation_dataset.append(conversation)
+
+ return conversation_dataset
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--dataset',
+ type=str,
+ default="All",
+ choices=["Alpaca", "ShareGPT", "All"],
+ help="which dataset to convert, All will combine Alpaca and ShareGPT")
+ parser.add_argument('--save_path', type=str, default="dataset.json", help="path to save the converted dataset")
+ args = parser.parse_args()
+
+ conversation_dataset = []
+
+ if args.dataset == "Alpaca":
+ conversation_dataset.extend(generate_alpaca())
+ elif args.dataset == "ShareGPT":
+ conversation_dataset.extend(generate_sharegpt())
+ else:
+ conversation_dataset.extend(generate_alpaca())
+ conversation_dataset.extend(generate_sharegpt())
+
+ for idx, sample in enumerate(conversation_dataset):
+ sample["id"] = idx + 1
+
+ with open(args.save_path, mode='w') as f:
+ json.dump(conversation_dataset, f, indent=4, default=str, ensure_ascii=False)
@@ -74,8 +74,8 @@ def train(args):
  padding_side="right",
  use_fast=False,
  )
- tokenizer.eos_token = '<\s>'
- tokenizer.pad_token = tokenizer.unk_token
+ tokenizer.eos_token = '</s>'
+ tokenizer.pad_token = tokenizer.eos_token
  else:
  raise ValueError(f'Unsupported model "{args.model}"')
 
@@ -153,9 +153,7 @@ def train(args):
  optim,
  num_warmup_steps=math.ceil(max_steps * 0.03),
  num_training_steps=max_steps)
- strategy_dict = strategy.prepare(
- dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler)
- )
+ strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler))
  model = strategy_dict['model']
  optim = strategy_dict['optimizer']
  lr_scheduler = strategy_dict['lr_scheduler']