feat: update yml chat_template to specify dataset field (#2001) [skip…

… ci] * feat: update yml chat_template to specify dataset field * feat: replace sharegpt references with chat_template
axolotl-ai-cloud · Oct 29, 2024 · 8c3a727 · 8c3a727
1 parent 107b67b
commit 8c3a727
Show file tree

Hide file tree

Showing 7 changed files with 26 additions and 13 deletions.
diff --git a/devtools/dev_sharegpt.yml → devtools/dev_chat_template.yml b/devtools/dev_sharegpt.yml → devtools/dev_chat_template.yml
@@ -7,8 +7,8 @@ load_in_8bit: true
 load_in_4bit: false
 
 datasets:
-  - path: philschmid/guanaco-sharegpt-style
-    type: sharegpt
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
     shards: 10
 val_set_size: 0
 output_dir: temp_debug/axolotl_outputs/model

diff --git a/docs/debugging.qmd b/docs/debugging.qmd
@@ -51,12 +51,12 @@ While debugging it's helpful to simplify your test scenario as much as possible.
 
 ### Background
 
-The below example shows how to configure VSCode to debug data preprocessing of the `sharegpt` format.  This is the format used when you have the following in your axolotl config:
+The below example shows how to configure VSCode to debug data preprocessing of the `chat_template` format.  This is the format used when you have the following in your axolotl config:
 
 ```yaml
 datasets:
-  - path: <path to your sharegpt formatted dataset> # example on HF Hub: philschmid/guanaco-sharegpt-style
-    type: sharegpt
+  - path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
 ```
 
 >[!Important]
@@ -83,20 +83,20 @@ If you developing on a remote host, you can easily use VSCode to debug remotely.
 
 The easiest way to get started is to modify the [.vscode/launch.json](../.vscode/launch.json) file in this project.  This is just an example configuration, so you may need to modify or copy it to suit your needs.
 
-For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_sharegpt.yml`, you would use the below configuration[^1].  Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted.  This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
+For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml`, you would use the below configuration[^1].  Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted.  This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
 
 ```jsonc
 // .vscode/launch.json
 {
     "version": "0.2.0",
     "configurations": [
         {
-            "name": "Debug axolotl prompt - sharegpt",
+            "name": "Debug axolotl prompt - chat_template",
             "type": "python",
             "module": "accelerate.commands.launch",
             "request": "launch",
             "args": [
-                "-m", "axolotl.cli.train", "dev_sharegpt.yml",
+                "-m", "axolotl.cli.train", "dev_chat_template.yml",
                 // The flags below simplify debugging by overriding the axolotl config
                 // with the debugging tips above.  Modify as needed.
                 "--dataset_processes=1",      // limits data preprocessing to one process
@@ -240,6 +240,6 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3
 </div>
 <br>
 
-[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/sharegpt.yml`, but this is the same thing.
+[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml`, but this is the same thing.
 
 [^2]: Many of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit.  You can read more about these flags [here](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html).
diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -16,7 +16,10 @@ chat_template: deepseek_v2
 datasets:
   - path: mlabonne/FineTome-100k
     type: chat_template
-    split: train
+    split: train[:20%]
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
 
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0

diff --git a/examples/gemma2/qlora.yml b/examples/gemma2/qlora.yml
@@ -11,8 +11,11 @@ chat_template: gemma
 datasets:
   - path: cgato/SlimOrcaDedupCleaned
     type: chat_template
-    chat_template: gemma
     drop_system_message: true
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
 val_set_size: 0.0
 output_dir: ./outputs/out
 

diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml
@@ -4,11 +4,15 @@ tokenizer_type: AutoTokenizer
 load_in_4bit: true
 strict: false
 use_tensorboard: true
+chat_template: jamba
 datasets:
   - path: cgato/SlimOrcaDedupCleaned
     type: chat_template
-    chat_template: jamba
     drop_system_message: true
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: jamba-large-fsdp-qlora-ft

diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -14,6 +14,10 @@ datasets:
   - path: mlabonne/FineTome-100k
     type: chat_template
     split: train[:20%]
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.02
 output_dir: ./outputs/out

diff --git a/examples/phi/lora-3.5.yaml b/examples/phi/lora-3.5.yaml
@@ -10,7 +10,6 @@ chat_template: phi_3
 datasets:
   - path: fozziethebeat/alpaca_messages_2k_test
     type: chat_template
-    chat_template: phi_3
     field_messages: messages
     message_field_role: role
     message_field_content: content