-
-
Notifications
You must be signed in to change notification settings - Fork 28
/
example-hf-enwiki.yaml
120 lines (104 loc) · 4.19 KB
/
example-hf-enwiki.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
###############################################
##
## Trainer settings are kept minimal
## with checkpoint / model saving disabled,
## as this tutorial focuses only on datasets
##
## It only runs 10 steps, enough to prove that
## the dataset configs are valid
##
## See the full `config-example.yaml` for more
## detailes on the trainer/model configs
##
###############################################
trainer:
max_steps: 10
# Resonable batch size, for a more realistic it/s rate
target_batch_size: 32
model:
load_model: ../model/L6-D512-neox-init.pth
ctx_len: 1024
lr_init: 3e-4
# BPTT learning, this allows you to run the trainer against dataset
# larger then its training context length
bptt_learning: true
bptt_learning_range: -1
########################################
## Training model settings
########################################
data:
# dataset_path for the prebuilt dataset, using HF `load_from_disk()`
#
# Use this if you have built your own dataset and saved it with `save_to_disk()`
# with source left as null. Other wise configure this to a directory which the
# dataset will be built and tokenized by the huggingface dataset process.
data_path: ../datapath/neox/enwiki_10k_2048/
# Other wise provide the source path, which is used as huggingface dataset path
# this will be used to populate the dataset_path
#
# Use either the following
# - hugging face dataset
# - Directory path to a directory containing dataset files
# - Path to a single dataset file
# - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
# - null
#
# If source is disabled, all other params, except data_path, is ignored
source: "teven/enwiki_10k"
# source: text
# source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
# # Additional source dataset params, used to grab subsets of the dataset
# source_dataset_params:
# language: en
# # Use data_dir, if you are using source=text/json/etc
# # this should be relative to the trainer script path
# source_data_dir: null
# Tokenizer to use, use either the inbuilt 'neox', or 'neox' tokenizer
# If using a custom tokenizer, provide the tokenizer file path
# ---
tokenizer: neox
# Minimum / Maximum token size of the dataset to use
# useful for filtering out small noisy data samples from large datasets
# (eg. removal of small articles of less then 512 tokens from wikipedia)
#
# This is ignored, if set to -1
min_token_size: 64
max_token_size: -1
# Rechunking of text dataset, this is done only when source is set as 'text'
# and will merge the various sentencees, into larger chunks up to the target size
#
# Defaults to 4096
#
# This is ignored, if source is not set as text
# This is ignored, if set to zero
# ---
text_rechunk_size: 2048
# Apply text rechunk to the dataset, even if its not a 'text' source
# This is done only after dataset filtering, and if source is not 'text'
# ---
text_rechunk_force: true
# Custom text column to use, useful for dataset with alternative training columns labels
# This is checked before multi column merging, default is null (disabled)
# eg: 'code'
# ---
custom_text_key: 'text'
# Multi Column merging process, default setting is used to support and merge
# "instruction", "input", "output", datasets. To disable set multi_column_keys to []
#
# A minimum of 2 columns is required, with non empty data, for the merge to occur
# If no match is found, this will fallback to the default prompt/completion or text column,
# or throw an error if the default fallback is not found
# ---
# multi_column_keys: ['instruction', 'input', 'output']
# multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
# multi_column_train_mask: [true, false, true]
# multi_column_separator: '\n\n'
# If processing prompt/completion jsonl pairs, the prompt is masked by default
# use this flag to disable this default behaviour
# ---
# disable_prompt_completion_mask: false
# After loading the dataset, split out test data used for validation,
# This process is skipped if the dataset includes a test split
# This process is skipped if set to zero
test_split: 0.01
test_split_shuffle: false