-
-
Notifications
You must be signed in to change notification settings - Fork 28
/
example-hf-music-tokenizer.yaml
180 lines (160 loc) · 6.78 KB
/
example-hf-music-tokenizer.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
###############################################
##
## Trainer settings are kept minimal
## with checkpoint / model saving disabled,
## as this tutorial focuses only on datasets
##
## It only runs 10 steps, enough to prove that
## the dataset configs are valid
##
## See the full `config-example.yaml` for more
## detailes on the trainer/model configs
##
###############################################
trainer:
max_steps: 10
# Resonable batch size, for a more realistic it/s rate
target_batch_size: 32
########################################
## Training model settings
########################################
model:
# Model to start the finetune/training process from
load_model: ../model/L6-D512-V20259-init.pth
# Context length to use for the training process
# the larger the number (and batch size) the larger the vram usage
#
# Note that if the datasample context length is larger then the ctx_len
# its training process would be split into ctx_len sized chunks.
#
# This allows the training of extreamly large context length (eg. 100k),
# without eating up too much vram by keeping the training context length
# to a resonable number sutible to the current GPU setup
ctx_len: 4096
# Data samples would be cut down to the respective max ctx_len_cutoffs
# values if its larger then ctx_len. If the data sample is larger then
# the largest len_cutoff, the remaining data will be discarded
#
# Leave it as a blank array to disable the feature
ctx_len_cutoffs: []
# Experimental settings, number of tokens to skip in the data sample
# prefix, for the respective cutoff length. Used to speed up the process
#
# Leave it as a blank array to disable the feature
ctx_len_warmup_steps: []
# Learning rate of the training process
# ---
# Initia learning rate of the process
lr_init: 6e-4
# Final learning rate after the learning rate period
# learning rate will stay at final value from then onwards
#
# NOTE: lr_final / lr_period does not work with warmup_steps
# and will be ignored (or replaced) with the warmup_steps logic instead
lr_final: 2e-4
# Number of epoch to reduce the learning rate from lr_init to lr_final
# 1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
# 0 means lr_final will apply immediately
# -1 means we take the current max_step / max_epoch as the period
lr_period: 1
# lr_period type if its set, defaults to epoch
lr_period_type: epoch
# We disable bptt / limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup.
# (by skipping some syncronization code). Additionally, as bptt learning should not be triggering
# anyway as the data sample should be within ctx size 99% of the time
bptt_learning: true
bptt_learning_range: 1
# Adam optimizer settings
# You probably want to leave this alone, unless you know what you are doing
beta1: 0.9
beta2: 0.99
adam_eps: 1.0e-08
weight_decay: 0.01
# torch.set_float32_matmul_precision, used to optimize operations with tensor cores
# this should be set as null, for non cuda core GPUs
torch_set_float32_matmul_precision: 'high'
# torch_set_float32_matmul_precision: null
# various other settings you probably should leave alone
grad_cp: true
warmup_steps: -1
layerwise_lr: true
dim_att: null
dim_ffn: null
data:
# dataset_path for the prebuilt dataset, using HF `load_from_disk()`
#
# Use this if you have built your own dataset and saved it with `save_to_disk()`
# with source left as null. Other wise configure this to a directory which the
# dataset will be built and tokenized by the huggingface dataset process.
data_path: ../datapath/musnet/
# Other wise provide the source path, which is used as huggingface dataset path
# this will be used to populate the dataset_path
#
# Use either the following
# - hugging face dataset
# - Directory path to a directory containing dataset files
# - Path to a single dataset file
# - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
# - null
#
# If source is disabled, all other params, except data_path, is ignored
source: "breadlicker45/musenet-encoders-40k"
# source: text
# source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
# Use data_dir, if you are using source=text/json/etc
# this should be relative to the trainer script path
source_data_dir: null
# After loading the dataset, split out test data used for validation,
# This process is skipped if the dataset includes a test split
# This process is skipped if set to zero
test_split: 0.005
test_split_shuffle: true
# Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
# If using a custom tokenizer, provide the tokenizer file path
# ---
tokenizer: "breadlicker45/muse-tokenizer2"
# Minimum / Maximum token size of the dataset to use
# useful for filtering out small noisy data samples from large datasets
# (eg. removal of small articles of less then 512 tokens from wikipedia)
#
# This is ignored, if set to -1
min_token_size: -1
max_token_size: -1
# # Rechunking of text dataset, this is done only when source is set as 'text'
# # and will merge the various sentencees, into larger chunks up to the target size
# #
# # Defaults to 4096
# #
# # This is ignored, if source is not set as text
# # This is ignored, if set to zero
# # ---
# text_rechunk_size: 4096
# # Apply text rechunk to the dataset, even if its not a 'text' source
# # This is done only after dataset filtering, and if source is not 'text'
# # ---
# text_rechunk_force: true
# Custom text column to use, useful for dataset with alternative training columns labels
# This is checked before multi column merging, default is null (disabled)
# eg: 'code'
# ---
custom_text_key: 'bing'
# Multi Column merging process, default setting is used to support and merge
# "instruction", "input", "output", datasets. To disable set multi_column_keys to []
#
# A minimum of 2 columns is required, with non empty data, for the merge to occur
# If no match is found, this will fallback to the default prompt/completion or text column,
# or throw an error if the default fallback is not found
# ---
# multi_column_keys: ['instruction', 'input', 'output']
# multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
# multi_column_masking: [false, true, false]
# multi_column_seperator: '\n\n'
# If processing prompt/completion jsonl pairs, the prompt is masked by default
# use this flag to disable this default behaviour
# ---
# disable_prompt_mask: false
# Path to the current checkpoint to continue training from
# Enable this to the last checkpoint after the first run
# (if it crash and you want to resume)
# ckpt_path: ../checkpoint/Echo-B-1B4-enwiki/epoch=0-step=2500.ckpt
ckpt_path: null