notebook/dataset-config/example-hf-music-tokenizer.yaml

###############################################
##
## Trainer settings are kept minimal
## with checkpoint / model saving disabled,
## as this tutorial focuses only on datasets
## 
## It only runs 10 steps, enough to prove that
## the dataset configs are valid
##
## See the full `config-example.yaml` for more
## detailes on the trainer/model configs
##
###############################################

trainer:
  max_steps: 10
  # Resonable batch size, for a more realistic it/s rate
  target_batch_size: 32

########################################
## Training model settings
########################################
model:
  # Model to start the finetune/training process from
  load_model: ../model/L6-D512-V20259-init.pth

  # Context length to use for the training process
  # the larger the number (and batch size) the larger the vram usage
  # 
  # Note that if the datasample context length is larger then the ctx_len
  # its training process would be split into ctx_len sized chunks.
  #
  # This allows the training of extreamly large context length (eg. 100k),
  # without eating up too much vram by keeping the training context length
  # to a resonable number sutible to the current GPU setup
  ctx_len: 4096

  # Data samples would be cut down to the respective max ctx_len_cutoffs
  # values if its larger then ctx_len. If the data sample is larger then
  # the largest len_cutoff, the remaining data will be discarded
  #
  # Leave it as a blank array to disable the feature
  ctx_len_cutoffs: []
  # Experimental settings, number of tokens to skip in the data sample
  # prefix, for the respective cutoff length. Used to speed up the process
  #
  # Leave it as a blank array to disable the feature
  ctx_len_warmup_steps: []

  # Learning rate of the training process
  # ---
  # Initia learning rate of the process
  lr_init: 6e-4
  # Final learning rate after the learning rate period
  # learning rate will stay at final value from then onwards
  #
  # NOTE: lr_final / lr_period does not work with warmup_steps
  #       and will be ignored (or replaced) with the warmup_steps logic instead
  lr_final: 2e-4
  # Number of epoch to reduce the learning rate from lr_init to lr_final
  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
  #  0 means lr_final will apply immediately
  # -1 means we take the current max_step / max_epoch as the period
  lr_period: 1
  # lr_period type if its set, defaults to epoch
  lr_period_type: epoch

  # We disable bptt / limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup. 
  # (by skipping some syncronization code). Additionally, as bptt learning should not be triggering 
  # anyway as the data sample should be within ctx size 99% of the time
  bptt_learning: true
  bptt_learning_range: 1

  # Adam optimizer settings
  # You probably want to leave this alone, unless you know what you are doing
  beta1: 0.9
  beta2: 0.99
  adam_eps: 1.0e-08
  weight_decay: 0.01

  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
  # this should be set as null, for non cuda core GPUs
  torch_set_float32_matmul_precision: 'high'
  # torch_set_float32_matmul_precision: null

  # various other settings you probably should leave alone
  grad_cp: true
  warmup_steps: -1
  layerwise_lr: true
  dim_att: null
  dim_ffn: null
data:
  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
  #
  # Use this if you have built your own dataset and saved it with `save_to_disk()`
  # with source left as null. Other wise configure this to a directory which the 
  # dataset will be built and tokenized by the huggingface dataset process.
  data_path: ../datapath/musnet/

  # Other wise provide the source path, which is used as huggingface dataset path
  # this will be used to populate the dataset_path
  #
  # Use either the following
  # - hugging face dataset 
  # - Directory path to a directory containing dataset files
  # - Path to a single dataset file
  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
  # - null
  #
  # If source is disabled, all other params, except data_path, is ignored
  source: "breadlicker45/musenet-encoders-40k"
  # source: text
  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt

  # Use data_dir, if you are using source=text/json/etc
  # this should be relative to the trainer script path
  source_data_dir: null

  # After loading the dataset, split out test data used for validation, 
  # This process is skipped if the dataset includes a test split
  # This process is skipped if set to zero
  test_split: 0.005
  test_split_shuffle: true

  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
  # If using a custom tokenizer, provide the tokenizer file path
  # ---
  tokenizer: "breadlicker45/muse-tokenizer2"

  # Minimum / Maximum token size of the dataset to use
  # useful for filtering out small noisy data samples from large datasets
  # (eg. removal of small articles of less then 512 tokens from wikipedia)
  #
  # This is ignored, if set to -1
  min_token_size: -1
  max_token_size: -1

  # # Rechunking of text dataset, this is done only when source is set as 'text'
  # # and will merge the various sentencees, into larger chunks up to the target size
  # #
  # # Defaults to 4096
  # #
  # # This is ignored, if source is not set as text
  # # This is ignored, if set to zero
  # # ---
  # text_rechunk_size: 4096

  # # Apply text rechunk to the dataset, even if its not a 'text' source
  # # This is done only after dataset filtering, and if source is not 'text'
  # # ---
  # text_rechunk_force: true

  # Custom text column to use, useful for dataset with alternative training columns labels
  # This is checked before multi column merging, default is null (disabled)
  # eg: 'code'
  # ---
  custom_text_key: 'bing'

  # Multi Column merging process, default setting is used to support and merge
  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
  #
  # A minimum of 2 columns is required, with non empty data, for the merge to occur
  # If no match is found, this will fallback to the default prompt/completion or text column, 
  # or throw an error if the default fallback is not found
  # ---
  # multi_column_keys: ['instruction', 'input', 'output']
  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
  # multi_column_masking: [false, true, false]
  # multi_column_seperator: '\n\n'

  # If processing prompt/completion jsonl pairs, the prompt is masked by default
  # use this flag to disable this default behaviour
  # ---
  # disable_prompt_mask: false

# Path to the current checkpoint to continue training from
# Enable this to the last checkpoint after the first run 
# (if it crash and you want to resume)
# ckpt_path: ../checkpoint/Echo-B-1B4-enwiki/epoch=0-step=2500.ckpt
ckpt_path: null