notebook/dataset-config/example-hf-enwiki.yaml

###############################################
##
## Trainer settings are kept minimal
## with checkpoint / model saving disabled,
## as this tutorial focuses only on datasets
## 
## It only runs 10 steps, enough to prove that
## the dataset configs are valid
##
## See the full `config-example.yaml` for more
## detailes on the trainer/model configs
##
###############################################

trainer:
  max_steps: 10
  # Resonable batch size, for a more realistic it/s rate
  target_batch_size: 32

model:
  load_model: ../model/L6-D512-neox-init.pth
  ctx_len: 1024
  lr_init: 3e-4

  # BPTT learning, this allows you to run the trainer against dataset
  # larger then its training context length
  bptt_learning: true
  bptt_learning_range: -1

########################################
## Training model settings
########################################
data:
  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
  #
  # Use this if you have built your own dataset and saved it with `save_to_disk()`
  # with source left as null. Other wise configure this to a directory which the 
  # dataset will be built and tokenized by the huggingface dataset process.
  data_path: ../datapath/neox/enwiki_10k_2048/

  # Other wise provide the source path, which is used as huggingface dataset path
  # this will be used to populate the dataset_path
  #
  # Use either the following
  # - hugging face dataset 
  # - Directory path to a directory containing dataset files
  # - Path to a single dataset file
  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
  # - null
  #
  # If source is disabled, all other params, except data_path, is ignored
  source: "teven/enwiki_10k"
  # source: text
  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt

  # # Additional source dataset params, used to grab subsets of the dataset
  # source_dataset_params:
  #   language: en

  # # Use data_dir, if you are using source=text/json/etc
  # # this should be relative to the trainer script path
  # source_data_dir: null

  # Tokenizer to use, use either the inbuilt 'neox', or 'neox' tokenizer
  # If using a custom tokenizer, provide the tokenizer file path
  # ---
  tokenizer: neox

  # Minimum / Maximum token size of the dataset to use
  # useful for filtering out small noisy data samples from large datasets
  # (eg. removal of small articles of less then 512 tokens from wikipedia)
  #
  # This is ignored, if set to -1
  min_token_size: 64
  max_token_size: -1

  # Rechunking of text dataset, this is done only when source is set as 'text'
  # and will merge the various sentencees, into larger chunks up to the target size
  #
  # Defaults to 4096
  #
  # This is ignored, if source is not set as text
  # This is ignored, if set to zero
  # ---
  text_rechunk_size: 2048

  # Apply text rechunk to the dataset, even if its not a 'text' source
  # This is done only after dataset filtering, and if source is not 'text'
  # ---
  text_rechunk_force: true

  # Custom text column to use, useful for dataset with alternative training columns labels
  # This is checked before multi column merging, default is null (disabled)
  # eg: 'code'
  # ---
  custom_text_key: 'text'

  # Multi Column merging process, default setting is used to support and merge
  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
  #
  # A minimum of 2 columns is required, with non empty data, for the merge to occur
  # If no match is found, this will fallback to the default prompt/completion or text column, 
  # or throw an error if the default fallback is not found
  # ---
  # multi_column_keys: ['instruction', 'input', 'output']
  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
  # multi_column_train_mask: [true, false, true]
  # multi_column_separator: '\n\n'

  # If processing prompt/completion jsonl pairs, the prompt is masked by default
  # use this flag to disable this default behaviour
  # ---
  # disable_prompt_completion_mask: false

  # After loading the dataset, split out test data used for validation, 
  # This process is skipped if the dataset includes a test split
  # This process is skipped if set to zero
  test_split: 0.01
  test_split_shuffle: false