Skip to content

Commit

Permalink
support training multiple epochs in sft.
Browse files Browse the repository at this point in the history
Temporarily use all entries in the dataset as training dataset
(i.e., no eval)
  • Loading branch information
llauraa23 committed Jan 10, 2024
1 parent 878f44e commit 1428aba
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 19 deletions.
9 changes: 8 additions & 1 deletion example/rlhf/supervised_finetuning_d2l.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,16 @@
from peft import LoraConfig
config = RLHFConfig(base_model_path="mistralai/Mistral-7B-Instruct-v0.1",
dataset_type="local_csv", dataset_name="data/chapter22_trnvalfromseed_data_processed.csv",
train_test_split_ratio=0.1,
train_test_split_ratio=0, # ratio for test set DH:TODO: COBINE TRAIN AND EVAL
max_seq_length=896,
per_device_eval_batch_size = 1,
log_freq=20,
# dh: NOTE: 1 EPOCH iterates the dataset once. So log freq 20 means iterating 20 entries when training batch size = 1.
# (i.e., log_freq = 0.12 epoch when the dataset has 166 entires).
save_freq=40000,
num_train_epochs=20,
max_steps=-1, # if a positive number is given, it will override num_train_epochs
device_map="auto",
lora_config_rl = LoraConfig(
r=512,
lora_alpha=1024,
Expand Down
1 change: 1 addition & 0 deletions pykoi/rlhf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ class RLHFConfig:
default="./rlhf_checkpoints",
metadata={"help": "Output directory for all model weights."},
)
num_train_epochs: Optional[int] = field(default=5, metadata={"help": "supervised fine tuning training epochs"})
log_freq: Optional[int] = field(default=1, metadata={"help": "Logging frequency."})
eval_freq: Optional[int] = field(
default=1000, metadata={"help": "Evaluation frequency."}
Expand Down
47 changes: 29 additions & 18 deletions pykoi/rlhf/supervised_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> No
weight_decay=self._rlhf_config.weight_decay,
run_name="step1_supervised_finetuning",
ddp_find_unused_parameters=False,
num_train_epochs=self._rlhf_config.num_train_epochs,
)
self.model = AutoModelForCausalLM.from_pretrained(
self._rlhf_config.base_model_path,
Expand Down Expand Up @@ -257,9 +258,14 @@ def create_datasets(self, tokenizer, args):
)
dataset = Dataset.from_dict(my_data_pd)
elif args.dataset_type == "local_csv":
dataset = load_dataset("csv", data_files=args.dataset_name)
dataset = dataset[args.split] # Convert DatasetDict to Dataset
dataset2 = load_dataset("csv", data_files=args.dataset_name, split='train[:10%]')
## this way will load 1660 enetries
# dataset = load_dataset("csv", data_files=args.dataset_name)
# dataset = dataset[args.split] # Convert DatasetDict to Dataset

# this way will load 166 entries

dataset = load_dataset("csv", data_files=args.dataset_name, split='train[:10%]')

elif args.dataset_type == "huggingface":
dataset = load_dataset(
args.dataset_name,
Expand All @@ -275,29 +281,34 @@ def create_datasets(self, tokenizer, args):
"No (supported) data files or dataset script found"
f" {args.dataset_type}"
)

dataset = dataset.train_test_split(
test_size=args.train_test_split_ratio, seed=args.seed
)

# dh: temp change. No test set
# dataset = dataset.train_test_split(
# test_size=args.train_test_split_ratio, seed=args.seed
# )
print(
f"Size of the train set: {len(dataset['train'])}. "
f" Size of the validation set: {len(dataset['test'])}"
f"Size of the train set: {len(dataset)}. "
#f"Size of the train set: {len(dataset['train'])}. "
#f" Size of the validation set: {len(dataset['test'])}"
)

train_dataset = ConstantLengthDataset(
tokenizer,
dataset["train"],
dataset,
#dataset["train"], #dh: temp change. No test set
formatting_func=self.prepare_d2l_text,
infinite=True,
seq_length=args.max_seq_length,
# chars_per_token=chars_per_token,
)
eval_dataset = ConstantLengthDataset(
tokenizer,
dataset["test"],
formatting_func=self.prepare_d2l_text,
infinite=False,
seq_length=args.max_seq_length,
# chars_per_token=chars_per_token,
)
# temp change: no test set
# eval_dataset = ConstantLengthDataset(
# tokenizer,
# dataset["test"],
# formatting_func=self.prepare_d2l_text,
# infinite=False,
# seq_length=args.max_seq_length,
# # chars_per_token=chars_per_token,
# )
eval_dataset = None
return {"train": train_dataset, "eval": eval_dataset}

0 comments on commit 1428aba

Please sign in to comment.