diff --git a/example/rlhf/supervised_finetuning_d2l.py b/example/rlhf/supervised_finetuning_d2l.py index 0e50a85..172d1f3 100644 --- a/example/rlhf/supervised_finetuning_d2l.py +++ b/example/rlhf/supervised_finetuning_d2l.py @@ -29,9 +29,16 @@ from peft import LoraConfig config = RLHFConfig(base_model_path="mistralai/Mistral-7B-Instruct-v0.1", dataset_type="local_csv", dataset_name="data/chapter22_trnvalfromseed_data_processed.csv", - train_test_split_ratio=0.1, + train_test_split_ratio=0, # ratio for test set DH:TODO: COBINE TRAIN AND EVAL max_seq_length=896, per_device_eval_batch_size = 1, + log_freq=20, + # dh: NOTE: 1 EPOCH iterates the dataset once. So log freq 20 means iterating 20 entries when training batch size = 1. + # (i.e., log_freq = 0.12 epoch when the dataset has 166 entires). + save_freq=40000, + num_train_epochs=20, + max_steps=-1, # if a positive number is given, it will override num_train_epochs + device_map="auto", lora_config_rl = LoraConfig( r=512, lora_alpha=1024, diff --git a/pykoi/rlhf/config.py b/pykoi/rlhf/config.py index c34d68e..10d7184 100644 --- a/pykoi/rlhf/config.py +++ b/pykoi/rlhf/config.py @@ -120,6 +120,7 @@ class RLHFConfig: default="./rlhf_checkpoints", metadata={"help": "Output directory for all model weights."}, ) + num_train_epochs: Optional[int] = field(default=5, metadata={"help": "supervised fine tuning training epochs"}) log_freq: Optional[int] = field(default=1, metadata={"help": "Logging frequency."}) eval_freq: Optional[int] = field( default=1000, metadata={"help": "Evaluation frequency."} diff --git a/pykoi/rlhf/supervised_finetuning.py b/pykoi/rlhf/supervised_finetuning.py index c5e8ed6..27c8369 100644 --- a/pykoi/rlhf/supervised_finetuning.py +++ b/pykoi/rlhf/supervised_finetuning.py @@ -84,6 +84,7 @@ def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> No weight_decay=self._rlhf_config.weight_decay, run_name="step1_supervised_finetuning", ddp_find_unused_parameters=False, + num_train_epochs=self._rlhf_config.num_train_epochs, ) self.model = AutoModelForCausalLM.from_pretrained( self._rlhf_config.base_model_path, @@ -257,9 +258,14 @@ def create_datasets(self, tokenizer, args): ) dataset = Dataset.from_dict(my_data_pd) elif args.dataset_type == "local_csv": - dataset = load_dataset("csv", data_files=args.dataset_name) - dataset = dataset[args.split] # Convert DatasetDict to Dataset - dataset2 = load_dataset("csv", data_files=args.dataset_name, split='train[:10%]') + ## this way will load 1660 enetries + # dataset = load_dataset("csv", data_files=args.dataset_name) + # dataset = dataset[args.split] # Convert DatasetDict to Dataset + + # this way will load 166 entries + + dataset = load_dataset("csv", data_files=args.dataset_name, split='train[:10%]') + elif args.dataset_type == "huggingface": dataset = load_dataset( args.dataset_name, @@ -275,29 +281,34 @@ def create_datasets(self, tokenizer, args): "No (supported) data files or dataset script found" f" {args.dataset_type}" ) - - dataset = dataset.train_test_split( - test_size=args.train_test_split_ratio, seed=args.seed - ) + + # dh: temp change. No test set + # dataset = dataset.train_test_split( + # test_size=args.train_test_split_ratio, seed=args.seed + # ) print( - f"Size of the train set: {len(dataset['train'])}. " - f" Size of the validation set: {len(dataset['test'])}" + f"Size of the train set: {len(dataset)}. " + #f"Size of the train set: {len(dataset['train'])}. " + #f" Size of the validation set: {len(dataset['test'])}" ) train_dataset = ConstantLengthDataset( tokenizer, - dataset["train"], + dataset, + #dataset["train"], #dh: temp change. No test set formatting_func=self.prepare_d2l_text, infinite=True, seq_length=args.max_seq_length, # chars_per_token=chars_per_token, ) - eval_dataset = ConstantLengthDataset( - tokenizer, - dataset["test"], - formatting_func=self.prepare_d2l_text, - infinite=False, - seq_length=args.max_seq_length, - # chars_per_token=chars_per_token, - ) + # temp change: no test set + # eval_dataset = ConstantLengthDataset( + # tokenizer, + # dataset["test"], + # formatting_func=self.prepare_d2l_text, + # infinite=False, + # seq_length=args.max_seq_length, + # # chars_per_token=chars_per_token, + # ) + eval_dataset = None return {"train": train_dataset, "eval": eval_dataset}