Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFT for D2L. DPO #101

Closed
wants to merge 11 commits into from
Prev Previous commit
Next Next commit
support training multiple epochs in sft.
Temporarily use all entries in the dataset as training dataset
(i.e., no eval)
llauraa23 committed Jan 10, 2024
commit 1428aba89ece4b947224715e2d58e0da34ec53d2
9 changes: 8 additions & 1 deletion example/rlhf/supervised_finetuning_d2l.py
Original file line number Diff line number Diff line change
@@ -29,9 +29,16 @@
from peft import LoraConfig
config = RLHFConfig(base_model_path="mistralai/Mistral-7B-Instruct-v0.1",
dataset_type="local_csv", dataset_name="data/chapter22_trnvalfromseed_data_processed.csv",
train_test_split_ratio=0.1,
train_test_split_ratio=0, # ratio for test set DH:TODO: COBINE TRAIN AND EVAL
max_seq_length=896,
per_device_eval_batch_size = 1,
log_freq=20,
# dh: NOTE: 1 EPOCH iterates the dataset once. So log freq 20 means iterating 20 entries when training batch size = 1.
# (i.e., log_freq = 0.12 epoch when the dataset has 166 entires).
save_freq=40000,
num_train_epochs=20,
max_steps=-1, # if a positive number is given, it will override num_train_epochs
device_map="auto",
lora_config_rl = LoraConfig(
r=512,
lora_alpha=1024,
1 change: 1 addition & 0 deletions pykoi/rlhf/config.py
Original file line number Diff line number Diff line change
@@ -120,6 +120,7 @@ class RLHFConfig:
default="./rlhf_checkpoints",
metadata={"help": "Output directory for all model weights."},
)
num_train_epochs: Optional[int] = field(default=5, metadata={"help": "supervised fine tuning training epochs"})
log_freq: Optional[int] = field(default=1, metadata={"help": "Logging frequency."})
eval_freq: Optional[int] = field(
default=1000, metadata={"help": "Evaluation frequency."}
47 changes: 29 additions & 18 deletions pykoi/rlhf/supervised_finetuning.py
Original file line number Diff line number Diff line change
@@ -84,6 +84,7 @@ def __init__(self, rlhf_config: RLHFConfig, enable_telemetry: bool = True) -> No
weight_decay=self._rlhf_config.weight_decay,
run_name="step1_supervised_finetuning",
ddp_find_unused_parameters=False,
num_train_epochs=self._rlhf_config.num_train_epochs,
)
self.model = AutoModelForCausalLM.from_pretrained(
self._rlhf_config.base_model_path,
@@ -257,9 +258,14 @@ def create_datasets(self, tokenizer, args):
)
dataset = Dataset.from_dict(my_data_pd)
elif args.dataset_type == "local_csv":
dataset = load_dataset("csv", data_files=args.dataset_name)
dataset = dataset[args.split] # Convert DatasetDict to Dataset
dataset2 = load_dataset("csv", data_files=args.dataset_name, split='train[:10%]')
## this way will load 1660 enetries
# dataset = load_dataset("csv", data_files=args.dataset_name)
# dataset = dataset[args.split] # Convert DatasetDict to Dataset

# this way will load 166 entries

dataset = load_dataset("csv", data_files=args.dataset_name, split='train[:10%]')

elif args.dataset_type == "huggingface":
dataset = load_dataset(
args.dataset_name,
@@ -275,29 +281,34 @@ def create_datasets(self, tokenizer, args):
"No (supported) data files or dataset script found"
f" {args.dataset_type}"
)

dataset = dataset.train_test_split(
test_size=args.train_test_split_ratio, seed=args.seed
)

# dh: temp change. No test set
# dataset = dataset.train_test_split(
# test_size=args.train_test_split_ratio, seed=args.seed
# )
print(
f"Size of the train set: {len(dataset['train'])}. "
f" Size of the validation set: {len(dataset['test'])}"
f"Size of the train set: {len(dataset)}. "
#f"Size of the train set: {len(dataset['train'])}. "
#f" Size of the validation set: {len(dataset['test'])}"
)

train_dataset = ConstantLengthDataset(
tokenizer,
dataset["train"],
dataset,
#dataset["train"], #dh: temp change. No test set
formatting_func=self.prepare_d2l_text,
infinite=True,
seq_length=args.max_seq_length,
# chars_per_token=chars_per_token,
)
eval_dataset = ConstantLengthDataset(
tokenizer,
dataset["test"],
formatting_func=self.prepare_d2l_text,
infinite=False,
seq_length=args.max_seq_length,
# chars_per_token=chars_per_token,
)
# temp change: no test set
# eval_dataset = ConstantLengthDataset(
# tokenizer,
# dataset["test"],
# formatting_func=self.prepare_d2l_text,
# infinite=False,
# seq_length=args.max_seq_length,
# # chars_per_token=chars_per_token,
# )
eval_dataset = None
return {"train": train_dataset, "eval": eval_dataset}