diff --git a/example/rlhf/demo_rl.py b/example/rlhf/demo_rl.py index 0ac2346..91bce9a 100644 --- a/example/rlhf/demo_rl.py +++ b/example/rlhf/demo_rl.py @@ -13,11 +13,11 @@ # use huggingface sft and reward model config = pykoi.RLHFConfig( - base_model_path="elinas/llama-7b-hf-transformers-4.29", # "elinas/llama-7b-hf-transformers-4.29", + base_model_path="models/rlhf_step1_sft", #"elinas/llama-7b-hf-transformers-4.29", dataset_type="huggingface", dataset_name="goldmermaid/stack_exchange_rank_10k_dataset", dataset_subset_rl="data", - reward_model_path="cambioml/rlhf-reward-model", + reward_model_path="models/rlhf_step2_rw/", #"cambioml/rlhf_reward_model", save_freq=1, ppo_batch_size=32, ppo_epochs=4, diff --git a/example/rlhf/demo_rw_finetuning.py b/example/rlhf/demo_rw_finetuning.py index 69cb541..c6913c5 100644 --- a/example/rlhf/demo_rw_finetuning.py +++ b/example/rlhf/demo_rw_finetuning.py @@ -26,6 +26,6 @@ # run reward model finetuning # config = pykoi.RLHFConfig(dataset_type="local_db") -config = pykoi.RLHFConfig() +config = pykoi.RLHFConfig(reward_model_path = "databricks/dolly-v2-3b") rlhf_step2_rft = pykoi.RewardFinetuning(config) rlhf_step2_rft.train_and_save("./models/rlhf_step2_rw") diff --git a/example/rlhf/supervised_finetuning_demo.py b/example/rlhf/supervised_finetuning_demo.py index 8596e46..bfc29a3 100644 --- a/example/rlhf/supervised_finetuning_demo.py +++ b/example/rlhf/supervised_finetuning_demo.py @@ -25,6 +25,6 @@ print("My local database has {} samples in total".format(my_data_pd.shape[0])) # run supervised finetuning -config = pykoi.RLHFConfig(base_model_path="elinas/llama-7b-hf-transformers-4.29", dataset_type="local_db") +config = pykoi.RLHFConfig(base_model_path="databricks/dolly-v2-3b", dataset_type="local_db") rlhf_step1_sft = pykoi.SupervisedFinetuning(config) rlhf_step1_sft.train_and_save("./models/rlhf_step1_sft") diff --git a/pykoi/rlhf/rl_finetuning.py b/pykoi/rlhf/rl_finetuning.py index c3f105a..009cd14 100644 --- a/pykoi/rlhf/rl_finetuning.py +++ b/pykoi/rlhf/rl_finetuning.py @@ -27,6 +27,9 @@ ) from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer from trl.core import LengthSampler +from huggingface_hub import hf_hub_download +from transformers import AutoModelForCausalLM +from peft import PeftModel, PeftConfig, AutoPeftModelForCausalLM from pykoi.telemetry.telemetry import Telemetry from pykoi.telemetry.events import ( RLStartEvent, @@ -76,12 +79,56 @@ def __init__(self, rlhf_config.reward_model_path ) self.reward_dataset = self.create_dataset(self.reward_tokenizer) - self.reward_model = AutoModelForSequenceClassification.from_pretrained( - rlhf_config.reward_model_path, - num_labels=1, - load_in_8bit=True, - device_map={"": Accelerator().local_process_index}, + # self.reward_model = AutoModelForSequenceClassification.from_pretrained( + # rlhf_config.reward_model_path, + # num_labels=1, + # load_in_8bit=True, + # device_map={"": Accelerator().local_process_index}, + # ) + + reward_model_path = rlhf_config.reward_model_path + + try: + # If there is a trained peft adapter in the hub, load its config. + remote_adapter_config_reward = hf_hub_download(reward_model_path, "adapter_config.json") + except: + remote_adapter_config_reward = None + + + local_adapter_present_reward = os.path.exists( + os.path.join(reward_model_path, "adapter_config.json") + ) + + # # Load the trained peft adapter config + if local_adapter_present_reward: + trained_adapter_config_reward = PeftConfig.from_pretrained(reward_model_path) + else: + trained_adapter_config = PeftConfig.from_pretrained(remote_adapter_config_reward) + + ## Load the pretrained base model + pretrained_kwargs_reward = { + "num_labels": 1, + "load_in_8bit": False, #True, + "device_map": {"": Accelerator().local_process_index}, + } # TODO: ADD + pretrained_model_reward = AutoModelForSequenceClassification.from_pretrained( + trained_adapter_config_reward.base_model_name_or_path, + **pretrained_kwargs_reward ) + ## TODO: LOAD MERGED BASE MODEL FROM STEP 2 + + # Load the Peft model by combing the base model with the trained adapter + reward_model = PeftModel.from_pretrained(pretrained_model_reward, reward_model_path, is_trainable=False) # TODO: fix this. This should not be trainable. + self.reward_model = reward_model.merge_and_unload() + #pretrained_model.print_trainable_parameters() + print("\nTrained peft adapter loaded for reward model\n") + # have to specify the pad_token_id or will lead to error: "Cannot handle batch sizes > 1 if no padding token is defined" + # see https://stackoverflow.com/questions/68084302/assertionerror-cannot-handle-batch-sizes-1-if-no-padding-token-is-defined + self.reward_model.config.pad_token_id = self.reward_tokenizer.pad_token_id + + + + self.reward_kwargs = { "top_k": None, "function_to_apply": "none", @@ -102,12 +149,93 @@ def __init__(self, ## Load the base model and tokenizer and define the PPO Trainer for RL self.base_tokenizer = self.create_tokenizer(rlhf_config.base_model_path) self.base_dataset = self.create_dataset(self.base_tokenizer) - self.base_model = AutoModelForCausalLMWithValueHead.from_pretrained( - rlhf_config.base_model_path, - load_in_8bit=rlhf_config.load_in_8bit, - device_map={"": Accelerator().local_process_index}, - peft_config=rlhf_config.lora_config_rl, + + pretrained_model_name_or_path = rlhf_config.base_model_path + # #NOTE: TODO: peft config will be directly inferred from the pre-trained model. rlhf_config.lora_config_rl will be ignored in previous implementation. Do we want to use it, in the flow of using merged model as base model and then add peft adapter again?? + + pretrained_kwargs = { + "load_in_8bit": rlhf_config.load_in_8bit, + "device_map": {"": Accelerator().local_process_index}, + } + + assert isinstance(pretrained_model_name_or_path, str), "The `pretrained_model_path` should be a string." + try: + # If there is a trained peft adapter in the hub, load its config. + remote_adapter_config = hf_hub_download(pretrained_model_name_or_path, "adapter_config.json") + except: + remote_adapter_config = None + + + local_adapter_present = os.path.exists( + os.path.join(pretrained_model_name_or_path, "adapter_config.json") + ) + + # # Load the trained peft adapter config + if local_adapter_present: + trained_adapter_config = PeftConfig.from_pretrained(pretrained_model_name_or_path) + else: + trained_adapter_config = PeftConfig.from_pretrained(remote_adapter_config) + + # # Load the pretrained base model + pretrained_model = AutoModelForCausalLM.from_pretrained( + trained_adapter_config.base_model_name_or_path, + **pretrained_kwargs ) + + # Load the Peft model by combing the base model with the trained adapter + is_trainable = True # TODO: If following merge+train new adapter flow. Below should not be trainable! + pretrained_model = PeftModel.from_pretrained(pretrained_model, pretrained_model_name_or_path, is_trainable=is_trainable) + + #pretrained_model.print_trainable_parameters() + print("\nTrained peft adapter loaded for policy model\n") + + # Alternatively, load a peft model from a local path. See https://huggingface.co/docs/peft/quicktour. # TODO: DELETE. doesn't work + # peft_model = AutoPeftModelForCausalLM.from_pretrained(pretrained_model_name_or_path) + + + # Add value head to the pretrained peft model to create a policy network. + if isinstance(pretrained_model, PeftModel): + is_peft_model = True + trl_model_args = {} # args for the value head + # TODO: weights of v_head initialized using v_head_init_strategy="random" by default. trl also suports initialization using "norm". + model = AutoModelForCausalLMWithValueHead(pretrained_model, **trl_model_args) + # TODO: 1 VALUE HEAD REQURIES GRAD = FALSE AND NOT IN CUDA. CHECK IF BELOW CODE FIX THIS. 2. PEFTMODEL PRINT TRAINABLE PARAMETERS REUTRNS ... AND NONE + + + # For back compatibility for class AutoModelForCausalLMWithValueHead. is_peft_model needs to be specified or calling model.state_dict() will fail. + model.is_peft_model = is_peft_model + # For back compatibility + model.is_sequential_parallel = True + model.current_device = Accelerator().local_process_index + reward_adapter = None # TODO: Consider adding reward adapter here? + if is_peft_model and reward_adapter is not None: + model.add_and_load_reward_modeling_adapter(reward_adapter) + model.supports_rm_adapter = True + else: + model.supports_rm_adapter = False + + + # Adding v_head to device and register hook. See AutoModelForCausalLMWithValueHead.post_init(). + # TODO: is register_forward_hook necessary? outputs should be already on cuda + first_device = list(set(model.pretrained_model.hf_device_map.values()))[0] + model.v_head = model.v_head.to(first_device) + def set_device_hook(module, input, outputs): + new_output = () + for output in outputs: + if isinstance(output, torch.Tensor): + new_output += (output.to(first_device),) + else: + new_output += (output,) + return new_output + model.register_forward_hook(set_device_hook) + self.base_model = model + #breakpoint() + # self.base_model = AutoModelForCausalLMWithValueHead.from_pretrained( + # rlhf_config.base_model_path, + # load_in_8bit=rlhf_config.load_in_8bit, + # device_map={"": Accelerator().local_process_index}, + # peft_config=rlhf_config.lora_config_rl, + # ) self.ppo_trainer = PPOTrainer( config=self.ppo_config, model=self.base_model,