From e29fa3ed842339dfb768c719705d08c6109a0486 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 1 Mar 2024 18:10:50 +0800 Subject: [PATCH 01/26] Add llm lora --- llm-lora-finetuning/.dockerignore | 7 + llm-lora-finetuning/README.md | 0 llm-lora-finetuning/configs/feature.yaml | 0 llm-lora-finetuning/finetune/adapter.py | 333 ++++ llm-lora-finetuning/finetune/adapter_v2.py | 333 ++++ llm-lora-finetuning/finetune/full.py | 349 ++++ llm-lora-finetuning/finetune/lora.py | 363 +++++ llm-lora-finetuning/generate/adapter.py | 119 ++ llm-lora-finetuning/generate/adapter_v2.py | 119 ++ llm-lora-finetuning/generate/base.py | 193 +++ llm-lora-finetuning/generate/full.py | 115 ++ llm-lora-finetuning/generate/lora.py | 140 ++ llm-lora-finetuning/generate/sequentially.py | 231 +++ llm-lora-finetuning/generate/tp.py | 225 +++ llm-lora-finetuning/lit_gpt/__init__.py | 27 + llm-lora-finetuning/lit_gpt/adapter.py | 168 ++ llm-lora-finetuning/lit_gpt/adapter_v2.py | 224 +++ llm-lora-finetuning/lit_gpt/args.py | 81 + llm-lora-finetuning/lit_gpt/config.py | 1447 +++++++++++++++++ llm-lora-finetuning/lit_gpt/lora.py | 737 +++++++++ llm-lora-finetuning/lit_gpt/model.py | 390 +++++ llm-lora-finetuning/lit_gpt/packed_dataset.py | 239 +++ llm-lora-finetuning/lit_gpt/rmsnorm.py | 34 + llm-lora-finetuning/lit_gpt/tokenizer.py | 109 ++ llm-lora-finetuning/lit_gpt/utils.py | 379 +++++ llm-lora-finetuning/pipelines/__init__.py | 16 + llm-lora-finetuning/pipelines/finetuning.py | 25 + llm-lora-finetuning/pipelines/merge.py | 23 + llm-lora-finetuning/requirements.txt | 19 + llm-lora-finetuning/run.py | 114 ++ .../scripts/convert_hf_checkpoint.py | 356 ++++ .../scripts/convert_lit_checkpoint.py | 272 ++++ .../scripts/convert_pretrained_checkpoint.py | 78 + llm-lora-finetuning/scripts/download.py | 97 ++ llm-lora-finetuning/scripts/merge_lora.py | 83 + llm-lora-finetuning/scripts/prepare_alpaca.py | 151 ++ llm-lora-finetuning/scripts/prepare_csv.py | 139 ++ llm-lora-finetuning/scripts/prepare_dolly.py | 144 ++ llm-lora-finetuning/scripts/prepare_flan.py | 232 +++ llm-lora-finetuning/scripts/prepare_lima.py | 168 ++ .../scripts/prepare_longform.py | 136 ++ .../scripts/prepare_openwebtext.py | 81 + .../scripts/prepare_redpajama.py | 166 ++ .../scripts/prepare_slimpajama.py | 63 + .../scripts/prepare_starcoder.py | 74 + llm-lora-finetuning/steps/__init__.py | 16 + llm-lora-finetuning/steps/finetune.py | 43 + llm-lora-finetuning/steps/merge.py | 28 + 48 files changed, 8886 insertions(+) create mode 100644 llm-lora-finetuning/.dockerignore create mode 100644 llm-lora-finetuning/README.md create mode 100644 llm-lora-finetuning/configs/feature.yaml create mode 100644 llm-lora-finetuning/finetune/adapter.py create mode 100644 llm-lora-finetuning/finetune/adapter_v2.py create mode 100644 llm-lora-finetuning/finetune/full.py create mode 100644 llm-lora-finetuning/finetune/lora.py create mode 100644 llm-lora-finetuning/generate/adapter.py create mode 100644 llm-lora-finetuning/generate/adapter_v2.py create mode 100644 llm-lora-finetuning/generate/base.py create mode 100644 llm-lora-finetuning/generate/full.py create mode 100644 llm-lora-finetuning/generate/lora.py create mode 100644 llm-lora-finetuning/generate/sequentially.py create mode 100644 llm-lora-finetuning/generate/tp.py create mode 100644 llm-lora-finetuning/lit_gpt/__init__.py create mode 100644 llm-lora-finetuning/lit_gpt/adapter.py create mode 100644 llm-lora-finetuning/lit_gpt/adapter_v2.py create mode 100644 llm-lora-finetuning/lit_gpt/args.py create mode 100644 llm-lora-finetuning/lit_gpt/config.py create mode 100644 llm-lora-finetuning/lit_gpt/lora.py create mode 100644 llm-lora-finetuning/lit_gpt/model.py create mode 100644 llm-lora-finetuning/lit_gpt/packed_dataset.py create mode 100644 llm-lora-finetuning/lit_gpt/rmsnorm.py create mode 100644 llm-lora-finetuning/lit_gpt/tokenizer.py create mode 100644 llm-lora-finetuning/lit_gpt/utils.py create mode 100644 llm-lora-finetuning/pipelines/__init__.py create mode 100644 llm-lora-finetuning/pipelines/finetuning.py create mode 100644 llm-lora-finetuning/pipelines/merge.py create mode 100644 llm-lora-finetuning/requirements.txt create mode 100644 llm-lora-finetuning/run.py create mode 100644 llm-lora-finetuning/scripts/convert_hf_checkpoint.py create mode 100644 llm-lora-finetuning/scripts/convert_lit_checkpoint.py create mode 100644 llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py create mode 100644 llm-lora-finetuning/scripts/download.py create mode 100644 llm-lora-finetuning/scripts/merge_lora.py create mode 100644 llm-lora-finetuning/scripts/prepare_alpaca.py create mode 100644 llm-lora-finetuning/scripts/prepare_csv.py create mode 100644 llm-lora-finetuning/scripts/prepare_dolly.py create mode 100644 llm-lora-finetuning/scripts/prepare_flan.py create mode 100644 llm-lora-finetuning/scripts/prepare_lima.py create mode 100644 llm-lora-finetuning/scripts/prepare_longform.py create mode 100644 llm-lora-finetuning/scripts/prepare_openwebtext.py create mode 100644 llm-lora-finetuning/scripts/prepare_redpajama.py create mode 100644 llm-lora-finetuning/scripts/prepare_slimpajama.py create mode 100644 llm-lora-finetuning/scripts/prepare_starcoder.py create mode 100644 llm-lora-finetuning/steps/__init__.py create mode 100644 llm-lora-finetuning/steps/finetune.py create mode 100644 llm-lora-finetuning/steps/merge.py diff --git a/llm-lora-finetuning/.dockerignore b/llm-lora-finetuning/.dockerignore new file mode 100644 index 00000000..4a37fa74 --- /dev/null +++ b/llm-lora-finetuning/.dockerignore @@ -0,0 +1,7 @@ +* +!/pipelines/** +!/steps/** +!/lit_gpt/** +!/generate/** +!/scripts/** +!/finetune/** \ No newline at end of file diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md new file mode 100644 index 00000000..e69de29b diff --git a/llm-lora-finetuning/configs/feature.yaml b/llm-lora-finetuning/configs/feature.yaml new file mode 100644 index 00000000..e69de29b diff --git a/llm-lora-finetuning/finetune/adapter.py b/llm-lora-finetuning/finetune/adapter.py new file mode 100644 index 00000000..862e2333 --- /dev/null +++ b/llm-lora-finetuning/finetune/adapter.py @@ -0,0 +1,333 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import dataclasses +import os +import sys +import time +from pathlib import Path +from typing import Dict, List, Literal, Optional, Tuple + +import lightning as L +import torch +from lightning.fabric.loggers import CSVLogger +from lightning.fabric.plugins import BitsandbytesPrecision +from lightning.fabric.strategies import FSDPStrategy +from lightning.fabric.utilities import ThroughputMonitor + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt.adapter import GPT, Block, Config, adapter_filter, mark_only_adapter_as_trainable +from lit_gpt.args import EvalArgs, IOArgs, TrainArgs +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + chunked_cross_entropy, + get_default_supported_precision, + load_checkpoint, + num_parameters, +) +from scripts.prepare_alpaca import generate_prompt + + +def setup( + precision: Optional[str] = None, + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, + devices: int = 1, + io: IOArgs = IOArgs( + train_data_dir=Path("data/alpaca"), + val_data_dir=Path("data/alpaca"), + checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + out_dir=Path("out/adapter/alpaca"), + ), + train: TrainArgs = TrainArgs( + save_interval=1000, + log_interval=1, + global_batch_size=64, + micro_batch_size=4, + lr_warmup_steps=100, + epochs=5, + epoch_size=50000, + learning_rate=1e-3, + max_seq_length=None, + ), + eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100), +) -> None: + print(locals()) + precision = precision or get_default_supported_precision(training=True) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + if devices > 1: + if quantize: + raise NotImplementedError( + "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the" + " --quantize flag." + ) + strategy = FSDPStrategy( + auto_wrap_policy={Block}, + activation_checkpointing_policy={Block}, + state_dict_type="full", + limit_all_gathers=True, + cpu_offload=False, + ) + else: + strategy = "auto" + + logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) + fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) + fabric.launch(main, devices, Config.from_name(name=io.checkpoint_dir.name), io, train, eval) + + +def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: + validate_args(io, train, eval) + + steps_per_epoch = train.epoch_size // devices // train.batch_size(devices) + lr_max_steps = train.epochs * steps_per_epoch + + check_valid_checkpoint_dir(io.checkpoint_dir) + + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + + if fabric.global_rank == 0: + os.makedirs(io.out_dir, exist_ok=True) + + train_data = torch.load(io.train_data_dir / "train.pt") + val_data = torch.load(io.val_data_dir / "test.pt") + + checkpoint_path = io.checkpoint_dir / "lit_model.pth" + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + with fabric.init_module(empty_init=(devices > 1)): + model = GPT(config) + mark_only_adapter_as_trainable(model) + + fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") + fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + + model = fabric.setup_module(model) + + trainable_params = [p for p in model.parameters() if p.requires_grad] + if isinstance(fabric.strategy.precision, BitsandbytesPrecision): + import bitsandbytes as bnb + + optimizer_cls = bnb.optim.PagedAdamW + else: + optimizer_cls = torch.optim.AdamW + optimizer = optimizer_cls( + trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + ) + optimizer = fabric.setup_optimizers(optimizer) + scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) + + # strict=False because missing keys due to Adapter weights not contained in state dict + load_checkpoint(fabric, model, checkpoint_path, strict=False) + + fabric.seed_everything(1337 + fabric.global_rank) + + train_time = time.perf_counter() + fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval) + fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + + # Save the final checkpoint at the end of training + save_path = io.out_dir / "lit_model_adapter_finetuned.pth" + save_adapter_checkpoint(fabric, model, save_path) + + +def fit( + fabric: L.Fabric, + model: GPT, + optimizer: torch.optim.Optimizer, + scheduler: torch.optim.lr_scheduler, + train_data: List[Dict], + val_data: List[Dict], + devices: int, + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: + tokenizer = Tokenizer(io.checkpoint_dir) + longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + fabric.print( + f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" + f" {model.max_seq_length} and context length is {model.config.block_size}" + ) + + validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + + throughput = ThroughputMonitor(fabric, window_size=50) + step_count = 0 + total_lengths = 0 + total_t0 = time.perf_counter() + + for iter_num in range(1, train.max_iters(devices) + 1): + iter_t0 = time.perf_counter() + + input_ids, targets = get_batch( + fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None + ) + + is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + with fabric.no_backward_sync(model, enabled=is_accumulating): + logits = model(input_ids, lm_head_chunk_size=128) + # shift the targets such that output n predicts token n+1 + logits[-1] = logits[-1][..., :-1, :] + loss = chunked_cross_entropy(logits, targets[..., 1:]) + fabric.backward(loss / train.gradient_accumulation_iters(devices)) + + if not is_accumulating: + optimizer.step() + optimizer.zero_grad() + scheduler.step() + step_count += 1 + + total_lengths += input_ids.numel() + if iter_num % train.log_interval == 0: + loss_item = loss.item() # expensive device-to-host synchronization + t1 = time.perf_counter() + throughput.update( + time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths + ) + throughput.compute_and_log(step=iter_num) + fabric.print( + f"iter {iter_num} | step {step_count}: loss {loss_item:.4f}, iter time:" + f" {(t1 - iter_t0) * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}" + ) + + if not is_accumulating and step_count % eval.interval == 0: + t0 = time.perf_counter() + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + t1 = time.perf_counter() - t0 + fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + fabric.barrier() + if not is_accumulating and step_count % train.save_interval == 0: + checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth" + save_adapter_checkpoint(fabric, model, checkpoint_path) + + +# the adapter "kv cache" cannot be initialized under `inference_mode` +@torch.no_grad() +def validate( + fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs +) -> torch.Tensor: + fabric.print("Validating ...") + model.eval() + losses = torch.zeros(eval.max_iters) + for k in range(eval.max_iters): + input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + logits = model(input_ids) + losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + val_loss = losses.mean() + + # produce an example: + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + fabric.print(instruction) + sample = {"instruction": instruction, "input": ""} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + with fabric.init_tensor(): + # do not set `max_seq_length=max_returned_token` because memory is not a concern here + model.set_kv_cache(batch_size=1) + output = generate( + model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + ) + model.clear_kv_cache() + output = tokenizer.decode(output) + fabric.print(output) + + model.train() + return val_loss + + +def get_batch( + fabric: L.Fabric, + data: List[Dict], + micro_batch_size: int, + max_seq_length: Optional[int], + longest_seq_ix: Optional[int] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + ix = torch.randint(len(data), (micro_batch_size,)) + if longest_seq_ix is not None: + # force the longest sample at the beginning so potential OOMs happen right away + ix[0] = longest_seq_ix + + input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix] + labels = [data[i]["labels"].type(torch.int64) for i in ix] + + # this could be `longest_seq_length` to have a fixed size for all batches + max_len = max(len(s) for s in input_ids) + + def pad_right(x, pad_id): + # pad right based on the longest sequence + n = max_len - len(x) + return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype))) + + x = torch.stack([pad_right(x, pad_id=0) for x in input_ids]) + y = torch.stack([pad_right(x, pad_id=-1) for x in labels]) + + # Truncate if needed + if max_seq_length: + x = x[:, :max_seq_length] + y = y[:, :max_seq_length] + + if fabric.device.type == "cuda" and x.device.type == "cpu": + x, y = fabric.to_device((x.pin_memory(), y.pin_memory())) + else: + x, y = fabric.to_device((x, y)) + return x, y + + +def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): + # linear warmup followed by cosine annealing + scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) + return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + + +def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: + # find out the minimum max_seq_length required during fine-tuning (saves memory!) + lengths = [len(d["input_ids"]) for d in data] + longest_seq_length = max(lengths) + longest_seq_ix = lengths.index(longest_seq_length) + return longest_seq_length, longest_seq_ix + + +def save_adapter_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Path) -> None: + fabric.print(f"Saving adapter weights to {str(file_path)!r}") + fabric.save(file_path, {"model": model}, filter={"model": adapter_filter}) + + +def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: + issues = [] + unsupported = [(train, ["max_tokens", "max_norm"])] + for args, names in unsupported: + for name in names: + if getattr(args, name) is not None: + issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + required = [ + (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), + (train, ["epoch_size", "epochs"]), + (eval, ["max_new_tokens"]), + ] + for args, names in required: + for name in names: + if getattr(args, name) is None: + issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + if issues: + raise ValueError("\n".join(issues)) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(setup) diff --git a/llm-lora-finetuning/finetune/adapter_v2.py b/llm-lora-finetuning/finetune/adapter_v2.py new file mode 100644 index 00000000..8b5d0347 --- /dev/null +++ b/llm-lora-finetuning/finetune/adapter_v2.py @@ -0,0 +1,333 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import dataclasses +import os +import sys +import time +from pathlib import Path +from typing import Dict, List, Literal, Optional, Tuple + +import lightning as L +import torch +from lightning.fabric.loggers import CSVLogger +from lightning.fabric.plugins import BitsandbytesPrecision +from lightning.fabric.strategies import FSDPStrategy +from lightning.fabric.utilities import ThroughputMonitor + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt.adapter_v2 import GPT, Block, Config, adapter_filter, mark_only_adapter_v2_as_trainable +from lit_gpt.args import EvalArgs, IOArgs, TrainArgs +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + chunked_cross_entropy, + get_default_supported_precision, + load_checkpoint, + num_parameters, +) +from scripts.prepare_alpaca import generate_prompt + + +def setup( + precision: Optional[str] = None, + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, + devices: int = 1, + io: IOArgs = IOArgs( + train_data_dir=Path("data/alpaca"), + val_data_dir=Path("data/alpaca"), + checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + out_dir=Path("out/adapter_v2/alpaca"), + ), + train: TrainArgs = TrainArgs( + save_interval=1000, + log_interval=1, + global_batch_size=128, + micro_batch_size=2, + lr_warmup_steps=100, + epochs=5, + epoch_size=50000, + learning_rate=1e-3, + max_seq_length=None, + ), + eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100), +) -> None: + print(locals()) + precision = precision or get_default_supported_precision(training=True) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + if devices > 1: + if quantize: + raise NotImplementedError( + "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the" + " --quantize flag." + ) + strategy = FSDPStrategy( + auto_wrap_policy={Block}, + activation_checkpointing_policy={Block}, + state_dict_type="full", + limit_all_gathers=True, + cpu_offload=False, + ) + else: + strategy = "auto" + + logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) + fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) + fabric.launch(main, devices, Config.from_name(name=io.checkpoint_dir.name), io, train, eval) + + +def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: + validate_args(io, train, eval) + + steps_per_epoch = train.epoch_size // devices // train.batch_size(devices) + lr_max_steps = train.epochs * steps_per_epoch + + check_valid_checkpoint_dir(io.checkpoint_dir) + + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + + if fabric.global_rank == 0: + os.makedirs(io.out_dir, exist_ok=True) + + train_data = torch.load(io.train_data_dir / "train.pt") + val_data = torch.load(io.val_data_dir / "test.pt") + + checkpoint_path = io.checkpoint_dir / "lit_model.pth" + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + with fabric.init_module(empty_init=(devices > 1)): + model = GPT(config) + mark_only_adapter_v2_as_trainable(model) + + fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") + fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + + model = fabric.setup_module(model) + + trainable_params = [p for p in model.parameters() if p.requires_grad] + if isinstance(fabric.strategy.precision, BitsandbytesPrecision): + import bitsandbytes as bnb + + optimizer_cls = bnb.optim.PagedAdamW + else: + optimizer_cls = torch.optim.AdamW + optimizer = optimizer_cls( + trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + ) + optimizer = fabric.setup_optimizers(optimizer) + scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) + + # strict=False because missing keys due to Adapter weights not contained in state dict + load_checkpoint(fabric, model, checkpoint_path, strict=False) + + fabric.seed_everything(1337 + fabric.global_rank) + + train_time = time.perf_counter() + fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval) + fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + + # Save the final checkpoint at the end of training + save_path = io.out_dir / "lit_model_adapter_finetuned.pth" + save_adapter_v2_checkpoint(fabric, model, save_path) + + +def fit( + fabric: L.Fabric, + model: GPT, + optimizer: torch.optim.Optimizer, + scheduler: torch.optim.lr_scheduler, + train_data: List[Dict], + val_data: List[Dict], + devices: int, + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: + tokenizer = Tokenizer(io.checkpoint_dir) + longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + fabric.print( + f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" + f" {model.max_seq_length} and context length is {model.config.block_size}" + ) + + validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + + throughput = ThroughputMonitor(fabric, window_size=50) + step_count = 0 + total_lengths = 0 + total_t0 = time.perf_counter() + + for iter_num in range(1, train.max_iters(devices) + 1): + iter_t0 = time.perf_counter() + + input_ids, targets = get_batch( + fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None + ) + + is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + with fabric.no_backward_sync(model, enabled=is_accumulating): + logits = model(input_ids, lm_head_chunk_size=128) + # shift the targets such that output n predicts token n+1 + logits[-1] = logits[-1][..., :-1, :] + loss = chunked_cross_entropy(logits, targets[..., 1:]) + fabric.backward(loss / train.gradient_accumulation_iters(devices)) + + if not is_accumulating: + optimizer.step() + optimizer.zero_grad() + scheduler.step() + step_count += 1 + + total_lengths += input_ids.numel() + if iter_num % train.log_interval == 0: + loss_item = loss.item() # expensive device-to-host synchronization + t1 = time.perf_counter() + throughput.update( + time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths + ) + throughput.compute_and_log(step=iter_num) + fabric.print( + f"iter {iter_num} | step {step_count}: loss {loss_item:.4f}, iter time:" + f" {(t1 - iter_t0) * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}" + ) + + if not is_accumulating and step_count % eval.interval == 0: + t0 = time.perf_counter() + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + t1 = time.perf_counter() - t0 + fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + fabric.barrier() + if not is_accumulating and step_count % train.save_interval == 0: + checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth" + save_adapter_v2_checkpoint(fabric, model, checkpoint_path) + + +# the adapter "kv cache" cannot be initialized under `inference_mode` +@torch.no_grad() +def validate( + fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs +) -> torch.Tensor: + fabric.print("Validating ...") + model.eval() + losses = torch.zeros(eval.max_iters) + for k in range(eval.max_iters): + input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + logits = model(input_ids) + losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + val_loss = losses.mean() + + # produce an example: + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + fabric.print(instruction) + sample = {"instruction": instruction, "input": ""} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + with fabric.init_tensor(): + # do not set `max_seq_length=max_returned_token` because memory is not a concern here + model.set_kv_cache(batch_size=1) + output = generate( + model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + ) + model.clear_kv_cache() + output = tokenizer.decode(output) + fabric.print(output) + + model.train() + return val_loss + + +def get_batch( + fabric: L.Fabric, + data: List[Dict], + micro_batch_size: int, + max_seq_length: Optional[int], + longest_seq_ix: Optional[int] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + ix = torch.randint(len(data), (micro_batch_size,)) + if longest_seq_ix is not None: + # force the longest sample at the beginning so potential OOMs happen right away + ix[0] = longest_seq_ix + + input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix] + labels = [data[i]["labels"].type(torch.int64) for i in ix] + + # this could be `longest_seq_length` to have a fixed size for all batches + max_len = max(len(s) for s in input_ids) + + def pad_right(x, pad_id): + # pad right based on the longest sequence + n = max_len - len(x) + return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype))) + + x = torch.stack([pad_right(x, pad_id=0) for x in input_ids]) + y = torch.stack([pad_right(x, pad_id=-1) for x in labels]) + + # Truncate if needed + if max_seq_length: + x = x[:, :max_seq_length] + y = y[:, :max_seq_length] + + if fabric.device.type == "cuda" and x.device.type == "cpu": + x, y = fabric.to_device((x.pin_memory(), y.pin_memory())) + else: + x, y = fabric.to_device((x, y)) + return x, y + + +def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): + # linear warmup followed by cosine annealing + scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) + return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + + +def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: + # find out the minimum max_seq_length required during fine-tuning (saves memory!) + lengths = [len(d["input_ids"]) for d in data] + longest_seq_length = max(lengths) + longest_seq_ix = lengths.index(longest_seq_length) + return longest_seq_length, longest_seq_ix + + +def save_adapter_v2_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Path) -> None: + fabric.print(f"Saving adapter v2 weights to {str(file_path)!r}") + fabric.save(file_path, {"model": model}, filter={"model": adapter_filter}) + + +def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: + issues = [] + unsupported = [(train, ["max_tokens", "max_norm"])] + for args, names in unsupported: + for name in names: + if getattr(args, name) is not None: + issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + required = [ + (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), + (train, ["epoch_size", "epochs"]), + (eval, ["max_new_tokens"]), + ] + for args, names in required: + for name in names: + if getattr(args, name) is None: + issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + if issues: + raise ValueError("\n".join(issues)) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(setup) diff --git a/llm-lora-finetuning/finetune/full.py b/llm-lora-finetuning/finetune/full.py new file mode 100644 index 00000000..52b4a47d --- /dev/null +++ b/llm-lora-finetuning/finetune/full.py @@ -0,0 +1,349 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import dataclasses +import math +import os +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import lightning as L +import torch +from lightning.fabric.loggers import CSVLogger +from lightning.fabric.strategies import FSDPStrategy +from torchmetrics import RunningMean + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt.args import EvalArgs, IOArgs, TrainArgs +from lit_gpt.model import GPT, Block, Config +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + chunked_cross_entropy, + get_default_supported_precision, + load_checkpoint, + num_parameters, +) +from scripts.prepare_alpaca import generate_prompt + + +def setup( + precision: Optional[str] = None, + devices: int = 1, + resume: Union[bool, Path] = False, + io: IOArgs = IOArgs( + train_data_dir=Path("data/alpaca"), + val_data_dir=Path("data/alpaca"), + checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + out_dir=Path("out/full/alpaca"), + ), + train: TrainArgs = TrainArgs( + save_interval=1000, + log_interval=1, + global_batch_size=64, + micro_batch_size=1, + lr_warmup_steps=100, + epochs=5, + epoch_size=50000, + learning_rate=3e-3, + max_seq_length=None, + ), + eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100), +) -> None: + print(locals()) + precision = precision or get_default_supported_precision(training=True) + + if devices > 1: + strategy = FSDPStrategy( + auto_wrap_policy={Block}, + activation_checkpointing_policy={Block}, + state_dict_type="full", + limit_all_gathers=True, + cpu_offload=False, + ) + else: + strategy = "auto" + + logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) + fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger) + fabric.launch(main, devices, resume, Config.from_name(name=io.checkpoint_dir.name), io, train, eval) + + +def main( + fabric: L.Fabric, + devices: int, + resume: Union[bool, Path], + config: Config, + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: + validate_args(io, train, eval) + + steps_per_epoch = train.epoch_size // devices // train.batch_size(devices) + lr_max_steps = train.epochs * steps_per_epoch + + check_valid_checkpoint_dir(io.checkpoint_dir) + + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + + if fabric.global_rank == 0: + os.makedirs(io.out_dir, exist_ok=True) + + train_data = torch.load(io.train_data_dir / "train.pt") + val_data = torch.load(io.val_data_dir / "test.pt") + + checkpoint_path = io.checkpoint_dir / "lit_model.pth" + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + with fabric.init_module(empty_init=(devices > 1)): + model = GPT(config) + + fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") + + model = fabric.setup(model) + optimizer = torch.optim.AdamW( + model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + ) + optimizer = fabric.setup_optimizers(optimizer) + scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) + state = {"model": model, "optimizer": optimizer, "scheduler": scheduler, "iter_num": 0, "step_count": 0} + + if resume is True: + resume = max(io.out_dir.glob("*.pth"), key=(lambda p: int(p.name.split("-")[1]))) + if resume: + fabric.print(f"Resuming training from {resume}") + fabric.load(resume, state) + else: + load_checkpoint(fabric, state["model"], checkpoint_path) + + fabric.seed_everything(1337 + fabric.global_rank) + + train_time = time.perf_counter() + fit(fabric, state, train_data, val_data, devices, resume, io, train, eval) + fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + + # Save the final checkpoint at the end of training + fabric.save(io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]}) + + +def fit( + fabric: L.Fabric, + state: Dict, + train_data: List[Dict], + val_data: List[Dict], + devices: int, + resume: Union[bool, Path], + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: + model = state["model"] + optimizer = state["optimizer"] + scheduler = state["scheduler"] + tokenizer = Tokenizer(io.checkpoint_dir) + longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + fabric.print( + f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" + f" {model.max_seq_length} and context length is {model.config.block_size}" + ) + + validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + initial_iter = state["iter_num"] + + # resume data loader state by fast-forwarding through all seen batches + if resume: + resume_t0 = time.perf_counter() + for resume_iter in range(initial_iter): + get_batch(fabric, train_data, None) + if resume_iter % 1000 == 0: + fabric.print(f"Resuming dataset: {resume_iter} / {initial_iter}") + fabric.barrier() + fabric.print( + f"Resuming data loader finished. Took {time.perf_counter() - resume_t0:.1f} seconds to reach iteration" + f" {initial_iter}." + ) + + running_loss = RunningMean(window=train.gradient_accumulation_iters(devices), sync_on_compute=False).to( + fabric.device + ) + fabric.barrier() + + for state["iter_num"] in range(state["iter_num"] + 1, train.max_iters(devices) + 1): + iter_t0 = time.perf_counter() + + input_ids, targets = get_batch( + fabric, + train_data, + train.micro_batch_size, + train.max_seq_length, + longest_seq_ix if state["iter_num"] == 1 else None, + ) + + is_accumulating = state["iter_num"] % train.gradient_accumulation_iters(devices) != 0 + with fabric.no_backward_sync(model, enabled=is_accumulating): + logits = model(input_ids) + # shift the targets such that output n predicts token n+1 + loss = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:]) + fabric.backward(loss / train.gradient_accumulation_iters(devices)) + + running_loss.update(loss.detach()) + + if not is_accumulating: + optimizer.step() + optimizer.zero_grad() + scheduler.step() + state["step_count"] += 1 + + if state["iter_num"] % train.log_interval == 0: + loss = running_loss.compute().item() # expensive device-to-host synchronization + t1 = time.perf_counter() + metrics = { + "loss": loss, + "iter": state["iter_num"], + "step": state["step_count"], + "iter_time": t1 - iter_t0, + "tokens": state["iter_num"] * train.micro_batch_size * model.config.block_size, + "total_tokens": ( + state["iter_num"] * train.micro_batch_size * model.config.block_size * fabric.world_size + ), + # TODO: log learning rate + } + fabric.print( + f"iter {metrics['iter']} | step {metrics['step']}: loss {metrics['loss']:.4f}, iter time:" + f" {metrics['iter_time'] * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}" + ) + fabric.log_dict(metrics, step=state["iter_num"]) + + if not is_accumulating and state["step_count"] % eval.interval == 0: + t0 = time.perf_counter() + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + t1 = time.perf_counter() - t0 + fabric.print(f"iter {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + metrics = {"val_loss": val_loss, "val_ppl": math.exp(val_loss)} + fabric.log_dict(metrics, step=state["iter_num"]) + fabric.barrier() + if not is_accumulating and state["step_count"] % train.save_interval == 0: + checkpoint_path = io.out_dir / f"step-{state['step_count']:06d}.pth" + fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}") + fabric.save(checkpoint_path, state) + + +# FSDP has issues with `inference_mode` +@torch.no_grad() +def validate( + fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs +) -> torch.Tensor: + fabric.print("Validating ...") + model.eval() + losses = torch.zeros(eval.max_iters) + for k in range(eval.max_iters): + input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + logits = model(input_ids) + losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + val_loss = losses.mean() + + # produce an example: + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + fabric.print(instruction) + sample = {"instruction": instruction, "input": ""} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + with fabric.init_tensor(): + # do not set `max_seq_length=max_returned_token` because memory is not a concern here + model.set_kv_cache(batch_size=1) + output = generate( + model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + ) + model.clear_kv_cache() + output = tokenizer.decode(output) + fabric.print(output) + + model.train() + return val_loss + + +def get_batch( + fabric: L.Fabric, + data: List[Dict], + micro_batch_size: int, + max_seq_length: Optional[int], + longest_seq_ix: Optional[int] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + ix = torch.randint(len(data), (micro_batch_size,)) + if longest_seq_ix is not None: + # force the longest sample at the beginning so potential OOMs happen right away + ix[0] = longest_seq_ix + + input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix] + labels = [data[i]["labels"].type(torch.int64) for i in ix] + + # this could be `longest_seq_length` to have a fixed size for all batches + max_len = max(len(s) for s in input_ids) + + def pad_right(x, pad_id): + # pad right based on the longest sequence + n = max_len - len(x) + return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype))) + + x = torch.stack([pad_right(x, pad_id=0) for x in input_ids]) + y = torch.stack([pad_right(x, pad_id=-1) for x in labels]) + + # Truncate if needed + if max_seq_length: + x = x[:, :max_seq_length] + y = y[:, :max_seq_length] + + if fabric.device.type == "cuda" and x.device.type == "cpu": + x, y = fabric.to_device((x.pin_memory(), y.pin_memory())) + else: + x, y = fabric.to_device((x, y)) + return x, y + + +def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): + # linear warmup followed by cosine annealing + scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) + return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + + +def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: + # find out the minimum max_seq_length required during fine-tuning (saves memory!) + lengths = [len(d["input_ids"]) for d in data] + longest_seq_length = max(lengths) + longest_seq_ix = lengths.index(longest_seq_length) + return longest_seq_length, longest_seq_ix + + +def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: + issues = [] + unsupported = [(train, ["max_tokens", "max_norm"])] + for args, names in unsupported: + for name in names: + if getattr(args, name) is not None: + issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + required = [ + (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), + (train, ["epoch_size", "epochs"]), + (eval, ["max_new_tokens"]), + ] + for args, names in required: + for name in names: + if getattr(args, name) is None: + issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + if issues: + raise ValueError("\n".join(issues)) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(setup) diff --git a/llm-lora-finetuning/finetune/lora.py b/llm-lora-finetuning/finetune/lora.py new file mode 100644 index 00000000..086322fb --- /dev/null +++ b/llm-lora-finetuning/finetune/lora.py @@ -0,0 +1,363 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import dataclasses +import os +import sys +import time +from pathlib import Path +from typing import Dict, List, Literal, Optional, Tuple + +import lightning as L +import torch +from lightning.fabric.loggers import CSVLogger +from lightning.fabric.plugins import BitsandbytesPrecision +from lightning.fabric.strategies import FSDPStrategy +from lightning.fabric.utilities import ThroughputMonitor + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt.args import EvalArgs, IOArgs, TrainArgs +from lit_gpt.lora import GPT, Block, Config, lora_filter, mark_only_lora_as_trainable +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + chunked_cross_entropy, + get_default_supported_precision, + load_checkpoint, + num_parameters, +) +from scripts.prepare_alpaca import generate_prompt + + +def setup( + precision: Optional[str] = None, + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, + devices: int = 1, + lora_r: int = 8, + lora_alpha: int = 16, + lora_dropout: float = 0.05, + lora_query: bool = True, + lora_key: bool = False, + lora_value: bool = True, + lora_projection: bool = False, + lora_mlp: bool = False, + lora_head: bool = False, + io: IOArgs = IOArgs( + train_data_dir=Path("data/alpaca"), + val_data_dir=Path("data/alpaca"), + checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + out_dir=Path("out/lora/alpaca"), + ), + train: TrainArgs = TrainArgs( + save_interval=1000, + log_interval=1, + global_batch_size=128, + micro_batch_size=4, + lr_warmup_steps=100, + epochs=5, + epoch_size=50000, + learning_rate=3e-4, + max_seq_length=None, + ), + eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100), +) -> None: + print(locals()) + precision = precision or get_default_supported_precision(training=True) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + if devices > 1: + if quantize: + raise NotImplementedError( + "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the" + " --quantize flag." + ) + strategy = FSDPStrategy( + auto_wrap_policy={Block}, + activation_checkpointing_policy={Block}, + state_dict_type="full", + limit_all_gathers=True, + cpu_offload=False, + ) + else: + strategy = "auto" + + logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) + fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) + + if not any((lora_query, lora_key, lora_value, lora_projection, lora_mlp, lora_head)): + fabric.print("Warning: all LoRA layers are disabled!") + fabric.launch( + main, + devices, + Config.from_name( + name=io.checkpoint_dir.name, + r=lora_r, + alpha=lora_alpha, + dropout=lora_dropout, + to_query=lora_query, + to_key=lora_key, + to_value=lora_value, + to_projection=lora_projection, + to_mlp=lora_mlp, + to_head=lora_head, + ), + io, + train, + eval, + ) + + +def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: + validate_args(io, train, eval) + + steps_per_epoch = train.epoch_size // devices // train.batch_size(devices) + lr_max_steps = train.epochs * steps_per_epoch + + check_valid_checkpoint_dir(io.checkpoint_dir) + + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + + if fabric.global_rank == 0: + os.makedirs(io.out_dir, exist_ok=True) + + train_data = torch.load(io.train_data_dir / "train.pt") + val_data = torch.load(io.val_data_dir / "test.pt") + + checkpoint_path = io.checkpoint_dir / "lit_model.pth" + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + with fabric.init_module(empty_init=(devices > 1)): + model = GPT(config) + mark_only_lora_as_trainable(model) + + fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") + fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + + model = fabric.setup_module(model) + + trainable_params = [p for p in model.parameters() if p.requires_grad] + if isinstance(fabric.strategy.precision, BitsandbytesPrecision): + import bitsandbytes as bnb + + optimizer_cls = bnb.optim.PagedAdamW + else: + optimizer_cls = torch.optim.AdamW + optimizer = optimizer_cls( + trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + ) + optimizer = fabric.setup_optimizers(optimizer) + scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) + + # strict=False because missing keys due to LoRA weights not contained in state dict + load_checkpoint(fabric, model, checkpoint_path, strict=False) + + fabric.seed_everything(1337 + fabric.global_rank) + + train_time = time.perf_counter() + fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval) + fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + + # Save the final LoRA checkpoint at the end of training + save_path = io.out_dir / "lit_model_lora_finetuned.pth" + save_lora_checkpoint(fabric, model, save_path) + + +def fit( + fabric: L.Fabric, + model: GPT, + optimizer: torch.optim.Optimizer, + scheduler: torch.optim.lr_scheduler, + train_data: List[Dict], + val_data: List[Dict], + devices: int, + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: + tokenizer = Tokenizer(io.checkpoint_dir) + longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + fabric.print( + f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" + f" {model.max_seq_length} and context length is {model.config.block_size}" + ) + + validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + + throughput = ThroughputMonitor(fabric, window_size=50) + step_count = 0 + total_lengths = 0 + total_t0 = time.perf_counter() + + for iter_num in range(1, train.max_iters(devices) + 1): + iter_t0 = time.perf_counter() + + input_ids, targets = get_batch( + fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None + ) + + is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + with fabric.no_backward_sync(model, enabled=is_accumulating): + logits = model(input_ids, lm_head_chunk_size=128) + # shift the targets such that output n predicts token n+1 + logits[-1] = logits[-1][..., :-1, :] + loss = chunked_cross_entropy(logits, targets[..., 1:]) + fabric.backward(loss / train.gradient_accumulation_iters(devices)) + + if not is_accumulating: + optimizer.step() + optimizer.zero_grad() + scheduler.step() + step_count += 1 + + total_lengths += input_ids.numel() + if iter_num % train.log_interval == 0: + loss_item = loss.item() # expensive device-to-host synchronization + t1 = time.perf_counter() + throughput.update( + time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths + ) + throughput.compute_and_log(step=iter_num) + fabric.print( + f"iter {iter_num} | step {step_count}: loss {loss_item:.4f}, iter time:" + f" {(t1 - iter_t0) * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}" + ) + + if not is_accumulating and step_count % eval.interval == 0: + t0 = time.perf_counter() + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + t1 = time.perf_counter() - t0 + fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + fabric.barrier() + if not is_accumulating and step_count % train.save_interval == 0: + checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth" + save_lora_checkpoint(fabric, model, checkpoint_path) + + +# FSDP has issues with `inference_mode` +@torch.no_grad() +def validate( + fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs +) -> torch.Tensor: + fabric.print("Validating ...") + model.eval() + losses = torch.zeros(eval.max_iters) + for k in range(eval.max_iters): + input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + logits = model(input_ids) + losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + val_loss = losses.mean() + + # produce an example: + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + fabric.print(instruction) + sample = {"instruction": instruction, "input": ""} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + with fabric.init_tensor(): + # do not set `max_seq_length=max_returned_token` because memory is not a concern here + model.set_kv_cache(batch_size=1) + output = generate( + model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + ) + model.clear_kv_cache() + output = tokenizer.decode(output) + fabric.print(output) + + model.train() + return val_loss + + +def get_batch( + fabric: L.Fabric, + data: List[Dict], + micro_batch_size: int, + max_seq_length: Optional[int], + longest_seq_ix: Optional[int] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + ix = torch.randint(len(data), (micro_batch_size,)) + if longest_seq_ix is not None: + # force the longest sample at the beginning so potential OOMs happen right away + ix[0] = longest_seq_ix + + input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix] + labels = [data[i]["labels"].type(torch.int64) for i in ix] + + # this could be `longest_seq_length` to have a fixed size for all batches + max_len = max(len(s) for s in input_ids) + + def pad_right(x, pad_id): + # pad right based on the longest sequence + n = max_len - len(x) + return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype))) + + x = torch.stack([pad_right(x, pad_id=0) for x in input_ids]) + y = torch.stack([pad_right(x, pad_id=-1) for x in labels]) + + # Truncate if needed + if max_seq_length: + x = x[:, :max_seq_length] + y = y[:, :max_seq_length] + + if fabric.device.type == "cuda" and x.device.type == "cpu": + x, y = fabric.to_device((x.pin_memory(), y.pin_memory())) + else: + x, y = fabric.to_device((x, y)) + return x, y + + +def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): + # linear warmup followed by cosine annealing + scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) + return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + + +def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: + # find out the minimum max_seq_length required during fine-tuning (saves memory!) + lengths = [len(d["input_ids"]) for d in data] + longest_seq_length = max(lengths) + longest_seq_ix = lengths.index(longest_seq_length) + return longest_seq_length, longest_seq_ix + + +def save_lora_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Path) -> None: + fabric.print(f"Saving LoRA weights to {str(file_path)!r}") + fabric.save(file_path, {"model": model}, filter={"model": lora_filter}) + + +def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: + issues = [] + unsupported = [(train, ["max_tokens", "max_norm"])] + for args, names in unsupported: + for name in names: + if getattr(args, name) is not None: + issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + required = [ + (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), + (train, ["epoch_size", "epochs"]), + (eval, ["max_new_tokens"]), + ] + for args, names in required: + for name in names: + if getattr(args, name) is None: + issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + if issues: + raise ValueError("\n".join(issues)) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(setup) diff --git a/llm-lora-finetuning/generate/adapter.py b/llm-lora-finetuning/generate/adapter.py new file mode 100644 index 00000000..15e5df51 --- /dev/null +++ b/llm-lora-finetuning/generate/adapter.py @@ -0,0 +1,119 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import sys +import time +from pathlib import Path +from typing import Literal, Optional + +import lightning as L +import torch +from lightning.fabric.plugins import BitsandbytesPrecision + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt import Tokenizer +from lit_gpt.adapter import GPT, Config +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load +from scripts.prepare_alpaca import generate_prompt + + +def main( + prompt: str = "What food do llamas eat?", + input: str = "", + adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + max_new_tokens: int = 100, + top_k: Optional[int] = 200, + temperature: float = 0.8, + precision: Optional[str] = None, +) -> None: + """Generates a response based on a given instruction and an optional input. + This script will only work with checkpoints from the instruction-tuned GPT-Adapter model. + See `finetune/adapter.py`. + + Args: + prompt: The prompt/instruction (Alpaca style). + input: Optional input (Alpaca style). + adapter_path: Path to the checkpoint with trained adapter weights, which are the output of + `finetune/adapter.py`. + checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights. + quantize: Whether to quantize the model and using which method: + - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes + - bnb.int8: 8-bit quantization from bitsandbytes + for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md + max_new_tokens: The number of generation steps to take. + top_k: The number of top most probable tokens to consider in the sampling process. + temperature: A value controlling the randomness of the sampling process. Higher values result in more random + samples. + precision: Indicates the Fabric precision setting to use. + """ + precision = precision or get_default_supported_precision(training=False) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + fabric = L.Fabric(devices=1, precision=precision, plugins=plugins) + fabric.launch() + + check_valid_checkpoint_dir(checkpoint_dir) + + config = Config.from_json(checkpoint_dir / "lit_config.json") + + checkpoint_path = checkpoint_dir / "lit_model.pth" + + tokenizer = Tokenizer(checkpoint_dir) + sample = {"instruction": prompt, "input": input} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + prompt_length = encoded.size(0) + max_returned_tokens = prompt_length + max_new_tokens + + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + t0 = time.perf_counter() + with fabric.init_module(empty_init=True): + model = GPT(config) + fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + with fabric.init_tensor(): + # set the max_seq_length to limit the memory usage to what we need + model.max_seq_length = max_returned_tokens + # enable the kv cache + model.set_kv_cache(batch_size=1) + model.eval() + + t0 = time.perf_counter() + checkpoint = lazy_load(checkpoint_path) + adapter_checkpoint = lazy_load(adapter_path) + checkpoint.update(adapter_checkpoint.get("model", adapter_checkpoint)) + model.load_state_dict(checkpoint) + fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + model = fabric.setup(model) + + L.seed_everything(1234) + t0 = time.perf_counter() + y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + t = time.perf_counter() - t0 + + output = tokenizer.decode(y) + output = output.split("### Response:")[1].strip() + fabric.print(output) + + tokens_generated = y.size(0) - prompt_length + fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(main) diff --git a/llm-lora-finetuning/generate/adapter_v2.py b/llm-lora-finetuning/generate/adapter_v2.py new file mode 100644 index 00000000..c799a0ea --- /dev/null +++ b/llm-lora-finetuning/generate/adapter_v2.py @@ -0,0 +1,119 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import sys +import time +from pathlib import Path +from typing import Literal, Optional + +import lightning as L +import torch +from lightning.fabric.plugins import BitsandbytesPrecision + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt import Tokenizer +from lit_gpt.adapter_v2 import GPT, Config +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load +from scripts.prepare_alpaca import generate_prompt + + +def main( + prompt: str = "What food do llamas eat?", + input: str = "", + adapter_path: Path = Path("out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + max_new_tokens: int = 100, + top_k: Optional[int] = 200, + temperature: float = 0.8, + precision: Optional[str] = None, +) -> None: + """Generates a response based on a given instruction and an optional input. + This script will only work with checkpoints from the instruction-tuned GPT-AdapterV2 model. + See `finetune/adapter_v2.py`. + + Args: + prompt: The prompt/instruction (Alpaca style). + input: Optional input (Alpaca style). + adapter_path: Path to the checkpoint with trained adapter weights, which are the output of + `finetune/adapter_v2.py`. + checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights. + quantize: Whether to quantize the model and using which method: + - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes + - bnb.int8: 8-bit quantization from bitsandbytes + for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md + max_new_tokens: The number of generation steps to take. + top_k: The number of top most probable tokens to consider in the sampling process. + temperature: A value controlling the randomness of the sampling process. Higher values result in more random + samples. + precision: Indicates the Fabric precision setting to use. + """ + precision = precision or get_default_supported_precision(training=False) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + fabric = L.Fabric(devices=1, precision=precision, plugins=plugins) + fabric.launch() + + check_valid_checkpoint_dir(checkpoint_dir) + + config = Config.from_json(checkpoint_dir / "lit_config.json") + + checkpoint_path = checkpoint_dir / "lit_model.pth" + + tokenizer = Tokenizer(checkpoint_dir) + sample = {"instruction": prompt, "input": input} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + prompt_length = encoded.size(0) + max_returned_tokens = prompt_length + max_new_tokens + + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + t0 = time.perf_counter() + with fabric.init_module(empty_init=True): + model = GPT(config) + fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + with fabric.init_tensor(): + # set the max_seq_length to limit the memory usage to what we need + model.max_seq_length = max_returned_tokens + # enable the kv cache + model.set_kv_cache(batch_size=1) + model.eval() + + t0 = time.perf_counter() + checkpoint = lazy_load(checkpoint_path) + adapter_checkpoint = lazy_load(adapter_path) + checkpoint.update(adapter_checkpoint.get("model", adapter_checkpoint)) + model.load_state_dict(checkpoint) + fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + model = fabric.setup(model) + + L.seed_everything(1234) + t0 = time.perf_counter() + y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + t = time.perf_counter() - t0 + + output = tokenizer.decode(y) + output = output.split("### Response:")[1].strip() + fabric.print(output) + + tokens_generated = y.size(0) - prompt_length + fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(main) diff --git a/llm-lora-finetuning/generate/base.py b/llm-lora-finetuning/generate/base.py new file mode 100644 index 00000000..3cf75715 --- /dev/null +++ b/llm-lora-finetuning/generate/base.py @@ -0,0 +1,193 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import sys +import time +from pathlib import Path +from typing import Any, Literal, Optional + +import lightning as L +import torch +import torch._dynamo.config +import torch._inductor.config +from lightning.fabric.plugins import BitsandbytesPrecision + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt import GPT, Config, Tokenizer +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint + + +def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor: + if torch._dynamo.is_compiling(): + # Faster alternative to `torch.multinomial(probs, num_samples=1)` that is also CUDAGraph friendly + distribution = torch.empty_like(probs).exponential_(1) + return torch.argmax(probs / distribution, dim=-1, keepdim=True) + return torch.multinomial(probs, num_samples=1) + + +def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None) -> torch.Tensor: + logits = logits[0, -1] + # optionally crop the logits to only the top k options + if top_k is not None: + v, i = torch.topk(logits, min(top_k, logits.size(-1))) + # do not use `torch.where` as in nanogpt because it will repeat top-k collisions + logits = torch.full_like(logits, float("-inf")).scatter_(-1, i, v) + # optionally scale the logits and sample from a probability distribution + if temperature > 0.0: + probs = torch.nn.functional.softmax(logits / temperature, dim=-1) + return multinomial_num_samples_1(probs) + return torch.argmax(logits, dim=-1, keepdim=True) + + +def next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any) -> torch.Tensor: + logits = model(x, input_pos) + next = sample(logits, **kwargs) + return next.to(dtype=x.dtype) + + +@torch.inference_mode() +def generate( + model: GPT, + prompt: torch.Tensor, + max_returned_tokens: int, + *, + temperature: float = 1.0, + top_k: Optional[int] = None, + eos_id: Optional[int] = None, +) -> torch.Tensor: + """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested. + + The implementation of this function is modified from A. Karpathy's nanoGPT. + + Args: + model: The model to use. + prompt: Tensor of shape (T) with indices of the prompt sequence. + max_returned_tokens: The maximum number of tokens to return (given plus generated). + temperature: Scales the predicted logits by 1 / temperature. + top_k: If specified, only sample among the tokens with the k highest probabilities. + eos_id: If specified, stop generating any more token once the token is triggered. + """ + T = prompt.size(0) + assert max_returned_tokens > T + if model.max_seq_length < max_returned_tokens - 1: + # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a + # data dependency on the `input_pos` tensor and impact model compilation. Since this setting is uncommon, we do + # not support it to avoid negatively impacting the overall speed + raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}") + + device = prompt.device + tokens = [prompt] + input_pos = torch.tensor([T], device=device) + token = next_token( + model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k + ).clone() + tokens.append(token) + for _ in range(2, max_returned_tokens - T + 1): + token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k).clone() + tokens.append(token) + if token == eos_id: + break + input_pos = input_pos.add_(1) + return torch.cat(tokens) + + +@torch.inference_mode() +def main( + prompt: str = "What food do llamas eat?", + *, + num_samples: int = 1, + max_new_tokens: int = 50, + top_k: Optional[int] = 200, + temperature: float = 0.8, + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + precision: Optional[str] = None, + compile: bool = False, +) -> None: + """Generates text samples based on a pre-trained model and tokenizer. + + Args: + prompt: The prompt string to use for generating the samples. + num_samples: The number of text samples to generate. + max_new_tokens: The number of generation steps to take. + top_k: The number of top most probable tokens to consider in the sampling process. + temperature: A value controlling the randomness of the sampling process. Higher values result in more random + samples. + checkpoint_dir: The checkpoint directory to load. + quantize: Whether to quantize the model and using which method: + - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes + - bnb.int8: 8-bit quantization from bitsandbytes + for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md + precision: Indicates the Fabric precision setting to use. + compile: Whether to compile the model. + """ + precision = precision or get_default_supported_precision(training=False) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + fabric = L.Fabric(devices=1, precision=precision, plugins=plugins) + + check_valid_checkpoint_dir(checkpoint_dir) + + config = Config.from_json(checkpoint_dir / "lit_config.json") + + checkpoint_path = checkpoint_dir / "lit_model.pth" + + tokenizer = Tokenizer(checkpoint_dir) + encoded = tokenizer.encode(prompt, device=fabric.device) + prompt_length = encoded.size(0) + max_returned_tokens = prompt_length + max_new_tokens + + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + t0 = time.perf_counter() + with fabric.init_module(empty_init=True): + model = GPT(config) + fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + with fabric.init_tensor(): + # set the max_seq_length to limit the memory usage to what we need + model.max_seq_length = max_returned_tokens + # enable the kv cache + model.set_kv_cache(batch_size=1) + model.eval() + + if compile: + torch._dynamo.config.automatic_dynamic_shapes = True + torch._inductor.config.triton.unique_kernel_names = True + torch._inductor.config.coordinate_descent_tuning = True + global next_token + next_token = torch.compile(next_token, mode="reduce-overhead") + + model = fabric.setup_module(model) + + t0 = time.perf_counter() + load_checkpoint(fabric, model, checkpoint_path) + fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + L.seed_everything(1234) + for i in range(num_samples): + t0 = time.perf_counter() + y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + t = time.perf_counter() - t0 + for block in model.transformer.h: + block.attn.kv_cache.reset_parameters() + fabric.print(tokenizer.decode(y)) + tokens_generated = y.size(0) - prompt_length + fabric.print( + f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr + ) + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(main) diff --git a/llm-lora-finetuning/generate/full.py b/llm-lora-finetuning/generate/full.py new file mode 100644 index 00000000..ca1554e4 --- /dev/null +++ b/llm-lora-finetuning/generate/full.py @@ -0,0 +1,115 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import sys +import time +from pathlib import Path +from typing import Literal, Optional + +import lightning as L +import torch +from lightning.fabric.plugins import BitsandbytesPrecision + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt import GPT, Config, Tokenizer +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint +from scripts.prepare_alpaca import generate_prompt + + +def main( + prompt: str = "What food do llamas eat?", + input: str = "", + finetuned_path: Path = Path("out/full/alpaca/lit_model_finetuned.pth"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + max_new_tokens: int = 100, + top_k: Optional[int] = 200, + temperature: float = 0.8, + precision: Optional[str] = None, +) -> None: + """Generates a response based on a given instruction and an optional input. + This script will only work with checkpoints from the instruction-tuned GPT model. + See `finetune/full.py`. + + Args: + prompt: The prompt/instruction (Alpaca style). + input: Optional input (Alpaca style). + finetuned_path: Path to the checkpoint with trained weights, which are the output of + `finetune/full.py`. + checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights. + quantize: Whether to quantize the model and using which method: + - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes + - bnb.int8: 8-bit quantization from bitsandbytes + for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md + max_new_tokens: The number of generation steps to take. + top_k: The number of top most probable tokens to consider in the sampling process. + temperature: A value controlling the randomness of the sampling process. Higher values result in more random + samples. + precision: Indicates the Fabric precision setting to use. + """ + precision = precision or get_default_supported_precision(training=False) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + fabric = L.Fabric(devices=1, precision=precision, plugins=plugins) + fabric.launch() + + check_valid_checkpoint_dir(checkpoint_dir) + + config = Config.from_json(checkpoint_dir / "lit_config.json") + + checkpoint_path = finetuned_path + + tokenizer = Tokenizer(checkpoint_dir) + sample = {"instruction": prompt, "input": input} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + prompt_length = encoded.size(0) + max_returned_tokens = prompt_length + max_new_tokens + + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + t0 = time.perf_counter() + with fabric.init_module(empty_init=True): + model = GPT(config) + fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + with fabric.init_tensor(): + # set the max_seq_length to limit the memory usage to what we need + model.max_seq_length = max_returned_tokens + # enable the kv cache + model.set_kv_cache(batch_size=1) + model.eval() + + model = fabric.setup(model) + + t0 = time.perf_counter() + load_checkpoint(fabric, model, checkpoint_path) + fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + L.seed_everything(1234) + t0 = time.perf_counter() + y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + t = time.perf_counter() - t0 + + output = tokenizer.decode(y) + output = output.split("### Response:")[1].strip() + fabric.print(output) + + tokens_generated = y.size(0) - prompt_length + fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(main) diff --git a/llm-lora-finetuning/generate/lora.py b/llm-lora-finetuning/generate/lora.py new file mode 100644 index 00000000..006b75ba --- /dev/null +++ b/llm-lora-finetuning/generate/lora.py @@ -0,0 +1,140 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import sys +import time +from pathlib import Path +from typing import Literal, Optional + +import lightning as L +import torch +from lightning.fabric.plugins import BitsandbytesPrecision + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt import Tokenizer +from lit_gpt.lora import GPT, Config, merge_lora_weights +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load +from scripts.prepare_alpaca import generate_prompt + + +def main( + prompt: str = "What food do llamas eat?", + input: str = "", + lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + max_new_tokens: int = 100, + top_k: Optional[int] = 200, + temperature: float = 0.8, + precision: Optional[str] = None, + lora_r: int = 8, + lora_alpha: int = 16, + lora_dropout: float = 0.05, + lora_query: bool = True, + lora_key: bool = False, + lora_value: bool = True, + lora_projection: bool = False, + lora_mlp: bool = False, + lora_head: bool = False, +) -> None: + """Generates a response based on a given instruction and an optional input. + This script will only work with checkpoints from the instruction-tuned GPT-LoRA model. + See `finetune/lora.py`. + + Args: + prompt: The prompt/instruction (Alpaca style). + input: Optional input (Alpaca style). + lora_path: Path to the checkpoint with trained adapter weights, which are the output of + `finetune/lora.py`. + checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights. + quantize: Whether to quantize the model and using which method: + - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes + - bnb.int8: 8-bit quantization from bitsandbytes + for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md + max_new_tokens: The number of generation steps to take. + top_k: The number of top most probable tokens to consider in the sampling process. + temperature: A value controlling the randomness of the sampling process. Higher values result in more random + samples. + precision: Indicates the Fabric precision setting to use. + """ + precision = precision or get_default_supported_precision(training=False) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + fabric = L.Fabric(devices=1, precision=precision, plugins=plugins) + fabric.launch() + + check_valid_checkpoint_dir(checkpoint_dir) + + config = Config.from_json( + checkpoint_dir / "lit_config.json", + r=lora_r, + alpha=lora_alpha, + dropout=lora_dropout, + to_query=lora_query, + to_key=lora_key, + to_value=lora_value, + to_projection=lora_projection, + to_mlp=lora_mlp, + to_head=lora_head, + ) + + checkpoint_path = checkpoint_dir / "lit_model.pth" + + tokenizer = Tokenizer(checkpoint_dir) + sample = {"instruction": prompt, "input": input} + prompt = generate_prompt(sample) + encoded = tokenizer.encode(prompt, device=fabric.device) + prompt_length = encoded.size(0) + max_returned_tokens = prompt_length + max_new_tokens + + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + t0 = time.perf_counter() + with fabric.init_module(empty_init=True): + model = GPT(config) + fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + with fabric.init_tensor(): + # set the max_seq_length to limit the memory usage to what we need + model.max_seq_length = max_returned_tokens + # enable the kv cache + model.set_kv_cache(batch_size=1) + model.eval() + + t0 = time.perf_counter() + checkpoint = lazy_load(checkpoint_path) + lora_checkpoint = lazy_load(lora_path) + checkpoint.update(lora_checkpoint.get("model", lora_checkpoint)) + model.load_state_dict(checkpoint) + fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + merge_lora_weights(model) + model = fabric.setup(model) + + L.seed_everything(1234) + t0 = time.perf_counter() + y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + t = time.perf_counter() - t0 + + output = tokenizer.decode(y) + output = output.split("### Response:")[1].strip() + fabric.print(output) + + tokens_generated = y.size(0) - prompt_length + fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(main) diff --git a/llm-lora-finetuning/generate/sequentially.py b/llm-lora-finetuning/generate/sequentially.py new file mode 100644 index 00000000..cd1b1942 --- /dev/null +++ b/llm-lora-finetuning/generate/sequentially.py @@ -0,0 +1,231 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import itertools +import logging +import re +import sys +import time +from collections import OrderedDict +from functools import partial +from pathlib import Path +from typing import Literal, Optional + +import lightning as L +import torch +from lightning.fabric.accelerators import CUDAAccelerator +from lightning.fabric.plugins import BitsandbytesPrecision +from lightning.fabric.utilities.init import _materialize_meta_tensors +from typing_extensions import Type + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +import generate.base as generate_base +from lit_gpt import GPT, Config, Tokenizer +from lit_gpt.model import Block, build_mask_cache +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision + + +@torch.inference_mode() +def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int): + if model.config.n_layer % devices: + # TODO: support smarter partitioning schemes + raise NotImplementedError( + f"Only balanced partitioning is implemented: n_layer={model.config.n_layer}, devices {devices}" + ) + layers_per_rank = model.config.n_layer // devices + # dictates where each block should be instantiated + mapping = layer_to_device(model, chunk_on=Block, chunk_size=layers_per_rank) + + # materialize each block on the appropriate device + for path, target_index in mapping.items(): + submodule = model.get_submodule(path) + target_device = torch.device(root.type, target_index) + print(f"Moving {path!r} to {target_device}", file=sys.stderr) + # submodules loaded by the checkpoint will be on CPU (if no quantization). move them + replace_device(submodule, replace=torch.device("cpu"), by=target_device) + # in case the checkpoint was partial, materialize leftover metas + _materialize_meta_tensors(submodule, target_device) + # and build the kv cache + submodule.attn.kv_cache = submodule.attn.build_kv_cache(1, max_seq_length, model.cos.size(-1), target_device) + # rebuild odd ends + with root: + model.max_seq_length = max_seq_length + # the rope cache which is on meta device + model.cos, model.sin = model.rope_cache() + # the mask cache which cannot be created with `set_kv_cache` because that will set it for all layers + model.mask_cache = build_mask_cache(max_seq_length) + # and everything that is not a block in the root + _materialize_meta_tensors(model, root) + replace_device(model, replace=torch.device("cpu"), by=root) + + if devices > 1: + # install hooks to move layer inputs/output between devices + for layer_num, (path, target_index) in enumerate(mapping.items()): + submodule = model.get_submodule(path) + if layer_num >= layers_per_rank: + # we need to move the block input on the boundaries between devices + # and also on every non-root device because the RoPE and mask cache is shared + # TODO: the second case could be optimized and then we would only need this hook for + # `layer_num in [layers_per_rank * i - 1 for i in range(1, devices + 1)]` + target_device = torch.device(root.type, target_index) + submodule.register_forward_pre_hook(partial(move_block_input, target_device)) + if layer_num == model.config.n_layer - 1: + submodule.register_forward_hook(partial(move_block_output, root)) + + return model + + +def layer_to_device( + module: torch.nn.Module, chunk_on: Type[torch.nn.Module], chunk_size: int +) -> "OrderedDict[str, int]": + """Create a mapping from layer (block) to device.""" + # this assumes that the definition order is the same as the execution order + hits = [name for name, submodule in module.named_modules() if isinstance(submodule, chunk_on)] + return OrderedDict((name, i // chunk_size) for i, name in enumerate(hits)) + + +def move_block_input(device: torch.device, module: torch.nn.Module, ins): + """``forward_pre_hook`` to move a Block's input before forward.""" + # during inference, none of the inputs are None: x, cos, sin, mask, input_pos + return tuple(t.to(device) for t in ins) + + +def move_block_output(device: torch.device, module: torch.nn.Module, ins, outs) -> torch.Tensor: + """``forward_hook`` to move a Block's output after forward.""" + return outs.to(device) + + +def replace_device(module: torch.nn.Module, replace: torch.device, by: torch.device) -> torch.nn.Module: + for name, submodule in module.named_modules(): + tensors = dict( + itertools.chain(submodule.named_parameters(recurse=False), submodule.named_buffers(recurse=False)) + ) + if not tensors: + continue + devices = {t.device for t in tensors.values()} + if len(devices) != 1: + # since this is using `submodule.to`, different devices in the same submodule is a problem + path_to_device = {f"{name}.{p}": t.device for p, t in tensors.items()} + raise ValueError(f"Found multiple devices: {path_to_device}") + if devices.pop() == replace: + submodule.to(by) + return module + + +@torch.inference_mode() +def main( + prompt: str = "What food do llamas eat?", + *, + num_samples: int = 1, + max_new_tokens: int = 50, + top_k: Optional[int] = 200, + temperature: float = 0.8, + checkpoint_dir: Path = Path("checkpoints/mistralai/Mistral-7B-Instruct-v0.1"), + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None, + precision: Optional[str] = None, + compile: bool = False, +) -> None: + """Generates text samples based on a pre-trained model and tokenizer. + + Args: + prompt: The prompt string to use for generating the samples. + num_samples: The number of text samples to generate. + max_new_tokens: The number of generation steps to take. + top_k: The number of top most probable tokens to consider in the sampling process. + temperature: A value controlling the randomness of the sampling process. Higher values result in more random + samples. + checkpoint_dir: The checkpoint directory to load. + quantize: Whether to quantize the model and using which method: + - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes + for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md + precision: Indicates the Fabric precision setting to use. + compile: Whether to compile the model. + """ + precision = precision or get_default_supported_precision(training=False) + + plugins = None + if quantize is not None: + if compile: + raise NotImplementedError # untested + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + fabric = L.Fabric(devices=1, precision=precision, accelerator="cuda", plugins=plugins) + + total_devices = CUDAAccelerator.auto_device_count() + print(f"Using {total_devices} devices", file=sys.stderr) + + check_valid_checkpoint_dir(checkpoint_dir) + + config = Config.from_json(checkpoint_dir / "lit_config.json") + + checkpoint_path = checkpoint_dir / "lit_model.pth" + + tokenizer = Tokenizer(checkpoint_dir) + encoded = tokenizer.encode(prompt, device=fabric.device) + prompt_length = encoded.size(0) + max_returned_tokens = prompt_length + max_new_tokens + + print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + t0 = time.perf_counter() + # cannot use `init_module` because if bitsandbytes is used, the Linear layers will be replaced + # which means that the weights will get quantized on cuda:0 on checkpoint load. we need to load and then convert + # still, use init_tensor for the precision + with fabric.init_tensor(), torch.device("meta"): + model = GPT(config) + print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + t0 = time.perf_counter() + state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") + # TODO: this assumes that the model fits on CPU. Use lazy_load and make the materialization checkpoint aware + model.load_state_dict(state_dict, assign=True) + print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + model = fabric.setup_module(model, move_to_device=False) + + t0 = time.perf_counter() + model = sequential(model, fabric.device, max_returned_tokens, total_devices) + print(f"Time to sequential-ize the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + if compile: + # TODO: raises an internal compile AssertionError caused by fabric.strategy.precision.forward_context + raise NotImplementedError + # silence developer warning on nightly builds + # https://github.com/pytorch/pytorch/blob/v2.2.0-rc5/torch/_inductor/ir.py#L4166 + pattern = re.compile(".*DeviceCopy in input program.*") + logging.getLogger("torch._inductor.utils").addFilter(lambda record: not pattern.search(record.getMessage())) + torch._dynamo.config.automatic_dynamic_shapes = True + torch._inductor.config.triton.unique_kernel_names = True + torch._inductor.config.coordinate_descent_tuning = True + # cannot use cudagraphs because it doesn't support multiple device indices + # https://github.com/pytorch/pytorch/blob/v2.2.0-rc5/torch/_inductor/compile_fx.py#L371-L375 + generate_base.next_token = torch.compile(generate_base.next_token) + + L.seed_everything(1234) + for i in range(num_samples): + t0 = time.perf_counter() + y = generate_base.generate( + model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id + ) + t = time.perf_counter() - t0 + for block in model.transformer.h: + block.attn.kv_cache.reset_parameters() + print(tokenizer.decode(y)) + tokens_generated = y.size(0) - prompt_length + print( + f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr + ) + print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes").setLevel(logging.DEBUG) + + CLI(main) diff --git a/llm-lora-finetuning/generate/tp.py b/llm-lora-finetuning/generate/tp.py new file mode 100644 index 00000000..abd93cc1 --- /dev/null +++ b/llm-lora-finetuning/generate/tp.py @@ -0,0 +1,225 @@ +"""Tensor-parallel implementation adapted from https://github.com/pytorch-labs/gpt-fast/blob/14df27/tp.py""" + +import logging +import sys +import time +from functools import partial +from pathlib import Path +from typing import Literal, Optional, Union + +import lightning as L +import torch +import torch._dynamo.config +import torch._inductor.config +from lightning.fabric.plugins import BitsandbytesPrecision +from lightning.fabric.utilities import rank_zero_only +from torch.distributed._functional_collectives import all_reduce + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +import generate.base as generate_base +from lit_gpt import GPT, Config, Tokenizer +from lit_gpt.model import CausalSelfAttention, GptNeoxMLP, LLaMAMLP, LLaMAMoE +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision + + +def tensor_parallel_linear(fabric: L.Fabric, linear: torch.nn.Linear, style: str) -> None: + world_size = fabric.world_size + dim, attr = {"colwise": (0, "out_features"), "rowwise": (1, "in_features")}[style] + size = getattr(linear, attr) + if size % world_size != 0: + raise ValueError( + f"This linear's {attr} value ({size}) is not evenly divisible by the world size ({world_size})" + ) + + shard = torch.tensor_split(linear.weight, world_size, dim=dim)[fabric.global_rank] + # overwrite `.data` instead of recreating the parameter for quantization (bitsandbytes) support. + # the bitsandbytes linear classes use custom `torch.nn.Parameter` subclasses + linear.weight.data = shard + setattr(linear, attr, shard.size(dim)) + + if linear.bias is not None and dim == 0: + shard = torch.tensor_split(linear.bias, world_size)[fabric.global_rank] + linear.bias = torch.nn.Parameter(shard, requires_grad=linear.bias.requires_grad) + + +def tensor_parallel_mlp(fabric: L.Fabric, mlp: Union[GptNeoxMLP, LLaMAMLP, LLaMAMoE]) -> None: + if isinstance(mlp, LLaMAMLP): + tensor_parallel_linear(fabric, mlp.fc_1, "colwise") + tensor_parallel_linear(fabric, mlp.fc_2, "colwise") + tensor_parallel_linear(fabric, mlp.proj, "rowwise") + mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) + elif isinstance(mlp, GptNeoxMLP): + tensor_parallel_linear(fabric, mlp.fc, "colwise") + tensor_parallel_linear(fabric, mlp.proj, "rowwise") + mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) + elif isinstance(mlp, LLaMAMoE): + # we use expert slicing across ranks, alternatively, we could create a expert parallelism group + # when the number of experts is a multiple of the world size + for expert in mlp.experts: + tensor_parallel_mlp(fabric, expert) + else: + raise NotImplementedError + + +def tensor_parallel_attn(fabric: L.Fabric, attn: CausalSelfAttention) -> None: + tensor_parallel_linear(fabric, attn.attn, "colwise") + tensor_parallel_linear(fabric, attn.proj, "rowwise") + attn.register_forward_hook(partial(all_reduce_output, fabric.world_size)) + + +def all_reduce_output(world_size: int, module: torch.nn.Module, ins, outs) -> torch.Tensor: + return all_reduce(outs, "sum", list(range(world_size))) + + +def tensor_parallel(fabric: L.Fabric, model: GPT) -> GPT: + for block in model.transformer.h: + tensor_parallel_mlp(fabric, block.mlp) + tensor_parallel_attn(fabric, block.attn) + + # update the config values to the shard sizes + # this is only relevant for `tensor_parallel_attn`, but it needs to run only once + world_size = fabric.world_size + attrs = ["n_head", "n_embd", "n_query_groups"] + for attr in attrs: + size = getattr(model.config, attr) + if size % world_size != 0: + raise ValueError(f"This {attr} value ({size}) is not evenly divisible by the world size ({world_size})") + setattr(model.config, attr, size // world_size) + + return model + + +@torch.inference_mode() +def main( + prompt: str = "What food do llamas eat?", + *, + num_samples: int = 1, + max_new_tokens: int = 50, + top_k: Optional[int] = 200, + temperature: float = 0.8, + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None, + precision: Optional[str] = None, + compile: bool = False, +) -> None: + """Generates text samples based on a pre-trained model and tokenizer. + + Args: + prompt: The prompt string to use for generating the samples. + num_samples: The number of text samples to generate. + max_new_tokens: The number of generation steps to take. + top_k: The number of top most probable tokens to consider in the sampling process. + temperature: A value controlling the randomness of the sampling process. Higher values result in more random + samples. + checkpoint_dir: The checkpoint directory to load. + quantize: Whether to quantize the model and using which method: + - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes + for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md + precision: Indicates the Fabric precision setting to use. + compile: Whether to compile the model. + """ + precision = precision or get_default_supported_precision(training=False) + + plugins = None + if quantize is not None: + if compile: + raise NotImplementedError # untested + if "mixed" in precision: + raise ValueError("Quantization and mixed precision is not supported.") + dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + # set "ddp" as the strategy for the launching functionality, but there's no data-parallelism + fabric = L.Fabric(devices="auto", strategy="ddp", precision=precision, plugins=plugins) + fabric.launch() + + check_valid_checkpoint_dir(checkpoint_dir) + + config = Config.from_json(checkpoint_dir / "lit_config.json") + + model_file = "lit_model.pth" + checkpoint_path = checkpoint_dir / model_file + + tokenizer = Tokenizer(checkpoint_dir) + encoded = tokenizer.encode(prompt, device=fabric.device) + prompt_length = encoded.size(0) + max_returned_tokens = prompt_length + max_new_tokens + + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + t0 = time.perf_counter() + # cannot use `init_module` because if bitsandbytes is used, the Linear layers will be replaced + # which means that the weights will get quantized on cuda:0 on checkpoint load. we need to load and then convert + # still, use init_tensor for the precision + with fabric.init_tensor(), torch.device("meta"): + model = GPT(config) + fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + # sequentially do: load the checkpoint on CPU -> quantize -> apply tp -> move to device + # so that the CPU RAM doesn't OOM with larger models + for rank in range(fabric.world_size): + if fabric.global_rank == rank: + t0 = time.perf_counter() + state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") + model.load_state_dict(state_dict, assign=True) + print(f"[{rank}] Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + + # cannot use `.setup_module` because it will wrap with DDP + model = fabric._precision.convert_module(model) + + t0 = time.perf_counter() + model = tensor_parallel(fabric, model) + print( + f"[{rank}] Time to tensor-parallelize the model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) + + with fabric.init_tensor(): + # set the max_seq_length to limit the memory usage to what we need + model.max_seq_length = max_returned_tokens + # the rope cache which is on meta device + model.cos, model.sin = model.rope_cache() + # enable the kv cache + model.set_kv_cache(batch_size=1) + model.eval() + + t0 = time.perf_counter() + model = fabric.to_device(model) + print(f"[{rank}] Time to move the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.barrier() + + if compile: + torch._dynamo.config.automatic_dynamic_shapes = True + torch._inductor.config.triton.unique_kernel_names = True + torch._inductor.config.coordinate_descent_tuning = True + generate_base.next_token = torch.compile(generate_base.next_token, mode="reduce-overhead") + + L.seed_everything(1234) + for i in range(num_samples): + t0 = time.perf_counter() + y = generate_base.generate( + model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id + ) + t = time.perf_counter() - t0 + for block in model.transformer.h: + block.attn.kv_cache.reset_parameters() + fabric.print(tokenizer.decode(y)) + tokens_generated = y.size(0) - prompt_length + fabric.print( + f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr + ) + if fabric.device.type == "cuda": + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + bnb_logger = logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes") + bnb_logger.setLevel(logging.DEBUG) + bnb_logger.debug = rank_zero_only(bnb_logger.debug) + + CLI(main) diff --git a/llm-lora-finetuning/lit_gpt/__init__.py b/llm-lora-finetuning/lit_gpt/__init__.py new file mode 100644 index 00000000..856e7cd6 --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/__init__.py @@ -0,0 +1,27 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import re +import logging + +from lit_gpt.model import GPT +from lit_gpt.config import Config +from lit_gpt.tokenizer import Tokenizer + +from lightning_utilities.core.imports import RequirementCache + +_LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.2.0.dev0") +if not bool(_LIGHTNING_AVAILABLE): + raise ImportError( + "Lit-GPT requires lightning nightly. Please run:\n" + f" pip uninstall -y lightning; pip install -r requirements.txt\n{str(_LIGHTNING_AVAILABLE)}" + ) + +# Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632 +pattern = re.compile(".*Profiler function .* will be ignored") +logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage())) + +# Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint +logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True +logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True + +__all__ = ["GPT", "Config", "Tokenizer"] diff --git a/llm-lora-finetuning/lit_gpt/adapter.py b/llm-lora-finetuning/lit_gpt/adapter.py new file mode 100644 index 00000000..044b75d5 --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/adapter.py @@ -0,0 +1,168 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Implementation of the paper: + +LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention +https://arxiv.org/abs/2303.16199 + +Port for Lit-GPT +""" + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from typing_extensions import Self + +from lit_gpt.config import Config as BaseConfig +from lit_gpt.model import GPT as BaseModel +from lit_gpt.model import Block as BaseBlock +from lit_gpt.model import CausalSelfAttention as BaseCausalSelfAttention + + +@dataclass +class Config(BaseConfig): + adapter_prompt_length: int = 10 + adapter_start_layer: int = 2 + + +class GPT(BaseModel): + """The implementation is identical to `lit_gpt.model.GPT` with the exception that + the `Block` saves the layer index and passes it down to the attention layer.""" + + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + assert config.padded_vocab_size is not None + self.config = config + + self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.transformer = nn.ModuleDict( + dict( + wte=nn.Embedding(config.padded_vocab_size, config.n_embd), + h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), + ) + ) + self.max_seq_length = self.config.block_size + self.mask_cache: Optional[torch.Tensor] = None + + def forward( + self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0 + ) -> Union[torch.Tensor, List[torch.Tensor]]: + T = idx.size(1) + if self.max_seq_length < T: + raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") + + if input_pos is not None: # use the kv cache + cos = self.cos.index_select(0, input_pos) + sin = self.sin.index_select(0, input_pos) + if self.mask_cache is None: + raise TypeError("You need to call `gpt.set_kv_cache()`") + mask = self.mask_cache.index_select(2, input_pos) + else: + cos = self.cos[:T] + sin = self.sin[:T] + mask = None + + x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + for block in self.transformer.h: + x = block(x, cos, sin, mask, input_pos) + x = self.transformer.ln_f(x) + if lm_head_chunk_size > 0: + # chunk the lm head logits to reduce the peak memory used by autograd + return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] + return self.lm_head(x) # (b, t, vocab_size) + + @classmethod + def from_name(cls, name: str, **kwargs: Any) -> Self: + return cls(Config.from_name(name, **kwargs)) + + def _init_weights(self, module: nn.Module) -> None: + """Meant to be used with `gpt.apply(gpt._init_weights)`. Unused method left for completeness.""" + super()._init_weights(module) + if isinstance(module, CausalSelfAttention): + module.reset_parameters() + + +class Block(BaseBlock): + """The implementation is identical to `lit_gpt.model.Block` with the exception that + we replace the attention layer where adaption is implemented.""" + + def __init__(self, config: Config, block_idx: int) -> None: + # Skip the parent class __init__ altogether and replace it to avoid useless allocations + nn.Module.__init__(self) + self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) + self.attn = CausalSelfAttention(config, block_idx) + if not config.shared_attention_norm: + self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps) + self.mlp = config.mlp_class(config) + + self.config = config + + +class CausalSelfAttention(BaseCausalSelfAttention): + """A modification of `lit_gpt.model.CausalSelfAttention` that adds the attention + over the adaption prompt.""" + + def __init__(self, config: Config, block_idx: int) -> None: + super().__init__(config) + if block_idx >= config.adapter_start_layer: + # adapter embedding layer + self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) + # gate for adaption + self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) + # kv cache for inference + self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self.block_idx = block_idx + + def scaled_dot_product_attention( + self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + y = super().scaled_dot_product_attention(q, k, v, mask) + if self.block_idx < self.config.adapter_start_layer: + return y + + aT = self.config.adapter_prompt_length + if self.adapter_kv_cache is not None: + # since this uses the wte weights as the prefix and the kv cache is only used during inference, ak and av + # are the same every call + ak, av = self.adapter_kv_cache + else: + prefix = self.adapter_wte.weight.reshape(1, aT, self.config.n_embd) + aqkv = self.attn(prefix) + q_per_kv = self.config.n_head // self.config.n_query_groups + aqkv = aqkv.view(1, aT, self.config.n_query_groups, q_per_kv + 2, self.config.head_size) + aqkv = aqkv.permute(0, 2, 3, 1, 4) + _, ak, av = aqkv.split((q_per_kv, 1, 1), dim=2) + if self.config.n_query_groups != 1: + # for MHA this is a no-op + ak = ak.repeat_interleave(q_per_kv, dim=2) + av = av.repeat_interleave(q_per_kv, dim=2) + ak = ak.view(1, -1, aT, self.config.head_size) # (1, nh_ak, aT, hs) + av = av.view(1, -1, aT, self.config.head_size) # (1, nh_av, aT, hs) + self.adapter_kv_cache = (ak, av) + + T = q.size(2) + amask = torch.ones(T, aT, dtype=torch.bool, device=q.device) + ay = super().scaled_dot_product_attention(q, ak, av, amask) + return y + self.gating_factor * ay + + def reset_parameters(self) -> None: + torch.nn.init.zeros_(self.gating_factor) + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with older checkpoints.""" + if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head: + state_dict[key] = state_dict[key].permute(0, 2, 1, 3) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +def mark_only_adapter_as_trainable(model: GPT) -> None: + """Sets `requires_grad=False` for all non-adapter weights.""" + for name, param in model.named_parameters(): + param.requires_grad = adapter_filter(name, param) + + +def adapter_filter(key: str, value: Any) -> bool: + return "adapter_wte" in key or "gating_factor" in key diff --git a/llm-lora-finetuning/lit_gpt/adapter_v2.py b/llm-lora-finetuning/lit_gpt/adapter_v2.py new file mode 100644 index 00000000..51b826a2 --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/adapter_v2.py @@ -0,0 +1,224 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Implementation of the paper: + +LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model +https://arxiv.org/abs/2304.15010 + +Port for Lit-GPT +""" + +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple, Type + +import torch +import torch.nn as nn +from typing_extensions import Self + +import lit_gpt +from lit_gpt.adapter import GPT as BaseModel +from lit_gpt.adapter import Block as BaseBlock +from lit_gpt.adapter import CausalSelfAttention as BaseCausalSelfAttention +from lit_gpt.adapter import Config as BaseConfig +from lit_gpt.model import KVCache +from lit_gpt.utils import map_old_state_dict_weights + + +@dataclass +class Config(BaseConfig): + @property + def mlp_class(self) -> Type: + return getattr(lit_gpt.adapter_v2, self._mlp_class) + + +def adapter_filter(key: str, value: Any) -> bool: + adapter_substrings = ( + # regular adapter v1 parameters + "adapter_wte", + "gating_factor", + # adapter v2: new bias and scale used in Linear + "adapter_scale", + "adapter_bias", + # adapter v2: Norm parameters are now trainable + "norm_1", + "norm_2", + "ln_f", + ) + return any(s in key for s in adapter_substrings) + + +class AdapterV2Linear(torch.nn.Module): + def __init__(self, in_features: int, out_features: int, **kwargs) -> None: + super().__init__() + self.linear = torch.nn.Linear(in_features, out_features, **kwargs) + self.adapter_bias = torch.nn.Parameter(torch.zeros(out_features), requires_grad=False) + self.adapter_scale = torch.nn.Parameter(torch.ones(out_features), requires_grad=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.adapter_scale * (self.linear(x) + self.adapter_bias) + + def reset_parameters(self) -> None: + nn.init.zeros_(self.adapter_bias) + nn.init.ones_(self.adapter_scale) + + +class GPT(BaseModel): + def __init__(self, config: Config) -> None: + # Skip the parent class __init__ altogether and replace it to avoid useless allocations + nn.Module.__init__(self) + assert config.padded_vocab_size is not None + self.config = config + + self.lm_head = AdapterV2Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.transformer = nn.ModuleDict( + dict( + wte=nn.Embedding(config.padded_vocab_size, config.n_embd), + h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), + ) + ) + self.max_seq_length = self.config.block_size + self.mask_cache: Optional[torch.Tensor] = None + + @classmethod + def from_name(cls, name: str, **kwargs: Any) -> Self: + return cls(Config.from_name(name, **kwargs)) + + def _init_weights(self, module: nn.Module) -> None: + """Meant to be used with `gpt.apply(gpt._init_weights)`. Unused method left for completeness.""" + super()._init_weights(module) + if isinstance(module, AdapterV2Linear): + module.reset_parameters() + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"} + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class Block(BaseBlock): + """The implementation is identical to `lit_gpt.model.Block` with the exception that + we replace the attention layer where adaption is implemented.""" + + def __init__(self, config: Config, block_idx: int) -> None: + # Skip the parent class __init__ altogether and replace it to avoid useless allocations + nn.Module.__init__(self) + self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) + self.attn = CausalSelfAttention(config, block_idx) + if not config.shared_attention_norm: + self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps) + self.mlp = config.mlp_class(config) + + self.config = config + + +class CausalSelfAttention(BaseCausalSelfAttention): + """A modification of `lit_gpt.adapter.CausalSelfAttention` that uses the Adapter V2 Linear class""" + + def __init__(self, config: Config, block_idx: int) -> None: + # Skip the parent class __init__ altogether and replace it to avoid useless allocations + nn.Module.__init__(self) + shape = (config.n_head + 2 * config.n_query_groups) * config.head_size + # key, query, value projections for all heads, but in a batch + self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias) + # output projection + # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` + self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) + # disabled by default + self.kv_cache: Optional[KVCache] = None + + if block_idx >= config.adapter_start_layer: + # adapter embedding layer + self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) + # gate for adaption + self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) + # kv cache for inference + self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self.block_idx = block_idx + + self.config = config + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = { + "attn.weight": "attn.linear.weight", + "attn.bias": "attn.linear.bias", + "proj.weight": "proj.linear.weight", + "proj.bias": "proj.linear.bias", + } + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + # For compatibility with older checkpoints + if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head: + state_dict[key] = state_dict[key].permute(0, 2, 1, 3) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class GptNeoxMLP(lit_gpt.model.GptNeoxMLP): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + self.fc = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias) + + self.config = config + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = { + "fc.weight": "fc.linear.weight", + "fc.bias": "fc.linear.bias", + "proj.weight": "proj.linear.weight", + "proj.bias": "proj.linear.bias", + } + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class LLaMAMLP(lit_gpt.model.LLaMAMLP): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + self.fc_1 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.fc_2 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias) + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = { + "fc_1.weight": "fc_1.linear.weight", + "fc_1.bias": "fc_1.linear.bias", + "fc_2.weight": "fc_2.linear.weight", + "fc_2.bias": "fc_2.linear.bias", + "proj.weight": "proj.linear.weight", + "proj.bias": "proj.linear.bias", + } + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class GemmaMLP(LLaMAMLP): + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_fc_1 = self.fc_1(x) + x_fc_2 = self.fc_2(x) + x = torch.nn.functional.gelu(x_fc_1) * x_fc_2 + return self.proj(x) + + +class LLaMAMoE(lit_gpt.model.LLaMAMoE): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + self.gate = AdapterV2Linear(config.n_embd, config.n_expert, bias=False) + self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + + self.config = config + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = {"gate.weight": "gate.linear.weight"} + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +def mark_only_adapter_v2_as_trainable(model: GPT) -> None: + """Sets requires_grad=False for all non-adapter weights""" + for name, param in model.named_parameters(): + param.requires_grad = adapter_filter(name, param) diff --git a/llm-lora-finetuning/lit_gpt/args.py b/llm-lora-finetuning/lit_gpt/args.py new file mode 100644 index 00000000..62217076 --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/args.py @@ -0,0 +1,81 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + + +@dataclass +class TrainArgs: + """Training related arguments""" + + save_interval: int = 1000 + """Number of optimizer steps between checkpoints""" + log_interval: int = 1 + """Number of iterations between logging calls""" + global_batch_size: int = 64 + """Number of samples between optimizer steps across data-parallel ranks""" + micro_batch_size: int = 4 + """Number of samples per data-parallel rank""" + lr_warmup_steps: int = 100 + """Number of iterations with learning rate warmup active""" + epochs: Optional[int] = None + """Number of epochs to run""" + epoch_size: Optional[int] = None + """Size of the epoch""" + # TODO: pretrain/tinyllama is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs? + max_tokens: Optional[int] = None + """Total number of tokens to train on""" + max_seq_length: Optional[int] = None + """Limits the length of samples. Off by default""" + + # Optimization args + learning_rate: float = 1e-3 + weight_decay: float = 0.02 + beta1: float = 0.9 + beta2: float = 0.95 + max_norm: Optional[float] = None + min_lr: float = 6e-5 + + def max_iters(self, devices: int) -> int: + """Number of iterations""" + max_iters = self.epochs * self.epoch_size // devices // self.micro_batch_size + assert max_iters > 0 + return max_iters + + def gradient_accumulation_iters(self, devices: int) -> int: + """Number of iterations between gradient synchronizations""" + gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size + assert gradient_accumulation_iters > 0 + return gradient_accumulation_iters + + def batch_size(self, devices: int) -> int: + """Number of samples between optimizer steps per data-parallel rank""" + batch_size = self.global_batch_size // devices + assert batch_size > 0 + return batch_size + + +@dataclass +class EvalArgs: + """Evaluation related arguments""" + + interval: int = 600 + """Number of optimizer steps between evaluation calls""" + max_new_tokens: Optional[int] = None + """Number of tokens to generate""" + max_iters: int = 100 + """Number of iterations""" + + +@dataclass +class IOArgs: + """Inputs and outputs related arguments""" + + # Optional because pretrain/tinyllama hardcodes the path + train_data_dir: Optional[Path] = Path("data/alpaca") + """Where to read training data from""" + val_data_dir: Optional[Path] = None + """Where to read validation data from""" + checkpoint_dir: Optional[Path] = None + """Where to read weights and tokenizer data from""" + out_dir: Path = Path("out/adapter/alpaca") + """Where to save artifacts""" diff --git a/llm-lora-finetuning/lit_gpt/config.py b/llm-lora-finetuning/lit_gpt/config.py new file mode 100644 index 00000000..4c73dc6b --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/config.py @@ -0,0 +1,1447 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import json +from copy import deepcopy +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Literal, Optional, Type, Union + +import torch +from typing_extensions import Self + +import lit_gpt.model +from lit_gpt.utils import find_multiple + + +@dataclass +class Config: + name: str = "" + hf_config: dict = field(default_factory=dict) + scale_embeddings: bool = False + block_size: int = 4096 + vocab_size: int = 50254 + padding_multiple: int = 512 + padded_vocab_size: Optional[int] = None + n_layer: int = 16 + n_head: int = 32 + head_size: Optional[int] = None + n_embd: int = 4096 + rotary_percentage: float = 0.25 + parallel_residual: bool = True + bias: bool = True + lm_head_bias: bool = False + # to use multi-head attention (MHA), set this to `n_head` (default) + # to use multi-query attention (MQA), set this to 1 + # to use grouped-query attention (GQA), set this to a value in between + # Example with `n_head=4` + # ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐ + # │ v ││ v ││ v ││ v │ │ v │ │ v │ │ v │ + # └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘ + # │ │ │ │ │ │ │ + # ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐ + # │ k ││ k ││ k ││ k │ │ k │ │ k │ │ k │ + # └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘ + # │ │ │ │ ┌──┴──┐ ┌──┴──┐ ┌────┬──┴─┬────┐ + # ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐ + # │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │ + # └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘ + # ◀──────────────────▶ ◀──────────────────▶ ◀──────────────────▶ + # MHA GQA MQA + # n_query_groups=4 n_query_groups=2 n_query_groups=1 + # + # credit https://arxiv.org/pdf/2305.13245.pdf + n_query_groups: Optional[int] = None + shared_attention_norm: bool = False + _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm" + norm_eps: float = 1e-5 + _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE"] = "GptNeoxMLP" + gelu_approximate: str = "none" + intermediate_size: Optional[int] = None + rope_condense_ratio: int = 1 + rope_base: int = 10000 + n_expert: int = 0 + n_expert_per_token: int = 0 + + def __post_init__(self): + if not self.name: + self.name = self.hf_config.get("name", self.name) + + if self.head_size is None: + assert self.n_embd % self.n_head == 0 + self.head_size = self.n_embd // self.n_head + + # vocab size should be a power of 2 to be optimal on hardware. compute the closest value + if self.padded_vocab_size is None: + self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple) + else: + # vocab size shouldn't be larger than padded vocab size + self.vocab_size = min(self.vocab_size, self.padded_vocab_size) + + # compute the number of query groups + if self.n_query_groups is not None: + assert self.n_head % self.n_query_groups == 0 + else: + self.n_query_groups = self.n_head + + # compute the intermediate size for MLP if not set + if self.intermediate_size is None: + if self._mlp_class == "LLaMAMLP": + raise ValueError("The config needs to set the `intermediate_size`") + self.intermediate_size = 4 * self.n_embd + + self.rope_n_elem = int(self.rotary_percentage * self.head_size) + + @classmethod + def from_name(cls, name: str, **kwargs: Any) -> Self: + if name not in name_to_config: + # search through all `config['hf_config']['name']` + try: + conf_dict = next(config for config in configs if name == config["hf_config"]["name"]) + except StopIteration: + raise ValueError(f"{name!r} is not a supported config name") + else: + conf_dict = name_to_config[name] + + conf_dict = conf_dict.copy() + if "condense_ratio" in kwargs: # legacy name + kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") + conf_dict.update(kwargs) + return cls(**conf_dict) + + @classmethod + def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self: + with open(path, encoding="utf-8") as fp: + json_kwargs = json.load(fp) + if "condense_ratio" in json_kwargs: # legacy name + json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio") + if "condense_ratio" in kwargs: # legacy name + kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") + if "org" in json_kwargs: # legacy name + json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")} + if "org" in kwargs: # legacy name + kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")} + json_kwargs.update(kwargs) + return cls(**json_kwargs) + + @classmethod + def from_checkpoint(cls, path: Path, **kwargs: Any) -> Self: + """Automatically load `lit_config.json` and if it doesn't exist - a matching config from `lit_gpt/config.py`.""" + if (config_path := path / "lit_config.json").is_file(): + return cls.from_json(config_path, **kwargs) + if (model_name := path.name) in name_to_config: + return cls.from_name(model_name, **kwargs) + raise FileNotFoundError(f"For {str(path)!r} neither 'lit_config.json' nor matching config exists.") + + @property + def mlp_class(self) -> Type: + # `self._mlp_class` cannot be the type to keep the config json serializable + return getattr(lit_gpt.model, self._mlp_class) + + @property + def norm_class(self) -> Type: + # `self._norm_class` cannot be the type to keep the config json serializable + if self._norm_class == "RMSNorm": + from functools import partial + + from lit_gpt.rmsnorm import RMSNorm + + return partial(RMSNorm, add_unit_offset="Gemma" in self.name) + return getattr(torch.nn, self._norm_class) + + +######################## +# Stability AI StableLM +######################## +configs = [ + # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json + dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")), + # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json + dict( + name="stablelm-base-alpha-7b", + hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"), + n_head=48, + n_embd=6144, + padding_multiple=256, + ), + # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json + dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32), + # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json + dict( + name="stablelm-tuned-alpha-7b", + hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"), + n_head=48, + n_embd=6144, + padding_multiple=256, + ), + # https://huggingface.co/stabilityai/stablelm-zephyr-3b/blob/main/config.json + dict( + name="stablelm-zephyr-3b", + hf_config=dict(org="stabilityai", name="stablelm-zephyr-3b"), + padded_vocab_size=50304, + n_layer=32, + n_head=32, + n_embd=2560, + parallel_residual=False, + bias=False, + _mlp_class="LLaMAMLP", + intermediate_size=6912, + ), +] + +#################### +# EleutherAI Pythia +#################### +pythia = [ + # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json + dict( + name="pythia-14m", + hf_config=dict(org="EleutherAI", name="pythia-14m"), + block_size=512, + n_layer=6, + n_embd=128, + n_head=4, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json + dict( + name="pythia-31m", + hf_config=dict(org="EleutherAI", name="pythia-31m"), + block_size=1024, + n_layer=6, + n_embd=256, + n_head=8, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json + dict( + name="pythia-70m", + hf_config=dict(org="EleutherAI", name="pythia-70m"), + block_size=2048, + n_layer=6, + n_embd=512, + n_head=8, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json + dict( + name="pythia-160m", + hf_config=dict(org="EleutherAI", name="pythia-160m"), + block_size=2048, + n_layer=12, + n_embd=768, + n_head=12, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json + dict( + name="pythia-410m", + hf_config=dict(org="EleutherAI", name="pythia-410m"), + block_size=2048, + n_layer=24, + n_embd=1024, + n_head=16, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json + dict( + name="pythia-1b", + hf_config=dict(org="EleutherAI", name="pythia-1b"), + block_size=2048, + n_embd=2048, + n_head=8, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json + dict( + name="pythia-1.4b", + hf_config=dict(org="EleutherAI", name="pythia-1.4b"), + block_size=2048, + n_layer=24, + n_embd=2048, + n_head=16, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json + dict( + name="pythia-2.8b", + hf_config=dict(org="EleutherAI", name="pythia-2.8b"), + block_size=2048, + n_layer=32, + n_embd=2560, + padding_multiple=128, + ), + # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json + dict( + name="pythia-6.9b", + hf_config=dict(org="EleutherAI", name="pythia-6.9b"), + block_size=2048, + n_layer=32, + padding_multiple=256, + ), + # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json + dict( + name="pythia-12b", + hf_config=dict(org="EleutherAI", name="pythia-12b"), + block_size=2048, + n_layer=36, + n_embd=5120, + n_head=40, + ), +] +configs.extend(pythia) +for c in pythia: + # "pythia-14m" and "pythia-31m" don't have deduped version + if c["name"] in ("pythia-14m", "pythia-31m"): + continue + copy = deepcopy(c) + copy["name"] = f"{c['name']}-deduped" + copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped" + configs.append(copy) + + +################### +# databricks Dolly +################### +dolly = [ + # https://huggingface.co/databricks/dolly-v2-3b/blob/main/config.json + dict( + name="dolly-v2-3b", + hf_config=dict(org="databricks", name="dolly-v2-3b"), + block_size=2048, + n_layer=32, + n_embd=2560, + padded_vocab_size=50280, + ), + # https://huggingface.co/databricks/dolly-v2-7b/blob/main/config.json + dict( + name="dolly-v2-7b", + hf_config=dict(org="databricks", name="dolly-v2-7b"), + block_size=2048, + n_layer=32, + padded_vocab_size=50280, + ), + # https://huggingface.co/databricks/dolly-v2-12b/blob/main/config.json + dict( + name="dolly-v2-12b", + hf_config=dict(org="databricks", name="dolly-v2-12b"), + block_size=2048, + n_layer=36, + n_embd=5120, + n_head=40, + padded_vocab_size=50280, + ), +] +configs.extend(dolly) + + +#################################### +# togethercomputer RedPajama INCITE +#################################### +redpajama_incite = [ + # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json + dict( + name="RedPajama-INCITE-{}-3B-v1", + hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"), + block_size=2048, + n_layer=32, + n_embd=2560, + padding_multiple=256, + rotary_percentage=1.0, + parallel_residual=False, + ), + # https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json + dict( + name="RedPajama-INCITE-7B-{}", + hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"), + block_size=2048, + n_layer=32, + padding_multiple=256, + rotary_percentage=1.0, + parallel_residual=False, + ), + # this redirects to the checkpoint above. kept for those who had the old weights already downloaded + dict( + name="RedPajama-INCITE-{}-7B-v0.1", + hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"), + block_size=2048, + n_layer=32, + padding_multiple=256, + rotary_percentage=1.0, + parallel_residual=False, + ), +] +for c in redpajama_incite: + for kind in ("Base", "Chat", "Instruct"): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + configs.append(copy) + + +################# +# TII UAE Falcon +################# +falcon = [ + # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json + dict( + name="falcon-7b{}", + hf_config=dict(org="tiiuae", name="falcon-7b{}"), + block_size=2048, + vocab_size=65024, + padded_vocab_size=65024, + n_layer=32, + n_head=71, + n_embd=4544, + rotary_percentage=1.0, + n_query_groups=1, + bias=False, + # this is not in the config, but in the original model implementation, only for this config + shared_attention_norm=True, + ), + # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json + dict( + name="falcon-40b{}", + hf_config=dict(org="tiiuae", name="falcon-40b{}"), + block_size=2048, + vocab_size=65024, + padded_vocab_size=65024, + n_layer=60, + n_head=128, + n_embd=8192, + rotary_percentage=1.0, + n_query_groups=8, + bias=False, + ), +] +for c in falcon: + for kind in ("", "-instruct"): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + configs.append(copy) + +# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json +falcon180b = dict( + name="falcon-180B{}", + hf_config=dict(org="tiiuae", name="falcon-180B{}"), + block_size=2048, + vocab_size=65024, + padded_vocab_size=65024, + n_layer=80, + n_head=232, + n_embd=14848, + rotary_percentage=1.0, + n_query_groups=8, + bias=False, +) + +for kind in ("", "-chat"): + copy = deepcopy(falcon180b) + copy["name"] = falcon180b["name"].format(kind) + copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind) + configs.append(copy) + + +############################# +# OpenLM Research Open LLaMA +############################# +open_LLaMA = [ + # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json + dict( + name="open_llama_3b", + hf_config=dict(org="openlm-research", name="open_llama_3b"), + block_size=2048, + vocab_size=32000, + padding_multiple=64, + n_layer=26, + n_embd=3200, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=8640, + ), + # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json + dict( + name="open_llama_7b", + hf_config=dict(org="openlm-research", name="open_llama_7b"), + block_size=2048, + vocab_size=32000, + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + ), + # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json + dict( + name="open_llama_13b", + hf_config=dict(org="openlm-research", name="open_llama_13b"), + block_size=2048, + vocab_size=32000, + padding_multiple=64, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), +] +configs.extend(open_LLaMA) + + +############### +# LMSYS Vicuna +############### +vicuna = [ + # https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json + dict( + name="vicuna-7b-v1.3", + hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"), + block_size=2048, + vocab_size=32000, + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + ), + # https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json + dict( + name="vicuna-13b-v1.3", + hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"), + block_size=2048, + vocab_size=32000, + padding_multiple=64, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), + # https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json + dict( + name="vicuna-33b-v1.3", + hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"), + block_size=2048, + vocab_size=32000, + padding_multiple=64, + n_layer=60, + n_head=52, + n_embd=6656, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=17920, + ), + # https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json + dict( + name="vicuna-7b-v1.5", + hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"), + vocab_size=32000, + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=11008, + ), + # https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json + dict( + name="vicuna-7b-v1.5-16k", + hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"), + block_size=16384, + vocab_size=32000, + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=11008, + rope_condense_ratio=4, + ), + # https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json + dict( + name="vicuna-13b-v1.5", + hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"), + vocab_size=32000, + padding_multiple=64, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), + # https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json + dict( + name="vicuna-13b-v1.5-16k", + hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"), + block_size=16384, + vocab_size=32000, + padding_multiple=64, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=13824, + rope_condense_ratio=4, + ), +] +configs.extend(vicuna) + + +################# +# LMSYS LongChat +################# +long_chat = [ + # https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json + dict( + name="longchat-7b-16k", + hf_config=dict(org="lmsys", name="longchat-7b-16k"), + block_size=16384, + vocab_size=32000, + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + rope_condense_ratio=8, + ), + # https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json + dict( + name="longchat-13b-16k", + hf_config=dict(org="lmsys", name="longchat-13b-16k"), + block_size=16384, + vocab_size=32000, + padding_multiple=64, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + rope_condense_ratio=8, + ), +] +configs.extend(long_chat) + + +###################### +# NousResearch Hermes +###################### +nous_research = [ + # https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json + dict( + name="Nous-Hermes-llama-2-7b", + hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"), + padded_vocab_size=32000, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + ), + # https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json + dict( + name="Nous-Hermes-13b", + hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"), + block_size=2048, + vocab_size=32000, + padded_vocab_size=32001, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-6, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), + # https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b + dict( + name="Nous-Hermes-Llama2-13b", + hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"), + vocab_size=32000, + padded_vocab_size=32032, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), +] +configs.extend(nous_research) + + +############### +# Meta LLaMA 2 +############### +llama_2 = [ + # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json + dict( + name="Llama-2-7b{}-hf", + hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"), + vocab_size=32000, + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=11008, + ), + # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json + dict( + name="Llama-2-13b{}-hf", + hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"), + vocab_size=32000, + padding_multiple=64, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), + # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json + dict( + name="Llama-2-70b{}-hf", + hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"), + vocab_size=32000, + padding_multiple=64, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=28672, + ), +] +for c in llama_2: + for kind in ("", "-chat"): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + configs.append(copy) + + +############### +# Google Gemma +############### +gemma = [ + # https://huggingface.co/google/gemma-2b/blob/main/config.json + dict( + name="Gemma-2b", + hf_config=dict(org="google", name="gemma-2b"), + scale_embeddings=True, + vocab_size=256000, + padding_multiple=64, + n_embd=2048, + n_layer=18, + n_head=8, + n_query_groups=1, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="GemmaMLP", + intermediate_size=16384, + ), + # https://huggingface.co/google/gemma-7b/blob/main/config.json + dict( + name="Gemma-7b", + hf_config=dict(org="google", name="gemma-7b"), + scale_embeddings=True, + vocab_size=256000, + padding_multiple=64, + n_embd=3072, + n_layer=28, + n_head=16, + head_size=256, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="GemmaMLP", + intermediate_size=24576, + ), +] +configs.extend(gemma) +for c in gemma: + copy = deepcopy(c) + copy["name"] = f"{c['name']}-it" + copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it" + configs.append(copy) + + +########################## +# Stability AI FreeWilly2 +########################## +freewilly_2 = [ + # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json + dict( + name="FreeWilly2", + hf_config=dict(org="stabilityai", name="FreeWilly2"), + vocab_size=32000, + padding_multiple=64, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=28672, + ) +] +configs.extend(freewilly_2) + + +################## +# Meta Code Llama +################## +code_llama = [ + # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json + dict( + name="CodeLlama-7b-hf", + hf_config=dict(org="codellama", name="CodeLlama-7b-hf"), + block_size=16384, + vocab_size=32016, + padding_multiple=16, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json + dict( + name="CodeLlama-13b-hf", + hf_config=dict(org="codellama", name="CodeLlama-13b-hf"), + block_size=16384, + vocab_size=32016, + padding_multiple=16, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json + dict( + name="CodeLlama-34b-hf", + hf_config=dict(org="codellama", name="CodeLlama-34b-hf"), + block_size=16384, + vocab_size=32000, + padded_vocab_size=32000, + n_layer=48, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=22016, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-70b-hf/blob/main/config.json + dict( + name="CodeLlama-70b-hf", + hf_config=dict(org="codellama", name="CodeLlama-70b-hf"), + block_size=16384, + vocab_size=32016, + padding_multiple=16, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=28672, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json + dict( + name="CodeLlama-7b-Python-hf", + hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"), + block_size=16384, + vocab_size=32000, + padded_vocab_size=32000, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json + dict( + name="CodeLlama-13b-Python-hf", + hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"), + block_size=16384, + vocab_size=32000, + padded_vocab_size=32000, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json + dict( + name="CodeLlama-34b-Python-hf", + hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"), + block_size=16384, + vocab_size=32000, + padded_vocab_size=32000, + n_layer=48, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=22016, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-70b-Python-hf/blob/main/config.json + dict( + name="CodeLlama-70b-Python-hf", + hf_config=dict(org="codellama", name="CodeLlama-70b-Python-hf"), + block_size=16384, + vocab_size=32016, + padding_multiple=16, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=28672, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/config.json + dict( + name="CodeLlama-7b-Instruct-hf", + hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"), + block_size=16384, + vocab_size=32016, + padding_multiple=16, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json + dict( + name="CodeLlama-13b-Instruct-hf", + hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"), + block_size=2048, + vocab_size=32016, + padding_multiple=16, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json + dict( + name="CodeLlama-34b-Instruct-hf", + hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"), + block_size=16384, + vocab_size=32000, + padded_vocab_size=32000, + n_layer=48, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=22016, + rope_base=1000000, + ), + # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/config.json + dict( + name="CodeLlama-70b-Instruct-hf", + hf_config=dict(org="codellama", name="CodeLlama-70b-Instruct-hf"), + block_size=16384, + vocab_size=32016, + padding_multiple=16, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=28672, + rope_base=1000000, + ), +] +configs.extend(code_llama) + + +######################## +# garage-bAInd Platypus +######################## +platypus = [ + # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json + dict( + name="Platypus-30B", + hf_config=dict(org="garage-bAInd", name="Platypus-30B"), + block_size=2048, + padded_vocab_size=32000, + n_layer=60, + n_head=52, + n_embd=6656, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-06, + _mlp_class="LLaMAMLP", + intermediate_size=17920, + ), + # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json + dict( + name="Platypus2-7B", + hf_config=dict(org="garage-bAInd", name="Platypus2-7B"), + padded_vocab_size=32000, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=11008, + ), + # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json + dict( + name="Platypus2-13B", + hf_config=dict(org="garage-bAInd", name="Platypus2-13B"), + padded_vocab_size=32000, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), + # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json + dict( + name="Platypus2-70B", + hf_config=dict(org="garage-bAInd", name="Platypus2-70B"), + padded_vocab_size=32000, + n_layer=80, + n_head=64, + n_embd=8192, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=28672, + ), + # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json + dict( + name="Camel-Platypus2-13B", + hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"), + padded_vocab_size=32000, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), + # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json + dict( + name="Camel-Platypus2-70B", + hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"), + padded_vocab_size=32000, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=28672, + ), + # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json + dict( + name="Stable-Platypus2-13B", + hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"), + padded_vocab_size=32000, + n_layer=40, + n_head=40, + n_embd=5120, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=13824, + ), + # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json + dict( + name="Platypus2-70B-instruct", + hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"), + padded_vocab_size=32000, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=28672, + ), +] +configs.extend(platypus) + + +########################## +# Stability AI StableCode +########################## +stablecode = [ + # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json + dict( + name="stablecode-completion-alpha-3b", + hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), + block_size=16384, + vocab_size=49152, + n_layer=32, + n_embd=2560, + ), + # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json + dict( + name="stablecode-completion-alpha-3b-4k", + hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), + vocab_size=49152, + n_layer=32, + n_embd=2560, + ), + # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json + dict( + name="stablecode-instruct-alpha-3b", + hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"), + vocab_size=49152, + n_layer=32, + n_embd=2560, + ), +] +configs.extend(stablecode) + + +################################## +# togethercomputer LLaMA-2-7B-32K +################################## +together_llama2_32k = [ + # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json + dict( + name="LLaMA-2-7B-32K", + hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"), + vocab_size=32000, + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=11008, + rope_condense_ratio=8, + ) +] +configs.extend(together_llama2_32k) + + +################ +# Microsoft Phi +################ +phi = [ + # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json + dict( + name="phi-1_5", + hf_config=dict(org="microsoft", name="phi-1_5"), + vocab_size=50257, + padded_vocab_size=51200, + block_size=2048, + n_embd=2048, + n_layer=24, + rotary_percentage=0.5, # 32 / (n_embd / n_head) = 32 / 64 + shared_attention_norm=True, + lm_head_bias=True, + gelu_approximate="tanh", + ), + # https://huggingface.co/microsoft/phi-2/blob/main/config.json + dict( + name="phi-2", + hf_config=dict(org="microsoft", name="phi-2"), + vocab_size=50257, + padded_vocab_size=51200, + block_size=2048, + n_embd=2560, + n_layer=32, + rotary_percentage=0.4, # 32 / (n_embd / n_head) = 32 / 80 + shared_attention_norm=True, + lm_head_bias=True, + gelu_approximate="tanh", + ), +] +configs.extend(phi) + + +############# +# Mistral AI +############# +mistral = [ + # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json + dict( + name="Mistral-7B-{}v0.1", + hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"), + padded_vocab_size=32000, + block_size=4096, # should be 32768 but sliding window attention is not implemented + n_layer=32, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=14336, + ), + # https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json + dict( + name="Mixtral-8x7B-{}v0.1", + hf_config=dict(org="mistralai", name="Mixtral-8x7B-{}v0.1"), + padded_vocab_size=32000, + block_size=32768, + n_layer=32, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMoE", + intermediate_size=14336, + rope_base=1000000, + n_expert=8, + n_expert_per_token=2, + ), +] +for c in mistral: + for kind in ("", "Instruct-"): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + configs.append(copy) +configs.append( + # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/main/config.json + dict( + name="Mistral-7B-Instruct-v0.2", + hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.2"), + padded_vocab_size=32000, + block_size=32768, + n_layer=32, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + norm_eps=1e-05, + _mlp_class="LLaMAMLP", + intermediate_size=14336, + ) +) + + +############ +# TinyLlama +############ +tiny_llama = [ + dict( + name="tiny-llama-1.1b{}", + hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"), + block_size=2048, + vocab_size=32000, + padding_multiple=64, + n_layer=22, + n_head=32, + n_embd=2048, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", # original TinyLlama uses FusedRMSNorm + norm_eps=1e-5, + _mlp_class="LLaMAMLP", + intermediate_size=5632, + n_query_groups=4, + ) +] +for c in tiny_llama: + for kind, hf_postfix in (("", "-intermediate-step-1431k-3T"), ("-chat", "-Chat-v1.0")): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix) + configs.append(copy) + + +########################## +# Trelis Function Calling +########################## +llama_2_function_calling = [ + # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json + dict( + name="Llama-2-7b-chat-hf-function-calling-v2", + hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"), + padding_multiple=64, + n_layer=32, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + _norm_class="RMSNorm", + _mlp_class="LLaMAMLP", + intermediate_size=11008, + norm_eps=1e-6, + block_size=4096, + vocab_size=32000, + n_head=32, + n_embd=4096, + rope_base=10000, + ) +] + +configs.extend(llama_2_function_calling) + +name_to_config = {config["name"]: config for config in configs} diff --git a/llm-lora-finetuning/lit_gpt/lora.py b/llm-lora-finetuning/lit_gpt/lora.py new file mode 100644 index 00000000..bfc7adc1 --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/lora.py @@ -0,0 +1,737 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +# Derived from https://github.com/microsoft/LoRA +# ------------------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ + +r""" + Low Ranking Adaptation for LLMs scheme. + + ┌───────────────────┐ + ┆ h ┆ + └───────────────────┘ + ▲ + | + + + / \ + ┌─────────────────┐ ╭───────────────╮ Matrix initialization: + ┆ ┆ \ B / B = 0 + ┆ pretrained ┆ \ r*d / A = N(0, sigma^2) + ┆ weights ┆ ╰─────────╯ + ┆ ┆ | r | r - rank + ┆ W e R^(d*d) ┆ | ◀─────▶ | + ┆ ┆ ╭─────────╮ + └─────────────────┘ / A \ + ▲ / d*r \ + \ ╰───────────────╯ + \ ▲ + \ / + \ / + ┌───────────────────┐ + ┆ x ┆ + └───────────────────┘ + +With LoRA (Low Ranking Adaptation: https://arxiv.org/abs/2106.09685) instead of learning weights of size d*d, +we can freeze the pretrained weights and instead learn two matrices of size d*r and r*d (they will store weight updates +for the pretrained weights): the number of parameters in this case will be reduced drastically (depending on the rank of +course) yet after multiplication of matrices d*r and r*d we will get a matrix d*d which we can sum with frozen +pretrained weights and thus fine-tune the model. + +The goal of this approach is to move weight updates into a separate matrix which is decomposed with +two matrices of a lower rank. +""" + +import math +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing_extensions import Self + +import lit_gpt +from lit_gpt.config import Config as BaseConfig +from lit_gpt.model import GPT as BaseModel +from lit_gpt.model import Block as BaseBlock +from lit_gpt.model import CausalSelfAttention as BaseCausalSelfAttention +from lit_gpt.model import KVCache +from lit_gpt.utils import map_old_state_dict_weights + + +class LoRALayer(nn.Module): + def __init__(self, r: int, lora_alpha: int, lora_dropout: float): + """Store LoRA specific attributes in a class. + + Args: + r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of + the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2) + lora_alpha: alpha is needed for scaling updates as alpha/r + "This scaling helps to reduce the need to retune hyperparameters when we vary r" + https://arxiv.org/pdf/2106.09685.pdf (section 4.1) + lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A) + """ + super().__init__() + assert r >= 0 + self.r = r + self.lora_alpha = lora_alpha + # Optional dropout + if lora_dropout > 0.0: + self.lora_dropout = nn.Dropout(p=lora_dropout) + else: + self.lora_dropout = lambda x: x + # Mark the weight as unmerged + self.merged = False + + +class LoRALinear(LoRALayer): + # LoRA implemented in a dense layer + def __init__( + self, + # ↓ this part is for pretrained weights + in_features: int, + out_features: int, + # ↓ the remaining part is for LoRA + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + **kwargs: Any, + ): + """LoRA wrapper around linear class. + + This class has three weight matrices: + 1. Pretrained weights are stored as `self.linear.weight` + 2. LoRA A matrix as `self.lora_A` + 3. LoRA B matrix as `self.lora_B` + Only LoRA's A and B matrices are updated, pretrained weights stay frozen. + + Args: + in_features: number of input features of the pretrained weights + out_features: number of output features of the pretrained weights + r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of + the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2) + lora_alpha: alpha is needed for scaling updates as alpha/r + "This scaling helps to reduce the need to retune hyperparameters when we vary r" + https://arxiv.org/pdf/2106.09685.pdf (section 4.1) + lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A) + """ + super().__init__(r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout) + self.linear = torch.nn.Linear(in_features, out_features, **kwargs) + + # Actual trainable parameters + if r > 0: + self.lora_A = nn.Parameter(torch.zeros((r, in_features))) + self.lora_B = nn.Parameter(torch.zeros((out_features, r))) + self.scaling = self.lora_alpha / self.r + self.reset_parameters() + + def reset_parameters(self) -> None: + """Reset all the weights, even including pretrained ones.""" + if hasattr(self, "lora_A"): + # initialize A the same way as the default for nn.Linear and B to zero + # Wondering why 'a' is equal to math.sqrt(5)?: https://github.com/pytorch/pytorch/issues/15314 + nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) + nn.init.zeros_(self.lora_B) + + def get_lora_AB(self) -> torch.Tensor: + """Return merged lora_A and lora_B matrices with the same shape as the pretrained weights.""" + return (self.lora_B @ self.lora_A) * self.scaling + + def merge(self) -> None: + """Merges the LoRA weights into the full-rank weights (W = W + delta_W).""" + if self.r > 0 and not self.merged: + pretrained_dtype = self.linear.weight.data.dtype + lora_data = self.get_lora_AB() + # if the pretrained weights and LoRA weights are of the same dtype - simply sum them + if pretrained_dtype == lora_data.dtype: + self.linear.weight.data += lora_data + # if only the pretrained are in quantized form - dequantize, sum with LoRA and quantize the result + elif pretrained_dtype == torch.uint8: + import bitsandbytes as bnb + + weight = self.linear.weight + # dequantize the pretrained weights + weight_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state).to(lora_data.dtype) + # add pretrained and LoRA weights + weight_data += lora_data + # assign updated weights and quantize by moving to CUDA device + self.linear.weight = bnb.nn.Params4bit(weight_data, requires_grad=False, **weight.__dict__) + self.linear.weight.cuda(weight.device) + else: + raise NotImplementedError( + f"Cannot merge the pretrained weights of type {pretrained_dtype}" + f" and LoRA weights of type {lora_data.dtype}" + ) + + self.merged = True + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # if weights are merged or rank is less or equal to zero (LoRA is disabled) - it's only a regular nn.Linear forward pass; + # otherwise in addition do the forward pass with LoRA weights and add it's output to the output from pretrained weights + pretrained = self.linear(x) + if self.r == 0 or self.merged: + return pretrained + lora = (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling + return pretrained + lora + + +class LoRAQKVLinear(LoRALinear): + # LoRA implemented in a dense layer + def __init__( + self, + # ↓ this part is for pretrained weights + in_features: int, + out_features: int, + # ↓ the remaining part is for LoRA + n_head: int, + n_query_groups: int, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + enable_lora: Union[bool, Tuple[bool, bool, bool]] = False, + **kwargs: Any, + ): + """LoRA wrapper around linear class that is used for calculation of q, k and v matrices. + + This class has three weight matrices: + 1. Pretrained weights are stored as `self.linear.weight` + 2. LoRA A matrix as `self.lora_A` + 3. LoRA B matrix as `self.lora_B` + Only LoRA's A and B matrices are updated, pretrained weights stay frozen. + + Args: + in_features: number of input features of the pretrained weights + out_features: number of output features of the pretrained weights + n_head: number of attention heads + n_query_groups: number of query groups (see diagram in `lit_gpt/config.py`) + r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of + the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2) + lora_alpha: alpha is needed for scaling updates as alpha/r + "This scaling helps to reduce the need to retune hyperparameters when we vary r" + https://arxiv.org/pdf/2106.09685.pdf (section 4.1) + lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A) + enable_lora: MergeLinear class is for attention mechanism where qkv are calculated with a single weight matrix. If we + don't want to apply LoRA we can set it as False. For example if we want to apply LoRA only to `query` + and `value` but keep `key` without weight updates we should pass `[True, False, True]` + """ + super(LoRALinear, self).__init__(r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout) + self.linear = torch.nn.Linear(in_features, out_features, **kwargs) + self.n_head = n_head + self.n_query_groups = n_query_groups + if isinstance(enable_lora, bool): + enable_lora = [enable_lora] * 3 + assert len(enable_lora) == 3 + self.enable_lora = enable_lora + + # Actual trainable parameters + # To better understand initialization let's imagine that we have such parameters: + # ⚬ in_features: 128 (embeddings_size) + # ⚬ out_features: 384 (3 * embedding_size) + # ⚬ r: 2 + # ⚬ enable_lora: [True, False, True] + if r > 0 and any(enable_lora): + self.lora_A = nn.Parameter(torch.zeros((r * sum(enable_lora), in_features))) # (4, 128) + enable_q, enable_k, enable_v = enable_lora + self.kv_embd_size = self.linear.in_features // (n_head // n_query_groups) + # qkv_shapes will be used to split a tensor with weights correctly + qkv_shapes = ( + self.linear.in_features * enable_q, + self.kv_embd_size * enable_k, + self.kv_embd_size * enable_v, + ) + self.qkv_shapes = [s for s in qkv_shapes if s] + self.lora_B = nn.Parameter(torch.zeros(sum(self.qkv_shapes), r)) # (256, 2)) + # Notes about shapes above + # - self.lora_A has shape (4, 128): 4 because rank is 2 and LoRA is applied only to two matrices; + # 128 is the input size of the x (embedding size). (4, 128) and not (128, 4) because later on in + # F.linear function weights are automatically transposed. In addition conv1d requires channels to + # be before seq length + # - self.lora_B has shape (256, 2): 256 because LoRA is applied only to two matrices, so the output is + # 128*2; 2 tells to have two channels per group for group convolution + + # Scaling: + # This balances the pretrained model`s knowledge and the new task-specific adaptation + # https://lightning.ai/pages/community/tutorial/lora-llm/ + # So, set alpha to 1.0 to fully add LoRA. If the LoRA seems to have too much effect (i.e., overfitted), set + # alpha to lower value. If the LoRA seems to have too little effect, set alpha to higher than 1.0. You can + # tune these values to your needs. This value can be even slightly greater than 1.0! + # https://github.com/cloneofsimo/lora + self.scaling = self.lora_alpha / self.r + + # Compute the indices + # Indices are needed to properly pad weight updates with zeros in `zero_pad` method. + q_per_kv = self.n_head // self.n_query_groups + total_qkv = q_per_kv + 2 + head_size = out_features // (self.n_query_groups * total_qkv) + ind = range(out_features) + self.lora_ind = [] + if enable_q: + q_ind = [x for x in ind if (x // head_size) % total_qkv < total_qkv - 2] + self.lora_ind.extend(q_ind) + if enable_k: + k_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 2] + self.lora_ind.extend(k_ind) + if enable_v: + v_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 1] + self.lora_ind.extend(v_ind) + self.reset_parameters() + + def zero_pad(self, x: torch.Tensor) -> torch.Tensor: + """Properly pad weight updates with zeros. + + If, based on `self.enable_lora`, we want to fine-tune queries and values, but not keys, + then the weights update should be: + + [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,], + [....................................], + [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]] + ↑ ↑ ↑ + ________________________________________ + | query | key | value | + ---------------------------------------- + For Llama2's GQA support, Q, K, and V weights are interleaved, so that weights for grouped + queries are adjacent to their associated key and value weights. + For example, suppose we have n_head = 12 with 3 query groups. + Then along the embedding dimension the interleaved weights would look like + + [Q, Q, Q, Q, K, V, Q, Q, Q, Q, K, V, Q, Q, Q, Q, K, V], + + where each Q, K, and V has size head_size. + + In this case, the previously-described weight update applies separately to each + individual block, so the update will take the form + + [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ...], + [.............................................................................], + [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ...]] + ↑ ↑ ↑ ↑ ↑ ↑ + ________________________________________________________________________________ + | q block 1 | k block 1 | v block 1 | q block 2 | k block 2 | v block 2 | ... + -------------------------------------------------------------------------------- + Note that in the above diagram, the size of each q block will equal q_per_kv + times the size of each k and v block. + + Args: + x: tensor with weights update that will be padded with zeros if necessary + + Returns: + A tensor with weight updates and zeros for deselected q, k or v + """ + # we need to do zero padding only if LoRA is disabled for one of QKV matrices + if all(self.enable_lora): + return x + + # Let's image that: + # ⚬ input x has shape (64, 64, 256): (batch_size, sequence_length, embeddings_size) + # ⚬ embeddings_size: 128 + # ⚬ self.linear.out_features: 384 (3 * embeddings_size) + # ⚬ enable_lora: [True, False, True] + # Then x has embeddings_size of 256 (2 * 128 as enable_lora only for query and value, not keys) and expected + # embeddings_size is 384 (self.linear.out_features), so that means that we need to pad from 256 to 384 with zeros, but + # only for key updates (this is where self.lora_ind comes in handy) + # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors + # for example when we want to merge/unmerge LoRA weights and pretrained weights + x = x.transpose(0, 1) + result = x.new_zeros((*x.shape[:-1], self.linear.out_features)) # (64, 64, 384) + result = result.view(-1, self.linear.out_features) # (4096, 384) + result = result.index_copy( + 1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes)) + ) # (4096, 256) + return result.view((*x.shape[:-1], self.linear.out_features)).transpose(0, 1) # (64, 64, 384) + + def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: + """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. + + If the number of heads is equal to the number of query groups - grouped queries are disabled + (see scheme in `lit_gpt/config.py:Config`). In this case the combined QKV matrix consists of equally sized + query, key and value parts, which means we can utilize `groups` argument from `conv1d`: with this argument the + input and weight matrices will be splitted in equally sized parts and applied separately (like having multiple + conv layers side by side). + + Otherwise QKV matrix consists of unequally sized parts and thus we have to split input and weight matrices manually, + apply each part of the weight matrix to the corresponding input's part and concatenate the result. + + Args: + input: input matrix of shape (B, C, T) + weight: weight matrix of shape (C_output, rank, 1). + "C_output" is defined as a sum of embedding sizes for each enabled LoRA layer (see init method of the class). + + Returns: + A tensor with a shape (B, C_output, T) + + """ + if self.n_head == self.n_query_groups: + return F.conv1d(input, weight, groups=sum(self.enable_lora)) # (B, C_output, T) + + # Notation: + # ⚬ N: number of enabled LoRA layers (self.enable_lora) + # ⚬ C_output': embeddings size for each LoRA layer (not equal in size) + # ⚬ r: rank of all LoRA layers (equal in size) + + input_splitted = input.chunk(sum(self.enable_lora), dim=1) # N * (B, C // N, T) + weight_splitted = weight.split(self.qkv_shapes) # N * (C_output', r, 1) + return torch.cat( + [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], dim=1 # (B, C_output', T) + ) # (B, C_output, T) + + def get_lora_AB(self) -> torch.Tensor: + """Return merged lora_A and lora_B matrices with the same shape as the pretrained weights.""" + # Let's assume that: + # ⚬ self.linear.weight.data: (384, 128) or (3 * embedding_size, embedding_size) + # ⚬ self.lora_A.data: (4, 128) + # ⚬ self.lora_B.data: (256, 2) + lora = self.conv1d( + self.lora_A.data.unsqueeze(0), # (4, 128) -> (1, 4, 128) + self.lora_B.data.unsqueeze(-1), # (256, 2) -> (256, 2, 1) + ).squeeze( + 0 + ) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128) + return self.zero_pad(lora * self.scaling) # (256, 128) after zero_pad (384, 128) + + def merge(self) -> None: + """Merges the LoRA weights into the full-rank weights (W = W + delta_W).""" + if self.r > 0 and any(self.enable_lora) and not self.merged: + super().merge() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Do the forward pass. + + If LoRA's weights are merged with pretrained ones then it's a simple matrix multiplication. + If not, then multiply pretrained weights with input, apply LoRA on input and do summation. + + Args: + x: input tensor of shape (batch_size, context_length, embedding_size) + + Returns: + Output tensor of shape (batch_size, context_length, 3 * embedding_size) + """ + + # Let's assume that: + # ⚬ x: (64, 64, 128) or (batch_size, context_length, embedding_size) + # ⚬ self.linear.weight: (384, 128) or (3 * embedding_size, embedding_size) + # ⚬ self.lora_A.data: (4, 128) + # ⚬ self.lora_B.data: (256, 2) + + # if weights are merged or LoRA is disabled (r <= 0 or all `enable_lora` are False) - it's only a regular nn.Linear forward pass; + # otherwise in addition do the forward pass with LoRA weights and add it's output to the output from pretrained weights + pretrained = self.linear(x) + if self.r == 0 or not any(self.enable_lora) or self.merged: + return pretrained + after_A = F.linear(self.lora_dropout(x), self.lora_A) # (64, 64, 128) @ (4, 128) -> (64, 64, 4) + # For F.conv1d: + # ⚬ input: input tensor of shape (mini-batch, in_channels, iW) + # ⚬ weight: filters of shape (out_channels, in_channels/groups, kW) + after_B = self.conv1d( + after_A.transpose(-2, -1), # (64, 64, 4) -> (64, 4, 64) + self.lora_B.unsqueeze(-1), # (256, 2) -> (256, 2, 1) + ).transpose( + -2, -1 + ) # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64) -> (64, 64, 256) + lora = self.zero_pad(after_B) * self.scaling # (64, 64, 256) after zero_pad (64, 64, 384) + return pretrained + lora + + +def mark_only_lora_as_trainable(model: nn.Module, bias: str = "none") -> None: + """Freeze all modules except LoRA's and depending on 'bias' value unfreezes bias weights. + + Args: + model: model with LoRA layers + bias: + ``"none"``: all bias weights will be frozen, + ``"lora_only"``: only bias weight for LoRA layers will be unfrozen, + ``"all"``: all bias weights will be unfrozen. + + Raises: + NotImplementedError: if `bias` not in ["none", "lora_only", "all"] + """ + # freeze all layers except LoRA's + for n, p in model.named_parameters(): + if "lora_" not in n: + p.requires_grad = False + + # depending on the `bias` value unfreeze bias weights + if bias == "none": + return + if bias == "all": + for n, p in model.named_parameters(): + if "bias" in n: + p.requires_grad = True + elif bias == "lora_only": + for m in model.modules(): + if isinstance(m, LoRALayer) and hasattr(m, "bias") and m.bias is not None: + m.bias.requires_grad = True + else: + raise NotImplementedError + + +def lora_filter(key: str, value: Any) -> bool: + return "lora_" in key + + +@dataclass +class Config(BaseConfig): + """ + Args: + r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of + the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2) + alpha: alpha is needed for scaling updates as alpha/r + "This scaling helps to reduce the need to retune hyperparameters when we vary r" + https://arxiv.org/pdf/2106.09685.pdf (section 4.1) + dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A) + to_*: either apply LoRA to the specified weights or not + """ + + r: int = 0 + alpha: int = 1 + dropout: float = 0.0 + to_query: bool = False + to_key: bool = False + to_value: bool = False + to_projection: bool = False + to_mlp: bool = False + to_head: bool = False + + @property + def mlp_class(self) -> Type: + return getattr(lit_gpt.lora, self._mlp_class) + + +class GPT(BaseModel): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + assert config.padded_vocab_size is not None + self.config = config + + self.lm_head = LoRALinear( + config.n_embd, + config.padded_vocab_size, + bias=config.lm_head_bias, + r=(config.r if config.to_head else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + self.transformer = nn.ModuleDict( + dict( + wte=nn.Embedding(config.padded_vocab_size, config.n_embd), + h=nn.ModuleList(Block(config) for _ in range(config.n_layer)), + ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), + ) + ) + self.max_seq_length = self.config.block_size + self.mask_cache: Optional[torch.Tensor] = None + + def forward( + self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0 + ) -> Union[torch.Tensor, List[torch.Tensor]]: + T = idx.size(1) + if self.max_seq_length < T: + raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") + + if input_pos is not None: # use the kv cache + cos = self.cos.index_select(0, input_pos) + sin = self.sin.index_select(0, input_pos) + if self.mask_cache is None: + raise TypeError("You need to call `gpt.set_kv_cache()`") + mask = self.mask_cache.index_select(2, input_pos) + else: + cos = self.cos[:T] + sin = self.sin[:T] + mask = None + + x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + for block in self.transformer.h: + x = block(x, cos, sin, mask, input_pos) + x = self.transformer.ln_f(x) + if lm_head_chunk_size > 0: + # chunk the lm head logits to reduce the peak memory used by autograd + return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] + return self.lm_head(x) # (B, T, vocab_size) + + @classmethod + def from_name(cls, name: str, **kwargs: Any) -> Self: + return cls(Config.from_name(name, **kwargs)) + + def _init_weights(self, module: nn.Module) -> None: + """Meant to be used with `gpt.apply(gpt._init_weights)`. Unused method left for completeness.""" + super()._init_weights(module) + if isinstance(module, LoRALinear): + module.reset_parameters() + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"} + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class Block(BaseBlock): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) + self.attn = CausalSelfAttention(config) + if not config.shared_attention_norm: + self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps) + self.mlp = config.mlp_class(config) + + self.config = config + + +class CausalSelfAttention(BaseCausalSelfAttention): + def __init__(self, config: Config) -> None: + # Skip the parent class __init__ altogether and replace it to avoid + # useless allocations + nn.Module.__init__(self) + shape = (config.n_head + 2 * config.n_query_groups) * config.head_size + # key, query, value projections for all heads, but in a batch + self.attn = LoRAQKVLinear( + in_features=config.n_embd, + out_features=shape, + r=config.r, + lora_alpha=config.alpha, + lora_dropout=config.dropout, + enable_lora=(config.to_query, config.to_key, config.to_value), + bias=config.bias, + # for MQA/GQA support + n_head=config.n_head, + n_query_groups=config.n_query_groups, + ) + # output projection + # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` + self.proj = LoRALinear( + config.head_size * config.n_head, + config.n_embd, + bias=config.bias, + r=(config.r if config.to_projection else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + # disabled by default + self.kv_cache: Optional[KVCache] = None + + self.config = config + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = { + "attn.weight": "attn.linear.weight", + "attn.bias": "attn.linear.bias", + "proj.weight": "proj.linear.weight", + "proj.bias": "proj.linear.bias", + } + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class GptNeoxMLP(lit_gpt.model.GptNeoxMLP): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + self.fc = LoRALinear( + config.n_embd, + config.intermediate_size, + bias=config.bias, + r=(config.r if config.to_mlp else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + self.proj = LoRALinear( + config.intermediate_size, + config.n_embd, + bias=config.bias, + r=(config.r if config.to_mlp else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + + self.config = config + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = { + "fc.weight": "fc.linear.weight", + "fc.bias": "fc.linear.bias", + "proj.weight": "proj.linear.weight", + "proj.bias": "proj.linear.bias", + } + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class LLaMAMLP(lit_gpt.model.LLaMAMLP): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + self.fc_1 = LoRALinear( + config.n_embd, + config.intermediate_size, + bias=config.bias, + r=(config.r if config.to_mlp else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + self.fc_2 = LoRALinear( + config.n_embd, + config.intermediate_size, + bias=config.bias, + r=(config.r if config.to_mlp else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + self.proj = LoRALinear( + config.intermediate_size, + config.n_embd, + bias=config.bias, + r=(config.r if config.to_mlp else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = { + "fc_1.weight": "fc_1.linear.weight", + "fc_1.bias": "fc_1.linear.bias", + "fc_2.weight": "fc_2.linear.weight", + "fc_2.bias": "fc_2.linear.bias", + "proj.weight": "proj.linear.weight", + "proj.bias": "proj.linear.bias", + } + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +class GemmaMLP(LLaMAMLP): + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_fc_1 = self.fc_1(x) + x_fc_2 = self.fc_2(x) + x = torch.nn.functional.gelu(x_fc_1) * x_fc_2 + return self.proj(x) + + +class LLaMAMoE(lit_gpt.model.LLaMAMoE): + def __init__(self, config: Config) -> None: + nn.Module.__init__(self) + self.gate = LoRALinear( + config.n_embd, + config.n_expert, + bias=False, + r=(config.r if config.to_mlp else 0), + lora_alpha=config.alpha, + lora_dropout=config.dropout, + ) + self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + + self.config = config + + def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + """For compatibility with base checkpoints.""" + mapping = {"gate.weight": "gate.linear.weight"} + state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + +def merge_lora_weights(model: GPT) -> None: + """Merge LoRA weights into the full-rank weights to speed up inference.""" + for module in model.modules(): + if isinstance(module, LoRALinear): + module.merge() diff --git a/llm-lora-finetuning/lit_gpt/model.py b/llm-lora-finetuning/lit_gpt/model.py new file mode 100644 index 00000000..ed33664f --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/model.py @@ -0,0 +1,390 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Full definition of a decoder-only transformer-based language model, all of it in this single file. + +Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and +https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model. +""" + +import math +from typing import Any, Optional, Tuple + +import torch +import torch.nn as nn +from typing_extensions import Self + +from lit_gpt.config import Config + + +class GPT(nn.Module): + def __init__(self, config: Config) -> None: + super().__init__() + assert config.padded_vocab_size is not None + self.config = config + + self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.transformer = nn.ModuleDict( + dict( + wte=nn.Embedding(config.padded_vocab_size, config.n_embd), + h=nn.ModuleList(Block(config) for _ in range(config.n_layer)), + ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), + ) + ) + self.max_seq_length = self.config.block_size + self.mask_cache: Optional[torch.Tensor] = None + + @property + def max_seq_length(self) -> int: + return self._max_seq_length + + @max_seq_length.setter + def max_seq_length(self, value: int) -> None: + """ + When doing inference, the sequences used might be shorter than the model's context length. + This allows setting a smaller number to avoid allocating unused memory + """ + if value > self.config.block_size: + raise ValueError(f"Cannot attend to {value}, block size is only {self.config.block_size}") + self._max_seq_length = value + if not hasattr(self, "cos"): + # first call + cos, sin = self.rope_cache() + self.register_buffer("cos", cos, persistent=False) + self.register_buffer("sin", sin, persistent=False) + # override + elif value != self.cos.size(0): + self.cos, self.sin = self.rope_cache(device=self.cos.device) + # the mask and kv cache size will get updated on `set_kv_cache`. we cannot update it here because we don't know + # if the kv cache is expected + + def reset_parameters(self) -> None: + # Trigger resetting the rope-cache + self.cos, self.sin = self.rope_cache() + + def _init_weights(self, module: nn.Module) -> None: + """Meant to be used with `gpt.apply(gpt._init_weights)`.""" + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + T = idx.size(1) + if self.max_seq_length < T: + raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") + + if input_pos is not None: # use the kv cache + cos = self.cos.index_select(0, input_pos) + sin = self.sin.index_select(0, input_pos) + if self.mask_cache is None: + raise TypeError("You need to call `gpt.set_kv_cache()`") + mask = self.mask_cache.index_select(2, input_pos) + else: + cos = self.cos[:T] + sin = self.sin[:T] + mask = None + + x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + if self.config.scale_embeddings: + x = x * (self.config.n_embd**0.5) + + for block in self.transformer.h: + x = block(x, cos, sin, mask, input_pos) + x = self.transformer.ln_f(x) + return self.lm_head(x) # (b, t, vocab_size) + + @classmethod + def from_name(cls, name: str, **kwargs: Any) -> Self: + return cls(Config.from_name(name, **kwargs)) + + def rope_cache(self, device: Optional[torch.device] = None) -> Tuple[torch.Tensor, torch.Tensor]: + return build_rope_cache( + seq_len=self.max_seq_length, + n_elem=self.config.rope_n_elem, + device=device, + condense_ratio=self.config.rope_condense_ratio, + base=self.config.rope_base, + ) + + def set_kv_cache( + self, + batch_size: int, + rope_cache_length: Optional[int] = None, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> None: + if rope_cache_length is None: + rope_cache_length = self.cos.size(-1) + max_seq_length = self.max_seq_length + + # initialize the kv cache for all blocks + for block in self.transformer.h: + block.attn.kv_cache = block.attn.build_kv_cache( + batch_size, max_seq_length, rope_cache_length, device, dtype + ) + + if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length: + # passing `attn_mask` to SDPA disables the flash implementation. since we only need the mask + # for the kv-cache support (only during inference), we only create it in that situation + self.mask_cache = build_mask_cache(max_seq_length, device) + + def clear_kv_cache(self) -> None: + self.mask_cache = None + for block in self.transformer.h: + block.attn.kv_cache = None + + +class Block(nn.Module): + def __init__(self, config: Config) -> None: + super().__init__() + self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) + self.attn = CausalSelfAttention(config) + self.norm_2 = None if config.shared_attention_norm else config.norm_class(config.n_embd, eps=config.norm_eps) + self.mlp = config.mlp_class(config) + + self.config = config + + def forward( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + mask: Optional[torch.Tensor] = None, + input_pos: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + n_1 = self.norm_1(x) + h = self.attn(n_1, cos, sin, mask, input_pos) + if self.config.parallel_residual: + n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x) + x = self.mlp(n_2) + h + x + else: + if self.config.shared_attention_norm: + raise NotImplementedError( + "No checkpoint amongst the ones we support uses this configuration" + " (non-parallel residual and shared attention norm)." + ) + x = h + x + x = self.mlp(self.norm_2(x)) + x + return x + + +class CausalSelfAttention(nn.Module): + def __init__(self, config: Config) -> None: + super().__init__() + shape = (config.n_head + 2 * config.n_query_groups) * config.head_size + # key, query, value projections for all heads, but in a batch + self.attn = nn.Linear(config.n_embd, shape, bias=config.bias) + # output projection + # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` + self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) + # disabled by default + self.kv_cache: Optional[KVCache] = None + + self.config = config + + def forward( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + mask: Optional[torch.Tensor] = None, + input_pos: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) + + qkv = self.attn(x) + + # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`) + q_per_kv = self.config.n_head // self.config.n_query_groups + total_qkv = q_per_kv + 2 # each group has 1+ queries, 1 key, and 1 value + qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size) + qkv = qkv.permute(0, 2, 3, 1, 4) # (B, n_query_groups, total_qkv, T, hs) + + # split batched computation into three + q, k, v = qkv.split((q_per_kv, 1, 1), dim=2) + + # maybe repeat k and v if for the non multi-head attention cases + # training: flash attention requires it + # inference: multi-query would require a full kv cache so avoid it to limit its memory usage + if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1): + k = k.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size) + v = v.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size) + + q = q.reshape(B, -1, T, self.config.head_size) # (B, nh_q, T, hs) + k = k.reshape(B, -1, T, self.config.head_size) # (B, nh_k, T, hs) + v = v.reshape(B, -1, T, self.config.head_size) # (B, nh_v, T, hs) + + q_roped = apply_rope(q[..., : self.config.rope_n_elem], cos, sin) + k_roped = apply_rope(k[..., : self.config.rope_n_elem], cos, sin) + q = torch.cat((q_roped, q[..., self.config.rope_n_elem :]), dim=-1) + k = torch.cat((k_roped, k[..., self.config.rope_n_elem :]), dim=-1) + + if input_pos is not None: + if not isinstance(self.kv_cache, KVCache): + raise TypeError("You need to call `gpt.set_kv_cache()`") + k, v = self.kv_cache(input_pos, k, v) + + y = self.scaled_dot_product_attention(q, k, v, mask) + + y = y.reshape(B, T, self.config.head_size * self.config.n_head) # re-assemble all head outputs side by side + + # output projection + return self.proj(y) + + def scaled_dot_product_attention( + self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + scale = 1.0 / math.sqrt(self.config.head_size) + y = torch.nn.functional.scaled_dot_product_attention( + q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None + ) + return y.transpose(1, 2) + + def build_kv_cache( + self, + batch_size: int, + max_seq_length: int, + rope_cache_length: Optional[int] = None, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> "KVCache": + heads = 1 if self.config.n_query_groups == 1 else self.config.n_head + v_shape = (batch_size, heads, max_seq_length, self.config.head_size) + if rope_cache_length is None: + if self.config.rotary_percentage != 1.0: + raise TypeError("Please pass the `rope_cache_length=gpt.cos.size(-1)` value") + k_shape = v_shape + else: + k_shape = ( + batch_size, + heads, + max_seq_length, + rope_cache_length + self.config.head_size - self.config.rope_n_elem, + ) + return KVCache(k_shape, v_shape, device=device, dtype=dtype) + + +class GptNeoxMLP(nn.Module): + def __init__(self, config: Config) -> None: + super().__init__() + self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) + + self.config = config + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc(x) + x = torch.nn.functional.gelu(x, approximate=self.config.gelu_approximate) + return self.proj(x) + + +class LLaMAMLP(nn.Module): + def __init__(self, config: Config) -> None: + super().__init__() + self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_fc_1 = self.fc_1(x) + x_fc_2 = self.fc_2(x) + x = torch.nn.functional.silu(x_fc_1) * x_fc_2 + return self.proj(x) + + +class GemmaMLP(LLaMAMLP): + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_fc_1 = self.fc_1(x) + x_fc_2 = self.fc_2(x) + x = torch.nn.functional.gelu(x_fc_1) * x_fc_2 + return self.proj(x) + + +class LLaMAMoE(nn.Module): + def __init__(self, config: Config) -> None: + super().__init__() + self.gate = nn.Linear(config.n_embd, config.n_expert, bias=False) + self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + + self.config = config + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Derived from: https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 + See also figure 1 in https://arxiv.org/abs/2211.15841 + """ + B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) + x = x.view(-1, C) # (B*T, C) + router = self.gate(x) # (B*T, n_expert) + probs, indices = torch.topk(router, self.config.n_expert_per_token) # (B*T, n_expert_per_token) + probs = probs.softmax(dim=1, dtype=torch.float).to(dtype=x.dtype) + masks = indices.unsqueeze(-1) == torch.arange(self.config.n_expert, device=x.device) + masks = masks.permute(2, 0, 1) # (n_expert, B*T, n_expert_per_token) + y = torch.zeros_like(x) # (B*T, C) + for mask, expert in zip(masks, self.experts): + token_idx, expert_idx = torch.where(mask) + y[token_idx] += probs[token_idx, expert_idx, None] * expert(x[token_idx]) + return y.view(B, T, C) + + +def build_rope_cache( + seq_len: int, n_elem: int, device: Optional[torch.device] = None, base: int = 10000, condense_ratio: int = 1 +) -> Tuple[torch.Tensor, torch.Tensor]: + """Enhanced Transformer with Rotary Position Embedding. + + Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ + transformers/rope/__init__.py. MIT License: + https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. + """ + # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem)) + + # Create position indexes `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, device=device) / condense_ratio + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.outer(seq_idx, theta).repeat(1, 2) + + return torch.cos(idx_theta), torch.sin(idx_theta) + + +def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + head_size = x.size(-1) + x1 = x[..., : head_size // 2] # (B, nh, T, hs/2) + x2 = x[..., head_size // 2 :] # (B, nh, T, hs/2) + rotated = torch.cat((-x2, x1), dim=-1) # (B, nh, T, hs) + roped = (x * cos) + (rotated * sin) + return roped.to(dtype=x.dtype) + + +class KVCache(nn.Module): + def __init__( + self, + k_shape: Tuple[int, int, int, int], + v_shape: Tuple[int, int, int, int], + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> None: + super().__init__() + self.register_buffer("k", torch.zeros(k_shape, device=device, dtype=dtype), persistent=False) + self.register_buffer("v", torch.zeros(v_shape, device=device, dtype=dtype), persistent=False) + + def forward(self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + # move the buffer to the activation dtype for when AMP is used + self.k = self.k.to(k.dtype) + self.v = self.v.to(v.dtype) + # update the cache + k = self.k.index_copy_(2, input_pos, k) + v = self.v.index_copy_(2, input_pos, v) + return k, v + + def reset_parameters(self) -> None: + torch.nn.init.zeros_(self.k) + torch.nn.init.zeros_(self.v) + + +def build_mask_cache(max_seq_length: int, device: Optional[torch.device] = None) -> torch.Tensor: + ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool) + return torch.tril(ones).unsqueeze(0).unsqueeze(0) diff --git a/llm-lora-finetuning/lit_gpt/packed_dataset.py b/llm-lora-finetuning/lit_gpt/packed_dataset.py new file mode 100644 index 00000000..2b5b3d6d --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/packed_dataset.py @@ -0,0 +1,239 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +# Very loosely inspired by indexed_dataset in Fairseq, Megatron +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py + + +import os +import random +import struct + +import numpy as np +import torch +from torch.utils.data import IterableDataset, get_worker_info + +dtypes = {1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float32, 7: np.float64, 8: np.uint16} + + +def code(dtype): + for k in dtypes: + if dtypes[k] == dtype: + return k + raise ValueError(dtype) + + +HDR_MAGIC = b"LITPKDS" +HDR_SIZE = 24 # bytes + + +class PackedDataset(IterableDataset): + def __init__( + self, filenames, n_chunks, block_size, seed=12345, shuffle=True, wrap=False, num_processes=1, process_rank=0 + ): + self._filenames = filenames + self._n_chunks = n_chunks + self._block_size = block_size + self._seed = seed + self._shuffle = shuffle + self._wrap = wrap + self._num_processes = num_processes + self._process_rank = process_rank + + def __iter__(self): + worker_info = get_worker_info() + num_workers = worker_info.num_workers if worker_info is not None else 1 + worker_id = worker_info.id if worker_info is not None else 0 + num_shards = num_workers * self._num_processes + shard_id = self._process_rank * num_workers + worker_id + + max_num_files = len(self._filenames) // num_shards * num_shards + filenames = self._filenames[shard_id:max_num_files:num_shards] + + return PackedDatasetIterator( + filenames=filenames, + n_chunks=self._n_chunks, + block_size=self._block_size, + seed=self._seed, + shuffle=self._shuffle, + wrap=self._wrap, + ) + + +class PackedDatasetBuilder(object): + def __init__(self, outdir, prefix, chunk_size, sep_token, dtype="auto", vocab_size=None): + if dtype == "auto": + if vocab_size is None: + raise ValueError("vocab_size cannot be None when dtype='auto'") + if vocab_size is not None and vocab_size < 65500: + self._dtype = np.uint16 + else: + self._dtype = np.int32 + else: + self._dtype = dtype + self._counter = 0 + self._chunk_size = chunk_size + self._outdir = outdir + self._prefix = prefix + self._sep_token = sep_token + self._arr = np.zeros(self._chunk_size, dtype=self._dtype) + self._arr.fill(self._sep_token) + self._idx = 0 + self._version = 1 + self._filenames = [] + + def _write_chunk(self): + filename = f"{self._prefix}_{self._counter:010d}.bin" + filename = os.path.join(self._outdir, filename) + + with open(filename, "wb") as f: + f.write(HDR_MAGIC) + f.write(struct.pack(" self._chunk_size: + part_len = self._chunk_size - self._idx + self._arr[self._idx : self._idx + part_len] = arr[:part_len] + self._write_chunk() + arr = arr[part_len:] + + arr_len = arr.shape[0] + self._arr[self._idx : self._idx + arr_len] = arr + self._idx += arr_len + + def write_reminder(self): + self._write_chunk() + + +class PackedDatasetIterator: + def __init__(self, filenames, n_chunks, block_size, seed, shuffle, wrap): + self._seed = seed + self._shuffle = shuffle + self._rng = np.random.default_rng(seed) if shuffle else None + self._block_idxs = None + + self._wrap = wrap + + # TODO: instead of filenames, we could have a single text stream + # (or text file) with the sequence of all files to be + # fetched/loaded. + self._filenames = filenames + self._file_idx = 0 + + self._n_chunks = n_chunks + + self._dtype = None + self._block_size = block_size + self._n_blocks = None + + self._mmaps = [] + self._buffers = [] + + self._block_idxs = [] + self._curr_idx = 0 + + self._load_n_chunks() + + def _read_header(self, path): + with open(path, "rb") as f: + magic = f.read(len(HDR_MAGIC)) + assert magic == HDR_MAGIC, "File doesn't match expected format." + version = struct.unpack(" len(self._filenames[self._file_idx :]): + if not self._wrap: + raise StopIteration + self._file_idx = 0 + + for i in range(self._n_chunks): + filename = self._filenames[self._file_idx + i] + if self._dtype is None: + self._dtype, self._chunk_size = self._read_header(filename) + self._n_blocks = self._chunk_size // self._block_size + # TODO: check header matches with previous files + mmap = np.memmap(filename, mode="r", order="C", offset=HDR_SIZE) + self._mmaps.append(mmap) + self._buffers.append(memoryview(mmap)) + + self._file_idx += self._n_chunks + n_all_blocks = self._n_chunks * self._n_blocks + + self._block_idxs = self._rng.permutation(n_all_blocks) if self._shuffle else range(n_all_blocks) + + self._curr_idx = 0 + + def __del__(self): + self._close_mmaps() + del self._mmaps + del self._buffers + + def __iter__(self): + return self + + def __next__(self): + if self._curr_idx >= len(self._block_idxs): + self._load_n_chunks() + # TODO: trigger fetching next next n_chunks if remote + block_idx = self._block_idxs[self._curr_idx] + chunk_id = block_idx // self._n_blocks + buffer = self._buffers[chunk_id] + elem_id = (block_idx % self._n_blocks) * self._block_size + offset = np.dtype(self._dtype).itemsize * elem_id + arr = np.frombuffer(buffer, dtype=self._dtype, count=self._block_size, offset=offset) + self._curr_idx += 1 + return torch.from_numpy(arr.astype(np.int64)) + + +class CombinedDataset(IterableDataset): + def __init__(self, datasets, seed, weights=None): + self._seed = seed + self._datasets = datasets + self._weights = weights + n_datasets = len(datasets) + if weights is None: + self._weights = [1 / n_datasets] * n_datasets + else: + self._weights = [w / sum(weights) for w in weights] + + def __iter__(self): + return CombinedDatasetIterator(self._datasets, self._seed, self._weights) + + +class CombinedDatasetIterator: + def __init__(self, datasets, seed, weights): + self._datasets = [iter(el) for el in datasets] + self._weights = weights + self._rng = random.Random(seed) + + def __next__(self): + (dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1) + return next(dataset) diff --git a/llm-lora-finetuning/lit_gpt/rmsnorm.py b/llm-lora-finetuning/lit_gpt/rmsnorm.py new file mode 100644 index 00000000..dcaab677 --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/rmsnorm.py @@ -0,0 +1,34 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import torch + + +class RMSNorm(torch.nn.Module): + """Root Mean Square Layer Normalization. + + Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License: + https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE. + """ + + def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_unit_offset: bool = False) -> None: + super().__init__() + self.weight = torch.nn.Parameter(torch.ones(size)) + self.eps = eps + self.dim = dim + self.add_unit_offset = add_unit_offset + + def forward(self, x: torch.Tensor) -> torch.Tensor: + dtype = x.dtype + x = x.float() + # NOTE: the original RMSNorm paper implementation is not equivalent + norm_x = torch.mean(x * x, dim=self.dim, keepdim=True) + x_normed = x * torch.rsqrt(norm_x + self.eps) + x_normed = x_normed.to(dtype=dtype) + if self.add_unit_offset: + # Gemma model requires a unit offset + # https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L176 + return x_normed * (1 + self.weight) + return x_normed * self.weight + + def reset_parameters(self) -> None: + torch.nn.init.ones_(self.weight) diff --git a/llm-lora-finetuning/lit_gpt/tokenizer.py b/llm-lora-finetuning/lit_gpt/tokenizer.py new file mode 100644 index 00000000..3a6758eb --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/tokenizer.py @@ -0,0 +1,109 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import json +from pathlib import Path +from typing import Optional, Union + +import torch + + +class Tokenizer: + def __init__(self, checkpoint_dir: Union[Path, str]) -> None: + checkpoint_dir = Path(checkpoint_dir) + if not checkpoint_dir.exists(): + raise NotADirectoryError(f"The checkpoint directory does not exist: {str(checkpoint_dir)}") + + self.use_bos = self.check_if_bos_token_used(checkpoint_dir) + self.bos_id = None + self.eos_id = None + + # some checkpoints have both files, `.model` takes precedence + if (vocabulary_path := checkpoint_dir / "tokenizer.model").is_file(): + from sentencepiece import SentencePieceProcessor + + self.processor = SentencePieceProcessor(model_file=str(vocabulary_path)) + self.backend = "sentencepiece" + self.bos_id = self.processor.bos_id() + self.eos_id = self.processor.eos_id() + + elif (vocabulary_path := checkpoint_dir / "tokenizer.json").is_file(): + from tokenizers import Tokenizer as HFTokenizer + + self.processor = HFTokenizer.from_file(str(vocabulary_path)) + self.backend = "huggingface" + + if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file(): + with open(special_tokens_path) as fp: + config = json.load(fp) + bos_token = config.get("bos_token") + self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None + eos_token = config.get("eos_token") + self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None + if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file(): + with open(special_tokens_path) as fp: + config = json.load(fp) + if self.bos_id is None: + self.bos_id = config.get("bos_token_id") + if self.eos_id is None: + self.eos_id = config.get("eos_token_id") + else: + raise NotImplementedError + + @property + def vocab_size(self) -> int: + if self.backend == "huggingface": + return self.processor.get_vocab_size(with_added_tokens=False) + if self.backend == "sentencepiece": + return self.processor.vocab_size() + raise RuntimeError + + def token_to_id(self, token: str) -> int: + if self.backend == "huggingface": + id_ = self.processor.token_to_id(token) + elif self.backend == "sentencepiece": + id_ = self.processor.piece_to_id(token) + else: + raise RuntimeError + if id_ is None: + raise ValueError(f"token {token!r} not found in the collection.") + return id_ + + def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool: + if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file(): + return False + with open(tokenizer_config_path) as fp: + config = json.load(fp) + if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")): + return True + # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True. + # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2 + return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer" + + def encode( + self, + string: str, + device: Optional[torch.device] = None, + bos: Optional[bool] = None, + eos: bool = False, + max_length: int = -1, + ) -> torch.Tensor: + if self.backend == "huggingface": + tokens = self.processor.encode(string).ids + elif self.backend == "sentencepiece": + tokens = self.processor.encode(string) + else: + raise RuntimeError + if bos or (bos is None and self.use_bos): + bos_id = self.bos_id + if bos_id is None: + raise NotImplementedError("This tokenizer does not have a defined a bos token") + tokens = [bos_id] + tokens + if eos: + tokens = tokens + [self.eos_id] + if max_length > 0: + tokens = tokens[:max_length] + return torch.tensor(tokens, dtype=torch.int, device=device) + + def decode(self, tensor: torch.Tensor) -> str: + tokens = [tensor.item()] if tensor.ndim == 0 else tensor.tolist() + return self.processor.decode(tokens) diff --git a/llm-lora-finetuning/lit_gpt/utils.py b/llm-lora-finetuning/lit_gpt/utils.py new file mode 100644 index 00000000..c9102791 --- /dev/null +++ b/llm-lora-finetuning/lit_gpt/utils.py @@ -0,0 +1,379 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Utility functions for training and inference.""" + +import math +import pickle +import sys +from io import BytesIO +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, TypeVar, Union + +import lightning as L +import torch +import torch.nn as nn +import torch.utils._device +from lightning.fabric.strategies import FSDPStrategy +from lightning.fabric.utilities.load import _lazy_load as lazy_load +from torch.serialization import normalize_storage_type +from typing_extensions import Self + +if TYPE_CHECKING: + from lit_gpt import GPT + + +def find_multiple(n: int, k: int) -> int: + assert k > 0 + if n % k == 0: + return n + return n + k - (n % k) + + +def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> int: + total = 0 + for p in module.parameters(): + if requires_grad is None or p.requires_grad == requires_grad: + if hasattr(p, "quant_state"): + # bitsandbytes 4bit layer support + total += math.prod(p.quant_state[1]) + else: + total += p.numel() + return total + + +def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None: + files = { + "lit_model.pth": (checkpoint_dir / "lit_model.pth").is_file(), + "lit_config.json": (checkpoint_dir / "lit_config.json").is_file(), + "tokenizer.json OR tokenizer.model": (checkpoint_dir / "tokenizer.json").is_file() + or (checkpoint_dir / "tokenizer.model").is_file(), + "tokenizer_config.json": (checkpoint_dir / "tokenizer_config.json").is_file(), + } + if checkpoint_dir.is_dir(): + if all(files.values()): + # we're good + return + problem = f" is missing the files: {[f for f, exists in files.items() if not exists]!r}" + else: + problem = " is not a checkpoint directory" + + # list locally available checkpoints + available = list(Path("checkpoints").glob("*/*")) + if available: + options = "\n --checkpoint_dir ".join([""] + [repr(str(p.resolve())) for p in available]) + extra = f"\nYou have downloaded locally:{options}\n" + else: + extra = "" + + error_message = ( + f"--checkpoint_dir {str(checkpoint_dir.absolute())!r}{problem}." + "\nFind download instructions at https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials\n" + f"{extra}\nSee all download options by running:\n python scripts/download.py" + ) + print(error_message, file=sys.stderr) + raise SystemExit(1) + + +class SavingProxyForStorage: + def __init__(self, obj, saver, protocol_version=5): + self.protocol_version = protocol_version + self.saver = saver + if not (isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj)): + raise TypeError(f"expected storage, not {type(obj)}") + + # this logic is taken from PyTorch 2.0+ torch/serialization.py + if isinstance(obj, torch.storage.TypedStorage): + # PT upstream wants to deprecate this eventually... + storage = obj._untyped_storage + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + storage_numel = obj._size() + else: + storage = obj + storage_type = normalize_storage_type(type(obj)) + storage_numel = storage.nbytes() + + storage_key = saver._write_storage_and_return_key(storage) + location = torch.serialization.location_tag(storage) + + self.storage_info = ("storage", storage_type, storage_key, location, storage_numel) + + def __reduce_ex__(self, protocol_version): + assert False, "this should be handled with out of band" + + +class SavingProxyForTensor: + def __init__(self, tensor, saver, protocol_version=5): + self.protocol_version = protocol_version + self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__(protocol_version) + if reduce_args[0] == torch._utils._rebuild_tensor_v2: + # for Tensors with Python attributes + (a0, a1, (storage, *a2_other), *other_reduce_args) = reduce_args + assert isinstance(storage, torch.storage.TypedStorage), "Please check for updates" + storage_proxy = SavingProxyForStorage(storage, saver, protocol_version=protocol_version) + self.reduce_args = (a0, a1, (storage_proxy, *a2_other), *other_reduce_args) + else: + (storage, *other_reduce_args) = reduce_args + assert isinstance(storage, torch.storage.TypedStorage), "Please check for updates" + storage_proxy = SavingProxyForStorage(storage, saver, protocol_version=protocol_version) + self.reduce_args = (storage_proxy, *other_reduce_args) + + def __reduce_ex__(self, protocol_version): + if protocol_version != self.protocol_version: + raise RuntimeError(f"Unexpected protocol version: expected {self.protocol_version}, got {protocol_version}") + return self.reduce_ret_fn, self.reduce_args + + +class IncrementalPyTorchPickler(pickle.Pickler): + def __init__(self, saver, *args, **kwargs): + super().__init__(*args, **kwargs) + self.storage_dtypes = {} + self.saver = saver + self.id_map = {} + + # this logic is taken from PyTorch 2.0+ torch/serialization.py + def persistent_id(self, obj): + # FIXME: the docs say that persistent_id should only return a string + # but torch store returns tuples. This works only in the binary protocol + # see + # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects + # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 + if isinstance(obj, SavingProxyForStorage): + return obj.storage_info + + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + if isinstance(obj, torch.storage.TypedStorage): + # TODO: Once we decide to break serialization FC, this case + # can be deleted + storage = obj._untyped_storage + storage_dtype = obj.dtype + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + storage_numel = obj._size() + + else: + storage = obj + storage_dtype = torch.uint8 + storage_type = normalize_storage_type(type(obj)) + storage_numel = storage.nbytes() + + # If storage is allocated, ensure that any other saved storages + # pointing to the same data all have the same dtype. If storage is + # not allocated, don't perform this check + if storage.data_ptr() != 0: + if storage.data_ptr() in self.storage_dtypes: + if storage_dtype != self.storage_dtypes[storage.data_ptr()]: + raise RuntimeError( + "Cannot save multiple tensors or storages that view the same data as different types" + ) + else: + self.storage_dtypes[storage.data_ptr()] = storage_dtype + + storage_key = self.id_map.get(storage._cdata) + if storage_key is None: + storage_key = self.saver._write_storage_and_return_key(storage) + self.id_map[storage._cdata] = storage_key + location = torch.serialization.location_tag(storage) + + return ("storage", storage_type, storage_key, location, storage_numel) + + return None + + +class incremental_save: + def __init__(self, name): + self.name = name + self.zipfile = torch._C.PyTorchFileWriter(str(name)) + self.has_saved = False + self.next_key = 0 + + def __enter__(self): + return self + + def store_early(self, tensor): + if isinstance(tensor, torch.Tensor): + return SavingProxyForTensor(tensor, self) + raise TypeError(f"can only store tensors early, not {type(tensor)}") + + def save(self, obj): + if self.has_saved: + raise RuntimeError("have already saved") + # Write the pickle data for `obj` + data_buf = BytesIO() + pickler = IncrementalPyTorchPickler(self, data_buf, protocol=5) + pickler.dump(obj) + data_value = data_buf.getvalue() + self.zipfile.write_record("data.pkl", data_value, len(data_value)) + self.has_saved = True + + def _write_storage_and_return_key(self, storage): + if self.has_saved: + raise RuntimeError("have already saved") + key = self.next_key + self.next_key += 1 + name = f"data/{key}" + if storage.device.type != "cpu": + storage = storage.cpu() + num_bytes = storage.nbytes() + self.zipfile.write_record(name, storage.data_ptr(), num_bytes) + return key + + def __exit__(self, type, value, traceback): + self.zipfile.write_end_of_file() + + +T = TypeVar("T") + + +def chunked_cross_entropy( + logits: Union[torch.Tensor, List[torch.Tensor]], + targets: torch.Tensor, + chunk_size: int = 128, + ignore_index: int = -1, +) -> torch.Tensor: + # with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate + # the memory usage in fine-tuning settings with low number of parameters. + # as a workaround hack, the cross entropy computation is chunked to force it to deallocate on the go, reducing + # the memory spike's magnitude + + # lm_head was chunked (we are fine-tuning) + if isinstance(logits, list): + # don't want to chunk cross entropy + if chunk_size == 0: + logits = torch.cat(logits, dim=1) + logits = logits.reshape(-1, logits.size(-1)) + targets = targets.reshape(-1) + return torch.nn.functional.cross_entropy(logits, targets, ignore_index=ignore_index) + + # chunk cross entropy + logit_chunks = [logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits] + target_chunks = [target_chunk.reshape(-1) for target_chunk in targets.split(logits[0].size(1), dim=1)] + loss_chunks = [ + torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=ignore_index, reduction="none") + for logit_chunk, target_chunk in zip(logit_chunks, target_chunks) + ] + non_masked_elems = (targets != ignore_index).sum() + return torch.cat(loss_chunks).sum() / max(1, non_masked_elems) + + # no chunking at all + logits = logits.reshape(-1, logits.size(-1)) + targets = targets.reshape(-1) + if chunk_size == 0: + return torch.nn.functional.cross_entropy(logits, targets, ignore_index=ignore_index) + + # lm_head wasn't chunked, chunk cross entropy + logit_chunks = logits.split(chunk_size) + target_chunks = targets.split(chunk_size) + loss_chunks = [ + torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=ignore_index, reduction="none") + for logit_chunk, target_chunk in zip(logit_chunks, target_chunks) + ] + non_masked_elems = (targets != ignore_index).sum() + return torch.cat(loss_chunks).sum() / max(1, non_masked_elems) + + +def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict: + for checkpoint_name, attribute_name in mapping.items(): + full_checkpoint_name = prefix + checkpoint_name + if full_checkpoint_name in state_dict: + full_attribute_name = prefix + attribute_name + state_dict[full_attribute_name] = state_dict.pop(full_checkpoint_name) + return state_dict + + +def get_default_supported_precision(training: bool) -> str: + """Return default precision that is supported by the hardware: either `bf16` or `16`. + + Args: + training: `-mixed` or `-true` version of the precision to use + + Returns: + default precision that is suitable for the task and is supported by the hardware + """ + from lightning.fabric.accelerators import MPSAccelerator + + if MPSAccelerator.is_available() or (torch.cuda.is_available() and not torch.cuda.is_bf16_supported()): + return "16-mixed" if training else "16-true" + return "bf16-mixed" if training else "bf16-true" + + +def load_checkpoint(fabric: L.Fabric, model: nn.Module, checkpoint_path: Path, strict: bool = True) -> None: + if isinstance(fabric.strategy, FSDPStrategy): + fabric.load_raw(checkpoint_path, model, strict=strict) + else: + state_dict = lazy_load(checkpoint_path) + state_dict = state_dict.get("model", state_dict) + model.load_state_dict(state_dict, strict=strict) + + +def flops_per_param(max_seq_length: int, n_layer: int, n_embd: int, n_params: int) -> int: + flops_per_token = 2 * n_params # each parameter is used for a MAC (2 FLOPS) per network operation + # this assumes that all samples have a fixed length equal to the block size + # which is most likely false during finetuning + flops_per_seq = flops_per_token * max_seq_length + attn_flops_per_seq = n_layer * 2 * 2 * (n_embd * (max_seq_length**2)) + return flops_per_seq + attn_flops_per_seq + + +def estimate_flops(model: "GPT", training: bool) -> int: + """Measures estimated FLOPs for MFU. + + Refs: + * https://ar5iv.labs.arxiv.org/html/2205.05198#A1 + * https://ar5iv.labs.arxiv.org/html/2204.02311#A2 + """ + # using all parameters for this is a naive over estimation because not all model parameters actually contribute to + # this FLOP computation (e.g. embedding, norm). For this reason, the result will be higher by a fixed percentage + # (~10%) compared to the measured FLOPs, making those lower but more realistic. + # For a proper estimate, this needs a more fine-grained calculation as in Appendix A of the paper. + n_trainable_params = num_parameters(model, requires_grad=True) + trainable_flops = flops_per_param( + model.max_seq_length, model.config.n_layer, model.config.n_embd, n_trainable_params + ) + # forward + backward + gradients (assumes no gradient accumulation) + ops_per_step = 3 if training else 1 + n_frozen_params = num_parameters(model, requires_grad=False) + frozen_flops = flops_per_param(model.max_seq_length, model.config.n_layer, model.config.n_embd, n_frozen_params) + # forward + backward + frozen_ops_per_step = 2 if training else 1 + return ops_per_step * trainable_flops + frozen_ops_per_step * frozen_flops + + +class CycleIterator: + """An iterator that cycles through an iterable indefinitely. + + Example: + >>> iterator = CycleIterator([1, 2, 3]) + >>> [next(iterator) for _ in range(5)] + [1, 2, 3, 1, 2] + + Note: + Unlike ``itertools.cycle``, this iterator does not cache the values of the iterable. + """ + + def __init__(self, iterable: Iterable) -> None: + self.iterable = iterable + self.epoch = 0 + self._iterator = None + + def __next__(self) -> Any: + if self._iterator is None: + self._iterator = iter(self.iterable) + try: + return next(self._iterator) + except StopIteration: + self._iterator = iter(self.iterable) + self.epoch += 1 + return next(self._iterator) + + def __iter__(self) -> Self: + return self + + +def CLI(*args: Any, **kwargs: Any) -> Any: + from jsonargparse import CLI, set_docstring_parse_options + + set_docstring_parse_options(attribute_docstrings=True) + + kwargs.setdefault("as_positional", False) + return CLI(*args, **kwargs) diff --git a/llm-lora-finetuning/pipelines/__init__.py b/llm-lora-finetuning/pipelines/__init__.py new file mode 100644 index 00000000..757bd841 --- /dev/null +++ b/llm-lora-finetuning/pipelines/__init__.py @@ -0,0 +1,16 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py new file mode 100644 index 00000000..467d2f63 --- /dev/null +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -0,0 +1,25 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from zenml import pipeline +from steps.finetune import finetune_lora +from steps.merge import merge +from zenml.config import DockerSettings + +@pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) +def finetuning_pipeline(repo_id: str = "mistralai/Mistral-7B-Instruct-v0.1") -> None: + checkpoint_dir, output_path = finetune_lora(repo_id=repo_id) diff --git a/llm-lora-finetuning/pipelines/merge.py b/llm-lora-finetuning/pipelines/merge.py new file mode 100644 index 00000000..a5389a6d --- /dev/null +++ b/llm-lora-finetuning/pipelines/merge.py @@ -0,0 +1,23 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from zenml import pipeline +from steps.merge import merge + +@pipeline +def merge_pipeline() -> None: + merge() \ No newline at end of file diff --git a/llm-lora-finetuning/requirements.txt b/llm-lora-finetuning/requirements.txt new file mode 100644 index 00000000..68d82b31 --- /dev/null +++ b/llm-lora-finetuning/requirements.txt @@ -0,0 +1,19 @@ +zenml[server]>=0.55.0 +torch>=2.2.0 +lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af +jsonargparse[signatures] # CLI +bitsandbytes==0.41.0 # quantization +scipy # required by bitsandbytes +sentencepiece # llama-based models +tokenizers # pythia, falcon, redpajama +datasets # eval +requests # scripts/prepare_* +zstandard # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py +pandas # scripts/prepare_csv.py, scripts/prepare_starcoder.py +pyarrow # scripts/prepare_starcoder.py +# tensorboard # pretrain/tinyllama.py +# torchmetrics # pretrain/tinyllama.py +# eval +git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529 +# scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py, pretrain/tinyllama.py +lightning[data] @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af diff --git a/llm-lora-finetuning/run.py b/llm-lora-finetuning/run.py new file mode 100644 index 00000000..cbaf351b --- /dev/null +++ b/llm-lora-finetuning/run.py @@ -0,0 +1,114 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional +import click +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@click.command( + help=""" +ZenML LLM Lora Finetuning project. + +Examples: + + \b + # Run the feature data preparation pipeline + python run.py --data-pipeline + + \b + # Run the finetuning pipeline + python run.py --finetuning-pipeline + + \b + # Run the merging pipeline + python run.py --merging-pipeline + + \b + # Run the evaluation pipeline + python run.py --eval-pipeline + + \b + # Run the deployment pipeline + python run.py --deployment-pipeline +""" +) +@click.option( + "--config", + type=str, + default=None, + help="Path to the YAML config file.", +) +@click.option( + "--feature-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that creates the dataset.", +) +@click.option( + "--finetuning-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that finetunes the model.", +) +@click.option( + "--merging-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that merges the model and adapter.", +) +@click.option( + "--eval-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that evaluates the model.", +) +@click.option( + "--deployment-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that deploys the model.", +) +@click.option( + "--no-cache", + is_flag=True, + default=False, + help="Disable caching for the pipeline run.", +) +def main( + config: Optional[str] = None, + feature_pipeline: bool = False, + finetuning_pipeline: bool = False, + merging_pipeline: bool = False, + eval_pipeline: bool = False, + deployment_pipeline: bool = False, + no_cache: bool = False, +): + """Main entry point for the pipeline execution. + + Args: + no_cache: If `True` cache will be disabled. + """ + if feature_pipeline: + from pipelines.feature_engineering import feature_engineering_pipeline + + feature_engineering_pipeline() + +if __name__ == "__main__": + main() diff --git a/llm-lora-finetuning/scripts/convert_hf_checkpoint.py b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py new file mode 100644 index 00000000..3839a879 --- /dev/null +++ b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py @@ -0,0 +1,356 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import gc +import json +import sys +from collections import defaultdict +from dataclasses import asdict +from functools import partial +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import torch +from lightning.fabric.utilities.load import _NotYetLoadedTensor as NotYetLoadedTensor + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt import Config +from lit_gpt.utils import incremental_save, lazy_load + + +def copy_weights_gpt_neox( + state_dict: Dict[str, torch.Tensor], + hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, + dtype: Optional[torch.dtype] = None, +) -> None: + weight_map = { + "gpt_neox.embed_in.weight": "transformer.wte.weight", + "gpt_neox.layers.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias", + "gpt_neox.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight", + "gpt_neox.layers.{}.attention.query_key_value.bias": "transformer.h.{}.attn.attn.bias", + "gpt_neox.layers.{}.attention.query_key_value.weight": "transformer.h.{}.attn.attn.weight", + "gpt_neox.layers.{}.attention.dense.bias": "transformer.h.{}.attn.proj.bias", + "gpt_neox.layers.{}.attention.dense.weight": "transformer.h.{}.attn.proj.weight", + "gpt_neox.layers.{}.attention.rotary_emb.inv_freq": None, + "gpt_neox.layers.{}.attention.bias": None, + "gpt_neox.layers.{}.attention.masked_bias": None, + "gpt_neox.layers.{}.post_attention_layernorm.bias": "transformer.h.{}.norm_2.bias", + "gpt_neox.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight", + "gpt_neox.layers.{}.mlp.dense_h_to_4h.bias": "transformer.h.{}.mlp.fc.bias", + "gpt_neox.layers.{}.mlp.dense_h_to_4h.weight": "transformer.h.{}.mlp.fc.weight", + "gpt_neox.layers.{}.mlp.dense_4h_to_h.bias": "transformer.h.{}.mlp.proj.bias", + "gpt_neox.layers.{}.mlp.dense_4h_to_h.weight": "transformer.h.{}.mlp.proj.weight", + "gpt_neox.final_layer_norm.bias": "transformer.ln_f.bias", + "gpt_neox.final_layer_norm.weight": "transformer.ln_f.weight", + "embed_out.weight": "lm_head.weight", + } + + for name, param in hf_weights.items(): + if "gpt_neox.layers" in name: + from_name, number = layer_template(name, 2) + to_name = weight_map[from_name] + if to_name is None: + continue + to_name = to_name.format(number) + else: + to_name = weight_map[name] + param = load_param(param, name, dtype) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + +def copy_weights_falcon( + model_name: str, + state_dict: Dict[str, torch.Tensor], + hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, + dtype: Optional[torch.dtype] = None, +) -> None: + weight_map = { + "transformer.word_embeddings.weight": "transformer.wte.weight", + "transformer.h.{}.self_attention.query_key_value.weight": "transformer.h.{}.attn.attn.weight", + "transformer.h.{}.self_attention.dense.weight": "transformer.h.{}.attn.proj.weight", + "transformer.h.{}.mlp.dense_h_to_4h.weight": "transformer.h.{}.mlp.fc.weight", + "transformer.h.{}.mlp.dense_4h_to_h.weight": "transformer.h.{}.mlp.proj.weight", + "transformer.ln_f.bias": "transformer.ln_f.bias", + "transformer.ln_f.weight": "transformer.ln_f.weight", + "lm_head.weight": "lm_head.weight", + } + # the original model definition is different for each size + if "7b" in model_name: + weight_map.update( + { + "transformer.h.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias", + "transformer.h.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight", + } + ) + elif "40b" in model_name or "180B" in model_name: + weight_map.update( + { + "transformer.h.{}.ln_attn.bias": "transformer.h.{}.norm_1.bias", + "transformer.h.{}.ln_attn.weight": "transformer.h.{}.norm_1.weight", + "transformer.h.{}.ln_mlp.bias": "transformer.h.{}.norm_2.bias", + "transformer.h.{}.ln_mlp.weight": "transformer.h.{}.norm_2.weight", + } + ) + else: + raise NotImplementedError + + for name, param in hf_weights.items(): + if "transformer.h" in name: + from_name, number = layer_template(name, 2) + to_name = weight_map[from_name].format(number) + else: + to_name = weight_map[name] + param = load_param(param, name, dtype) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + +def copy_weights_hf_llama( + config: Config, + qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]], + state_dict: Dict[str, torch.Tensor], + hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, + dtype: Optional[torch.dtype] = None, +) -> None: + weight_map = { + "model.embed_tokens.weight": "transformer.wte.weight", + "model.layers.{}.input_layernorm.weight": "transformer.h.{l}.norm_1.weight", + "model.layers.{}.input_layernorm.bias": "transformer.h.{l}.norm_1.bias", + "model.layers.{}.self_attn.q_proj.weight": None, + "model.layers.{}.self_attn.k_proj.weight": None, + "model.layers.{}.self_attn.v_proj.weight": None, + "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{l}.attn.proj.weight", + "model.layers.{}.self_attn.rotary_emb.inv_freq": None, + "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{l}.norm_2.weight", + "model.layers.{}.post_attention_layernorm.bias": "transformer.h.{l}.norm_2.bias", + "model.norm.weight": "transformer.ln_f.weight", + "model.norm.bias": "transformer.ln_f.bias", + "lm_head.weight": "lm_head.weight", + } + if config._mlp_class == "LLaMAMoE": + weight_map.update( + { + "model.layers.{}.block_sparse_moe.gate.weight": "transformer.h.{l}.mlp.gate.weight", + "model.layers.{}.block_sparse_moe.experts.{}.w1.weight": "transformer.h.{l}.mlp.experts.{e}.fc_1.weight", + "model.layers.{}.block_sparse_moe.experts.{}.w3.weight": "transformer.h.{l}.mlp.experts.{e}.fc_2.weight", + "model.layers.{}.block_sparse_moe.experts.{}.w2.weight": "transformer.h.{l}.mlp.experts.{e}.proj.weight", + } + ) + elif config._mlp_class in ("LLaMAMLP", "GemmaMLP"): + weight_map.update( + { + "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{l}.mlp.fc_1.weight", + "model.layers.{}.mlp.up_proj.weight": "transformer.h.{l}.mlp.fc_2.weight", + "model.layers.{}.mlp.down_proj.weight": "transformer.h.{l}.mlp.proj.weight", + } + ) + else: + raise NotImplementedError + + for name, param in hf_weights.items(): + if "model.layers" in name: + from_name, l = layer_template(name, 2) + e = None + if "block_sparse_moe.experts" in name: + from_name, e = layer_template(from_name, 5) + qkv = qkv_weights.setdefault(l, [None, None, None]) + if "q_proj" in name: + qkv[0] = param + elif "k_proj" in name: + qkv[1] = param + elif "v_proj" in name: + qkv[2] = param + to_name = weight_map[from_name] + if to_name is None: + continue + to_name = to_name.format(l=l, e=e) + else: + to_name = weight_map[name] + param = load_param(param, name, dtype) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + if "lm_head.weight" not in state_dict: + state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"] + + # convert separate q, k, v matrices into an interleaved qkv + for i, (q, k, v) in list(qkv_weights.items()): + if q is None or k is None or v is None: + # split across different .bin files + continue + q = load_param(q, f"layer {i} q", dtype) + k = load_param(k, f"layer {i} k", dtype) + v = load_param(v, f"layer {i} v", dtype) + q_per_kv = config.n_head // config.n_query_groups + qs = torch.split(q, config.head_size * q_per_kv) + ks = torch.split(k, config.head_size) + vs = torch.split(v, config.head_size) + cycled = [t for group in zip(qs, ks, vs) for t in group] + qkv = torch.cat(cycled) + state_dict[f"transformer.h.{i}.attn.attn.weight"] = qkv + del qkv_weights[i] + + +def copy_weights_phi( + config: Config, + qkv_weights: dict, + state_dict: Dict[str, torch.Tensor], + hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, + dtype: Optional[torch.dtype] = None, +) -> None: + if any(layer_name.startswith(("layers.", "transformer.")) for layer_name in hf_weights): + raise ValueError( + "You are using an outdated Phi checkpoint. Please reload it as described in 'tutorials/download_phi.md'" + ) + + weight_map = { + "model.embed_tokens.weight": "transformer.wte.weight", + "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight", + "model.layers.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias", + "model.layers.{}.self_attn.q_proj.weight": None, + "model.layers.{}.self_attn.q_proj.bias": None, + "model.layers.{}.self_attn.k_proj.weight": None, + "model.layers.{}.self_attn.k_proj.bias": None, + "model.layers.{}.self_attn.v_proj.weight": None, + "model.layers.{}.self_attn.v_proj.bias": None, + "model.layers.{}.self_attn.dense.weight": "transformer.h.{}.attn.proj.weight", + "model.layers.{}.self_attn.dense.bias": "transformer.h.{}.attn.proj.bias", + "model.layers.{}.mlp.fc1.weight": "transformer.h.{}.mlp.fc.weight", + "model.layers.{}.mlp.fc1.bias": "transformer.h.{}.mlp.fc.bias", + "model.layers.{}.mlp.fc2.weight": "transformer.h.{}.mlp.proj.weight", + "model.layers.{}.mlp.fc2.bias": "transformer.h.{}.mlp.proj.bias", + "model.final_layernorm.weight": "transformer.ln_f.weight", + "model.final_layernorm.bias": "transformer.ln_f.bias", + "lm_head.weight": "lm_head.weight", + "lm_head.bias": "lm_head.bias", + } + + for name, param in hf_weights.items(): + if name.startswith("model.layers."): + from_name, l = layer_template(name, 2) + qkv = qkv_weights.setdefault(l, defaultdict(dict)) + if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")): + weight_name, weight_type = from_name.split(".")[-2:] + qkv[weight_type][weight_name] = param + to_name = weight_map[from_name] + if to_name is None: + continue + to_name = to_name.format(l) + else: + to_name = weight_map[name] + param = load_param(param, name, dtype) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + for i in list(qkv_weights): + for weight_type in list(qkv_weights[i]): + qkv = qkv_weights[i][weight_type] + if len(qkv) != 3: + # split across different .bin files + continue + q = load_param(qkv["q_proj"], f"layer {i} q {weight_type}", dtype) + k = load_param(qkv["k_proj"], f"layer {i} k {weight_type}", dtype) + v = load_param(qkv["v_proj"], f"layer {i} v {weight_type}", dtype) + q_per_kv = config.n_head // config.n_query_groups + qs = torch.split(q, config.head_size * q_per_kv) + ks = torch.split(k, config.head_size) + vs = torch.split(v, config.head_size) + cycled = [t for group in zip(qs, ks, vs) for t in group] + qkv = torch.cat(cycled) + state_dict[f"transformer.h.{i}.attn.attn.{weight_type}"] = qkv + del qkv_weights[i][weight_type] + + +def layer_template(layer_name: str, idx: int) -> Tuple[str, int]: + split = layer_name.split(".") + number = int(split[idx]) + split[idx] = "{}" + from_name = ".".join(split) + return from_name, number + + +def load_param(param: Union[torch.Tensor, NotYetLoadedTensor], name: str, dtype: Optional[torch.dtype]) -> torch.Tensor: + if hasattr(param, "_load_tensor"): + # support tensors loaded via `lazy_load()` + print(f"Loading {name!r} into RAM") + param = param._load_tensor() + if dtype is not None and type(dtype) is not NotYetLoadedTensor and dtype != param.dtype: + print(f"Converting {name!r} from {param.dtype} to {dtype}") + param = param.to(dtype) + return param + + +@torch.inference_mode() +def convert_hf_checkpoint( + *, + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + model_name: Optional[str] = None, + dtype: Optional[str] = None, +) -> None: + if model_name is None: + model_name = checkpoint_dir.name + if dtype is not None: + dtype = getattr(torch, dtype) + + config = Config.from_name(model_name) + config_dict = asdict(config) + print(f"Model config {config_dict}") + with open(checkpoint_dir / "lit_config.json", "w") as json_config: + json.dump(config_dict, json_config) + + if "falcon" in model_name: + copy_fn = partial(copy_weights_falcon, model_name) + elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): + # holder to reconstitute the split q, k, v + qkv_weights = {} + copy_fn = partial(copy_weights_hf_llama, config, qkv_weights) + elif "phi" in model_name: + # holder to reconstitute the split q, k, v + qkv_weights = {} + copy_fn = partial(copy_weights_phi, config, qkv_weights) + else: + copy_fn = copy_weights_gpt_neox + + # initialize a new empty state dict to hold our new weights + sd = {} + + # Load the json file containing weight mapping + pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json" + if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file + with open(pytorch_bin_map_json_path) as json_map: + bin_index = json.load(json_map) + bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} + else: + bin_files = set(checkpoint_dir.glob("*.bin")) + # some checkpoints serialize the training arguments + bin_files = {f for f in bin_files if f.name != "training_args.bin"} + if not bin_files: + raise ValueError(f"Expected {str(checkpoint_dir)!r} to contain .bin files") + + with incremental_save(checkpoint_dir / "lit_model.pth") as saver: + # for checkpoints that split the QKV across several files, we need to keep all the bin files + # open, so we use `ExitStack` to close them all together at the end + for bin_file in sorted(bin_files): + print("Processing", bin_file) + hf_weights = lazy_load(bin_file) + copy_fn(sd, hf_weights, saver=saver, dtype=dtype) + gc.collect() + print("Saving converted checkpoint") + saver.save(sd) + + +if __name__ == "__main__": + from jsonargparse import CLI + + CLI(convert_hf_checkpoint) diff --git a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py new file mode 100644 index 00000000..8a3b101a --- /dev/null +++ b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py @@ -0,0 +1,272 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import gc +import sys +from functools import partial +from pathlib import Path +from typing import Dict, Optional, Tuple, Union + +import torch +from lightning.fabric.utilities.load import _NotYetLoadedTensor as NotYetLoadedTensor + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt import Config +from lit_gpt.utils import CLI, incremental_save, lazy_load +from scripts.convert_hf_checkpoint import layer_template, load_param + + +def copy_weights_falcon( + model_name: str, + state_dict: Dict[str, torch.Tensor], + lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, +) -> None: + weight_map = { + "transformer.wte.weight": "transformer.word_embeddings.weight", + "transformer.h.{}.attn.attn.weight": "transformer.h.{}.self_attention.query_key_value.weight", + "transformer.h.{}.attn.proj.weight": "transformer.h.{}.self_attention.dense.weight", + "transformer.h.{}.mlp.fc.weight": "transformer.h.{}.mlp.dense_h_to_4h.weight", + "transformer.h.{}.mlp.proj.weight": "transformer.h.{}.mlp.dense_4h_to_h.weight", + "transformer.ln_f.bias": "transformer.ln_f.bias", + "transformer.ln_f.weight": "transformer.ln_f.weight", + "lm_head.weight": "lm_head.weight", + } + # the original model definition is different for each size + if "7b" in model_name: + weight_map.update( + { + "transformer.h.{}.norm_1.bias": "transformer.h.{}.input_layernorm.bias", + "transformer.h.{}.norm_1.weight": "transformer.h.{}.input_layernorm.weight", + } + ) + elif "40b" in model_name or "180B" in model_name: + weight_map.update( + { + "transformer.h.{}.norm_1.bias": "transformer.h.{}.ln_attn.bias", + "transformer.h.{}.norm_1.weight": "transformer.h.{}.ln_attn.weight", + "transformer.h.{}.norm_2.bias": "transformer.h.{}.ln_mlp.bias", + "transformer.h.{}.norm_2.weight": "transformer.h.{}.ln_mlp.weight", + } + ) + else: + raise NotImplementedError + + for name, param in lit_weights.items(): + if "transformer.h" in name: + from_name, number = layer_template(name, 2) + to_name = weight_map[from_name].format(number) + else: + to_name = weight_map[name] + param = load_param(param, name, None) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + +def copy_weights_gpt_neox( + state_dict: Dict[str, torch.Tensor], + lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, +) -> None: + weight_map = { + "transformer.wte.weight": "gpt_neox.embed_in.weight", + "transformer.h.{}.norm_1.bias": "gpt_neox.layers.{}.input_layernorm.bias", + "transformer.h.{}.norm_1.weight": "gpt_neox.layers.{}.input_layernorm.weight", + "transformer.h.{}.attn.attn.bias": "gpt_neox.layers.{}.attention.query_key_value.bias", + "transformer.h.{}.attn.attn.weight": "gpt_neox.layers.{}.attention.query_key_value.weight", + "transformer.h.{}.attn.proj.bias": "gpt_neox.layers.{}.attention.dense.bias", + "transformer.h.{}.attn.proj.weight": "gpt_neox.layers.{}.attention.dense.weight", + "transformer.h.{}.norm_2.bias": "gpt_neox.layers.{}.post_attention_layernorm.bias", + "transformer.h.{}.norm_2.weight": "gpt_neox.layers.{}.post_attention_layernorm.weight", + "transformer.h.{}.mlp.fc.bias": "gpt_neox.layers.{}.mlp.dense_h_to_4h.bias", + "transformer.h.{}.mlp.fc.weight": "gpt_neox.layers.{}.mlp.dense_h_to_4h.weight", + "transformer.h.{}.mlp.proj.bias": "gpt_neox.layers.{}.mlp.dense_4h_to_h.bias", + "transformer.h.{}.mlp.proj.weight": "gpt_neox.layers.{}.mlp.dense_4h_to_h.weight", + "transformer.ln_f.bias": "gpt_neox.final_layer_norm.bias", + "transformer.ln_f.weight": "gpt_neox.final_layer_norm.weight", + "lm_head.weight": "embed_out.weight", + } + + for name, param in lit_weights.items(): + if "transformer.h" in name: + from_name, number = layer_template(name, 2) + to_name = weight_map[from_name].format(number) + else: + to_name = weight_map[name] + param = load_param(param, name, None) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + +def copy_weights_llama( + config: Config, + state_dict: Dict[str, torch.Tensor], + lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + untie_weights: bool = False, + saver: Optional[incremental_save] = None, +) -> None: + weight_map = { + "transformer.wte.weight": "model.embed_tokens.weight", + "transformer.h.{}.norm_1.weight": "model.layers.{l}.input_layernorm.weight", + "transformer.h.{}.norm_1.bias": "model.layers.{l}.input_layernorm.bias", + "transformer.h.{}.attn.proj.weight": "model.layers.{l}.self_attn.o_proj.weight", + "transformer.h.{}.norm_2.weight": "model.layers.{l}.post_attention_layernorm.weight", + "transformer.h.{}.norm_2.bias": "model.layers.{l}.post_attention_layernorm.bias", + "transformer.ln_f.weight": "model.norm.weight", + "transformer.ln_f.bias": "model.norm.bias", + "lm_head.weight": "lm_head.weight", + } + if config._mlp_class == "LLaMAMoE": + weight_map.update( + { + "transformer.h.{}.mlp.gate.weight": "model.layers.{l}.block_sparse_moe.gate.weight", + "transformer.h.{}.mlp.experts.{}.fc_1.weight": "model.layers.{l}.block_sparse_moe.experts.{e}.w1.weight", + "transformer.h.{}.mlp.experts.{}.fc_2.weight": "model.layers.{l}.block_sparse_moe.experts.{e}.w3.weight", + "transformer.h.{}.mlp.experts.{}.proj.weight": "model.layers.{l}.block_sparse_moe.experts.{e}.w2.weight", + } + ) + elif config._mlp_class in ("LLaMAMLP", "GemmaMLP"): + weight_map.update( + { + "transformer.h.{}.mlp.fc_1.weight": "model.layers.{l}.mlp.gate_proj.weight", + "transformer.h.{}.mlp.fc_2.weight": "model.layers.{l}.mlp.up_proj.weight", + "transformer.h.{}.mlp.proj.weight": "model.layers.{l}.mlp.down_proj.weight", + } + ) + else: + raise NotImplementedError + + for name, param in lit_weights.items(): + if name == "lm_head.weight" and untie_weights: + continue + if name.endswith(".attn.attn.weight"): + from_name, l = layer_template(name, 2) + q = "model.layers.{}.self_attn.q_proj.weight".format(l) + k = "model.layers.{}.self_attn.k_proj.weight".format(l) + v = "model.layers.{}.self_attn.v_proj.weight".format(l) + qkv = load_param(param, name, None) + qp, kp, vp = qkv_split(qkv, config) + for to_name, param in zip((q, k, v), (qp, kp, vp)): + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + else: + if "transformer.h" in name: + from_name, l = layer_template(name, 2) + e = None + if "mlp.experts" in name: + from_name, e = layer_template(from_name, 5) + to_name = weight_map[from_name] + to_name = to_name.format(l=l, e=e) + else: + to_name = weight_map[name] + param = load_param(param, name, None) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + +def copy_weights_phi( + config: Config, + state_dict: Dict[str, torch.Tensor], + lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, +) -> None: + weight_map = { + "transformer.wte.weight": "model.embed_tokens.weight", + "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight", + "transformer.h.{}.norm_1.bias": "model.layers.{}.input_layernorm.bias", + "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.dense.weight", + "transformer.h.{}.attn.proj.bias": "model.layers.{}.self_attn.dense.bias", + "transformer.h.{}.mlp.fc.weight": "model.layers.{}.mlp.fc1.weight", + "transformer.h.{}.mlp.fc.bias": "model.layers.{}.mlp.fc1.bias", + "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.fc2.weight", + "transformer.h.{}.mlp.proj.bias": "model.layers.{}.mlp.fc2.bias", + "transformer.ln_f.weight": "model.final_layernorm.weight", + "transformer.ln_f.bias": "model.final_layernorm.bias", + "lm_head.weight": "lm_head.weight", + "lm_head.bias": "lm_head.bias", + } + + for name, param in lit_weights.items(): + if name.endswith((".attn.attn.weight", ".attn.attn.bias")): + from_name, l = layer_template(name, 2) + weight_type = name.split(".")[-1] # weight or bias + q = f"model.layers.{l}.self_attn.q_proj.{weight_type}" + k = f"model.layers.{l}.self_attn.k_proj.{weight_type}" + v = f"model.layers.{l}.self_attn.v_proj.{weight_type}" + qkv = load_param(param, name, None) + qp, kp, vp = qkv_split(qkv, config) + for to_name, param in zip((q, k, v), (qp, kp, vp)): + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + else: + if "transformer.h" in name: + from_name, l = layer_template(name, 2) + to_name = weight_map[from_name] + to_name = to_name.format(l) + else: + to_name = weight_map[name] + param = load_param(param, name, None) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + +def qkv_split( + param: Union[torch.Tensor, NotYetLoadedTensor], config: Config +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + q_per_kv = config.n_head // config.n_query_groups + qs = [] + ks = [] + vs = [] + for chunk in torch.chunk(param, config.n_query_groups): + split = torch.split(chunk, [config.head_size * q_per_kv, config.head_size, config.head_size]) + qs.append(split[0]) + ks.append(split[1]) + vs.append(split[2]) + q = torch.cat(qs) + k = torch.cat(ks) + v = torch.cat(vs) + return q, k, v + + +def check_conversion_supported(lit_weights: Dict[str, torch.Tensor]) -> None: + if any("lora" in wn for wn in lit_weights): + raise ValueError("Checkpoints with LoRA weights cannot be converted. Call `scripts/merge_lora.py` first.") + if any("adapter" in wn or "gating_factor" in wn for wn in lit_weights): + raise NotImplementedError("Converting adapter models is supported.") + + +@torch.inference_mode() +def convert_lit_checkpoint(checkpoint_path: Path, output_path: Path, config_path: Path) -> None: + config = Config.from_json(config_path) + + if "falcon" in config.name: + copy_fn = partial(copy_weights_falcon, config.name) + elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): + untie_weights = "Gemma" in config.name + copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights) + elif "phi" in config.name: + copy_fn = partial(copy_weights_phi, config) + else: + copy_fn = copy_weights_gpt_neox + + # initialize a new empty state dict to hold our new weights + sd = {} + with incremental_save(output_path) as saver: + lit_weights = lazy_load(checkpoint_path) + lit_weights = lit_weights.get("model", lit_weights) + check_conversion_supported(lit_weights) + copy_fn(sd, lit_weights, saver=saver) + gc.collect() + saver.save(sd) + + +if __name__ == "__main__": + CLI(convert_lit_checkpoint) diff --git a/llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py b/llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py new file mode 100644 index 00000000..b32103e0 --- /dev/null +++ b/llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py @@ -0,0 +1,78 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import json +import shutil +import sys +from dataclasses import asdict +from pathlib import Path + +import torch + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt import Config +from lit_gpt.utils import CLI, incremental_save + + +@torch.inference_mode() +def convert_checkpoint(checkpoint_file: Path, tokenizer_dir: Path, config_name: str, output_dir: Path) -> None: + """Convert a checkpoint after pretraining. + + The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training + is finished. This script will export the state-dict of the model and place it in the chosen output folder together + with the tokenizer and model config, which then can be loaded by other scripts for inference, evaluation, etc. + + Args: + checkpoint_file: Path to a checkpoint file scripts produced by the scripts in ``lit_gpt/pretrain/``. + tokenizer_dir: A path to the folder that holds the tokenizer configuration files that were used to train + the model. All files with a name starting with 'tokenizer' will be copied to the output folder. + config_name: The name of the model loaded with the ``lit_gpt.Config``. The configuration will be saved as a + JSON file to the output folder. + output_dir: The output folder where model state-dict file, the tokenizer config file, and the model config + file will be saved. + """ + + if output_dir.is_dir() and output_dir.glob("*"): + raise FileExistsError( + f"The output folder exists and is not empty: {str(output_dir)}." + " Please delete it first or choose a different name." + ) + if not tokenizer_dir.is_dir(): + raise FileNotFoundError(f"The tokenizer_dir must be a directory: {str(output_dir)}.") + + output_dir.mkdir(parents=True) + output_checkpoint_file = output_dir / "lit_model.pth" + output_config_file = output_dir / "lit_config.json" + + # Save the config to output folder + config = Config.from_name(config_name) + with open(output_config_file, "w") as json_config: + json.dump(asdict(config), json_config) + + # Export the tokenizer configuration to output folder + for tokenizer_file in tokenizer_dir.glob("tokenizer*"): + shutil.copyfile(tokenizer_file, output_dir / tokenizer_file.name) + + # Copy config for tokenization if found + if (tokenizer_dir / "generation_config.json").is_file(): + shutil.copyfile(tokenizer_dir / "generation_config.json", output_dir / "generation_config.json") + + # Extract the model state dict and save to output folder + with incremental_save(output_checkpoint_file) as saver: + print("Processing", checkpoint_file) + full_checkpoint = torch.load(str(checkpoint_file), mmap=True) + loaded_state_dict = full_checkpoint["model"] + converted_state_dict = {} + for param_name, param in loaded_state_dict.items(): + saver.store_early(param) + # remove prefix for compiled model (if any) + param_name = param_name.replace("_orig_mod.", "") + converted_state_dict[param_name] = param + print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.") + saver.save(converted_state_dict) + + +if __name__ == "__main__": + CLI(convert_checkpoint) diff --git a/llm-lora-finetuning/scripts/download.py b/llm-lora-finetuning/scripts/download.py new file mode 100644 index 00000000..b1a1a78f --- /dev/null +++ b/llm-lora-finetuning/scripts/download.py @@ -0,0 +1,97 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import os +import sys +from pathlib import Path +from typing import Optional + +import torch +from lightning_utilities.core.imports import RequirementCache + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt.utils import CLI + +_SAFETENSORS_AVAILABLE = RequirementCache("safetensors") +_HF_TRANSFER_AVAILABLE = RequirementCache("hf_transfer") + + +def download_from_hub( + repo_id: Optional[str] = None, + access_token: Optional[str] = os.getenv("HF_TOKEN"), + from_safetensors: bool = False, + tokenizer_only: bool = False, + checkpoint_dir: Path = Path("checkpoints"), +) -> None: + if repo_id is None: + from lit_gpt.config import configs + + options = [f"{config['hf_config']['org']}/{config['hf_config']['name']}" for config in configs] + print("Please specify --repo_id . Available values:") + print("\n".join(options)) + return + + from huggingface_hub import snapshot_download + + if ("meta-llama" in repo_id or "falcon-180" in repo_id) and not access_token: + raise ValueError( + f"{repo_id} requires authentication, please set the `HF_TOKEN=your_token` environment" + " variable or pass --access_token=your_token. You can find your token by visiting" + " https://huggingface.co/settings/tokens" + ) + + download_files = ["tokenizer*", "generation_config.json"] + if not tokenizer_only: + if from_safetensors: + if not _SAFETENSORS_AVAILABLE: + raise ModuleNotFoundError(str(_SAFETENSORS_AVAILABLE)) + download_files.append("*.safetensors") + else: + # covers `.bin` files and `.bin.index.json` + download_files.append("*.bin*") + elif from_safetensors: + raise ValueError("`--from_safetensors=True` won't have an effect with `--tokenizer_only=True`") + + import huggingface_hub._snapshot_download as download + import huggingface_hub.constants as constants + + previous = constants.HF_HUB_ENABLE_HF_TRANSFER + if _HF_TRANSFER_AVAILABLE and not previous: + print("Setting HF_HUB_ENABLE_HF_TRANSFER=1") + constants.HF_HUB_ENABLE_HF_TRANSFER = True + download.HF_HUB_ENABLE_HF_TRANSFER = True + + directory = checkpoint_dir / repo_id + snapshot_download( + repo_id, + local_dir=directory, + local_dir_use_symlinks=False, + resume_download=True, + allow_patterns=download_files, + token=access_token, + ) + + constants.HF_HUB_ENABLE_HF_TRANSFER = previous + download.HF_HUB_ENABLE_HF_TRANSFER = previous + + # convert safetensors to PyTorch binaries + if from_safetensors: + from safetensors import SafetensorError + from safetensors.torch import load_file as safetensors_load + + print("Converting .safetensor files to PyTorch binaries (.bin)") + for safetensor_path in directory.glob("*.safetensors"): + bin_path = safetensor_path.with_suffix(".bin") + try: + result = safetensors_load(safetensor_path) + except SafetensorError as e: + raise RuntimeError(f"{safetensor_path} is likely corrupted. Please try to re-download it.") from e + print(f"{safetensor_path} --> {bin_path}") + torch.save(result, bin_path) + os.remove(safetensor_path) + + +if __name__ == "__main__": + CLI(download_from_hub) diff --git a/llm-lora-finetuning/scripts/merge_lora.py b/llm-lora-finetuning/scripts/merge_lora.py new file mode 100644 index 00000000..c25f87f4 --- /dev/null +++ b/llm-lora-finetuning/scripts/merge_lora.py @@ -0,0 +1,83 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""This script merges the LoRA weights with the base model""" + +import sys +from pathlib import Path +from typing import Optional + +import lightning as L +import torch + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt.lora import GPT, Config, lora_filter, merge_lora_weights +from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load + + +def merge_lora( + lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + out_dir: Path = Path("out/lora/checkpoint"), + precision: Optional[str] = None, + lora_r: int = 8, + lora_alpha: int = 16, + lora_dropout: float = 0.05, + lora_query: bool = True, + lora_key: bool = False, + lora_value: bool = True, + lora_projection: bool = False, + lora_mlp: bool = False, + lora_head: bool = False, +) -> None: + """Generates a response based on a given instruction and an optional input. + This script will only work with checkpoints from the instruction-tuned GPT-LoRA model. + See `finetune/lora.py`. + + Args: + lora_path: Path to the checkpoint with trained adapter weights, which are the output of + `finetune/lora.py`. + checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights. + out_dir: The path to the merged model that is created by this script. + precision: Indicates the Fabric precision setting to use. + """ + check_valid_checkpoint_dir(checkpoint_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + precision = precision or get_default_supported_precision(training=False) + fabric = L.Fabric(devices=1, precision=precision) + + config = Config.from_json( + checkpoint_dir / "lit_config.json", + r=lora_r, + alpha=lora_alpha, + dropout=lora_dropout, + to_query=lora_query, + to_key=lora_key, + to_value=lora_value, + to_projection=lora_projection, + to_mlp=lora_mlp, + to_head=lora_head, + ) + + with fabric.init_module(empty_init=True): + model = GPT(config) + checkpoint_path = checkpoint_dir / "lit_model.pth" + checkpoint = lazy_load(checkpoint_path) + lora_checkpoint = lazy_load(lora_path) + checkpoint.update(lora_checkpoint.get("model", lora_checkpoint)) + model.load_state_dict(checkpoint) + + merge_lora_weights(model) + + save_path = out_dir / "lit_model.pth" + fabric.print(f"Saving weights to {str(save_path)!r}") + # remove lora parameters and the lora linear substring + state_dict = {k.replace("linear.", ""): v for k, v in model.state_dict().items() if not lora_filter(k, v)} + torch.save(state_dict, save_path) + + +if __name__ == "__main__": + CLI(merge_lora) diff --git a/llm-lora-finetuning/scripts/prepare_alpaca.py b/llm-lora-finetuning/scripts/prepare_alpaca.py new file mode 100644 index 00000000..61ca7bf3 --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_alpaca.py @@ -0,0 +1,151 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Implementation derived from https://github.com/tloen/alpaca-lora""" + +import json +import sys +from pathlib import Path +from typing import Optional + +import torch +from lightning_utilities.core.imports import RequirementCache +from torch.utils.data import random_split +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import CLI + + +def prepare( + destination_path: Path = Path("data/alpaca"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, + seed: int = 42, + mask_inputs: bool = False, # as in alpaca-lora + data_file_name: str = "alpaca_data_cleaned_archive.json", + data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json", + ignore_index: int = -1, + max_seq_length: Optional[int] = None, +) -> None: + """Prepare the Alpaca dataset for instruction tuning. + + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + if max_seq_length is None: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["block_size"] + + destination_path.mkdir(parents=True, exist_ok=True) + data_file_path = destination_path / data_file_name + print("Loading data file...") + download_if_missing(data_file_path, data_file_url) + with open(data_file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + print("Loading tokenizer...") + tokenizer = Tokenizer(checkpoint_dir) + + # Partition the dataset into train and test + train_set, test_set = random_split( + data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + ) + train_set, test_set = list(train_set), list(test_set) + + print(f"train has {len(train_set):,} samples") + print(f"test has {len(test_set):,} samples") + + print("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(train_set) + ] + torch.save(train_set, destination_path / "train.pt") + + print("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(test_set) + ] + torch.save(test_set, destination_path / "test.pt") + + +def download_if_missing(file_path: Path, file_url: str) -> None: + """Downloads the raw json data file and saves it in the given destination.""" + if file_path.exists() and file_path.stat().st_size > 0: + return + requests_available = RequirementCache("requests") + if not requests_available: + raise ModuleNotFoundError(str(requests_available)) + import requests + + with open(file_path, "w", encoding="utf-8") as f: + f.write(requests.get(file_url).text) + + +def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: + """Processes a single sample. + + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + + # The labels are the full prompt with response, but with the prompt masked out + labels = encoded_full_prompt_and_response.clone() + if mask_inputs: + labels[: len(encoded_full_prompt)] = ignore_index + + return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + if example["input"]: + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" + ) + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Response:" + ) + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_csv.py b/llm-lora-finetuning/scripts/prepare_csv.py new file mode 100644 index 00000000..89dd43f9 --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_csv.py @@ -0,0 +1,139 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import json +import logging +import sys +from pathlib import Path +from typing import Optional, Tuple + +import torch +from torch.utils.data import random_split +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +logger = logging.getLogger(__name__) +sys.path.append(str(wd)) + +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import CLI + + +def prepare( + csv_path: Path, + destination_path: Path = Path("data/csv"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + test_split_fraction: float = 0.1, + seed: int = 42, + mask_inputs: bool = False, + ignore_index: int = -1, + max_seq_length: Optional[int] = None, + columns: Tuple[str, ...] = ("instruction", "input", "output"), +) -> None: + """Prepare a CSV dataset for instruction tuning. + + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + if max_seq_length is None: + with open(checkpoint_dir / "lit_config.json", "r") as file: + config = json.load(file) + max_seq_length = config["block_size"] + + destination_path.mkdir(parents=True, exist_ok=True) + logger.info("Loading data file ...") + import pandas as pd + + df = pd.read_csv(csv_path, dtype=str).fillna("") + if not (df.columns.values == columns).all(): + raise ValueError(f"CSV columns must be {columns}, found {df.columns.values}") + data = json.loads(df.to_json(orient="records", indent=4)) + + print("Loading tokenizer...") + tokenizer = Tokenizer(checkpoint_dir) + + # Partition the dataset into train and test + train_set, test_set = random_split( + data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + ) + train_set, test_set = list(train_set), list(test_set) + + print(f"train has {len(train_set):,} samples") + print(f"test has {len(test_set):,} samples") + + print("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(train_set) + ] + torch.save(train_set, destination_path / "train.pt") + + print("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(test_set) + ] + torch.save(test_set, destination_path / "test.pt") + + +def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: + """Processes a single sample. + + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + + # The labels are the full prompt with response, but with the prompt masked out + labels = encoded_full_prompt_and_response.clone() + if mask_inputs: + labels[: len(encoded_full_prompt)] = ignore_index + + return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + if example["input"]: + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" + ) + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Response:" + ) + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_dolly.py b/llm-lora-finetuning/scripts/prepare_dolly.py new file mode 100644 index 00000000..56da37ce --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_dolly.py @@ -0,0 +1,144 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Implementation derived from https://github.com/tloen/alpaca-lora""" + +import json +import sys +from pathlib import Path +from typing import Optional + +import torch +from torch.utils.data import random_split +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import CLI +from scripts.prepare_alpaca import download_if_missing + + +def prepare( + destination_path: Path = Path("data/dolly"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + test_split_fraction: float = 0.1, + seed: int = 42, + mask_inputs: bool = False, + data_file_name: str = "dolly_data_cleaned.json", + data_file_url: str = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl", + ignore_index: int = -1, + max_seq_length: Optional[int] = None, +) -> None: + """Prepare the Dolly 15k dataset for instruction tuning. + + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + + if max_seq_length is None: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["block_size"] + + destination_path.mkdir(parents=True, exist_ok=True) + data_file_path = destination_path / data_file_name + print("Loading data file...") + download_if_missing(data_file_path, data_file_url) + + with open(data_file_path, "r", encoding="utf-8") as file: + data = file.readlines() + data = [json.loads(line) for line in data] + for item in data: + item["input"] = item.pop("context") + item["output"] = item.pop("response") + + print("Loading tokenizer...") + tokenizer = Tokenizer(checkpoint_dir) + + # Partition the dataset into train and test + train_set, test_set = random_split( + data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + ) + train_set, test_set = list(train_set), list(test_set) + + print(f"train has {len(train_set):,} samples") + print(f"test has {len(test_set):,} samples") + + print("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(train_set) + ] + torch.save(train_set, destination_path / "train.pt") + + print("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(test_set) + ] + torch.save(test_set, destination_path / "test.pt") + + +def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: + """Processes a single sample. + + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + + # The labels are the full prompt with response, but with the prompt masked out + labels = encoded_full_prompt_and_response.clone() + if mask_inputs: + labels[: len(encoded_full_prompt)] = ignore_index + + return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + if example["input"]: + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" + ) + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Response:" + ) + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_flan.py b/llm-lora-finetuning/scripts/prepare_flan.py new file mode 100644 index 00000000..59d3a7fa --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_flan.py @@ -0,0 +1,232 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Implementation derived from https://github.com/tloen/alpaca-lora""" +import json +import sys +from pathlib import Path +from typing import Optional + +import torch +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import CLI +from scripts.prepare_alpaca import download_if_missing + + +def load_jsonl(filename): + data = [] + with open(filename, "r", encoding="utf-8") as f: + for line in f: + data.append(json.loads(line)) + return data + + +def prepare( + destination_path: Path = Path("data/flan"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + mask_inputs: bool = False, # as in alpaca-lora + subsets: Optional[str] = None, + ignore_index: int = -1, + max_seq_length: Optional[int] = None, +) -> None: + """Prepare the FLAN-collection datasets for instruction tuning. + + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + + Since the original test set does not have responses, the validation set + is used as the test set. + """ + + supported_subsets = { + "aeslc_10templates", + "ag_news_subset_10templates", + "anli_r1_10templates", + "anli_r2_10templates", + "anli_r3_10templates", + "arc_challenge_10templates", + "arc_easy_10templates", + "bool_q_10templates", + "cb_10templates", + "cnn_dailymail_10templates", + "cola_10templates", + "common_gen_10templates", + "copa_10templates", + "coqa_10templates", + "cosmos_qa_10templates", + "dart_10templates", + "definite_pronoun_resolution_10templates", + "drop_10templates", + "e2e_nlg_10templates", + "fix_punct_10templates", + "gigaword_10templates", + "glue_mrpc_10templates", + "glue_qqp_10templates", + "hellaswag_10templates", + "imdb_reviews_10templates", + "math_dataset_10templates", + "mnli_matched_10templates", + "mnli_mismatched_10templates", + "multi_news_10templates", + "multirc_10templates", + "natural_questions_10templates", + "openbookqa_10templates", + "opinion_abstracts_idebate_10templates", + "opinion_abstracts_rotten_tomatoes_10templates", + "para_crawl_enes_10templates", + "paws_wiki_10templates", + "piqa_10templates", + "qnli_10templates", + "quac_10templates", + "record_10templates", + "rte_10templates", + "samsum_10templates", + "sentiment140_10templates", + "snli_10templates", + "squad_v1_10templates", + "squad_v2_10templates", + "sst2_10templates", + "story_cloze_10templates", + "stsb_10templates", + "trec_10templates", + "trivia_qa_10templates", + "true_case_10templates", + "web_nlg_en_10templates", + "wic_10templates", + "wiki_lingua_english_en_10templates", + "wmt14_enfr_10templates", + "wmt16_translate_csen_10templates", + "wmt16_translate_deen_10templates", + "wmt16_translate_fien_10templates", + "wmt16_translate_roen_10templates", + "wmt16_translate_ruen_10templates", + "wmt16_translate_tren_10templates", + "wnli_10templates", + "word_segment_10templates", + "wsc_10templates", + "yelp_polarity_reviews_10templates", + } + + if subsets is not None: + subsets = subsets.split(",") + for sub in subsets: + if sub not in supported_subsets: + raise ValueError(f"{sub} not in {supported_subsets}") + else: + subsets = list(supported_subsets) + + if max_seq_length is None: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["block_size"] + + destination_path.mkdir(parents=True, exist_ok=True) + print("Loading data file...") + + base_url = "https://huggingface.co/datasets/Muennighoff/flan/resolve/main/" + + train_set, test_set = [], [] + for sub in subsets: + train_sub = sub + "_train" + data_file_name = train_sub + ".jsonl" + data_file_path = destination_path / data_file_name + data_file_url = base_url + "train/" + data_file_name + + print(f"Loading training data file {sub}...") + download_if_missing(data_file_path, data_file_url) + sub_train_set = load_jsonl(data_file_path) + train_set.extend(sub_train_set) + + test_sub = sub + "_test" + data_file_name = test_sub + ".jsonl" + data_file_path = destination_path / data_file_name + data_file_url = base_url + "test/" + data_file_name + + print(f"Loading test data file {sub}...") + download_if_missing(data_file_path, data_file_url) + sub_test_set = load_jsonl(data_file_path) + test_set.extend(sub_test_set) + + print("Loading tokenizer...") + tokenizer = Tokenizer(checkpoint_dir) + + train_set, test_set = list(train_set), list(test_set) + + print(f"train has {len(train_set):,} samples") + print(f"test has {len(test_set):,} samples") + + print("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(train_set) + ] + torch.save(train_set, destination_path / "train.pt") + + print("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(test_set) + ] + torch.save(test_set, destination_path / "test.pt") + + +def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int): + """Processes a single sample. + + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["targets"] + encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + + # The labels are the full prompt with response, but with the prompt masked out + labels = encoded_full_prompt_and_response.clone() + if mask_inputs: + labels[: len(encoded_full_prompt)] = ignore_index + + return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + + +def generate_prompt(example): + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['inputs']}\n\n### Response:" + ) + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_lima.py b/llm-lora-finetuning/scripts/prepare_lima.py new file mode 100644 index 00000000..ca35e62b --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_lima.py @@ -0,0 +1,168 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Implementation derived from https://github.com/tloen/alpaca-lora""" + +import json +import os +import sys +from pathlib import Path +from typing import List, Optional + +import torch +from torch.utils.data import random_split +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import CLI + + +def prepare( + destination_path: Path = Path("data/lima"), + test_split_fraction: float = 0.1, + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + mask_inputs: bool = False, # as in alpaca-lora + seed: int = 42, + include_multiturn_conversations: bool = False, + data_repo_id: str = "GAIR/lima", + ignore_index: int = -1, + access_token: Optional[str] = os.getenv("HF_TOKEN"), + max_seq_length: Optional[int] = None, +) -> None: + """Prepare the LIMA dataset for instruction tuning. + + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + + if access_token is None: + raise ValueError( + "LIMA requires authentication, please set the `HF_TOKEN=your_token` environment" + " variable or pass --access_token=your_token. You can find your token by visiting" + " https://huggingface.co/settings/tokens" + ) + + if max_seq_length is None: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["block_size"] + + destination_path.mkdir(parents=True, exist_ok=True) + print("Loading data file...") + + from datasets import load_dataset + + dataset = load_dataset(data_repo_id, token=access_token) + train_data = format_dataset(dataset["train"], include_multiturn_conversations) + + # test set is present but doesn't have any solutions, so we cannot use it here + # but have to create our own + # for consistency with prepare_alpaca.py and prepare_dolly.py + # test_set = format_dataset(dataset["test"], include_multiturn_conversations) + + print("Loading tokenizer...") + tokenizer = Tokenizer(checkpoint_dir) + + # Partition the dataset into train and test + train_set, test_set = random_split( + train_data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + ) + train_set, test_set = list(train_set), list(test_set) + + print(f"train has {len(train_set):,} samples") + print(f"test has {len(test_set):,} samples") + + print("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(train_set) + ] + torch.save(train_set, destination_path / "train.pt") + + print("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(test_set) + ] + torch.save(test_set, destination_path / "test.pt") + + +def format_dataset(dataset_partition: dict, include_multi_turn_conversations: bool) -> List[dict]: + formatted_ds = [] + + for entry in dataset_partition: + convo = entry["conversations"] + if include_multi_turn_conversations: + for i in range(0, len(convo) - 1, 2): + formatted_ds.append({"instruction": convo[i], "input": "", "output": convo[i + 1]}) + + else: + formatted_ds.append({"instruction": convo[0], "input": "", "output": convo[1]}) + + return formatted_ds + + +def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: + """Processes a single sample. + + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + + # The labels are the full prompt with response, but with the prompt masked out + labels = encoded_full_prompt_and_response.clone() + if mask_inputs: + labels[: len(encoded_full_prompt)] = ignore_index + + return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + if example["input"]: + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" + ) + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Response:" + ) + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_longform.py b/llm-lora-finetuning/scripts/prepare_longform.py new file mode 100644 index 00000000..2a46e7dd --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_longform.py @@ -0,0 +1,136 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +"""Implementation derived from https://github.com/tloen/alpaca-lora""" + +import json +import sys +from pathlib import Path +from typing import Optional + +import torch +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt.tokenizer import Tokenizer +from lit_gpt.utils import CLI +from scripts.prepare_alpaca import download_if_missing + + +def prepare( + destination_path: Path = Path("data/longform"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + mask_inputs: bool = False, # as in alpaca-lora + ignore_index: int = -1, + max_seq_length: Optional[int] = None, +) -> None: + """Prepare the Alpaca dataset for instruction tuning. + + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + if max_seq_length is None: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["block_size"] + + destination_path.mkdir(parents=True, exist_ok=True) + + train_file_name = "train.json" + # val_file_name = "val.json" + test_file_name = "test.json" + + train_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/train.json" + # val_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/val.json" + test_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/test.json" + + train_file_path = destination_path / train_file_name + print("Loading train data file...") + download_if_missing(train_file_path, train_file_url) + with open(train_file_path, "r", encoding="utf-8") as file: + train_data = json.load(file) + + test_file_path = destination_path / test_file_name + print("Loading test data file...") + download_if_missing(test_file_path, test_file_url) + with open(test_file_path, "r", encoding="utf-8") as file: + test_data = json.load(file) + + print("Loading tokenizer...") + tokenizer = Tokenizer(checkpoint_dir) + + print(f"train has {len(train_data):,} samples") + print(f"test has {len(test_data):,} samples") + + print("Processing train set ...") + train_data = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(train_data) + ] + torch.save(train_data, destination_path / "train.pt") + + print("Processing test set ...") + test_data = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + mask_inputs=mask_inputs, + ignore_index=ignore_index, + ) + for sample in tqdm(test_data) + ] + torch.save(test_data, destination_path / "test.pt") + + +def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: + """Processes a single sample. + + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + + # The labels are the full prompt with response, but with the prompt masked out + labels = encoded_full_prompt_and_response.clone() + if mask_inputs: + labels[: len(encoded_full_prompt)] = ignore_index + + return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction and a + 'response' field.""" + + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['input']}\n\n### Response:" + ) + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_openwebtext.py b/llm-lora-finetuning/scripts/prepare_openwebtext.py new file mode 100644 index 00000000..2578ab9f --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_openwebtext.py @@ -0,0 +1,81 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +# saves the openwebtext dataset to a binary file for training. following was helpful: +# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py +import os +import sys +from pathlib import Path +from typing import Union + +import numpy as np +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt import Tokenizer +from lit_gpt.utils import CLI + + +def prepare( + destination_path: Path = Path("data/openwebtext"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + seed: int = 42, + test_size: Union[float, int, None] = 0.0005, +) -> None: + from datasets import load_dataset # huggingface datasets + + destination_path.mkdir(parents=True, exist_ok=True) + + tokenizer = Tokenizer(checkpoint_dir) + + # number of workers in .map() call + # good number to use is ~order number of cpu cores // 2 + num_proc = os.cpu_count() // 2 + + # number of workers in load_dataset() call + # best number might be different from num_proc above as it also depends on HW speed. + # it is better than 1 usually though + num_proc_load_dataset = num_proc + + # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769) + dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset) + + # owt by default only contains the 'train' split, so create a test split + split_dataset = dataset["train"].train_test_split(test_size=test_size, seed=seed, shuffle=True) + split_dataset["val"] = split_dataset.pop("test") # rename the test split to val + + def process(example): + ids = tokenizer.encode(example["text"]).tolist() + ids.append(tokenizer.eos_id) + + # ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens + # ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe + # note: I think eot should be prepended not appended... hmm. it's called "eot" though... + return {"ids": ids, "len": len(ids)} + + # tokenize the dataset + tokenized = split_dataset.map(process, remove_columns=["text"], desc="tokenizing the splits", num_proc=num_proc) + + # concatenate all the ids in each dataset into one large file we can use for training + for split, dset in tokenized.items(): + arr_len = np.sum(dset["len"], dtype=np.uint64) + filename = destination_path / f"{split}.bin" + dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) + arr = np.memmap(str(filename), dtype=dtype, mode="w+", shape=(arr_len,)) + total_batches = 1024 + + idx = 0 + for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"): + # Batch together samples for faster write + batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format("numpy") + arr_batch = np.concatenate(batch["ids"]) + # Write into mmap + arr[idx : idx + len(arr_batch)] = arr_batch + idx += len(arr_batch) + arr.flush() + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_redpajama.py b/llm-lora-finetuning/scripts/prepare_redpajama.py new file mode 100644 index 00000000..f2c87a33 --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_redpajama.py @@ -0,0 +1,166 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import glob +import json +import os +import sys +from pathlib import Path + +import numpy as np +from tqdm import tqdm + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +import lit_gpt.packed_dataset as packed_dataset +from lit_gpt import Config, Tokenizer +from lit_gpt.utils import CLI + +filenames_sample = [ + "arxiv_sample.jsonl", + "book_sample.jsonl", + "c4_sample.jsonl", + "cc_2019-30_sample.jsonl", + "cc_2020-05_sample.jsonl", + "cc_2021-04_sample.jsonl", + "cc_2022-05_sample.jsonl", + "cc_2023-06_sample.jsonl", + "github_sample.jsonl", + "stackexchange_sample.jsonl", + "wikipedia_sample.jsonl", +] + +filename_sets = { + "arxiv": "arxiv/arxiv*", + "book": "book/book*", + "c4": "c4/c4-train*", + "common_crawl": "common_crawl/*", + "github": "github/filtered*", + "stackexchange": "stackexchange/stackexchange*", + "wikipedia": "wikipedia/wiki*", +} + + +def prepare_sample( + source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = "" +) -> None: + """Prepare the "Red Pajama" dataset using the original tokenizer.""" + destination_path.mkdir(parents=True, exist_ok=True) + + tokenizer = Tokenizer(checkpoint_dir) + + for name in filenames_sample: + if match and match not in name: + continue + + filepath = source_path / name + + if not filepath.is_file(): + raise RuntimeError( + f"Input file not found at {filepath}. \nMake sure you download the data, e.g. wget -i" + " https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through" + " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T" + " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n" + ) + + prefix, _ = os.path.splitext(name) + + builder = packed_dataset.PackedDatasetBuilder( + outdir=destination_path, + prefix=prefix, + chunk_size=chunk_size, + sep_token=tokenizer.eos_id, + dtype="auto", + vocab_size=tokenizer.vocab_size, + ) + + print(f"Processing {name}") + + with open(filepath, encoding="utf-8") as f: + for row in tqdm(f): + text = json.loads(row)["text"] + text_ids = tokenizer.encode(text) + builder.add_array(np.array(text_ids, dtype=builder.dtype)) + + builder.write_reminder() + + +def prepare_full( + source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = "" +) -> None: + """Prepare the "Red Pajama" dataset using the original tokenizer.""" + import zstandard as zstd + + destination_path.mkdir(parents=True, exist_ok=True) + + tokenizer = Tokenizer(checkpoint_dir) + + for set_name, pattern in filename_sets.items(): + if match and match not in set_name: + continue + + is_cc = set_name == "common_crawl" + + filenames = glob.glob(os.path.join(source_path, pattern), recursive=True) + + if not filenames: + raise RuntimeError( + f"No files matching {pattern} found at {source_path}. \nMake sure you download the data, e.g. wget -i" + " https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through" + " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T" + " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n" + ) + + builder = packed_dataset.PackedDatasetBuilder( + outdir=destination_path, + prefix=set_name, + chunk_size=chunk_size, + sep_token=tokenizer.eos_id, + dtype="auto", + vocab_size=tokenizer.vocab_size, + ) + + for name in filenames: + filepath = source_path / name + + print(f"Processing {name}") + + if is_cc: + with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: + for row in tqdm(f): + text = json.loads(row)["text"] + text_ids = tokenizer.encode(text) + builder.add_array(np.array(text_ids, dtype=builder.dtype)) + else: + with open(filepath, encoding="utf-8") as f: + for row in tqdm(f): + text = json.loads(row)["text"] + text_ids = tokenizer.encode(text) + builder.add_array(np.array(text_ids, dtype=builder.dtype)) + + builder.write_reminder() + + +def prepare( + source_path: Path = Path("data/RedPajama-Data-1T-Sample"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + destination_path: Path = Path("data/redpajama_sample"), + sample: bool = True, + match: str = "", +) -> None: + """Prepare the "Red Pajama" dataset. We assume tokenizer has been trained.""" + config = Config.from_checkpoint(checkpoint_dir) + + prepare_fn = prepare_sample if sample else prepare_full + prepare_fn( + source_path=source_path, + checkpoint_dir=checkpoint_dir, + destination_path=destination_path, + chunk_size=(config.block_size + 1) * 1024, # block size + 1 for causal, 1024 blocks + match=match, + ) + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_slimpajama.py b/llm-lora-finetuning/scripts/prepare_slimpajama.py new file mode 100644 index 00000000..7a83316a --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_slimpajama.py @@ -0,0 +1,63 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import json +import os +import sys +import time +from pathlib import Path + +import zstandard as zstd +from lightning.data.streaming import DataChunkRecipe, DataProcessor + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt import Tokenizer +from lit_gpt.utils import CLI + + +class SlimPajamaDataRecipe(DataChunkRecipe): + def __init__(self, tokenizer: Tokenizer, chunk_size: int): + super().__init__(chunk_size) + self.tokenizer = tokenizer + + def prepare_structure(self, input_dir): + files = Path(input_dir).rglob("*.zst") + return [str(file) for file in files] + + def prepare_item(self, filepath): + with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: + for row in f: + text = json.loads(row)["text"] + if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub": + continue # exclude the GitHub data since it overlaps with starcoder + text_ids = self.tokenizer.encode(text, bos=False, eos=True) + yield text_ids + + +def prepare( + input_dir: Path = Path("data/SlimPajama-627B/train"), + output_dir: Path = Path("data/slimpajama/train"), + tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), + chunk_size: int = (2049 * 16384), + fast_dev_run: bool = False, +) -> None: + tokenizer = Tokenizer(tokenizer_path) + data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) + data_processor = DataProcessor( + input_dir=str(input_dir), + output_dir=str(output_dir), + fast_dev_run=fast_dev_run, + num_workers=os.cpu_count(), + num_downloaders=1, + ) + + start_time = time.time() + data_processor.run(data_recipe) + elapsed_time = time.time() - start_time + print(f"Time taken: {elapsed_time:.2f} seconds") + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/scripts/prepare_starcoder.py b/llm-lora-finetuning/scripts/prepare_starcoder.py new file mode 100644 index 00000000..ea260ebc --- /dev/null +++ b/llm-lora-finetuning/scripts/prepare_starcoder.py @@ -0,0 +1,74 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import os +import sys +import time +import traceback +from pathlib import Path + +import pyarrow.parquet as pq +from lightning.data.streaming import DataChunkRecipe, DataProcessor + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from lit_gpt import Tokenizer +from lit_gpt.utils import CLI + + +class StarcoderDataRecipe(DataChunkRecipe): + def __init__(self, tokenizer: Tokenizer, chunk_size: int): + super().__init__(chunk_size) + self.tokenizer = tokenizer + + def prepare_structure(self, input_dir): + files = Path(input_dir).rglob("*.parquet") + return [str(file) for file in files] + + def prepare_item(self, item_metadata): + filepath = item_metadata + start = time.time() + + try: + parquet_file = pq.ParquetFile(filepath) + # reduce RAM usage + for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]): + for text in batch.to_pandas()["content"]: + yield self.tokenizer.encode(text, bos=False, eos=True) + + except Exception: + print(traceback.format_exc()) + print(f"Error reading {filepath}") + return + + parquet_file.close() + end = time.time() + print(f"Took {end - start:.2f} seconds total", filepath) + + +def prepare( + input_dir: Path = Path("data/starcoderdata"), + output_dir: Path = Path("data/starcoder"), + tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), + chunk_size: int = (2049 * 8192), + fast_dev_run: bool = False, +) -> None: + tokenizer = Tokenizer(tokenizer_path) + data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) + data_processor = DataProcessor( + input_dir=str(input_dir), + output_dir=str(output_dir), + fast_dev_run=fast_dev_run, + num_workers=os.cpu_count(), + num_downloaders=1, + ) + + start_time = time.time() + data_processor.run(data_recipe) + elapsed_time = time.time() - start_time + print(f"Time taken: {elapsed_time:.2f} seconds") + + +if __name__ == "__main__": + CLI(prepare) diff --git a/llm-lora-finetuning/steps/__init__.py b/llm-lora-finetuning/steps/__init__.py new file mode 100644 index 00000000..757bd841 --- /dev/null +++ b/llm-lora-finetuning/steps/__init__.py @@ -0,0 +1,16 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py new file mode 100644 index 00000000..35dcb22f --- /dev/null +++ b/llm-lora-finetuning/steps/finetune.py @@ -0,0 +1,43 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +from pathlib import Path +from typing import Tuple, Annotated + +from lit_gpt.args import IOArgs +from zenml import step + +from scripts.download import download_from_hub +from scripts.convert_hf_checkpoint import convert_hf_checkpoint +from scripts.prepare_alpaca import prepare +from finetune.lora import setup +from scripts.merge_lora import merge_lora +import shutil + +@step +def finetune_lora(repo_id: str) -> Tuple[Annotated[str, "checkpoint_dir"], Annotated[str, "output_path"]]: + checkpoint_dir = Path("checkpoints") + data_dir = Path("data/alpaca") + output_dir = Path("out/lora/alpaca") + download_from_hub(repo_id=repo_id, checkpoint_dir=checkpoint_dir) + convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) + prepare(destination_path=data_dir, checkpoint_dir=checkpoint_dir) + + io_args = IOArgs( + train_data_dir=data_dir, + val_data_dir=data_dir, + checkpoint_dir=checkpoint_dir, + out_dir=output_dir, + ), + setup(precision="bf16-true", io=io_args) + + model_name = repo_id.split("/")[-1] + lora_path = output_dir / model_name / "lit_model_lora_finetuned.pth" + + merge_output_dir = Path("out/lora_merged") / model_name + merge_lora(lora_alpha=lora_path, checkpoint_dir=checkpoint_dir, out_dir=merge_output_dir) + + for path in Path(checkpoint_dir).glob('*.json'): + destination = Path(merge_output_dir) / path.name + + shutil.copy(src=path, dst=destination) + + return checkpoint_dir, lora_path diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py new file mode 100644 index 00000000..61a86e55 --- /dev/null +++ b/llm-lora-finetuning/steps/merge.py @@ -0,0 +1,28 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +from pathlib import Path + + + + +from pathlib import Path + + + + +from lit_gpt.args import IOArgs +from zenml import step + +from scripts.download import download_from_hub +from scripts.merge_lora import merge_lora +from scripts.prepare_alpaca import prepare +from finetune.lora import setup +import shutil + +@step +def merge(checkpoint_dir: str, lora_path: str, out_dir: str) -> None: + merge_lora(lora_alpha=Path(lora_path), checkpoint_dir=Path(checkpoint_dir), out_dir=Path(out_dir)) + + for path in Path(checkpoint_dir).glob('*.json'): + destination = Path(out_dir) / path.name + + shutil.copy(src=path, dst=destination) \ No newline at end of file From 4c3662ee3de5dcd10267b8194222896e4cac26be Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 5 Mar 2024 12:13:30 +0800 Subject: [PATCH 02/26] wip --- .../evaluate/lm_eval_harness.py | 231 ++++++++++++++++++ llm-lora-finetuning/finetune/adapter.py | 184 +++++++++++--- llm-lora-finetuning/finetune/adapter_v2.py | 184 +++++++++++--- llm-lora-finetuning/finetune/full.py | 165 ++++++++++--- llm-lora-finetuning/finetune/lora.py | 186 +++++++++++--- llm-lora-finetuning/generate/adapter.py | 64 ++++- llm-lora-finetuning/generate/adapter_v2.py | 64 ++++- llm-lora-finetuning/generate/base.py | 83 +++++-- llm-lora-finetuning/generate/full.py | 60 ++++- llm-lora-finetuning/generate/lora.py | 60 ++++- llm-lora-finetuning/generate/sequentially.py | 126 +++++++--- llm-lora-finetuning/generate/tp.py | 112 +++++++-- llm-lora-finetuning/lit_gpt/__init__.py | 12 +- llm-lora-finetuning/lit_gpt/adapter.py | 70 ++++-- llm-lora-finetuning/lit_gpt/adapter_v2.py | 89 +++++-- llm-lora-finetuning/lit_gpt/args.py | 8 +- llm-lora-finetuning/lit_gpt/config.py | 72 ++++-- llm-lora-finetuning/lit_gpt/lora.py | 145 ++++++++--- llm-lora-finetuning/lit_gpt/model.py | 191 ++++++++++++--- llm-lora-finetuning/lit_gpt/packed_dataset.py | 49 +++- llm-lora-finetuning/lit_gpt/rmsnorm.py | 8 +- llm-lora-finetuning/lit_gpt/tokenizer.py | 47 +++- llm-lora-finetuning/lit_gpt/utils.py | 160 +++++++++--- llm-lora-finetuning/pipelines/eval.py | 9 + .../pipelines/feature_engineering.py | 7 + llm-lora-finetuning/pipelines/finetuning.py | 19 +- llm-lora-finetuning/pipelines/merge.py | 16 +- llm-lora-finetuning/run.py | 39 ++- .../scripts/convert_hf_checkpoint.py | 37 ++- .../scripts/convert_lit_checkpoint.py | 22 +- .../scripts/convert_pretrained_checkpoint.py | 16 +- llm-lora-finetuning/scripts/download.py | 17 +- llm-lora-finetuning/scripts/merge_lora.py | 17 +- llm-lora-finetuning/scripts/prepare_alpaca.py | 30 ++- llm-lora-finetuning/scripts/prepare_csv.py | 30 ++- llm-lora-finetuning/scripts/prepare_dolly.py | 31 ++- llm-lora-finetuning/scripts/prepare_flan.py | 27 +- llm-lora-finetuning/scripts/prepare_lima.py | 50 +++- .../scripts/prepare_longform.py | 27 +- .../scripts/prepare_openwebtext.py | 35 ++- .../scripts/prepare_redpajama.py | 35 ++- .../scripts/prepare_slimpajama.py | 9 +- .../scripts/prepare_starcoder.py | 8 +- llm-lora-finetuning/steps/eval.py | 45 ++++ .../steps/feature_engineering.py | 24 ++ llm-lora-finetuning/steps/finetune.py | 71 ++++-- llm-lora-finetuning/steps/merge.py | 59 +++-- 47 files changed, 2478 insertions(+), 572 deletions(-) create mode 100644 llm-lora-finetuning/evaluate/lm_eval_harness.py create mode 100644 llm-lora-finetuning/pipelines/eval.py create mode 100644 llm-lora-finetuning/pipelines/feature_engineering.py create mode 100644 llm-lora-finetuning/steps/eval.py create mode 100644 llm-lora-finetuning/steps/feature_engineering.py diff --git a/llm-lora-finetuning/evaluate/lm_eval_harness.py b/llm-lora-finetuning/evaluate/lm_eval_harness.py new file mode 100644 index 00000000..6f90c19f --- /dev/null +++ b/llm-lora-finetuning/evaluate/lm_eval_harness.py @@ -0,0 +1,231 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. + +import json +import sys +from pathlib import Path +from typing import Dict, List, Literal, Optional + +import lightning as L +import torch +from lightning.fabric.plugins import BitsandbytesPrecision +from lm_eval import base, evaluator, tasks +from lm_eval.base import BaseLM + +# support running without installing as a package +wd = Path(__file__).parent.parent.resolve() +sys.path.append(str(wd)) + +from generate.base import generate +from lit_gpt import GPT, Config, Tokenizer +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, + load_checkpoint, +) + + +class EvalHarnessBase(BaseLM): + # Credits: + # https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py + def __init__( + self, + fabric: L.Fabric, + model: GPT, + tokenizer: Tokenizer, + batch_size: int, + ): + super().__init__() + self.fabric = fabric + self.model = model + self.tokenizer = tokenizer + self.batch_size_per_gpu = batch_size + with fabric.init_tensor(): + model.set_kv_cache(batch_size=batch_size) + + @classmethod + def create_from_arg_string(cls, arg_string, additional_config=None): + kwargs = { + el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",") + } + return cls(**kwargs, **additional_config) + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_id + + @property + def max_length(self): + return self.model.max_seq_length + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def max_gen_toks(self): + return 256 + + @property + def batch_size(self): + return self.batch_size_per_gpu * self.fabric.world_size + + @property + def device(self): + return self.fabric.device + + def tok_encode(self, string: str) -> List[int]: + return self.tokenizer.encode(string, bos=False, eos=False).tolist() + + def tok_decode(self, tokens: List[int]) -> str: + t = torch.tensor(tokens) + return self.tokenizer.decode(t) + + @torch.inference_mode() + def _model_call(self, inps): + return self.model(inps) + + @torch.inference_mode() + def _model_generate( + self, context, max_length, eos_token_id + ) -> torch.Tensor: + # this only supports batch size 1 + assert context.shape[0] == 1 + out = generate(self.model, context[0], max_length, eos_id=eos_token_id) + for block in self.model.transformer.h: + block.attn.kv_cache.reset_parameters() + return out.unsqueeze(0) + + @torch.inference_mode() + def run_eval( + self, + eval_tasks: List[str], + num_fewshot: int, + limit: Optional[int], + bootstrap_iters: int, + no_cache: bool, + ) -> Dict: + # Returns a list containing all values of the task registry that + # match at least one of the patterns + import fnmatch + + def pattern_match(patterns, source_list): + task_names = set() + for pattern in patterns: + for matching in fnmatch.filter(source_list, pattern): + task_names.add(matching) + return list(task_names) + + eval_tasks = pattern_match(eval_tasks, tasks.ALL_TASKS) + print(f"Found tasks: {eval_tasks}") + + # **HACK INCOMING**: + # first get task dict on local main rank + # the tasks are downloaded *as they are initialized*, and the downloads don't like multithreading. + # so we download them once on the local main rank, wait, and then initialize them on all other ranks, which *should* load from the cache. + if self.fabric.local_rank == 0: + tasks.get_task_dict(eval_tasks) + # torch barrier + self.fabric.barrier() + tasks.get_task_dict(eval_tasks) + + lm = self + if not no_cache: + lm = base.CachingLM(lm, "lm_cache/lit-gpt.db") + + results = evaluator.evaluate( + lm=lm, + task_dict=tasks.get_task_dict(eval_tasks), + num_fewshot=num_fewshot, + limit=limit, + bootstrap_iters=bootstrap_iters, + ) + results["config"] = dict( + model=self.model.config.name, + batch_size=self.batch_size, + device=str(self.device), + num_fewshot=num_fewshot, + limit=limit, + bootstrap_iters=bootstrap_iters, + no_cache=no_cache, + ) + return results + + +@torch.inference_mode() +def run_eval_harness( + checkpoint_dir: Path, + precision: Optional[str] = None, + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] + ] = None, + eval_tasks: List[str] = [ + "arc_challenge", + "piqa", + "hellaswag", + "hendrycksTest-*", + ], + save_filepath: Optional[Path] = None, + num_fewshot: int = 0, + limit: Optional[int] = None, + bootstrap_iters: int = 100000, + no_cache: bool = True, +): + if precision is None: + precision = get_default_supported_precision(training=False) + + plugins = None + if quantize is not None and quantize.startswith("bnb."): + if "mixed" in precision: + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] + plugins = BitsandbytesPrecision(quantize[4:], dtype) + precision = None + + fabric = L.Fabric(devices=1, precision=precision, plugins=plugins) + + check_valid_checkpoint_dir(checkpoint_dir) + tokenizer = Tokenizer(checkpoint_dir) + + config = Config.from_json(checkpoint_dir / "lit_config.json") + + checkpoint_path = checkpoint_dir / "lit_model.pth" + + print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) + with fabric.init_module(empty_init=True): + model = GPT(config) + + model.eval() + model = fabric.setup_module(model) + + load_checkpoint(fabric, model, checkpoint_path) + + eval_harness = EvalHarnessBase(fabric, model, tokenizer, 1) + + results = eval_harness.run_eval( + eval_tasks, num_fewshot, limit, bootstrap_iters, no_cache + ) + if save_filepath is None: + print(results) + else: + print(f"Saving results to {str(save_filepath)!r}") + save_filepath.parent.mkdir(parents=True, exist_ok=True) + data = json.dumps(results) + with open(save_filepath, "w") as fw: + fw.write(data) + + +if __name__ == "__main__": + torch.set_float32_matmul_precision("high") + + CLI(run_eval_harness) diff --git a/llm-lora-finetuning/finetune/adapter.py b/llm-lora-finetuning/finetune/adapter.py index 862e2333..acf8f6d4 100644 --- a/llm-lora-finetuning/finetune/adapter.py +++ b/llm-lora-finetuning/finetune/adapter.py @@ -18,7 +18,13 @@ sys.path.append(str(wd)) from generate.base import generate -from lit_gpt.adapter import GPT, Block, Config, adapter_filter, mark_only_adapter_as_trainable +from lit_gpt.adapter import ( + GPT, + Block, + Config, + adapter_filter, + mark_only_adapter_as_trainable, +) from lit_gpt.args import EvalArgs, IOArgs, TrainArgs from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import ( @@ -29,12 +35,21 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt def setup( precision: Optional[str] = None, - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, + quantize: Optional[ + Literal[ + "bnb.nf4", + "bnb.nf4-dq", + "bnb.fp4", + "bnb.fp4-dq", + "bnb.int8-training", + ] + ] = None, devices: int = 1, io: IOArgs = IOArgs( train_data_dir=Path("data/alpaca"), @@ -61,8 +76,14 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -82,12 +103,36 @@ def setup( else: strategy = "auto" - logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) - fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) - fabric.launch(main, devices, Config.from_name(name=io.checkpoint_dir.name), io, train, eval) + logger = CSVLogger( + io.out_dir.parent, + io.out_dir.name, + flush_logs_every_n_steps=train.log_interval, + ) + fabric = L.Fabric( + devices=devices, + strategy=strategy, + precision=precision, + loggers=logger, + plugins=plugins, + ) + fabric.launch( + main, + devices, + Config.from_name(name=io.checkpoint_dir.name), + io, + train, + eval, + ) -def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: +def main( + fabric: L.Fabric, + devices: int, + config: Config, + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: validate_args(io, train, eval) steps_per_epoch = train.epoch_size // devices // train.batch_size(devices) @@ -95,7 +140,9 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -104,13 +151,19 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_adapter_as_trainable(model) - fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") - fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + fabric.print( + f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}" + ) + fabric.print( + f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}" + ) model = fabric.setup_module(model) @@ -122,10 +175,15 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai else: optimizer_cls = torch.optim.AdamW optimizer = optimizer_cls( - trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + trainable_params, + lr=train.learning_rate, + weight_decay=train.weight_decay, + betas=(train.beta1, train.beta2), ) optimizer = fabric.setup_optimizers(optimizer) - scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) + scheduler = get_lr_scheduler( + optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps + ) # strict=False because missing keys due to Adapter weights not contained in state dict load_checkpoint(fabric, model, checkpoint_path, strict=False) @@ -133,10 +191,23 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai fabric.seed_everything(1337 + fabric.global_rank) train_time = time.perf_counter() - fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval) + fit( + fabric, + model, + optimizer, + scheduler, + train_data, + val_data, + devices, + io, + train, + eval, + ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final checkpoint at the end of training save_path = io.out_dir / "lit_model_adapter_finetuned.pth" @@ -157,13 +228,22 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + validate( + fabric, + model, + val_data, + tokenizer, + dataclasses.replace(eval, max_iters=2), + train, + ) # sanity check throughput = ThroughputMonitor(fabric, window_size=50) step_count = 0 @@ -174,10 +254,16 @@ def fit( iter_t0 = time.perf_counter() input_ids, targets = get_batch( - fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None + fabric, + train_data, + train.micro_batch_size, + train.max_seq_length, + longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + is_accumulating = ( + iter_num % train.gradient_accumulation_iters(devices) != 0 + ) with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -196,7 +282,10 @@ def fit( loss_item = loss.item() # expensive device-to-host synchronization t1 = time.perf_counter() throughput.update( - time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths + time=t1 - total_t0, + batches=iter_num, + samples=iter_num * train.micro_batch_size, + lengths=total_lengths, ) throughput.compute_and_log(step=iter_num) fabric.print( @@ -206,9 +295,13 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 - fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + fabric.print( + f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" + ) fabric.barrier() if not is_accumulating and step_count % train.save_interval == 0: checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth" @@ -218,15 +311,24 @@ def fit( # the adapter "kv cache" cannot be initialized under `inference_mode` @torch.no_grad() def validate( - fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs + fabric: L.Fabric, + model: GPT, + val_data: List[Dict], + tokenizer: Tokenizer, + eval: EvalArgs, + train: TrainArgs, ) -> torch.Tensor: fabric.print("Validating ...") model.eval() losses = torch.zeros(eval.max_iters) for k in range(eval.max_iters): - input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + input_ids, targets = get_batch( + fabric, val_data, train.micro_batch_size, train.max_seq_length + ) logits = model(input_ids) - losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + losses[k] = chunked_cross_entropy( + logits[..., :-1, :], targets[..., 1:], chunk_size=0 + ) val_loss = losses.mean() # produce an example: @@ -239,7 +341,11 @@ def validate( # do not set `max_seq_length=max_returned_token` because memory is not a concern here model.set_kv_cache(batch_size=1) output = generate( - model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + model, + encoded, + max_returned_tokens=len(encoded) + eval.max_new_tokens, + temperature=0.8, + eos_id=tokenizer.eos_id, ) model.clear_kv_cache() output = tokenizer.decode(output) @@ -289,9 +395,15 @@ def pad_right(x, pad_id): def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): # linear warmup followed by cosine annealing - scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) - scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) - return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + scheduler1 = torch.optim.lr_scheduler.LambdaLR( + optimizer, lambda step: step / warmup_steps + ) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=(max_steps - warmup_steps) + ) + return torch.optim.lr_scheduler.SequentialLR( + optimizer, [scheduler1, scheduler2], milestones=[warmup_steps] + ) def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: @@ -302,7 +414,9 @@ def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: return longest_seq_length, longest_seq_ix -def save_adapter_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Path) -> None: +def save_adapter_checkpoint( + fabric: L.Fabric, model: torch.nn.Module, file_path: Path +) -> None: fabric.print(f"Saving adapter weights to {str(file_path)!r}") fabric.save(file_path, {"model": model}, filter={"model": adapter_filter}) @@ -313,7 +427,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in unsupported: for name in names: if getattr(args, name) is not None: - issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} doesn't support the {name!r} argument. This is set in {args}" + ) required = [ (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), (train, ["epoch_size", "epochs"]), @@ -322,7 +438,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in required: for name in names: if getattr(args, name) is None: - issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} requires the {name!r} argument. This is set in {args}" + ) if issues: raise ValueError("\n".join(issues)) diff --git a/llm-lora-finetuning/finetune/adapter_v2.py b/llm-lora-finetuning/finetune/adapter_v2.py index 8b5d0347..ac7de327 100644 --- a/llm-lora-finetuning/finetune/adapter_v2.py +++ b/llm-lora-finetuning/finetune/adapter_v2.py @@ -18,7 +18,13 @@ sys.path.append(str(wd)) from generate.base import generate -from lit_gpt.adapter_v2 import GPT, Block, Config, adapter_filter, mark_only_adapter_v2_as_trainable +from lit_gpt.adapter_v2 import ( + GPT, + Block, + Config, + adapter_filter, + mark_only_adapter_v2_as_trainable, +) from lit_gpt.args import EvalArgs, IOArgs, TrainArgs from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import ( @@ -29,12 +35,21 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt def setup( precision: Optional[str] = None, - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, + quantize: Optional[ + Literal[ + "bnb.nf4", + "bnb.nf4-dq", + "bnb.fp4", + "bnb.fp4-dq", + "bnb.int8-training", + ] + ] = None, devices: int = 1, io: IOArgs = IOArgs( train_data_dir=Path("data/alpaca"), @@ -61,8 +76,14 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -82,12 +103,36 @@ def setup( else: strategy = "auto" - logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) - fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) - fabric.launch(main, devices, Config.from_name(name=io.checkpoint_dir.name), io, train, eval) + logger = CSVLogger( + io.out_dir.parent, + io.out_dir.name, + flush_logs_every_n_steps=train.log_interval, + ) + fabric = L.Fabric( + devices=devices, + strategy=strategy, + precision=precision, + loggers=logger, + plugins=plugins, + ) + fabric.launch( + main, + devices, + Config.from_name(name=io.checkpoint_dir.name), + io, + train, + eval, + ) -def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: +def main( + fabric: L.Fabric, + devices: int, + config: Config, + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: validate_args(io, train, eval) steps_per_epoch = train.epoch_size // devices // train.batch_size(devices) @@ -95,7 +140,9 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -104,13 +151,19 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_adapter_v2_as_trainable(model) - fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") - fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + fabric.print( + f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}" + ) + fabric.print( + f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}" + ) model = fabric.setup_module(model) @@ -122,10 +175,15 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai else: optimizer_cls = torch.optim.AdamW optimizer = optimizer_cls( - trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + trainable_params, + lr=train.learning_rate, + weight_decay=train.weight_decay, + betas=(train.beta1, train.beta2), ) optimizer = fabric.setup_optimizers(optimizer) - scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) + scheduler = get_lr_scheduler( + optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps + ) # strict=False because missing keys due to Adapter weights not contained in state dict load_checkpoint(fabric, model, checkpoint_path, strict=False) @@ -133,10 +191,23 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai fabric.seed_everything(1337 + fabric.global_rank) train_time = time.perf_counter() - fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval) + fit( + fabric, + model, + optimizer, + scheduler, + train_data, + val_data, + devices, + io, + train, + eval, + ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final checkpoint at the end of training save_path = io.out_dir / "lit_model_adapter_finetuned.pth" @@ -157,13 +228,22 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + validate( + fabric, + model, + val_data, + tokenizer, + dataclasses.replace(eval, max_iters=2), + train, + ) # sanity check throughput = ThroughputMonitor(fabric, window_size=50) step_count = 0 @@ -174,10 +254,16 @@ def fit( iter_t0 = time.perf_counter() input_ids, targets = get_batch( - fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None + fabric, + train_data, + train.micro_batch_size, + train.max_seq_length, + longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + is_accumulating = ( + iter_num % train.gradient_accumulation_iters(devices) != 0 + ) with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -196,7 +282,10 @@ def fit( loss_item = loss.item() # expensive device-to-host synchronization t1 = time.perf_counter() throughput.update( - time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths + time=t1 - total_t0, + batches=iter_num, + samples=iter_num * train.micro_batch_size, + lengths=total_lengths, ) throughput.compute_and_log(step=iter_num) fabric.print( @@ -206,9 +295,13 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 - fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + fabric.print( + f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" + ) fabric.barrier() if not is_accumulating and step_count % train.save_interval == 0: checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth" @@ -218,15 +311,24 @@ def fit( # the adapter "kv cache" cannot be initialized under `inference_mode` @torch.no_grad() def validate( - fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs + fabric: L.Fabric, + model: GPT, + val_data: List[Dict], + tokenizer: Tokenizer, + eval: EvalArgs, + train: TrainArgs, ) -> torch.Tensor: fabric.print("Validating ...") model.eval() losses = torch.zeros(eval.max_iters) for k in range(eval.max_iters): - input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + input_ids, targets = get_batch( + fabric, val_data, train.micro_batch_size, train.max_seq_length + ) logits = model(input_ids) - losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + losses[k] = chunked_cross_entropy( + logits[..., :-1, :], targets[..., 1:], chunk_size=0 + ) val_loss = losses.mean() # produce an example: @@ -239,7 +341,11 @@ def validate( # do not set `max_seq_length=max_returned_token` because memory is not a concern here model.set_kv_cache(batch_size=1) output = generate( - model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + model, + encoded, + max_returned_tokens=len(encoded) + eval.max_new_tokens, + temperature=0.8, + eos_id=tokenizer.eos_id, ) model.clear_kv_cache() output = tokenizer.decode(output) @@ -289,9 +395,15 @@ def pad_right(x, pad_id): def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): # linear warmup followed by cosine annealing - scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) - scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) - return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + scheduler1 = torch.optim.lr_scheduler.LambdaLR( + optimizer, lambda step: step / warmup_steps + ) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=(max_steps - warmup_steps) + ) + return torch.optim.lr_scheduler.SequentialLR( + optimizer, [scheduler1, scheduler2], milestones=[warmup_steps] + ) def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: @@ -302,7 +414,9 @@ def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: return longest_seq_length, longest_seq_ix -def save_adapter_v2_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Path) -> None: +def save_adapter_v2_checkpoint( + fabric: L.Fabric, model: torch.nn.Module, file_path: Path +) -> None: fabric.print(f"Saving adapter v2 weights to {str(file_path)!r}") fabric.save(file_path, {"model": model}, filter={"model": adapter_filter}) @@ -313,7 +427,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in unsupported: for name in names: if getattr(args, name) is not None: - issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} doesn't support the {name!r} argument. This is set in {args}" + ) required = [ (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), (train, ["epoch_size", "epochs"]), @@ -322,7 +438,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in required: for name in names: if getattr(args, name) is None: - issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} requires the {name!r} argument. This is set in {args}" + ) if issues: raise ValueError("\n".join(issues)) diff --git a/llm-lora-finetuning/finetune/full.py b/llm-lora-finetuning/finetune/full.py index 52b4a47d..02e28a72 100644 --- a/llm-lora-finetuning/finetune/full.py +++ b/llm-lora-finetuning/finetune/full.py @@ -29,6 +29,7 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt @@ -69,9 +70,23 @@ def setup( else: strategy = "auto" - logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) - fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger) - fabric.launch(main, devices, resume, Config.from_name(name=io.checkpoint_dir.name), io, train, eval) + logger = CSVLogger( + io.out_dir.parent, + io.out_dir.name, + flush_logs_every_n_steps=train.log_interval, + ) + fabric = L.Fabric( + devices=devices, strategy=strategy, precision=precision, loggers=logger + ) + fabric.launch( + main, + devices, + resume, + Config.from_name(name=io.checkpoint_dir.name), + io, + train, + eval, + ) def main( @@ -90,7 +105,9 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -99,22 +116,39 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) - fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") + fabric.print( + f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}" + ) model = fabric.setup(model) optimizer = torch.optim.AdamW( - model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + model.parameters(), + lr=train.learning_rate, + weight_decay=train.weight_decay, + betas=(train.beta1, train.beta2), ) optimizer = fabric.setup_optimizers(optimizer) - scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) - state = {"model": model, "optimizer": optimizer, "scheduler": scheduler, "iter_num": 0, "step_count": 0} + scheduler = get_lr_scheduler( + optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps + ) + state = { + "model": model, + "optimizer": optimizer, + "scheduler": scheduler, + "iter_num": 0, + "step_count": 0, + } if resume is True: - resume = max(io.out_dir.glob("*.pth"), key=(lambda p: int(p.name.split("-")[1]))) + resume = max( + io.out_dir.glob("*.pth"), key=(lambda p: int(p.name.split("-")[1])) + ) if resume: fabric.print(f"Resuming training from {resume}") fabric.load(resume, state) @@ -127,10 +161,14 @@ def main( fit(fabric, state, train_data, val_data, devices, resume, io, train, eval) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final checkpoint at the end of training - fabric.save(io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]}) + fabric.save( + io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]} + ) def fit( @@ -149,13 +187,22 @@ def fit( scheduler = state["scheduler"] tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + validate( + fabric, + model, + val_data, + tokenizer, + dataclasses.replace(eval, max_iters=2), + train, + ) # sanity check initial_iter = state["iter_num"] # resume data loader state by fast-forwarding through all seen batches @@ -164,19 +211,24 @@ def fit( for resume_iter in range(initial_iter): get_batch(fabric, train_data, None) if resume_iter % 1000 == 0: - fabric.print(f"Resuming dataset: {resume_iter} / {initial_iter}") + fabric.print( + f"Resuming dataset: {resume_iter} / {initial_iter}" + ) fabric.barrier() fabric.print( f"Resuming data loader finished. Took {time.perf_counter() - resume_t0:.1f} seconds to reach iteration" f" {initial_iter}." ) - running_loss = RunningMean(window=train.gradient_accumulation_iters(devices), sync_on_compute=False).to( - fabric.device - ) + running_loss = RunningMean( + window=train.gradient_accumulation_iters(devices), + sync_on_compute=False, + ).to(fabric.device) fabric.barrier() - for state["iter_num"] in range(state["iter_num"] + 1, train.max_iters(devices) + 1): + for state["iter_num"] in range( + state["iter_num"] + 1, train.max_iters(devices) + 1 + ): iter_t0 = time.perf_counter() input_ids, targets = get_batch( @@ -187,7 +239,9 @@ def fit( longest_seq_ix if state["iter_num"] == 1 else None, ) - is_accumulating = state["iter_num"] % train.gradient_accumulation_iters(devices) != 0 + is_accumulating = ( + state["iter_num"] % train.gradient_accumulation_iters(devices) != 0 + ) with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids) # shift the targets such that output n predicts token n+1 @@ -203,16 +257,23 @@ def fit( state["step_count"] += 1 if state["iter_num"] % train.log_interval == 0: - loss = running_loss.compute().item() # expensive device-to-host synchronization + loss = ( + running_loss.compute().item() + ) # expensive device-to-host synchronization t1 = time.perf_counter() metrics = { "loss": loss, "iter": state["iter_num"], "step": state["step_count"], "iter_time": t1 - iter_t0, - "tokens": state["iter_num"] * train.micro_batch_size * model.config.block_size, + "tokens": state["iter_num"] + * train.micro_batch_size + * model.config.block_size, "total_tokens": ( - state["iter_num"] * train.micro_batch_size * model.config.block_size * fabric.world_size + state["iter_num"] + * train.micro_batch_size + * model.config.block_size + * fabric.world_size ), # TODO: log learning rate } @@ -224,14 +285,23 @@ def fit( if not is_accumulating and state["step_count"] % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 - fabric.print(f"iter {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + fabric.print( + f"iter {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" + ) metrics = {"val_loss": val_loss, "val_ppl": math.exp(val_loss)} fabric.log_dict(metrics, step=state["iter_num"]) fabric.barrier() - if not is_accumulating and state["step_count"] % train.save_interval == 0: - checkpoint_path = io.out_dir / f"step-{state['step_count']:06d}.pth" + if ( + not is_accumulating + and state["step_count"] % train.save_interval == 0 + ): + checkpoint_path = ( + io.out_dir / f"step-{state['step_count']:06d}.pth" + ) fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}") fabric.save(checkpoint_path, state) @@ -239,15 +309,24 @@ def fit( # FSDP has issues with `inference_mode` @torch.no_grad() def validate( - fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs + fabric: L.Fabric, + model: GPT, + val_data: List[Dict], + tokenizer: Tokenizer, + eval: EvalArgs, + train: TrainArgs, ) -> torch.Tensor: fabric.print("Validating ...") model.eval() losses = torch.zeros(eval.max_iters) for k in range(eval.max_iters): - input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + input_ids, targets = get_batch( + fabric, val_data, train.micro_batch_size, train.max_seq_length + ) logits = model(input_ids) - losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + losses[k] = chunked_cross_entropy( + logits[..., :-1, :], targets[..., 1:], chunk_size=0 + ) val_loss = losses.mean() # produce an example: @@ -260,7 +339,11 @@ def validate( # do not set `max_seq_length=max_returned_token` because memory is not a concern here model.set_kv_cache(batch_size=1) output = generate( - model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + model, + encoded, + max_returned_tokens=len(encoded) + eval.max_new_tokens, + temperature=0.8, + eos_id=tokenizer.eos_id, ) model.clear_kv_cache() output = tokenizer.decode(output) @@ -310,9 +393,15 @@ def pad_right(x, pad_id): def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): # linear warmup followed by cosine annealing - scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) - scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) - return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + scheduler1 = torch.optim.lr_scheduler.LambdaLR( + optimizer, lambda step: step / warmup_steps + ) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=(max_steps - warmup_steps) + ) + return torch.optim.lr_scheduler.SequentialLR( + optimizer, [scheduler1, scheduler2], milestones=[warmup_steps] + ) def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: @@ -329,7 +418,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in unsupported: for name in names: if getattr(args, name) is not None: - issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} doesn't support the {name!r} argument. This is set in {args}" + ) required = [ (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), (train, ["epoch_size", "epochs"]), @@ -338,7 +429,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in required: for name in names: if getattr(args, name) is None: - issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} requires the {name!r} argument. This is set in {args}" + ) if issues: raise ValueError("\n".join(issues)) diff --git a/llm-lora-finetuning/finetune/lora.py b/llm-lora-finetuning/finetune/lora.py index 086322fb..39caa06e 100644 --- a/llm-lora-finetuning/finetune/lora.py +++ b/llm-lora-finetuning/finetune/lora.py @@ -19,7 +19,13 @@ from generate.base import generate from lit_gpt.args import EvalArgs, IOArgs, TrainArgs -from lit_gpt.lora import GPT, Block, Config, lora_filter, mark_only_lora_as_trainable +from lit_gpt.lora import ( + GPT, + Block, + Config, + lora_filter, + mark_only_lora_as_trainable, +) from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import ( CLI, @@ -29,12 +35,21 @@ load_checkpoint, num_parameters, ) + from scripts.prepare_alpaca import generate_prompt def setup( precision: Optional[str] = None, - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None, + quantize: Optional[ + Literal[ + "bnb.nf4", + "bnb.nf4-dq", + "bnb.fp4", + "bnb.fp4-dq", + "bnb.int8-training", + ] + ] = None, devices: int = 1, lora_r: int = 8, lora_alpha: int = 16, @@ -70,8 +85,14 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -91,10 +112,29 @@ def setup( else: strategy = "auto" - logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval) - fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins) + logger = CSVLogger( + io.out_dir.parent, + io.out_dir.name, + flush_logs_every_n_steps=train.log_interval, + ) + fabric = L.Fabric( + devices=devices, + strategy=strategy, + precision=precision, + loggers=logger, + plugins=plugins, + ) - if not any((lora_query, lora_key, lora_value, lora_projection, lora_mlp, lora_head)): + if not any( + ( + lora_query, + lora_key, + lora_value, + lora_projection, + lora_mlp, + lora_head, + ) + ): fabric.print("Warning: all LoRA layers are disabled!") fabric.launch( main, @@ -117,7 +157,14 @@ def setup( ) -def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: +def main( + fabric: L.Fabric, + devices: int, + config: Config, + io: IOArgs, + train: TrainArgs, + eval: EvalArgs, +) -> None: validate_args(io, train, eval) steps_per_epoch = train.epoch_size // devices // train.batch_size(devices) @@ -125,7 +172,9 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -134,13 +183,19 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_lora_as_trainable(model) - fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") - fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}") + fabric.print( + f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}" + ) + fabric.print( + f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}" + ) model = fabric.setup_module(model) @@ -152,10 +207,15 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai else: optimizer_cls = torch.optim.AdamW optimizer = optimizer_cls( - trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2) + trainable_params, + lr=train.learning_rate, + weight_decay=train.weight_decay, + betas=(train.beta1, train.beta2), ) optimizer = fabric.setup_optimizers(optimizer) - scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps) + scheduler = get_lr_scheduler( + optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps + ) # strict=False because missing keys due to LoRA weights not contained in state dict load_checkpoint(fabric, model, checkpoint_path, strict=False) @@ -163,10 +223,23 @@ def main(fabric: L.Fabric, devices: int, config: Config, io: IOArgs, train: Trai fabric.seed_everything(1337 + fabric.global_rank) train_time = time.perf_counter() - fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval) + fit( + fabric, + model, + optimizer, + scheduler, + train_data, + val_data, + devices, + io, + train, + eval, + ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final LoRA checkpoint at the end of training save_path = io.out_dir / "lit_model_lora_finetuned.pth" @@ -187,13 +260,22 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train) # sanity check + validate( + fabric, + model, + val_data, + tokenizer, + dataclasses.replace(eval, max_iters=2), + train, + ) # sanity check throughput = ThroughputMonitor(fabric, window_size=50) step_count = 0 @@ -204,10 +286,16 @@ def fit( iter_t0 = time.perf_counter() input_ids, targets = get_batch( - fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None + fabric, + train_data, + train.micro_batch_size, + train.max_seq_length, + longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + is_accumulating = ( + iter_num % train.gradient_accumulation_iters(devices) != 0 + ) with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -226,7 +314,10 @@ def fit( loss_item = loss.item() # expensive device-to-host synchronization t1 = time.perf_counter() throughput.update( - time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths + time=t1 - total_t0, + batches=iter_num, + samples=iter_num * train.micro_batch_size, + lengths=total_lengths, ) throughput.compute_and_log(step=iter_num) fabric.print( @@ -236,9 +327,13 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 - fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms") + fabric.print( + f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" + ) fabric.barrier() if not is_accumulating and step_count % train.save_interval == 0: checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth" @@ -248,15 +343,24 @@ def fit( # FSDP has issues with `inference_mode` @torch.no_grad() def validate( - fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs + fabric: L.Fabric, + model: GPT, + val_data: List[Dict], + tokenizer: Tokenizer, + eval: EvalArgs, + train: TrainArgs, ) -> torch.Tensor: fabric.print("Validating ...") model.eval() losses = torch.zeros(eval.max_iters) for k in range(eval.max_iters): - input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length) + input_ids, targets = get_batch( + fabric, val_data, train.micro_batch_size, train.max_seq_length + ) logits = model(input_ids) - losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0) + losses[k] = chunked_cross_entropy( + logits[..., :-1, :], targets[..., 1:], chunk_size=0 + ) val_loss = losses.mean() # produce an example: @@ -269,7 +373,11 @@ def validate( # do not set `max_seq_length=max_returned_token` because memory is not a concern here model.set_kv_cache(batch_size=1) output = generate( - model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id + model, + encoded, + max_returned_tokens=len(encoded) + eval.max_new_tokens, + temperature=0.8, + eos_id=tokenizer.eos_id, ) model.clear_kv_cache() output = tokenizer.decode(output) @@ -319,9 +427,15 @@ def pad_right(x, pad_id): def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int): # linear warmup followed by cosine annealing - scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) - scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps)) - return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps]) + scheduler1 = torch.optim.lr_scheduler.LambdaLR( + optimizer, lambda step: step / warmup_steps + ) + scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=(max_steps - warmup_steps) + ) + return torch.optim.lr_scheduler.SequentialLR( + optimizer, [scheduler1, scheduler2], milestones=[warmup_steps] + ) def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: @@ -332,7 +446,9 @@ def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: return longest_seq_length, longest_seq_ix -def save_lora_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Path) -> None: +def save_lora_checkpoint( + fabric: L.Fabric, model: torch.nn.Module, file_path: Path +) -> None: fabric.print(f"Saving LoRA weights to {str(file_path)!r}") fabric.save(file_path, {"model": model}, filter={"model": lora_filter}) @@ -343,7 +459,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in unsupported: for name in names: if getattr(args, name) is not None: - issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} doesn't support the {name!r} argument. This is set in {args}" + ) required = [ (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]), (train, ["epoch_size", "epochs"]), @@ -352,7 +470,9 @@ def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None: for args, names in required: for name in names: if getattr(args, name) is None: - issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}") + issues.append( + f"{__file__} requires the {name!r} argument. This is set in {args}" + ) if issues: raise ValueError("\n".join(issues)) diff --git a/llm-lora-finetuning/generate/adapter.py b/llm-lora-finetuning/generate/adapter.py index 15e5df51..3daa8836 100644 --- a/llm-lora-finetuning/generate/adapter.py +++ b/llm-lora-finetuning/generate/adapter.py @@ -16,16 +16,28 @@ from generate.base import generate from lit_gpt import Tokenizer from lit_gpt.adapter import GPT, Config -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, + lazy_load, +) + from scripts.prepare_alpaca import generate_prompt def main( prompt: str = "What food do llamas eat?", input: str = "", - adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + adapter_path: Path = Path( + "out/adapter/alpaca/lit_model_adapter_finetuned.pth" + ), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] + ] = None, max_new_tokens: int = 100, top_k: Optional[int] = 200, temperature: float = 0.8, @@ -56,8 +68,14 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -77,11 +95,17 @@ def main( prompt_length = encoded.size(0) max_returned_tokens = prompt_length + max_new_tokens - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) t0 = time.perf_counter() with fabric.init_module(empty_init=True): model = GPT(config) - fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) with fabric.init_tensor(): # set the max_seq_length to limit the memory usage to what we need model.max_seq_length = max_returned_tokens @@ -94,13 +118,23 @@ def main( adapter_checkpoint = lazy_load(adapter_path) checkpoint.update(adapter_checkpoint.get("model", adapter_checkpoint)) model.load_state_dict(checkpoint) - fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) model = fabric.setup(model) L.seed_everything(1234) t0 = time.perf_counter() - y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + y = generate( + model, + encoded, + max_returned_tokens, + temperature=temperature, + top_k=top_k, + eos_id=tokenizer.eos_id, + ) t = time.perf_counter() - t0 output = tokenizer.decode(y) @@ -108,9 +142,15 @@ def main( fabric.print(output) tokens_generated = y.size(0) - prompt_length - fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + fabric.print( + f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", + file=sys.stderr, + ) if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", + file=sys.stderr, + ) if __name__ == "__main__": diff --git a/llm-lora-finetuning/generate/adapter_v2.py b/llm-lora-finetuning/generate/adapter_v2.py index c799a0ea..6f9d76d4 100644 --- a/llm-lora-finetuning/generate/adapter_v2.py +++ b/llm-lora-finetuning/generate/adapter_v2.py @@ -16,16 +16,28 @@ from generate.base import generate from lit_gpt import Tokenizer from lit_gpt.adapter_v2 import GPT, Config -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, + lazy_load, +) + from scripts.prepare_alpaca import generate_prompt def main( prompt: str = "What food do llamas eat?", input: str = "", - adapter_path: Path = Path("out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + adapter_path: Path = Path( + "out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth" + ), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] + ] = None, max_new_tokens: int = 100, top_k: Optional[int] = 200, temperature: float = 0.8, @@ -56,8 +68,14 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -77,11 +95,17 @@ def main( prompt_length = encoded.size(0) max_returned_tokens = prompt_length + max_new_tokens - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) t0 = time.perf_counter() with fabric.init_module(empty_init=True): model = GPT(config) - fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) with fabric.init_tensor(): # set the max_seq_length to limit the memory usage to what we need model.max_seq_length = max_returned_tokens @@ -94,13 +118,23 @@ def main( adapter_checkpoint = lazy_load(adapter_path) checkpoint.update(adapter_checkpoint.get("model", adapter_checkpoint)) model.load_state_dict(checkpoint) - fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) model = fabric.setup(model) L.seed_everything(1234) t0 = time.perf_counter() - y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + y = generate( + model, + encoded, + max_returned_tokens, + temperature=temperature, + top_k=top_k, + eos_id=tokenizer.eos_id, + ) t = time.perf_counter() - t0 output = tokenizer.decode(y) @@ -108,9 +142,15 @@ def main( fabric.print(output) tokens_generated = y.size(0) - prompt_length - fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + fabric.print( + f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", + file=sys.stderr, + ) if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", + file=sys.stderr, + ) if __name__ == "__main__": diff --git a/llm-lora-finetuning/generate/base.py b/llm-lora-finetuning/generate/base.py index 3cf75715..f8cfa7bd 100644 --- a/llm-lora-finetuning/generate/base.py +++ b/llm-lora-finetuning/generate/base.py @@ -16,7 +16,12 @@ sys.path.append(str(wd)) from lit_gpt import GPT, Config, Tokenizer -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, + load_checkpoint, +) def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor: @@ -27,7 +32,9 @@ def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor: return torch.multinomial(probs, num_samples=1) -def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None) -> torch.Tensor: +def sample( + logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None +) -> torch.Tensor: logits = logits[0, -1] # optionally crop the logits to only the top k options if top_k is not None: @@ -41,7 +48,9 @@ def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] return torch.argmax(logits, dim=-1, keepdim=True) -def next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any) -> torch.Tensor: +def next_token( + model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any +) -> torch.Tensor: logits = model(x, input_pos) next = sample(logits, **kwargs) return next.to(dtype=x.dtype) @@ -75,17 +84,29 @@ def generate( # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a # data dependency on the `input_pos` tensor and impact model compilation. Since this setting is uncommon, we do # not support it to avoid negatively impacting the overall speed - raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}") + raise NotImplementedError( + f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}" + ) device = prompt.device tokens = [prompt] input_pos = torch.tensor([T], device=device) token = next_token( - model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k + model, + torch.arange(0, T, device=device), + prompt.view(1, -1), + temperature=temperature, + top_k=top_k, ).clone() tokens.append(token) for _ in range(2, max_returned_tokens - T + 1): - token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k).clone() + token = next_token( + model, + input_pos, + token.view(1, -1), + temperature=temperature, + top_k=top_k, + ).clone() tokens.append(token) if token == eos_id: break @@ -101,8 +122,12 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] + ] = None, precision: Optional[str] = None, compile: bool = False, ) -> None: @@ -128,8 +153,14 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -146,11 +177,17 @@ def main( prompt_length = encoded.size(0) max_returned_tokens = prompt_length + max_new_tokens - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) t0 = time.perf_counter() with fabric.init_module(empty_init=True): model = GPT(config) - fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) with fabric.init_tensor(): # set the max_seq_length to limit the memory usage to what we need model.max_seq_length = max_returned_tokens @@ -169,22 +206,36 @@ def main( t0 = time.perf_counter() load_checkpoint(fabric, model, checkpoint_path) - fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) L.seed_everything(1234) for i in range(num_samples): t0 = time.perf_counter() - y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + y = generate( + model, + encoded, + max_returned_tokens, + temperature=temperature, + top_k=top_k, + eos_id=tokenizer.eos_id, + ) t = time.perf_counter() - t0 for block in model.transformer.h: block.attn.kv_cache.reset_parameters() fabric.print(tokenizer.decode(y)) tokens_generated = y.size(0) - prompt_length fabric.print( - f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr + f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", + file=sys.stderr, ) if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", + file=sys.stderr, + ) if __name__ == "__main__": diff --git a/llm-lora-finetuning/generate/full.py b/llm-lora-finetuning/generate/full.py index ca1554e4..cc1da495 100644 --- a/llm-lora-finetuning/generate/full.py +++ b/llm-lora-finetuning/generate/full.py @@ -15,7 +15,13 @@ from generate.base import generate from lit_gpt import GPT, Config, Tokenizer -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, + load_checkpoint, +) + from scripts.prepare_alpaca import generate_prompt @@ -23,8 +29,12 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", finetuned_path: Path = Path("out/full/alpaca/lit_model_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] + ] = None, max_new_tokens: int = 100, top_k: Optional[int] = 200, temperature: float = 0.8, @@ -55,8 +65,14 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -76,11 +92,17 @@ def main( prompt_length = encoded.size(0) max_returned_tokens = prompt_length + max_new_tokens - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) t0 = time.perf_counter() with fabric.init_module(empty_init=True): model = GPT(config) - fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) with fabric.init_tensor(): # set the max_seq_length to limit the memory usage to what we need model.max_seq_length = max_returned_tokens @@ -92,11 +114,21 @@ def main( t0 = time.perf_counter() load_checkpoint(fabric, model, checkpoint_path) - fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) L.seed_everything(1234) t0 = time.perf_counter() - y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + y = generate( + model, + encoded, + max_returned_tokens, + temperature=temperature, + top_k=top_k, + eos_id=tokenizer.eos_id, + ) t = time.perf_counter() - t0 output = tokenizer.decode(y) @@ -104,9 +136,15 @@ def main( fabric.print(output) tokens_generated = y.size(0) - prompt_length - fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + fabric.print( + f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", + file=sys.stderr, + ) if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", + file=sys.stderr, + ) if __name__ == "__main__": diff --git a/llm-lora-finetuning/generate/lora.py b/llm-lora-finetuning/generate/lora.py index 006b75ba..0b30b701 100644 --- a/llm-lora-finetuning/generate/lora.py +++ b/llm-lora-finetuning/generate/lora.py @@ -16,7 +16,13 @@ from generate.base import generate from lit_gpt import Tokenizer from lit_gpt.lora import GPT, Config, merge_lora_weights -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, + lazy_load, +) + from scripts.prepare_alpaca import generate_prompt @@ -24,8 +30,12 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None, + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] + ] = None, max_new_tokens: int = 100, top_k: Optional[int] = 200, temperature: float = 0.8, @@ -65,8 +75,14 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None @@ -97,11 +113,17 @@ def main( prompt_length = encoded.size(0) max_returned_tokens = prompt_length + max_new_tokens - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) t0 = time.perf_counter() with fabric.init_module(empty_init=True): model = GPT(config) - fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) with fabric.init_tensor(): # set the max_seq_length to limit the memory usage to what we need model.max_seq_length = max_returned_tokens @@ -114,14 +136,24 @@ def main( lora_checkpoint = lazy_load(lora_path) checkpoint.update(lora_checkpoint.get("model", lora_checkpoint)) model.load_state_dict(checkpoint) - fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) merge_lora_weights(model) model = fabric.setup(model) L.seed_everything(1234) t0 = time.perf_counter() - y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id) + y = generate( + model, + encoded, + max_returned_tokens, + temperature=temperature, + top_k=top_k, + eos_id=tokenizer.eos_id, + ) t = time.perf_counter() - t0 output = tokenizer.decode(y) @@ -129,9 +161,15 @@ def main( fabric.print(output) tokens_generated = y.size(0) - prompt_length - fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr) + fabric.print( + f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", + file=sys.stderr, + ) if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", + file=sys.stderr, + ) if __name__ == "__main__": diff --git a/llm-lora-finetuning/generate/sequentially.py b/llm-lora-finetuning/generate/sequentially.py index cd1b1942..d2dde4bb 100644 --- a/llm-lora-finetuning/generate/sequentially.py +++ b/llm-lora-finetuning/generate/sequentially.py @@ -24,11 +24,17 @@ import generate.base as generate_base from lit_gpt import GPT, Config, Tokenizer from lit_gpt.model import Block, build_mask_cache -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, +) @torch.inference_mode() -def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int): +def sequential( + model: GPT, root: torch.device, max_seq_length: int, devices: int +): if model.config.n_layer % devices: # TODO: support smarter partitioning schemes raise NotImplementedError( @@ -36,7 +42,9 @@ def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int ) layers_per_rank = model.config.n_layer // devices # dictates where each block should be instantiated - mapping = layer_to_device(model, chunk_on=Block, chunk_size=layers_per_rank) + mapping = layer_to_device( + model, chunk_on=Block, chunk_size=layers_per_rank + ) # materialize each block on the appropriate device for path, target_index in mapping.items(): @@ -44,11 +52,15 @@ def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int target_device = torch.device(root.type, target_index) print(f"Moving {path!r} to {target_device}", file=sys.stderr) # submodules loaded by the checkpoint will be on CPU (if no quantization). move them - replace_device(submodule, replace=torch.device("cpu"), by=target_device) + replace_device( + submodule, replace=torch.device("cpu"), by=target_device + ) # in case the checkpoint was partial, materialize leftover metas _materialize_meta_tensors(submodule, target_device) # and build the kv cache - submodule.attn.kv_cache = submodule.attn.build_kv_cache(1, max_seq_length, model.cos.size(-1), target_device) + submodule.attn.kv_cache = submodule.attn.build_kv_cache( + 1, max_seq_length, model.cos.size(-1), target_device + ) # rebuild odd ends with root: model.max_seq_length = max_seq_length @@ -70,9 +82,13 @@ def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int # TODO: the second case could be optimized and then we would only need this hook for # `layer_num in [layers_per_rank * i - 1 for i in range(1, devices + 1)]` target_device = torch.device(root.type, target_index) - submodule.register_forward_pre_hook(partial(move_block_input, target_device)) + submodule.register_forward_pre_hook( + partial(move_block_input, target_device) + ) if layer_num == model.config.n_layer - 1: - submodule.register_forward_hook(partial(move_block_output, root)) + submodule.register_forward_hook( + partial(move_block_output, root) + ) return model @@ -82,7 +98,11 @@ def layer_to_device( ) -> "OrderedDict[str, int]": """Create a mapping from layer (block) to device.""" # this assumes that the definition order is the same as the execution order - hits = [name for name, submodule in module.named_modules() if isinstance(submodule, chunk_on)] + hits = [ + name + for name, submodule in module.named_modules() + if isinstance(submodule, chunk_on) + ] return OrderedDict((name, i // chunk_size) for i, name in enumerate(hits)) @@ -92,22 +112,31 @@ def move_block_input(device: torch.device, module: torch.nn.Module, ins): return tuple(t.to(device) for t in ins) -def move_block_output(device: torch.device, module: torch.nn.Module, ins, outs) -> torch.Tensor: +def move_block_output( + device: torch.device, module: torch.nn.Module, ins, outs +) -> torch.Tensor: """``forward_hook`` to move a Block's output after forward.""" return outs.to(device) -def replace_device(module: torch.nn.Module, replace: torch.device, by: torch.device) -> torch.nn.Module: +def replace_device( + module: torch.nn.Module, replace: torch.device, by: torch.device +) -> torch.nn.Module: for name, submodule in module.named_modules(): tensors = dict( - itertools.chain(submodule.named_parameters(recurse=False), submodule.named_buffers(recurse=False)) + itertools.chain( + submodule.named_parameters(recurse=False), + submodule.named_buffers(recurse=False), + ) ) if not tensors: continue devices = {t.device for t in tensors.values()} if len(devices) != 1: # since this is using `submodule.to`, different devices in the same submodule is a problem - path_to_device = {f"{name}.{p}": t.device for p, t in tensors.items()} + path_to_device = { + f"{name}.{p}": t.device for p, t in tensors.items() + } raise ValueError(f"Found multiple devices: {path_to_device}") if devices.pop() == replace: submodule.to(by) @@ -122,8 +151,12 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path("checkpoints/mistralai/Mistral-7B-Instruct-v0.1"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None, + checkpoint_dir: Path = Path( + "checkpoints/mistralai/Mistral-7B-Instruct-v0.1" + ), + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"] + ] = None, precision: Optional[str] = None, compile: bool = False, ) -> None: @@ -150,12 +183,20 @@ def main( if compile: raise NotImplementedError # untested if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None - fabric = L.Fabric(devices=1, precision=precision, accelerator="cuda", plugins=plugins) + fabric = L.Fabric( + devices=1, precision=precision, accelerator="cuda", plugins=plugins + ) total_devices = CUDAAccelerator.auto_device_count() print(f"Using {total_devices} devices", file=sys.stderr) @@ -171,26 +212,42 @@ def main( prompt_length = encoded.size(0) max_returned_tokens = prompt_length + max_new_tokens - print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) t0 = time.perf_counter() # cannot use `init_module` because if bitsandbytes is used, the Linear layers will be replaced # which means that the weights will get quantized on cuda:0 on checkpoint load. we need to load and then convert # still, use init_tensor for the precision with fabric.init_tensor(), torch.device("meta"): model = GPT(config) - print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + print( + f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) t0 = time.perf_counter() - state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") + state_dict = torch.load( + str(checkpoint_path), mmap=True, map_location="cpu" + ) # TODO: this assumes that the model fits on CPU. Use lazy_load and make the materialization checkpoint aware model.load_state_dict(state_dict, assign=True) - print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + print( + f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) model = fabric.setup_module(model, move_to_device=False) t0 = time.perf_counter() - model = sequential(model, fabric.device, max_returned_tokens, total_devices) - print(f"Time to sequential-ize the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + model = sequential( + model, fabric.device, max_returned_tokens, total_devices + ) + print( + f"Time to sequential-ize the model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) if compile: # TODO: raises an internal compile AssertionError caused by fabric.strategy.precision.forward_context @@ -198,7 +255,9 @@ def main( # silence developer warning on nightly builds # https://github.com/pytorch/pytorch/blob/v2.2.0-rc5/torch/_inductor/ir.py#L4166 pattern = re.compile(".*DeviceCopy in input program.*") - logging.getLogger("torch._inductor.utils").addFilter(lambda record: not pattern.search(record.getMessage())) + logging.getLogger("torch._inductor.utils").addFilter( + lambda record: not pattern.search(record.getMessage()) + ) torch._dynamo.config.automatic_dynamic_shapes = True torch._inductor.config.triton.unique_kernel_names = True torch._inductor.config.coordinate_descent_tuning = True @@ -210,7 +269,12 @@ def main( for i in range(num_samples): t0 = time.perf_counter() y = generate_base.generate( - model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id + model, + encoded, + max_returned_tokens, + temperature=temperature, + top_k=top_k, + eos_id=tokenizer.eos_id, ) t = time.perf_counter() - t0 for block in model.transformer.h: @@ -218,14 +282,20 @@ def main( print(tokenizer.decode(y)) tokens_generated = y.size(0) - prompt_length print( - f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr + f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", + file=sys.stderr, ) - print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", + file=sys.stderr, + ) if __name__ == "__main__": torch.set_float32_matmul_precision("high") - logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes").setLevel(logging.DEBUG) + logging.getLogger( + "lightning.fabric.plugins.precision.bitsandbytes" + ).setLevel(logging.DEBUG) CLI(main) diff --git a/llm-lora-finetuning/generate/tp.py b/llm-lora-finetuning/generate/tp.py index abd93cc1..e8c7e1ef 100644 --- a/llm-lora-finetuning/generate/tp.py +++ b/llm-lora-finetuning/generate/tp.py @@ -22,19 +22,30 @@ import generate.base as generate_base from lit_gpt import GPT, Config, Tokenizer from lit_gpt.model import CausalSelfAttention, GptNeoxMLP, LLaMAMLP, LLaMAMoE -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, +) -def tensor_parallel_linear(fabric: L.Fabric, linear: torch.nn.Linear, style: str) -> None: +def tensor_parallel_linear( + fabric: L.Fabric, linear: torch.nn.Linear, style: str +) -> None: world_size = fabric.world_size - dim, attr = {"colwise": (0, "out_features"), "rowwise": (1, "in_features")}[style] + dim, attr = { + "colwise": (0, "out_features"), + "rowwise": (1, "in_features"), + }[style] size = getattr(linear, attr) if size % world_size != 0: raise ValueError( f"This linear's {attr} value ({size}) is not evenly divisible by the world size ({world_size})" ) - shard = torch.tensor_split(linear.weight, world_size, dim=dim)[fabric.global_rank] + shard = torch.tensor_split(linear.weight, world_size, dim=dim)[ + fabric.global_rank + ] # overwrite `.data` instead of recreating the parameter for quantization (bitsandbytes) support. # the bitsandbytes linear classes use custom `torch.nn.Parameter` subclasses linear.weight.data = shard @@ -42,19 +53,27 @@ def tensor_parallel_linear(fabric: L.Fabric, linear: torch.nn.Linear, style: str if linear.bias is not None and dim == 0: shard = torch.tensor_split(linear.bias, world_size)[fabric.global_rank] - linear.bias = torch.nn.Parameter(shard, requires_grad=linear.bias.requires_grad) + linear.bias = torch.nn.Parameter( + shard, requires_grad=linear.bias.requires_grad + ) -def tensor_parallel_mlp(fabric: L.Fabric, mlp: Union[GptNeoxMLP, LLaMAMLP, LLaMAMoE]) -> None: +def tensor_parallel_mlp( + fabric: L.Fabric, mlp: Union[GptNeoxMLP, LLaMAMLP, LLaMAMoE] +) -> None: if isinstance(mlp, LLaMAMLP): tensor_parallel_linear(fabric, mlp.fc_1, "colwise") tensor_parallel_linear(fabric, mlp.fc_2, "colwise") tensor_parallel_linear(fabric, mlp.proj, "rowwise") - mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) + mlp.register_forward_hook( + partial(all_reduce_output, fabric.world_size) + ) elif isinstance(mlp, GptNeoxMLP): tensor_parallel_linear(fabric, mlp.fc, "colwise") tensor_parallel_linear(fabric, mlp.proj, "rowwise") - mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) + mlp.register_forward_hook( + partial(all_reduce_output, fabric.world_size) + ) elif isinstance(mlp, LLaMAMoE): # we use expert slicing across ranks, alternatively, we could create a expert parallelism group # when the number of experts is a multiple of the world size @@ -70,7 +89,9 @@ def tensor_parallel_attn(fabric: L.Fabric, attn: CausalSelfAttention) -> None: attn.register_forward_hook(partial(all_reduce_output, fabric.world_size)) -def all_reduce_output(world_size: int, module: torch.nn.Module, ins, outs) -> torch.Tensor: +def all_reduce_output( + world_size: int, module: torch.nn.Module, ins, outs +) -> torch.Tensor: return all_reduce(outs, "sum", list(range(world_size))) @@ -86,7 +107,9 @@ def tensor_parallel(fabric: L.Fabric, model: GPT) -> GPT: for attr in attrs: size = getattr(model.config, attr) if size % world_size != 0: - raise ValueError(f"This {attr} value ({size}) is not evenly divisible by the world size ({world_size})") + raise ValueError( + f"This {attr} value ({size}) is not evenly divisible by the world size ({world_size})" + ) setattr(model.config, attr, size // world_size) return model @@ -100,8 +123,12 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), - quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None, + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), + quantize: Optional[ + Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"] + ] = None, precision: Optional[str] = None, compile: bool = False, ) -> None: @@ -128,13 +155,21 @@ def main( if compile: raise NotImplementedError # untested if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") - dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision] + raise ValueError( + "Quantization and mixed precision is not supported." + ) + dtype = { + "16-true": torch.float16, + "bf16-true": torch.bfloat16, + "32-true": torch.float32, + }[precision] plugins = BitsandbytesPrecision(quantize[4:], dtype) precision = None # set "ddp" as the strategy for the launching functionality, but there's no data-parallelism - fabric = L.Fabric(devices="auto", strategy="ddp", precision=precision, plugins=plugins) + fabric = L.Fabric( + devices="auto", strategy="ddp", precision=precision, plugins=plugins + ) fabric.launch() check_valid_checkpoint_dir(checkpoint_dir) @@ -149,23 +184,34 @@ def main( prompt_length = encoded.size(0) max_returned_tokens = prompt_length + max_new_tokens - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr) + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", + file=sys.stderr, + ) t0 = time.perf_counter() # cannot use `init_module` because if bitsandbytes is used, the Linear layers will be replaced # which means that the weights will get quantized on cuda:0 on checkpoint load. we need to load and then convert # still, use init_tensor for the precision with fabric.init_tensor(), torch.device("meta"): model = GPT(config) - fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + fabric.print( + f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) # sequentially do: load the checkpoint on CPU -> quantize -> apply tp -> move to device # so that the CPU RAM doesn't OOM with larger models for rank in range(fabric.world_size): if fabric.global_rank == rank: t0 = time.perf_counter() - state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") + state_dict = torch.load( + str(checkpoint_path), mmap=True, map_location="cpu" + ) model.load_state_dict(state_dict, assign=True) - print(f"[{rank}] Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + print( + f"[{rank}] Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) # cannot use `.setup_module` because it will wrap with DDP model = fabric._precision.convert_module(model) @@ -188,20 +234,30 @@ def main( t0 = time.perf_counter() model = fabric.to_device(model) - print(f"[{rank}] Time to move the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr) + print( + f"[{rank}] Time to move the model: {time.perf_counter() - t0:.02f} seconds.", + file=sys.stderr, + ) fabric.barrier() if compile: torch._dynamo.config.automatic_dynamic_shapes = True torch._inductor.config.triton.unique_kernel_names = True torch._inductor.config.coordinate_descent_tuning = True - generate_base.next_token = torch.compile(generate_base.next_token, mode="reduce-overhead") + generate_base.next_token = torch.compile( + generate_base.next_token, mode="reduce-overhead" + ) L.seed_everything(1234) for i in range(num_samples): t0 = time.perf_counter() y = generate_base.generate( - model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id + model, + encoded, + max_returned_tokens, + temperature=temperature, + top_k=top_k, + eos_id=tokenizer.eos_id, ) t = time.perf_counter() - t0 for block in model.transformer.h: @@ -209,16 +265,22 @@ def main( fabric.print(tokenizer.decode(y)) tokens_generated = y.size(0) - prompt_length fabric.print( - f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr + f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", + file=sys.stderr, ) if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr) + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", + file=sys.stderr, + ) if __name__ == "__main__": torch.set_float32_matmul_precision("high") - bnb_logger = logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes") + bnb_logger = logging.getLogger( + "lightning.fabric.plugins.precision.bitsandbytes" + ) bnb_logger.setLevel(logging.DEBUG) bnb_logger.debug = rank_zero_only(bnb_logger.debug) diff --git a/llm-lora-finetuning/lit_gpt/__init__.py b/llm-lora-finetuning/lit_gpt/__init__.py index 856e7cd6..9eac3836 100644 --- a/llm-lora-finetuning/lit_gpt/__init__.py +++ b/llm-lora-finetuning/lit_gpt/__init__.py @@ -1,14 +1,14 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. -import re import logging +import re + +from lightning_utilities.core.imports import RequirementCache -from lit_gpt.model import GPT from lit_gpt.config import Config +from lit_gpt.model import GPT from lit_gpt.tokenizer import Tokenizer -from lightning_utilities.core.imports import RequirementCache - _LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.2.0.dev0") if not bool(_LIGHTNING_AVAILABLE): raise ImportError( @@ -18,7 +18,9 @@ # Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632 pattern = re.compile(".*Profiler function .* will be ignored") -logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage())) +logging.getLogger("torch._dynamo.variables.torch").addFilter( + lambda record: not pattern.search(record.getMessage()) +) # Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True diff --git a/llm-lora-finetuning/lit_gpt/adapter.py b/llm-lora-finetuning/lit_gpt/adapter.py index 044b75d5..61744419 100644 --- a/llm-lora-finetuning/lit_gpt/adapter.py +++ b/llm-lora-finetuning/lit_gpt/adapter.py @@ -29,18 +29,23 @@ class Config(BaseConfig): class GPT(BaseModel): """The implementation is identical to `lit_gpt.model.GPT` with the exception that - the `Block` saves the layer index and passes it down to the attention layer.""" + the `Block` saves the layer index and passes it down to the attention layer. + """ def __init__(self, config: Config) -> None: nn.Module.__init__(self) assert config.padded_vocab_size is not None self.config = config - self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.lm_head = nn.Linear( + config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias + ) self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + h=nn.ModuleList( + Block(config, i) for i in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) @@ -48,11 +53,16 @@ def __init__(self, config: Config) -> None: self.mask_cache: Optional[torch.Tensor] = None def forward( - self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0 + self, + idx: torch.Tensor, + input_pos: Optional[torch.Tensor] = None, + lm_head_chunk_size: int = 0, ) -> Union[torch.Tensor, List[torch.Tensor]]: T = idx.size(1) if self.max_seq_length < T: - raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") + raise ValueError( + f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}." + ) if input_pos is not None: # use the kv cache cos = self.cos.index_select(0, input_pos) @@ -65,13 +75,17 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte( + idx + ) # token embeddings of shape (b, t, n_embd) for block in self.transformer.h: x = block(x, cos, sin, mask, input_pos) x = self.transformer.ln_f(x) if lm_head_chunk_size > 0: # chunk the lm head logits to reduce the peak memory used by autograd - return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] + return [ + self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1) + ] return self.lm_head(x) # (b, t, vocab_size) @classmethod @@ -109,15 +123,25 @@ def __init__(self, config: Config, block_idx: int) -> None: super().__init__(config) if block_idx >= config.adapter_start_layer: # adapter embedding layer - self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) + self.adapter_wte = nn.Embedding( + config.adapter_prompt_length, config.n_embd + ) # gate for adaption - self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) + self.gating_factor = torch.nn.Parameter( + torch.zeros(1, 1, config.n_head, 1) + ) # kv cache for inference - self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self.adapter_kv_cache: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None self.block_idx = block_idx def scaled_dot_product_attention( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: y = super().scaled_dot_product_attention(q, k, v, mask) if self.block_idx < self.config.adapter_start_layer: @@ -132,15 +156,25 @@ def scaled_dot_product_attention( prefix = self.adapter_wte.weight.reshape(1, aT, self.config.n_embd) aqkv = self.attn(prefix) q_per_kv = self.config.n_head // self.config.n_query_groups - aqkv = aqkv.view(1, aT, self.config.n_query_groups, q_per_kv + 2, self.config.head_size) + aqkv = aqkv.view( + 1, + aT, + self.config.n_query_groups, + q_per_kv + 2, + self.config.head_size, + ) aqkv = aqkv.permute(0, 2, 3, 1, 4) _, ak, av = aqkv.split((q_per_kv, 1, 1), dim=2) if self.config.n_query_groups != 1: # for MHA this is a no-op ak = ak.repeat_interleave(q_per_kv, dim=2) av = av.repeat_interleave(q_per_kv, dim=2) - ak = ak.view(1, -1, aT, self.config.head_size) # (1, nh_ak, aT, hs) - av = av.view(1, -1, aT, self.config.head_size) # (1, nh_av, aT, hs) + ak = ak.view( + 1, -1, aT, self.config.head_size + ) # (1, nh_ak, aT, hs) + av = av.view( + 1, -1, aT, self.config.head_size + ) # (1, nh_av, aT, hs) self.adapter_kv_cache = (ak, av) T = q.size(2) @@ -151,9 +185,13 @@ def scaled_dot_product_attention( def reset_parameters(self) -> None: torch.nn.init.zeros_(self.gating_factor) - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with older checkpoints.""" - if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head: + if (key := prefix + "gating_factor") in state_dict and state_dict[ + key + ].size(1) == self.config.n_head: state_dict[key] = state_dict[key].permute(0, 2, 1, 3) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) diff --git a/llm-lora-finetuning/lit_gpt/adapter_v2.py b/llm-lora-finetuning/lit_gpt/adapter_v2.py index 51b826a2..5d389471 100644 --- a/llm-lora-finetuning/lit_gpt/adapter_v2.py +++ b/llm-lora-finetuning/lit_gpt/adapter_v2.py @@ -51,8 +51,12 @@ class AdapterV2Linear(torch.nn.Module): def __init__(self, in_features: int, out_features: int, **kwargs) -> None: super().__init__() self.linear = torch.nn.Linear(in_features, out_features, **kwargs) - self.adapter_bias = torch.nn.Parameter(torch.zeros(out_features), requires_grad=False) - self.adapter_scale = torch.nn.Parameter(torch.ones(out_features), requires_grad=False) + self.adapter_bias = torch.nn.Parameter( + torch.zeros(out_features), requires_grad=False + ) + self.adapter_scale = torch.nn.Parameter( + torch.ones(out_features), requires_grad=False + ) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.adapter_scale * (self.linear(x) + self.adapter_bias) @@ -69,11 +73,15 @@ def __init__(self, config: Config) -> None: assert config.padded_vocab_size is not None self.config = config - self.lm_head = AdapterV2Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.lm_head = AdapterV2Linear( + config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias + ) self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + h=nn.ModuleList( + Block(config, i) for i in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) @@ -90,9 +98,14 @@ def _init_weights(self, module: nn.Module) -> None: if isinstance(module, AdapterV2Linear): module.reset_parameters() - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" - mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"} + mapping = { + "lm_head.weight": "lm_head.linear.weight", + "lm_head.bias": "lm_head.linear.bias", + } state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @@ -121,25 +134,37 @@ def __init__(self, config: Config, block_idx: int) -> None: nn.Module.__init__(self) shape = (config.n_head + 2 * config.n_query_groups) * config.head_size # key, query, value projections for all heads, but in a batch - self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias) + self.attn = AdapterV2Linear( + in_features=config.n_embd, out_features=shape, bias=config.bias + ) # output projection # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` - self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) + self.proj = AdapterV2Linear( + config.head_size * config.n_head, config.n_embd, bias=config.bias + ) # disabled by default self.kv_cache: Optional[KVCache] = None if block_idx >= config.adapter_start_layer: # adapter embedding layer - self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) + self.adapter_wte = nn.Embedding( + config.adapter_prompt_length, config.n_embd + ) # gate for adaption - self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) + self.gating_factor = torch.nn.Parameter( + torch.zeros(1, 1, config.n_head, 1) + ) # kv cache for inference - self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self.adapter_kv_cache: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None self.block_idx = block_idx self.config = config - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = { "attn.weight": "attn.linear.weight", @@ -149,7 +174,9 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa } state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) # For compatibility with older checkpoints - if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head: + if (key := prefix + "gating_factor") in state_dict and state_dict[ + key + ].size(1) == self.config.n_head: state_dict[key] = state_dict[key].permute(0, 2, 1, 3) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @@ -157,12 +184,18 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class GptNeoxMLP(lit_gpt.model.GptNeoxMLP): def __init__(self, config: Config) -> None: nn.Module.__init__(self) - self.fc = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias) + self.fc = AdapterV2Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = AdapterV2Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) self.config = config - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = { "fc.weight": "fc.linear.weight", @@ -177,11 +210,19 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa class LLaMAMLP(lit_gpt.model.LLaMAMLP): def __init__(self, config: Config) -> None: nn.Module.__init__(self) - self.fc_1 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.fc_2 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias) + self.fc_1 = AdapterV2Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.fc_2 = AdapterV2Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = AdapterV2Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = { "fc_1.weight": "fc_1.linear.weight", @@ -207,11 +248,15 @@ class LLaMAMoE(lit_gpt.model.LLaMAMoE): def __init__(self, config: Config) -> None: nn.Module.__init__(self) self.gate = AdapterV2Linear(config.n_embd, config.n_expert, bias=False) - self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + self.experts = nn.ModuleList( + LLaMAMLP(config) for _ in range(config.n_expert) + ) self.config = config - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = {"gate.weight": "gate.linear.weight"} state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) diff --git a/llm-lora-finetuning/lit_gpt/args.py b/llm-lora-finetuning/lit_gpt/args.py index 62217076..264c8f51 100644 --- a/llm-lora-finetuning/lit_gpt/args.py +++ b/llm-lora-finetuning/lit_gpt/args.py @@ -37,13 +37,17 @@ class TrainArgs: def max_iters(self, devices: int) -> int: """Number of iterations""" - max_iters = self.epochs * self.epoch_size // devices // self.micro_batch_size + max_iters = ( + self.epochs * self.epoch_size // devices // self.micro_batch_size + ) assert max_iters > 0 return max_iters def gradient_accumulation_iters(self, devices: int) -> int: """Number of iterations between gradient synchronizations""" - gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size + gradient_accumulation_iters = ( + self.batch_size(devices) // self.micro_batch_size + ) assert gradient_accumulation_iters > 0 return gradient_accumulation_iters diff --git a/llm-lora-finetuning/lit_gpt/config.py b/llm-lora-finetuning/lit_gpt/config.py index 4c73dc6b..dab1523b 100644 --- a/llm-lora-finetuning/lit_gpt/config.py +++ b/llm-lora-finetuning/lit_gpt/config.py @@ -54,7 +54,9 @@ class Config: shared_attention_norm: bool = False _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm" norm_eps: float = 1e-5 - _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE"] = "GptNeoxMLP" + _mlp_class: Literal[ + "GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE" + ] = "GptNeoxMLP" gelu_approximate: str = "none" intermediate_size: Optional[int] = None rope_condense_ratio: int = 1 @@ -72,7 +74,9 @@ def __post_init__(self): # vocab size should be a power of 2 to be optimal on hardware. compute the closest value if self.padded_vocab_size is None: - self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple) + self.padded_vocab_size = find_multiple( + self.vocab_size, self.padding_multiple + ) else: # vocab size shouldn't be larger than padded vocab size self.vocab_size = min(self.vocab_size, self.padded_vocab_size) @@ -86,7 +90,9 @@ def __post_init__(self): # compute the intermediate size for MLP if not set if self.intermediate_size is None: if self._mlp_class == "LLaMAMLP": - raise ValueError("The config needs to set the `intermediate_size`") + raise ValueError( + "The config needs to set the `intermediate_size`" + ) self.intermediate_size = 4 * self.n_embd self.rope_n_elem = int(self.rotary_percentage * self.head_size) @@ -96,7 +102,11 @@ def from_name(cls, name: str, **kwargs: Any) -> Self: if name not in name_to_config: # search through all `config['hf_config']['name']` try: - conf_dict = next(config for config in configs if name == config["hf_config"]["name"]) + conf_dict = next( + config + for config in configs + if name == config["hf_config"]["name"] + ) except StopIteration: raise ValueError(f"{name!r} is not a supported config name") else: @@ -113,13 +123,21 @@ def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self: with open(path, encoding="utf-8") as fp: json_kwargs = json.load(fp) if "condense_ratio" in json_kwargs: # legacy name - json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio") + json_kwargs["rope_condense_ratio"] = json_kwargs.pop( + "condense_ratio" + ) if "condense_ratio" in kwargs: # legacy name kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") if "org" in json_kwargs: # legacy name - json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")} + json_kwargs["hf_config"] = { + "name": json_kwargs["name"], + "org": json_kwargs.pop("org"), + } if "org" in kwargs: # legacy name - kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")} + kwargs["hf_config"] = { + "name": kwargs.get("name", json_kwargs["name"]), + "org": kwargs.pop("org"), + } json_kwargs.update(kwargs) return cls(**json_kwargs) @@ -130,7 +148,9 @@ def from_checkpoint(cls, path: Path, **kwargs: Any) -> Self: return cls.from_json(config_path, **kwargs) if (model_name := path.name) in name_to_config: return cls.from_name(model_name, **kwargs) - raise FileNotFoundError(f"For {str(path)!r} neither 'lit_config.json' nor matching config exists.") + raise FileNotFoundError( + f"For {str(path)!r} neither 'lit_config.json' nor matching config exists." + ) @property def mlp_class(self) -> Type: @@ -154,7 +174,10 @@ def norm_class(self) -> Type: ######################## configs = [ # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json - dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")), + dict( + name="stablelm-base-alpha-3b", + hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b"), + ), # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json dict( name="stablelm-base-alpha-7b", @@ -164,7 +187,11 @@ def norm_class(self) -> Type: padding_multiple=256, ), # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json - dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32), + dict( + name="stablelm-tuned-alpha-3b", + hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), + n_head=32, + ), # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json dict( name="stablelm-tuned-alpha-7b", @@ -341,7 +368,9 @@ def norm_class(self) -> Type: # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json dict( name="RedPajama-INCITE-{}-3B-v1", - hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"), + hf_config=dict( + org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1" + ), block_size=2048, n_layer=32, n_embd=2560, @@ -362,7 +391,9 @@ def norm_class(self) -> Type: # this redirects to the checkpoint above. kept for those who had the old weights already downloaded dict( name="RedPajama-INCITE-{}-7B-v0.1", - hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"), + hf_config=dict( + org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1" + ), block_size=2048, n_layer=32, padding_multiple=256, @@ -1235,7 +1266,9 @@ def norm_class(self) -> Type: # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json dict( name="stablecode-completion-alpha-3b", - hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), + hf_config=dict( + org="stabilityai", name="stablecode-completion-alpha-3b" + ), block_size=16384, vocab_size=49152, n_layer=32, @@ -1244,7 +1277,9 @@ def norm_class(self) -> Type: # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json dict( name="stablecode-completion-alpha-3b-4k", - hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), + hf_config=dict( + org="stabilityai", name="stablecode-completion-alpha-3b-4k" + ), vocab_size=49152, n_layer=32, n_embd=2560, @@ -1410,7 +1445,10 @@ def norm_class(self) -> Type: ) ] for c in tiny_llama: - for kind, hf_postfix in (("", "-intermediate-step-1431k-3T"), ("-chat", "-Chat-v1.0")): + for kind, hf_postfix in ( + ("", "-intermediate-step-1431k-3T"), + ("-chat", "-Chat-v1.0"), + ): copy = deepcopy(c) copy["name"] = c["name"].format(kind) copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix) @@ -1424,7 +1462,9 @@ def norm_class(self) -> Type: # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json dict( name="Llama-2-7b-chat-hf-function-calling-v2", - hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"), + hf_config=dict( + org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2" + ), padding_multiple=64, n_layer=32, rotary_percentage=1.0, diff --git a/llm-lora-finetuning/lit_gpt/lora.py b/llm-lora-finetuning/lit_gpt/lora.py index bfc7adc1..84d42543 100644 --- a/llm-lora-finetuning/lit_gpt/lora.py +++ b/llm-lora-finetuning/lit_gpt/lora.py @@ -153,11 +153,15 @@ def merge(self) -> None: weight = self.linear.weight # dequantize the pretrained weights - weight_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state).to(lora_data.dtype) + weight_data = bnb.functional.dequantize_4bit( + weight.data, weight.quant_state + ).to(lora_data.dtype) # add pretrained and LoRA weights weight_data += lora_data # assign updated weights and quantize by moving to CUDA device - self.linear.weight = bnb.nn.Params4bit(weight_data, requires_grad=False, **weight.__dict__) + self.linear.weight = bnb.nn.Params4bit( + weight_data, requires_grad=False, **weight.__dict__ + ) self.linear.weight.cuda(weight.device) else: raise NotImplementedError( @@ -173,7 +177,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: pretrained = self.linear(x) if self.r == 0 or self.merged: return pretrained - lora = (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling + lora = ( + self.lora_dropout(x) + @ self.lora_A.transpose(0, 1) + @ self.lora_B.transpose(0, 1) + ) * self.scaling return pretrained + lora @@ -216,7 +224,9 @@ def __init__( don't want to apply LoRA we can set it as False. For example if we want to apply LoRA only to `query` and `value` but keep `key` without weight updates we should pass `[True, False, True]` """ - super(LoRALinear, self).__init__(r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout) + super(LoRALinear, self).__init__( + r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout + ) self.linear = torch.nn.Linear(in_features, out_features, **kwargs) self.n_head = n_head self.n_query_groups = n_query_groups @@ -232,9 +242,13 @@ def __init__( # ⚬ r: 2 # ⚬ enable_lora: [True, False, True] if r > 0 and any(enable_lora): - self.lora_A = nn.Parameter(torch.zeros((r * sum(enable_lora), in_features))) # (4, 128) + self.lora_A = nn.Parameter( + torch.zeros((r * sum(enable_lora), in_features)) + ) # (4, 128) enable_q, enable_k, enable_v = enable_lora - self.kv_embd_size = self.linear.in_features // (n_head // n_query_groups) + self.kv_embd_size = self.linear.in_features // ( + n_head // n_query_groups + ) # qkv_shapes will be used to split a tensor with weights correctly qkv_shapes = ( self.linear.in_features * enable_q, @@ -242,7 +256,9 @@ def __init__( self.kv_embd_size * enable_v, ) self.qkv_shapes = [s for s in qkv_shapes if s] - self.lora_B = nn.Parameter(torch.zeros(sum(self.qkv_shapes), r)) # (256, 2)) + self.lora_B = nn.Parameter( + torch.zeros(sum(self.qkv_shapes), r) + ) # (256, 2)) # Notes about shapes above # - self.lora_A has shape (4, 128): 4 because rank is 2 and LoRA is applied only to two matrices; # 128 is the input size of the x (embedding size). (4, 128) and not (128, 4) because later on in @@ -268,13 +284,25 @@ def __init__( ind = range(out_features) self.lora_ind = [] if enable_q: - q_ind = [x for x in ind if (x // head_size) % total_qkv < total_qkv - 2] + q_ind = [ + x + for x in ind + if (x // head_size) % total_qkv < total_qkv - 2 + ] self.lora_ind.extend(q_ind) if enable_k: - k_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 2] + k_ind = [ + x + for x in ind + if (x // head_size) % total_qkv == total_qkv - 2 + ] self.lora_ind.extend(k_ind) if enable_v: - v_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 1] + v_ind = [ + x + for x in ind + if (x // head_size) % total_qkv == total_qkv - 1 + ] self.lora_ind.extend(v_ind) self.reset_parameters() @@ -334,14 +362,24 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors # for example when we want to merge/unmerge LoRA weights and pretrained weights x = x.transpose(0, 1) - result = x.new_zeros((*x.shape[:-1], self.linear.out_features)) # (64, 64, 384) + result = x.new_zeros( + (*x.shape[:-1], self.linear.out_features) + ) # (64, 64, 384) result = result.view(-1, self.linear.out_features) # (4096, 384) result = result.index_copy( - 1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes)) + 1, + torch.tensor(self.lora_ind, device=result.device), + x.reshape(-1, sum(self.qkv_shapes)), ) # (4096, 256) - return result.view((*x.shape[:-1], self.linear.out_features)).transpose(0, 1) # (64, 64, 384) + return result.view( + (*x.shape[:-1], self.linear.out_features) + ).transpose( + 0, 1 + ) # (64, 64, 384) - def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: + def conv1d( + self, input: torch.Tensor, weight: torch.Tensor + ) -> torch.Tensor: """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. If the number of heads is equal to the number of query groups - grouped queries are disabled @@ -363,17 +401,24 @@ def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """ if self.n_head == self.n_query_groups: - return F.conv1d(input, weight, groups=sum(self.enable_lora)) # (B, C_output, T) + return F.conv1d( + input, weight, groups=sum(self.enable_lora) + ) # (B, C_output, T) # Notation: # ⚬ N: number of enabled LoRA layers (self.enable_lora) # ⚬ C_output': embeddings size for each LoRA layer (not equal in size) # ⚬ r: rank of all LoRA layers (equal in size) - input_splitted = input.chunk(sum(self.enable_lora), dim=1) # N * (B, C // N, T) - weight_splitted = weight.split(self.qkv_shapes) # N * (C_output', r, 1) + input_splitted = input.chunk( + sum(self.enable_lora), dim=1 + ) # N * (B, C // N, T) + weight_splitted = weight.split( + self.qkv_shapes + ) # N * (C_output', r, 1) return torch.cat( - [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], dim=1 # (B, C_output', T) + [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], + dim=1, # (B, C_output', T) ) # (B, C_output, T) def get_lora_AB(self) -> torch.Tensor: @@ -388,7 +433,9 @@ def get_lora_AB(self) -> torch.Tensor: ).squeeze( 0 ) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128) - return self.zero_pad(lora * self.scaling) # (256, 128) after zero_pad (384, 128) + return self.zero_pad( + lora * self.scaling + ) # (256, 128) after zero_pad (384, 128) def merge(self) -> None: """Merges the LoRA weights into the full-rank weights (W = W + delta_W).""" @@ -419,7 +466,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: pretrained = self.linear(x) if self.r == 0 or not any(self.enable_lora) or self.merged: return pretrained - after_A = F.linear(self.lora_dropout(x), self.lora_A) # (64, 64, 128) @ (4, 128) -> (64, 64, 4) + after_A = F.linear( + self.lora_dropout(x), self.lora_A + ) # (64, 64, 128) @ (4, 128) -> (64, 64, 4) # For F.conv1d: # ⚬ input: input tensor of shape (mini-batch, in_channels, iW) # ⚬ weight: filters of shape (out_channels, in_channels/groups, kW) @@ -429,7 +478,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: ).transpose( -2, -1 ) # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64) -> (64, 64, 256) - lora = self.zero_pad(after_B) * self.scaling # (64, 64, 256) after zero_pad (64, 64, 384) + lora = ( + self.zero_pad(after_B) * self.scaling + ) # (64, 64, 256) after zero_pad (64, 64, 384) return pretrained + lora @@ -460,7 +511,11 @@ def mark_only_lora_as_trainable(model: nn.Module, bias: str = "none") -> None: p.requires_grad = True elif bias == "lora_only": for m in model.modules(): - if isinstance(m, LoRALayer) and hasattr(m, "bias") and m.bias is not None: + if ( + isinstance(m, LoRALayer) + and hasattr(m, "bias") + and m.bias is not None + ): m.bias.requires_grad = True else: raise NotImplementedError @@ -523,11 +578,16 @@ def __init__(self, config: Config) -> None: self.mask_cache: Optional[torch.Tensor] = None def forward( - self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0 + self, + idx: torch.Tensor, + input_pos: Optional[torch.Tensor] = None, + lm_head_chunk_size: int = 0, ) -> Union[torch.Tensor, List[torch.Tensor]]: T = idx.size(1) if self.max_seq_length < T: - raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") + raise ValueError( + f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}." + ) if input_pos is not None: # use the kv cache cos = self.cos.index_select(0, input_pos) @@ -540,13 +600,17 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte( + idx + ) # token embeddings of shape (b, t, n_embd) for block in self.transformer.h: x = block(x, cos, sin, mask, input_pos) x = self.transformer.ln_f(x) if lm_head_chunk_size > 0: # chunk the lm head logits to reduce the peak memory used by autograd - return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] + return [ + self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1) + ] return self.lm_head(x) # (B, T, vocab_size) @classmethod @@ -559,9 +623,14 @@ def _init_weights(self, module: nn.Module) -> None: if isinstance(module, LoRALinear): module.reset_parameters() - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" - mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"} + mapping = { + "lm_head.weight": "lm_head.linear.weight", + "lm_head.bias": "lm_head.linear.bias", + } state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @@ -612,7 +681,9 @@ def __init__(self, config: Config) -> None: self.config = config - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = { "attn.weight": "attn.linear.weight", @@ -646,7 +717,9 @@ def __init__(self, config: Config) -> None: self.config = config - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = { "fc.weight": "fc.linear.weight", @@ -686,7 +759,9 @@ def __init__(self, config: Config) -> None: lora_dropout=config.dropout, ) - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = { "fc_1.weight": "fc_1.linear.weight", @@ -719,11 +794,15 @@ def __init__(self, config: Config) -> None: lora_alpha=config.alpha, lora_dropout=config.dropout, ) - self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + self.experts = nn.ModuleList( + LLaMAMLP(config) for _ in range(config.n_expert) + ) self.config = config - def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None: + def _load_from_state_dict( + self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any + ) -> None: """For compatibility with base checkpoints.""" mapping = {"gate.weight": "gate.linear.weight"} state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) diff --git a/llm-lora-finetuning/lit_gpt/model.py b/llm-lora-finetuning/lit_gpt/model.py index ed33664f..1ff378fd 100644 --- a/llm-lora-finetuning/lit_gpt/model.py +++ b/llm-lora-finetuning/lit_gpt/model.py @@ -22,7 +22,9 @@ def __init__(self, config: Config) -> None: assert config.padded_vocab_size is not None self.config = config - self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias) + self.lm_head = nn.Linear( + config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias + ) self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), @@ -44,7 +46,9 @@ def max_seq_length(self, value: int) -> None: This allows setting a smaller number to avoid allocating unused memory """ if value > self.config.block_size: - raise ValueError(f"Cannot attend to {value}, block size is only {self.config.block_size}") + raise ValueError( + f"Cannot attend to {value}, block size is only {self.config.block_size}" + ) self._max_seq_length = value if not hasattr(self, "cos"): # first call @@ -70,10 +74,14 @@ def _init_weights(self, module: nn.Module) -> None: elif isinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) - def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + def forward( + self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None + ) -> torch.Tensor: T = idx.size(1) if self.max_seq_length < T: - raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.") + raise ValueError( + f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}." + ) if input_pos is not None: # use the kv cache cos = self.cos.index_select(0, input_pos) @@ -86,7 +94,9 @@ def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) - sin = self.sin[:T] mask = None - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte( + idx + ) # token embeddings of shape (b, t, n_embd) if self.config.scale_embeddings: x = x * (self.config.n_embd**0.5) @@ -99,7 +109,9 @@ def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) - def from_name(cls, name: str, **kwargs: Any) -> Self: return cls(Config.from_name(name, **kwargs)) - def rope_cache(self, device: Optional[torch.device] = None) -> Tuple[torch.Tensor, torch.Tensor]: + def rope_cache( + self, device: Optional[torch.device] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: return build_rope_cache( seq_len=self.max_seq_length, n_elem=self.config.rope_n_elem, @@ -125,7 +137,10 @@ def set_kv_cache( batch_size, max_seq_length, rope_cache_length, device, dtype ) - if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length: + if ( + self.mask_cache is None + or self.mask_cache.size(3) != max_seq_length + ): # passing `attn_mask` to SDPA disables the flash implementation. since we only need the mask # for the kv-cache support (only during inference), we only create it in that situation self.mask_cache = build_mask_cache(max_seq_length, device) @@ -141,7 +156,11 @@ def __init__(self, config: Config) -> None: super().__init__() self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps) self.attn = CausalSelfAttention(config) - self.norm_2 = None if config.shared_attention_norm else config.norm_class(config.n_embd, eps=config.norm_eps) + self.norm_2 = ( + None + if config.shared_attention_norm + else config.norm_class(config.n_embd, eps=config.norm_eps) + ) self.mlp = config.mlp_class(config) self.config = config @@ -178,7 +197,9 @@ def __init__(self, config: Config) -> None: self.attn = nn.Linear(config.n_embd, shape, bias=config.bias) # output projection # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` - self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) + self.proj = nn.Linear( + config.head_size * config.n_head, config.n_embd, bias=config.bias + ) # disabled by default self.kv_cache: Optional[KVCache] = None @@ -192,15 +213,27 @@ def forward( mask: Optional[torch.Tensor] = None, input_pos: Optional[torch.Tensor] = None, ) -> torch.Tensor: - B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) + ( + B, + T, + C, + ) = ( + x.size() + ) # batch size, sequence length, embedding dimensionality (n_embd) qkv = self.attn(x) # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`) q_per_kv = self.config.n_head // self.config.n_query_groups - total_qkv = q_per_kv + 2 # each group has 1+ queries, 1 key, and 1 value - qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size) - qkv = qkv.permute(0, 2, 3, 1, 4) # (B, n_query_groups, total_qkv, T, hs) + total_qkv = ( + q_per_kv + 2 + ) # each group has 1+ queries, 1 key, and 1 value + qkv = qkv.view( + B, T, self.config.n_query_groups, total_qkv, self.config.head_size + ) + qkv = qkv.permute( + 0, 2, 3, 1, 4 + ) # (B, n_query_groups, total_qkv, T, hs) # split batched computation into three q, k, v = qkv.split((q_per_kv, 1, 1), dim=2) @@ -208,9 +241,23 @@ def forward( # maybe repeat k and v if for the non multi-head attention cases # training: flash attention requires it # inference: multi-query would require a full kv cache so avoid it to limit its memory usage - if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1): - k = k.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size) - v = v.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size) + if self.config.n_query_groups != self.config.n_head and ( + input_pos is None or self.config.n_query_groups != 1 + ): + k = k.expand( + B, + self.config.n_query_groups, + q_per_kv, + T, + self.config.head_size, + ) + v = v.expand( + B, + self.config.n_query_groups, + q_per_kv, + T, + self.config.head_size, + ) q = q.reshape(B, -1, T, self.config.head_size) # (B, nh_q, T, hs) k = k.reshape(B, -1, T, self.config.head_size) # (B, nh_k, T, hs) @@ -228,17 +275,29 @@ def forward( y = self.scaled_dot_product_attention(q, k, v, mask) - y = y.reshape(B, T, self.config.head_size * self.config.n_head) # re-assemble all head outputs side by side + y = y.reshape( + B, T, self.config.head_size * self.config.n_head + ) # re-assemble all head outputs side by side # output projection return self.proj(y) def scaled_dot_product_attention( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: scale = 1.0 / math.sqrt(self.config.head_size) y = torch.nn.functional.scaled_dot_product_attention( - q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None + q, + k, + v, + attn_mask=mask, + dropout_p=0.0, + scale=scale, + is_causal=mask is None, ) return y.transpose(1, 2) @@ -254,14 +313,18 @@ def build_kv_cache( v_shape = (batch_size, heads, max_seq_length, self.config.head_size) if rope_cache_length is None: if self.config.rotary_percentage != 1.0: - raise TypeError("Please pass the `rope_cache_length=gpt.cos.size(-1)` value") + raise TypeError( + "Please pass the `rope_cache_length=gpt.cos.size(-1)` value" + ) k_shape = v_shape else: k_shape = ( batch_size, heads, max_seq_length, - rope_cache_length + self.config.head_size - self.config.rope_n_elem, + rope_cache_length + + self.config.head_size + - self.config.rope_n_elem, ) return KVCache(k_shape, v_shape, device=device, dtype=dtype) @@ -269,23 +332,35 @@ def build_kv_cache( class GptNeoxMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) + self.fc = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = nn.Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) self.config = config def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.fc(x) - x = torch.nn.functional.gelu(x, approximate=self.config.gelu_approximate) + x = torch.nn.functional.gelu( + x, approximate=self.config.gelu_approximate + ) return self.proj(x) class LLaMAMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) + self.fc_1 = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.fc_2 = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = nn.Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) def forward(self, x: torch.Tensor) -> torch.Tensor: x_fc_1 = self.fc_1(x) @@ -306,7 +381,9 @@ class LLaMAMoE(nn.Module): def __init__(self, config: Config) -> None: super().__init__() self.gate = nn.Linear(config.n_embd, config.n_expert, bias=False) - self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + self.experts = nn.ModuleList( + LLaMAMLP(config) for _ in range(config.n_expert) + ) self.config = config @@ -315,22 +392,38 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: Derived from: https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 See also figure 1 in https://arxiv.org/abs/2211.15841 """ - B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) + ( + B, + T, + C, + ) = ( + x.size() + ) # batch size, sequence length, embedding dimensionality (n_embd) x = x.view(-1, C) # (B*T, C) router = self.gate(x) # (B*T, n_expert) - probs, indices = torch.topk(router, self.config.n_expert_per_token) # (B*T, n_expert_per_token) + probs, indices = torch.topk( + router, self.config.n_expert_per_token + ) # (B*T, n_expert_per_token) probs = probs.softmax(dim=1, dtype=torch.float).to(dtype=x.dtype) - masks = indices.unsqueeze(-1) == torch.arange(self.config.n_expert, device=x.device) + masks = indices.unsqueeze(-1) == torch.arange( + self.config.n_expert, device=x.device + ) masks = masks.permute(2, 0, 1) # (n_expert, B*T, n_expert_per_token) y = torch.zeros_like(x) # (B*T, C) for mask, expert in zip(masks, self.experts): token_idx, expert_idx = torch.where(mask) - y[token_idx] += probs[token_idx, expert_idx, None] * expert(x[token_idx]) + y[token_idx] += probs[token_idx, expert_idx, None] * expert( + x[token_idx] + ) return y.view(B, T, C) def build_rope_cache( - seq_len: int, n_elem: int, device: Optional[torch.device] = None, base: int = 10000, condense_ratio: int = 1 + seq_len: int, + n_elem: int, + device: Optional[torch.device] = None, + base: int = 10000, + condense_ratio: int = 1, ) -> Tuple[torch.Tensor, torch.Tensor]: """Enhanced Transformer with Rotary Position Embedding. @@ -339,7 +432,9 @@ def build_rope_cache( https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. """ # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ - theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem)) + theta = 1.0 / ( + base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem) + ) # Create position indexes `[0, 1, ..., seq_len - 1]` seq_idx = torch.arange(seq_len, device=device) / condense_ratio @@ -350,7 +445,9 @@ def build_rope_cache( return torch.cos(idx_theta), torch.sin(idx_theta) -def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: +def apply_rope( + x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor +) -> torch.Tensor: head_size = x.size(-1) x1 = x[..., : head_size // 2] # (B, nh, T, hs/2) x2 = x[..., head_size // 2 :] # (B, nh, T, hs/2) @@ -368,10 +465,20 @@ def __init__( dtype: Optional[torch.dtype] = None, ) -> None: super().__init__() - self.register_buffer("k", torch.zeros(k_shape, device=device, dtype=dtype), persistent=False) - self.register_buffer("v", torch.zeros(v_shape, device=device, dtype=dtype), persistent=False) + self.register_buffer( + "k", + torch.zeros(k_shape, device=device, dtype=dtype), + persistent=False, + ) + self.register_buffer( + "v", + torch.zeros(v_shape, device=device, dtype=dtype), + persistent=False, + ) - def forward(self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def forward( + self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: # move the buffer to the activation dtype for when AMP is used self.k = self.k.to(k.dtype) self.v = self.v.to(v.dtype) @@ -385,6 +492,10 @@ def reset_parameters(self) -> None: torch.nn.init.zeros_(self.v) -def build_mask_cache(max_seq_length: int, device: Optional[torch.device] = None) -> torch.Tensor: - ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool) +def build_mask_cache( + max_seq_length: int, device: Optional[torch.device] = None +) -> torch.Tensor: + ones = torch.ones( + (max_seq_length, max_seq_length), device=device, dtype=torch.bool + ) return torch.tril(ones).unsqueeze(0).unsqueeze(0) diff --git a/llm-lora-finetuning/lit_gpt/packed_dataset.py b/llm-lora-finetuning/lit_gpt/packed_dataset.py index 2b5b3d6d..a183d4c2 100644 --- a/llm-lora-finetuning/lit_gpt/packed_dataset.py +++ b/llm-lora-finetuning/lit_gpt/packed_dataset.py @@ -12,7 +12,16 @@ import torch from torch.utils.data import IterableDataset, get_worker_info -dtypes = {1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float32, 7: np.float64, 8: np.uint16} +dtypes = { + 1: np.uint8, + 2: np.int8, + 3: np.int16, + 4: np.int32, + 5: np.int64, + 6: np.float32, + 7: np.float64, + 8: np.uint16, +} def code(dtype): @@ -28,7 +37,15 @@ def code(dtype): class PackedDataset(IterableDataset): def __init__( - self, filenames, n_chunks, block_size, seed=12345, shuffle=True, wrap=False, num_processes=1, process_rank=0 + self, + filenames, + n_chunks, + block_size, + seed=12345, + shuffle=True, + wrap=False, + num_processes=1, + process_rank=0, ): self._filenames = filenames self._n_chunks = n_chunks @@ -60,7 +77,15 @@ def __iter__(self): class PackedDatasetBuilder(object): - def __init__(self, outdir, prefix, chunk_size, sep_token, dtype="auto", vocab_size=None): + def __init__( + self, + outdir, + prefix, + chunk_size, + sep_token, + dtype="auto", + vocab_size=None, + ): if dtype == "auto": if vocab_size is None: raise ValueError("vocab_size cannot be None when dtype='auto'") @@ -187,7 +212,11 @@ def _load_n_chunks(self): self._file_idx += self._n_chunks n_all_blocks = self._n_chunks * self._n_blocks - self._block_idxs = self._rng.permutation(n_all_blocks) if self._shuffle else range(n_all_blocks) + self._block_idxs = ( + self._rng.permutation(n_all_blocks) + if self._shuffle + else range(n_all_blocks) + ) self._curr_idx = 0 @@ -208,7 +237,9 @@ def __next__(self): buffer = self._buffers[chunk_id] elem_id = (block_idx % self._n_blocks) * self._block_size offset = np.dtype(self._dtype).itemsize * elem_id - arr = np.frombuffer(buffer, dtype=self._dtype, count=self._block_size, offset=offset) + arr = np.frombuffer( + buffer, dtype=self._dtype, count=self._block_size, offset=offset + ) self._curr_idx += 1 return torch.from_numpy(arr.astype(np.int64)) @@ -225,7 +256,9 @@ def __init__(self, datasets, seed, weights=None): self._weights = [w / sum(weights) for w in weights] def __iter__(self): - return CombinedDatasetIterator(self._datasets, self._seed, self._weights) + return CombinedDatasetIterator( + self._datasets, self._seed, self._weights + ) class CombinedDatasetIterator: @@ -235,5 +268,7 @@ def __init__(self, datasets, seed, weights): self._rng = random.Random(seed) def __next__(self): - (dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1) + (dataset,) = self._rng.choices( + self._datasets, weights=self._weights, k=1 + ) return next(dataset) diff --git a/llm-lora-finetuning/lit_gpt/rmsnorm.py b/llm-lora-finetuning/lit_gpt/rmsnorm.py index dcaab677..10828812 100644 --- a/llm-lora-finetuning/lit_gpt/rmsnorm.py +++ b/llm-lora-finetuning/lit_gpt/rmsnorm.py @@ -10,7 +10,13 @@ class RMSNorm(torch.nn.Module): https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE. """ - def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_unit_offset: bool = False) -> None: + def __init__( + self, + size: int, + dim: int = -1, + eps: float = 1e-6, + add_unit_offset: bool = False, + ) -> None: super().__init__() self.weight = torch.nn.Parameter(torch.ones(size)) self.eps = eps diff --git a/llm-lora-finetuning/lit_gpt/tokenizer.py b/llm-lora-finetuning/lit_gpt/tokenizer.py index 3a6758eb..f2832ce6 100644 --- a/llm-lora-finetuning/lit_gpt/tokenizer.py +++ b/llm-lora-finetuning/lit_gpt/tokenizer.py @@ -11,7 +11,9 @@ class Tokenizer: def __init__(self, checkpoint_dir: Union[Path, str]) -> None: checkpoint_dir = Path(checkpoint_dir) if not checkpoint_dir.exists(): - raise NotADirectoryError(f"The checkpoint directory does not exist: {str(checkpoint_dir)}") + raise NotADirectoryError( + f"The checkpoint directory does not exist: {str(checkpoint_dir)}" + ) self.use_bos = self.check_if_bos_token_used(checkpoint_dir) self.bos_id = None @@ -21,7 +23,9 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None: if (vocabulary_path := checkpoint_dir / "tokenizer.model").is_file(): from sentencepiece import SentencePieceProcessor - self.processor = SentencePieceProcessor(model_file=str(vocabulary_path)) + self.processor = SentencePieceProcessor( + model_file=str(vocabulary_path) + ) self.backend = "sentencepiece" self.bos_id = self.processor.bos_id() self.eos_id = self.processor.eos_id() @@ -32,14 +36,27 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None: self.processor = HFTokenizer.from_file(str(vocabulary_path)) self.backend = "huggingface" - if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file(): + if ( + special_tokens_path := checkpoint_dir / "tokenizer_config.json" + ).is_file(): with open(special_tokens_path) as fp: config = json.load(fp) bos_token = config.get("bos_token") - self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None + self.bos_id = ( + self.token_to_id(bos_token) + if bos_token is not None + else None + ) eos_token = config.get("eos_token") - self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None - if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file(): + self.eos_id = ( + self.token_to_id(eos_token) + if eos_token is not None + else None + ) + if ( + special_tokens_path := checkpoint_dir + / "generation_config.json" + ).is_file(): with open(special_tokens_path) as fp: config = json.load(fp) if self.bos_id is None: @@ -69,15 +86,23 @@ def token_to_id(self, token: str) -> int: return id_ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool: - if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file(): + if not ( + tokenizer_config_path := checkpoint_dir / "tokenizer_config.json" + ).is_file(): return False with open(tokenizer_config_path) as fp: config = json.load(fp) - if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")): + if any( + config.get(check, False) + for check in ("add_bos_token", "add_prefix_space") + ): return True # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True. # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2 - return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer" + return ( + config.get("add_bos_token") is None + and config.get("tokenizer_class") == "LlamaTokenizer" + ) def encode( self, @@ -96,7 +121,9 @@ def encode( if bos or (bos is None and self.use_bos): bos_id = self.bos_id if bos_id is None: - raise NotImplementedError("This tokenizer does not have a defined a bos token") + raise NotImplementedError( + "This tokenizer does not have a defined a bos token" + ) tokens = [bos_id] + tokens if eos: tokens = tokens + [self.eos_id] diff --git a/llm-lora-finetuning/lit_gpt/utils.py b/llm-lora-finetuning/lit_gpt/utils.py index c9102791..ba4706ff 100644 --- a/llm-lora-finetuning/lit_gpt/utils.py +++ b/llm-lora-finetuning/lit_gpt/utils.py @@ -7,7 +7,17 @@ import sys from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, TypeVar, Union +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterable, + List, + Mapping, + Optional, + TypeVar, + Union, +) import lightning as L import torch @@ -29,7 +39,9 @@ def find_multiple(n: int, k: int) -> int: return n + k - (n % k) -def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> int: +def num_parameters( + module: nn.Module, requires_grad: Optional[bool] = None +) -> int: total = 0 for p in module.parameters(): if requires_grad is None or p.requires_grad == requires_grad: @@ -45,9 +57,13 @@ def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None: files = { "lit_model.pth": (checkpoint_dir / "lit_model.pth").is_file(), "lit_config.json": (checkpoint_dir / "lit_config.json").is_file(), - "tokenizer.json OR tokenizer.model": (checkpoint_dir / "tokenizer.json").is_file() + "tokenizer.json OR tokenizer.model": ( + checkpoint_dir / "tokenizer.json" + ).is_file() or (checkpoint_dir / "tokenizer.model").is_file(), - "tokenizer_config.json": (checkpoint_dir / "tokenizer_config.json").is_file(), + "tokenizer_config.json": ( + checkpoint_dir / "tokenizer_config.json" + ).is_file(), } if checkpoint_dir.is_dir(): if all(files.values()): @@ -60,7 +76,9 @@ def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None: # list locally available checkpoints available = list(Path("checkpoints").glob("*/*")) if available: - options = "\n --checkpoint_dir ".join([""] + [repr(str(p.resolve())) for p in available]) + options = "\n --checkpoint_dir ".join( + [""] + [repr(str(p.resolve())) for p in available] + ) extra = f"\nYou have downloaded locally:{options}\n" else: extra = "" @@ -78,7 +96,10 @@ class SavingProxyForStorage: def __init__(self, obj, saver, protocol_version=5): self.protocol_version = protocol_version self.saver = saver - if not (isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj)): + if not ( + isinstance(obj, torch.storage.TypedStorage) + or torch.is_storage(obj) + ): raise TypeError(f"expected storage, not {type(obj)}") # this logic is taken from PyTorch 2.0+ torch/serialization.py @@ -96,7 +117,13 @@ def __init__(self, obj, saver, protocol_version=5): storage_key = saver._write_storage_and_return_key(storage) location = torch.serialization.location_tag(storage) - self.storage_info = ("storage", storage_type, storage_key, location, storage_numel) + self.storage_info = ( + "storage", + storage_type, + storage_key, + location, + storage_numel, + ) def __reduce_ex__(self, protocol_version): assert False, "this should be handled with out of band" @@ -105,22 +132,39 @@ def __reduce_ex__(self, protocol_version): class SavingProxyForTensor: def __init__(self, tensor, saver, protocol_version=5): self.protocol_version = protocol_version - self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__(protocol_version) + self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__( + protocol_version + ) if reduce_args[0] == torch._utils._rebuild_tensor_v2: # for Tensors with Python attributes (a0, a1, (storage, *a2_other), *other_reduce_args) = reduce_args - assert isinstance(storage, torch.storage.TypedStorage), "Please check for updates" - storage_proxy = SavingProxyForStorage(storage, saver, protocol_version=protocol_version) - self.reduce_args = (a0, a1, (storage_proxy, *a2_other), *other_reduce_args) + assert isinstance( + storage, torch.storage.TypedStorage + ), "Please check for updates" + storage_proxy = SavingProxyForStorage( + storage, saver, protocol_version=protocol_version + ) + self.reduce_args = ( + a0, + a1, + (storage_proxy, *a2_other), + *other_reduce_args, + ) else: (storage, *other_reduce_args) = reduce_args - assert isinstance(storage, torch.storage.TypedStorage), "Please check for updates" - storage_proxy = SavingProxyForStorage(storage, saver, protocol_version=protocol_version) + assert isinstance( + storage, torch.storage.TypedStorage + ), "Please check for updates" + storage_proxy = SavingProxyForStorage( + storage, saver, protocol_version=protocol_version + ) self.reduce_args = (storage_proxy, *other_reduce_args) def __reduce_ex__(self, protocol_version): if protocol_version != self.protocol_version: - raise RuntimeError(f"Unexpected protocol version: expected {self.protocol_version}, got {protocol_version}") + raise RuntimeError( + f"Unexpected protocol version: expected {self.protocol_version}, got {protocol_version}" + ) return self.reduce_ret_fn, self.reduce_args @@ -141,7 +185,9 @@ def persistent_id(self, obj): if isinstance(obj, SavingProxyForStorage): return obj.storage_info - if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage( + obj + ): if isinstance(obj, torch.storage.TypedStorage): # TODO: Once we decide to break serialization FC, this case # can be deleted @@ -162,7 +208,10 @@ def persistent_id(self, obj): # not allocated, don't perform this check if storage.data_ptr() != 0: if storage.data_ptr() in self.storage_dtypes: - if storage_dtype != self.storage_dtypes[storage.data_ptr()]: + if ( + storage_dtype + != self.storage_dtypes[storage.data_ptr()] + ): raise RuntimeError( "Cannot save multiple tensors or storages that view the same data as different types" ) @@ -175,7 +224,13 @@ def persistent_id(self, obj): self.id_map[storage._cdata] = storage_key location = torch.serialization.location_tag(storage) - return ("storage", storage_type, storage_key, location, storage_numel) + return ( + "storage", + storage_type, + storage_key, + location, + storage_numel, + ) return None @@ -243,13 +298,26 @@ def chunked_cross_entropy( logits = torch.cat(logits, dim=1) logits = logits.reshape(-1, logits.size(-1)) targets = targets.reshape(-1) - return torch.nn.functional.cross_entropy(logits, targets, ignore_index=ignore_index) + return torch.nn.functional.cross_entropy( + logits, targets, ignore_index=ignore_index + ) # chunk cross entropy - logit_chunks = [logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits] - target_chunks = [target_chunk.reshape(-1) for target_chunk in targets.split(logits[0].size(1), dim=1)] + logit_chunks = [ + logit_chunk.reshape(-1, logit_chunk.size(-1)) + for logit_chunk in logits + ] + target_chunks = [ + target_chunk.reshape(-1) + for target_chunk in targets.split(logits[0].size(1), dim=1) + ] loss_chunks = [ - torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=ignore_index, reduction="none") + torch.nn.functional.cross_entropy( + logit_chunk, + target_chunk, + ignore_index=ignore_index, + reduction="none", + ) for logit_chunk, target_chunk in zip(logit_chunks, target_chunks) ] non_masked_elems = (targets != ignore_index).sum() @@ -259,25 +327,36 @@ def chunked_cross_entropy( logits = logits.reshape(-1, logits.size(-1)) targets = targets.reshape(-1) if chunk_size == 0: - return torch.nn.functional.cross_entropy(logits, targets, ignore_index=ignore_index) + return torch.nn.functional.cross_entropy( + logits, targets, ignore_index=ignore_index + ) # lm_head wasn't chunked, chunk cross entropy logit_chunks = logits.split(chunk_size) target_chunks = targets.split(chunk_size) loss_chunks = [ - torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=ignore_index, reduction="none") + torch.nn.functional.cross_entropy( + logit_chunk, + target_chunk, + ignore_index=ignore_index, + reduction="none", + ) for logit_chunk, target_chunk in zip(logit_chunks, target_chunks) ] non_masked_elems = (targets != ignore_index).sum() return torch.cat(loss_chunks).sum() / max(1, non_masked_elems) -def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict: +def map_old_state_dict_weights( + state_dict: Dict, mapping: Mapping, prefix: str +) -> Dict: for checkpoint_name, attribute_name in mapping.items(): full_checkpoint_name = prefix + checkpoint_name if full_checkpoint_name in state_dict: full_attribute_name = prefix + attribute_name - state_dict[full_attribute_name] = state_dict.pop(full_checkpoint_name) + state_dict[full_attribute_name] = state_dict.pop( + full_checkpoint_name + ) return state_dict @@ -292,12 +371,19 @@ def get_default_supported_precision(training: bool) -> str: """ from lightning.fabric.accelerators import MPSAccelerator - if MPSAccelerator.is_available() or (torch.cuda.is_available() and not torch.cuda.is_bf16_supported()): + if MPSAccelerator.is_available() or ( + torch.cuda.is_available() and not torch.cuda.is_bf16_supported() + ): return "16-mixed" if training else "16-true" return "bf16-mixed" if training else "bf16-true" -def load_checkpoint(fabric: L.Fabric, model: nn.Module, checkpoint_path: Path, strict: bool = True) -> None: +def load_checkpoint( + fabric: L.Fabric, + model: nn.Module, + checkpoint_path: Path, + strict: bool = True, +) -> None: if isinstance(fabric.strategy, FSDPStrategy): fabric.load_raw(checkpoint_path, model, strict=strict) else: @@ -306,8 +392,12 @@ def load_checkpoint(fabric: L.Fabric, model: nn.Module, checkpoint_path: Path, s model.load_state_dict(state_dict, strict=strict) -def flops_per_param(max_seq_length: int, n_layer: int, n_embd: int, n_params: int) -> int: - flops_per_token = 2 * n_params # each parameter is used for a MAC (2 FLOPS) per network operation +def flops_per_param( + max_seq_length: int, n_layer: int, n_embd: int, n_params: int +) -> int: + flops_per_token = ( + 2 * n_params + ) # each parameter is used for a MAC (2 FLOPS) per network operation # this assumes that all samples have a fixed length equal to the block size # which is most likely false during finetuning flops_per_seq = flops_per_token * max_seq_length @@ -328,12 +418,20 @@ def estimate_flops(model: "GPT", training: bool) -> int: # For a proper estimate, this needs a more fine-grained calculation as in Appendix A of the paper. n_trainable_params = num_parameters(model, requires_grad=True) trainable_flops = flops_per_param( - model.max_seq_length, model.config.n_layer, model.config.n_embd, n_trainable_params + model.max_seq_length, + model.config.n_layer, + model.config.n_embd, + n_trainable_params, ) # forward + backward + gradients (assumes no gradient accumulation) ops_per_step = 3 if training else 1 n_frozen_params = num_parameters(model, requires_grad=False) - frozen_flops = flops_per_param(model.max_seq_length, model.config.n_layer, model.config.n_embd, n_frozen_params) + frozen_flops = flops_per_param( + model.max_seq_length, + model.config.n_layer, + model.config.n_embd, + n_frozen_params, + ) # forward + backward frozen_ops_per_step = 2 if training else 1 return ops_per_step * trainable_flops + frozen_ops_per_step * frozen_flops diff --git a/llm-lora-finetuning/pipelines/eval.py b/llm-lora-finetuning/pipelines/eval.py new file mode 100644 index 00000000..88194f98 --- /dev/null +++ b/llm-lora-finetuning/pipelines/eval.py @@ -0,0 +1,9 @@ +from typing import Optional + +from steps.eval import eval +from zenml import pipeline + + +@pipeline +def eval_pipeline(model_repo: str, adapter_repo: Optional[str] = None) -> None: + eval(model_repo=model_repo, adapter_repo=adapter_repo) diff --git a/llm-lora-finetuning/pipelines/feature_engineering.py b/llm-lora-finetuning/pipelines/feature_engineering.py new file mode 100644 index 00000000..4e447421 --- /dev/null +++ b/llm-lora-finetuning/pipelines/feature_engineering.py @@ -0,0 +1,7 @@ +from steps.feature_engineering import feature_engineering +from zenml import pipeline + + +@pipeline +def feature_engineering_pipeline(model_repo: str, dataset_name: str) -> None: + feature_engineering(model_repo=model_repo, dataset_name=dataset_name) diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index 467d2f63..58f05744 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -14,12 +14,23 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from typing import Optional -from zenml import pipeline from steps.finetune import finetune_lora -from steps.merge import merge +from zenml import pipeline from zenml.config import DockerSettings + @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) -def finetuning_pipeline(repo_id: str = "mistralai/Mistral-7B-Instruct-v0.1") -> None: - checkpoint_dir, output_path = finetune_lora(repo_id=repo_id) +def finetuning_pipeline( + repo_id: str = "mistralai/Mistral-7B-Instruct-v0.1", + adapter_output_repo: Optional[str] = None, + merged_output_repo: Optional[str] = None, + convert_to_hf: bool = False, +) -> None: + finetune_lora( + repo_id=repo_id, + adapter_output_repo=adapter_output_repo, + merged_output_repo=merged_output_repo, + convert_to_hf=convert_to_hf, + ) diff --git a/llm-lora-finetuning/pipelines/merge.py b/llm-lora-finetuning/pipelines/merge.py index a5389a6d..4ea639d8 100644 --- a/llm-lora-finetuning/pipelines/merge.py +++ b/llm-lora-finetuning/pipelines/merge.py @@ -15,9 +15,19 @@ # limitations under the License. # -from zenml import pipeline from steps.merge import merge +from zenml import pipeline + @pipeline -def merge_pipeline() -> None: - merge() \ No newline at end of file +def merge_pipeline( + base_model_repo: str, + adapter_repo: str, + output_repo: str, + convert_to_hf: bool = False, +) -> None: + merge( + base_model_repo=base_model_repo, + output_repo=output_repo, + convert_to_hf=convert_to_hf, + ) diff --git a/llm-lora-finetuning/run.py b/llm-lora-finetuning/run.py index cbaf351b..cd1a71a6 100644 --- a/llm-lora-finetuning/run.py +++ b/llm-lora-finetuning/run.py @@ -15,7 +15,9 @@ # limitations under the License. # +import os from typing import Optional + import click from zenml.logger import get_logger @@ -43,10 +45,6 @@ \b # Run the evaluation pipeline python run.py --eval-pipeline - - \b - # Run the deployment pipeline - python run.py --deployment-pipeline """ ) @click.option( @@ -79,12 +77,6 @@ default=False, help="Whether to run the pipeline that evaluates the model.", ) -@click.option( - "--deployment-pipeline", - is_flag=True, - default=False, - help="Whether to run the pipeline that deploys the model.", -) @click.option( "--no-cache", is_flag=True, @@ -97,7 +89,6 @@ def main( finetuning_pipeline: bool = False, merging_pipeline: bool = False, eval_pipeline: bool = False, - deployment_pipeline: bool = False, no_cache: bool = False, ): """Main entry point for the pipeline execution. @@ -105,10 +96,34 @@ def main( Args: no_cache: If `True` cache will be disabled. """ + config_folder = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "configs", + ) + pipeline_args = {"enable_cache": not no_cache} + if config: + pipeline_args["config_path"] = os.path.join(config_folder, config) + if feature_pipeline: from pipelines.feature_engineering import feature_engineering_pipeline - feature_engineering_pipeline() + feature_engineering_pipeline.with_options(**pipeline_args)() + + if finetuning_pipeline: + from pipelines.finetuning import finetuning_pipeline + + finetuning_pipeline.with_options(**pipeline_args)() + + if merging_pipeline: + from pipelines.merge import merge_pipeline + + merge_pipeline.with_options(**pipeline_args)() + + if eval_pipeline: + from pipelines.eval import eval_pipeline + + eval_pipeline.with_options(**pipeline_args)() + if __name__ == "__main__": main() diff --git a/llm-lora-finetuning/scripts/convert_hf_checkpoint.py b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py index 3839a879..14d0ff6f 100644 --- a/llm-lora-finetuning/scripts/convert_hf_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py @@ -10,7 +10,9 @@ from typing import Dict, List, Optional, Tuple, Union import torch -from lightning.fabric.utilities.load import _NotYetLoadedTensor as NotYetLoadedTensor +from lightning.fabric.utilities.load import ( + _NotYetLoadedTensor as NotYetLoadedTensor, +) # support running without installing as a package wd = Path(__file__).parent.parent.resolve() @@ -208,7 +210,10 @@ def copy_weights_phi( saver: Optional[incremental_save] = None, dtype: Optional[torch.dtype] = None, ) -> None: - if any(layer_name.startswith(("layers.", "transformer.")) for layer_name in hf_weights): + if any( + layer_name.startswith(("layers.", "transformer.")) + for layer_name in hf_weights + ): raise ValueError( "You are using an outdated Phi checkpoint. Please reload it as described in 'tutorials/download_phi.md'" ) @@ -280,12 +285,20 @@ def layer_template(layer_name: str, idx: int) -> Tuple[str, int]: return from_name, number -def load_param(param: Union[torch.Tensor, NotYetLoadedTensor], name: str, dtype: Optional[torch.dtype]) -> torch.Tensor: +def load_param( + param: Union[torch.Tensor, NotYetLoadedTensor], + name: str, + dtype: Optional[torch.dtype], +) -> torch.Tensor: if hasattr(param, "_load_tensor"): # support tensors loaded via `lazy_load()` print(f"Loading {name!r} into RAM") param = param._load_tensor() - if dtype is not None and type(dtype) is not NotYetLoadedTensor and dtype != param.dtype: + if ( + dtype is not None + and type(dtype) is not NotYetLoadedTensor + and dtype != param.dtype + ): print(f"Converting {name!r} from {param.dtype} to {dtype}") param = param.to(dtype) return param @@ -294,7 +307,9 @@ def load_param(param: Union[torch.Tensor, NotYetLoadedTensor], name: str, dtype: @torch.inference_mode() def convert_hf_checkpoint( *, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), model_name: Optional[str] = None, dtype: Optional[str] = None, ) -> None: @@ -327,16 +342,22 @@ def convert_hf_checkpoint( # Load the json file containing weight mapping pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json" - if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file + if ( + pytorch_bin_map_json_path.is_file() + ): # not all checkpoints have this file with open(pytorch_bin_map_json_path) as json_map: bin_index = json.load(json_map) - bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} + bin_files = { + checkpoint_dir / bin for bin in bin_index["weight_map"].values() + } else: bin_files = set(checkpoint_dir.glob("*.bin")) # some checkpoints serialize the training arguments bin_files = {f for f in bin_files if f.name != "training_args.bin"} if not bin_files: - raise ValueError(f"Expected {str(checkpoint_dir)!r} to contain .bin files") + raise ValueError( + f"Expected {str(checkpoint_dir)!r} to contain .bin files" + ) with incremental_save(checkpoint_dir / "lit_model.pth") as saver: # for checkpoints that split the QKV across several files, we need to keep all the bin files diff --git a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py index 8a3b101a..1239e7d2 100644 --- a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py @@ -7,7 +7,9 @@ from typing import Dict, Optional, Tuple, Union import torch -from lightning.fabric.utilities.load import _NotYetLoadedTensor as NotYetLoadedTensor +from lightning.fabric.utilities.load import ( + _NotYetLoadedTensor as NotYetLoadedTensor, +) # support running without installing as a package wd = Path(__file__).parent.parent.resolve() @@ -15,6 +17,7 @@ from lit_gpt import Config from lit_gpt.utils import CLI, incremental_save, lazy_load + from scripts.convert_hf_checkpoint import layer_template, load_param @@ -226,7 +229,10 @@ def qkv_split( ks = [] vs = [] for chunk in torch.chunk(param, config.n_query_groups): - split = torch.split(chunk, [config.head_size * q_per_kv, config.head_size, config.head_size]) + split = torch.split( + chunk, + [config.head_size * q_per_kv, config.head_size, config.head_size], + ) qs.append(split[0]) ks.append(split[1]) vs.append(split[2]) @@ -238,20 +244,26 @@ def qkv_split( def check_conversion_supported(lit_weights: Dict[str, torch.Tensor]) -> None: if any("lora" in wn for wn in lit_weights): - raise ValueError("Checkpoints with LoRA weights cannot be converted. Call `scripts/merge_lora.py` first.") + raise ValueError( + "Checkpoints with LoRA weights cannot be converted. Call `scripts/merge_lora.py` first." + ) if any("adapter" in wn or "gating_factor" in wn for wn in lit_weights): raise NotImplementedError("Converting adapter models is supported.") @torch.inference_mode() -def convert_lit_checkpoint(checkpoint_path: Path, output_path: Path, config_path: Path) -> None: +def convert_lit_checkpoint( + checkpoint_path: Path, output_path: Path, config_path: Path +) -> None: config = Config.from_json(config_path) if "falcon" in config.name: copy_fn = partial(copy_weights_falcon, config.name) elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): untie_weights = "Gemma" in config.name - copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights) + copy_fn = partial( + copy_weights_llama, config, untie_weights=untie_weights + ) elif "phi" in config.name: copy_fn = partial(copy_weights_phi, config) else: diff --git a/llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py b/llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py index b32103e0..a6c30933 100644 --- a/llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_pretrained_checkpoint.py @@ -17,7 +17,12 @@ @torch.inference_mode() -def convert_checkpoint(checkpoint_file: Path, tokenizer_dir: Path, config_name: str, output_dir: Path) -> None: +def convert_checkpoint( + checkpoint_file: Path, + tokenizer_dir: Path, + config_name: str, + output_dir: Path, +) -> None: """Convert a checkpoint after pretraining. The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training @@ -40,7 +45,9 @@ def convert_checkpoint(checkpoint_file: Path, tokenizer_dir: Path, config_name: " Please delete it first or choose a different name." ) if not tokenizer_dir.is_dir(): - raise FileNotFoundError(f"The tokenizer_dir must be a directory: {str(output_dir)}.") + raise FileNotFoundError( + f"The tokenizer_dir must be a directory: {str(output_dir)}." + ) output_dir.mkdir(parents=True) output_checkpoint_file = output_dir / "lit_model.pth" @@ -57,7 +64,10 @@ def convert_checkpoint(checkpoint_file: Path, tokenizer_dir: Path, config_name: # Copy config for tokenization if found if (tokenizer_dir / "generation_config.json").is_file(): - shutil.copyfile(tokenizer_dir / "generation_config.json", output_dir / "generation_config.json") + shutil.copyfile( + tokenizer_dir / "generation_config.json", + output_dir / "generation_config.json", + ) # Extract the model state dict and save to output folder with incremental_save(output_checkpoint_file) as saver: diff --git a/llm-lora-finetuning/scripts/download.py b/llm-lora-finetuning/scripts/download.py index b1a1a78f..e5a7459d 100644 --- a/llm-lora-finetuning/scripts/download.py +++ b/llm-lora-finetuning/scripts/download.py @@ -28,14 +28,19 @@ def download_from_hub( if repo_id is None: from lit_gpt.config import configs - options = [f"{config['hf_config']['org']}/{config['hf_config']['name']}" for config in configs] + options = [ + f"{config['hf_config']['org']}/{config['hf_config']['name']}" + for config in configs + ] print("Please specify --repo_id . Available values:") print("\n".join(options)) return from huggingface_hub import snapshot_download - if ("meta-llama" in repo_id or "falcon-180" in repo_id) and not access_token: + if ( + "meta-llama" in repo_id or "falcon-180" in repo_id + ) and not access_token: raise ValueError( f"{repo_id} requires authentication, please set the `HF_TOKEN=your_token` environment" " variable or pass --access_token=your_token. You can find your token by visiting" @@ -52,7 +57,9 @@ def download_from_hub( # covers `.bin` files and `.bin.index.json` download_files.append("*.bin*") elif from_safetensors: - raise ValueError("`--from_safetensors=True` won't have an effect with `--tokenizer_only=True`") + raise ValueError( + "`--from_safetensors=True` won't have an effect with `--tokenizer_only=True`" + ) import huggingface_hub._snapshot_download as download import huggingface_hub.constants as constants @@ -87,7 +94,9 @@ def download_from_hub( try: result = safetensors_load(safetensor_path) except SafetensorError as e: - raise RuntimeError(f"{safetensor_path} is likely corrupted. Please try to re-download it.") from e + raise RuntimeError( + f"{safetensor_path} is likely corrupted. Please try to re-download it." + ) from e print(f"{safetensor_path} --> {bin_path}") torch.save(result, bin_path) os.remove(safetensor_path) diff --git a/llm-lora-finetuning/scripts/merge_lora.py b/llm-lora-finetuning/scripts/merge_lora.py index c25f87f4..89818a99 100644 --- a/llm-lora-finetuning/scripts/merge_lora.py +++ b/llm-lora-finetuning/scripts/merge_lora.py @@ -14,12 +14,19 @@ sys.path.append(str(wd)) from lit_gpt.lora import GPT, Config, lora_filter, merge_lora_weights -from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load +from lit_gpt.utils import ( + CLI, + check_valid_checkpoint_dir, + get_default_supported_precision, + lazy_load, +) def merge_lora( lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), out_dir: Path = Path("out/lora/checkpoint"), precision: Optional[str] = None, lora_r: int = 8, @@ -75,7 +82,11 @@ def merge_lora( save_path = out_dir / "lit_model.pth" fabric.print(f"Saving weights to {str(save_path)!r}") # remove lora parameters and the lora linear substring - state_dict = {k.replace("linear.", ""): v for k, v in model.state_dict().items() if not lora_filter(k, v)} + state_dict = { + k.replace("linear.", ""): v + for k, v in model.state_dict().items() + if not lora_filter(k, v) + } torch.save(state_dict, save_path) diff --git a/llm-lora-finetuning/scripts/prepare_alpaca.py b/llm-lora-finetuning/scripts/prepare_alpaca.py index 61ca7bf3..cde6fca1 100644 --- a/llm-lora-finetuning/scripts/prepare_alpaca.py +++ b/llm-lora-finetuning/scripts/prepare_alpaca.py @@ -22,7 +22,9 @@ def prepare( destination_path: Path = Path("data/alpaca"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, seed: int = 42, mask_inputs: bool = False, # as in alpaca-lora @@ -37,7 +39,9 @@ def prepare( which stores the preprocessed and tokenized prompts and labels. """ if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] @@ -53,7 +57,9 @@ def prepare( # Partition the dataset into train and test train_set, test_set = random_split( - data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + data, + [1.0 - test_split_fraction, test_split_fraction], + generator=torch.Generator().manual_seed(seed), ) train_set, test_set = list(train_set), list(test_set) @@ -100,7 +106,13 @@ def download_if_missing(file_path: Path, file_url: str) -> None: f.write(requests.get(file_url).text) -def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: +def prepare_sample( + example: dict, + tokenizer: Tokenizer, + max_length: int, + mask_inputs: bool, + ignore_index: int, +) -> dict: """Processes a single sample. Each sample in the dataset consists of: @@ -120,14 +132,20 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in full_prompt = generate_prompt(example) full_prompt_and_response = full_prompt + example["output"] encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) - encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode( + full_prompt_and_response, eos=True, max_length=max_length + ) # The labels are the full prompt with response, but with the prompt masked out labels = encoded_full_prompt_and_response.clone() if mask_inputs: labels[: len(encoded_full_prompt)] = ignore_index - return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + return { + **example, + "input_ids": encoded_full_prompt_and_response, + "labels": labels, + } def generate_prompt(example: dict) -> str: diff --git a/llm-lora-finetuning/scripts/prepare_csv.py b/llm-lora-finetuning/scripts/prepare_csv.py index 89dd43f9..bbd27074 100644 --- a/llm-lora-finetuning/scripts/prepare_csv.py +++ b/llm-lora-finetuning/scripts/prepare_csv.py @@ -22,7 +22,9 @@ def prepare( csv_path: Path, destination_path: Path = Path("data/csv"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), test_split_fraction: float = 0.1, seed: int = 42, mask_inputs: bool = False, @@ -46,7 +48,9 @@ def prepare( df = pd.read_csv(csv_path, dtype=str).fillna("") if not (df.columns.values == columns).all(): - raise ValueError(f"CSV columns must be {columns}, found {df.columns.values}") + raise ValueError( + f"CSV columns must be {columns}, found {df.columns.values}" + ) data = json.loads(df.to_json(orient="records", indent=4)) print("Loading tokenizer...") @@ -54,7 +58,9 @@ def prepare( # Partition the dataset into train and test train_set, test_set = random_split( - data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + data, + [1.0 - test_split_fraction, test_split_fraction], + generator=torch.Generator().manual_seed(seed), ) train_set, test_set = list(train_set), list(test_set) @@ -88,7 +94,13 @@ def prepare( torch.save(test_set, destination_path / "test.pt") -def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: +def prepare_sample( + example: dict, + tokenizer: Tokenizer, + max_length: int, + mask_inputs: bool, + ignore_index: int, +) -> dict: """Processes a single sample. Each sample in the dataset consists of: @@ -108,14 +120,20 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in full_prompt = generate_prompt(example) full_prompt_and_response = full_prompt + example["output"] encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) - encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode( + full_prompt_and_response, eos=True, max_length=max_length + ) # The labels are the full prompt with response, but with the prompt masked out labels = encoded_full_prompt_and_response.clone() if mask_inputs: labels[: len(encoded_full_prompt)] = ignore_index - return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + return { + **example, + "input_ids": encoded_full_prompt_and_response, + "labels": labels, + } def generate_prompt(example: dict) -> str: diff --git a/llm-lora-finetuning/scripts/prepare_dolly.py b/llm-lora-finetuning/scripts/prepare_dolly.py index 56da37ce..8bb43439 100644 --- a/llm-lora-finetuning/scripts/prepare_dolly.py +++ b/llm-lora-finetuning/scripts/prepare_dolly.py @@ -17,12 +17,15 @@ from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import CLI + from scripts.prepare_alpaca import download_if_missing def prepare( destination_path: Path = Path("data/dolly"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), test_split_fraction: float = 0.1, seed: int = 42, mask_inputs: bool = False, @@ -38,7 +41,9 @@ def prepare( """ if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] @@ -59,7 +64,9 @@ def prepare( # Partition the dataset into train and test train_set, test_set = random_split( - data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + data, + [1.0 - test_split_fraction, test_split_fraction], + generator=torch.Generator().manual_seed(seed), ) train_set, test_set = list(train_set), list(test_set) @@ -93,7 +100,13 @@ def prepare( torch.save(test_set, destination_path / "test.pt") -def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: +def prepare_sample( + example: dict, + tokenizer: Tokenizer, + max_length: int, + mask_inputs: bool, + ignore_index: int, +) -> dict: """Processes a single sample. Each sample in the dataset consists of: @@ -113,14 +126,20 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in full_prompt = generate_prompt(example) full_prompt_and_response = full_prompt + example["output"] encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) - encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode( + full_prompt_and_response, eos=True, max_length=max_length + ) # The labels are the full prompt with response, but with the prompt masked out labels = encoded_full_prompt_and_response.clone() if mask_inputs: labels[: len(encoded_full_prompt)] = ignore_index - return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + return { + **example, + "input_ids": encoded_full_prompt_and_response, + "labels": labels, + } def generate_prompt(example: dict) -> str: diff --git a/llm-lora-finetuning/scripts/prepare_flan.py b/llm-lora-finetuning/scripts/prepare_flan.py index 59d3a7fa..a34b5472 100644 --- a/llm-lora-finetuning/scripts/prepare_flan.py +++ b/llm-lora-finetuning/scripts/prepare_flan.py @@ -15,6 +15,7 @@ from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import CLI + from scripts.prepare_alpaca import download_if_missing @@ -28,7 +29,9 @@ def load_jsonl(filename): def prepare( destination_path: Path = Path("data/flan"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), mask_inputs: bool = False, # as in alpaca-lora subsets: Optional[str] = None, ignore_index: int = -1, @@ -121,7 +124,9 @@ def prepare( subsets = list(supported_subsets) if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] @@ -187,7 +192,13 @@ def prepare( torch.save(test_set, destination_path / "test.pt") -def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int): +def prepare_sample( + example: dict, + tokenizer: Tokenizer, + max_length: int, + mask_inputs: bool, + ignore_index: int, +): """Processes a single sample. Each sample in the dataset consists of: @@ -207,14 +218,20 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in full_prompt = generate_prompt(example) full_prompt_and_response = full_prompt + example["targets"] encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) - encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode( + full_prompt_and_response, eos=True, max_length=max_length + ) # The labels are the full prompt with response, but with the prompt masked out labels = encoded_full_prompt_and_response.clone() if mask_inputs: labels[: len(encoded_full_prompt)] = ignore_index - return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + return { + **example, + "input_ids": encoded_full_prompt_and_response, + "labels": labels, + } def generate_prompt(example): diff --git a/llm-lora-finetuning/scripts/prepare_lima.py b/llm-lora-finetuning/scripts/prepare_lima.py index ca35e62b..e27928ce 100644 --- a/llm-lora-finetuning/scripts/prepare_lima.py +++ b/llm-lora-finetuning/scripts/prepare_lima.py @@ -23,7 +23,9 @@ def prepare( destination_path: Path = Path("data/lima"), test_split_fraction: float = 0.1, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), mask_inputs: bool = False, # as in alpaca-lora seed: int = 42, include_multiturn_conversations: bool = False, @@ -46,7 +48,9 @@ def prepare( ) if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] @@ -56,7 +60,9 @@ def prepare( from datasets import load_dataset dataset = load_dataset(data_repo_id, token=access_token) - train_data = format_dataset(dataset["train"], include_multiturn_conversations) + train_data = format_dataset( + dataset["train"], include_multiturn_conversations + ) # test set is present but doesn't have any solutions, so we cannot use it here # but have to create our own @@ -68,7 +74,9 @@ def prepare( # Partition the dataset into train and test train_set, test_set = random_split( - train_data, [1.0 - test_split_fraction, test_split_fraction], generator=torch.Generator().manual_seed(seed) + train_data, + [1.0 - test_split_fraction, test_split_fraction], + generator=torch.Generator().manual_seed(seed), ) train_set, test_set = list(train_set), list(test_set) @@ -102,22 +110,38 @@ def prepare( torch.save(test_set, destination_path / "test.pt") -def format_dataset(dataset_partition: dict, include_multi_turn_conversations: bool) -> List[dict]: +def format_dataset( + dataset_partition: dict, include_multi_turn_conversations: bool +) -> List[dict]: formatted_ds = [] for entry in dataset_partition: convo = entry["conversations"] if include_multi_turn_conversations: for i in range(0, len(convo) - 1, 2): - formatted_ds.append({"instruction": convo[i], "input": "", "output": convo[i + 1]}) + formatted_ds.append( + { + "instruction": convo[i], + "input": "", + "output": convo[i + 1], + } + ) else: - formatted_ds.append({"instruction": convo[0], "input": "", "output": convo[1]}) + formatted_ds.append( + {"instruction": convo[0], "input": "", "output": convo[1]} + ) return formatted_ds -def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: +def prepare_sample( + example: dict, + tokenizer: Tokenizer, + max_length: int, + mask_inputs: bool, + ignore_index: int, +) -> dict: """Processes a single sample. Each sample in the dataset consists of: @@ -137,14 +161,20 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in full_prompt = generate_prompt(example) full_prompt_and_response = full_prompt + example["output"] encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) - encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode( + full_prompt_and_response, eos=True, max_length=max_length + ) # The labels are the full prompt with response, but with the prompt masked out labels = encoded_full_prompt_and_response.clone() if mask_inputs: labels[: len(encoded_full_prompt)] = ignore_index - return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + return { + **example, + "input_ids": encoded_full_prompt_and_response, + "labels": labels, + } def generate_prompt(example: dict) -> str: diff --git a/llm-lora-finetuning/scripts/prepare_longform.py b/llm-lora-finetuning/scripts/prepare_longform.py index 2a46e7dd..6327bad8 100644 --- a/llm-lora-finetuning/scripts/prepare_longform.py +++ b/llm-lora-finetuning/scripts/prepare_longform.py @@ -16,12 +16,15 @@ from lit_gpt.tokenizer import Tokenizer from lit_gpt.utils import CLI + from scripts.prepare_alpaca import download_if_missing def prepare( destination_path: Path = Path("data/longform"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), mask_inputs: bool = False, # as in alpaca-lora ignore_index: int = -1, max_seq_length: Optional[int] = None, @@ -32,7 +35,9 @@ def prepare( which stores the preprocessed and tokenized prompts and labels. """ if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] @@ -91,7 +96,13 @@ def prepare( torch.save(test_data, destination_path / "test.pt") -def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict: +def prepare_sample( + example: dict, + tokenizer: Tokenizer, + max_length: int, + mask_inputs: bool, + ignore_index: int, +) -> dict: """Processes a single sample. Each sample in the dataset consists of: @@ -111,14 +122,20 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in full_prompt = generate_prompt(example) full_prompt_and_response = full_prompt + example["output"] encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length) - encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length) + encoded_full_prompt_and_response = tokenizer.encode( + full_prompt_and_response, eos=True, max_length=max_length + ) # The labels are the full prompt with response, but with the prompt masked out labels = encoded_full_prompt_and_response.clone() if mask_inputs: labels[: len(encoded_full_prompt)] = ignore_index - return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels} + return { + **example, + "input_ids": encoded_full_prompt_and_response, + "labels": labels, + } def generate_prompt(example: dict) -> str: diff --git a/llm-lora-finetuning/scripts/prepare_openwebtext.py b/llm-lora-finetuning/scripts/prepare_openwebtext.py index 2578ab9f..fbb4a8d9 100644 --- a/llm-lora-finetuning/scripts/prepare_openwebtext.py +++ b/llm-lora-finetuning/scripts/prepare_openwebtext.py @@ -20,7 +20,9 @@ def prepare( destination_path: Path = Path("data/openwebtext"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), seed: int = 42, test_size: Union[float, int, None] = 0.0005, ) -> None: @@ -43,8 +45,12 @@ def prepare( dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset) # owt by default only contains the 'train' split, so create a test split - split_dataset = dataset["train"].train_test_split(test_size=test_size, seed=seed, shuffle=True) - split_dataset["val"] = split_dataset.pop("test") # rename the test split to val + split_dataset = dataset["train"].train_test_split( + test_size=test_size, seed=seed, shuffle=True + ) + split_dataset["val"] = split_dataset.pop( + "test" + ) # rename the test split to val def process(example): ids = tokenizer.encode(example["text"]).tolist() @@ -56,20 +62,33 @@ def process(example): return {"ids": ids, "len": len(ids)} # tokenize the dataset - tokenized = split_dataset.map(process, remove_columns=["text"], desc="tokenizing the splits", num_proc=num_proc) + tokenized = split_dataset.map( + process, + remove_columns=["text"], + desc="tokenizing the splits", + num_proc=num_proc, + ) # concatenate all the ids in each dataset into one large file we can use for training for split, dset in tokenized.items(): arr_len = np.sum(dset["len"], dtype=np.uint64) filename = destination_path / f"{split}.bin" - dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) - arr = np.memmap(str(filename), dtype=dtype, mode="w+", shape=(arr_len,)) + dtype = ( + np.uint16 + ) # (can do since enc.max_token_value == 50256 is < 2**16) + arr = np.memmap( + str(filename), dtype=dtype, mode="w+", shape=(arr_len,) + ) total_batches = 1024 idx = 0 - for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"): + for batch_idx in tqdm( + range(total_batches), desc=f"writing {filename}" + ): # Batch together samples for faster write - batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format("numpy") + batch = dset.shard( + num_shards=total_batches, index=batch_idx, contiguous=True + ).with_format("numpy") arr_batch = np.concatenate(batch["ids"]) # Write into mmap arr[idx : idx + len(arr_batch)] = arr_batch diff --git a/llm-lora-finetuning/scripts/prepare_redpajama.py b/llm-lora-finetuning/scripts/prepare_redpajama.py index f2c87a33..02044307 100644 --- a/llm-lora-finetuning/scripts/prepare_redpajama.py +++ b/llm-lora-finetuning/scripts/prepare_redpajama.py @@ -43,7 +43,11 @@ def prepare_sample( - source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = "" + source_path: Path, + checkpoint_dir: Path, + destination_path: Path, + chunk_size: int, + match: str = "", ) -> None: """Prepare the "Red Pajama" dataset using the original tokenizer.""" destination_path.mkdir(parents=True, exist_ok=True) @@ -87,7 +91,11 @@ def prepare_sample( def prepare_full( - source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = "" + source_path: Path, + checkpoint_dir: Path, + destination_path: Path, + chunk_size: int, + match: str = "", ) -> None: """Prepare the "Red Pajama" dataset using the original tokenizer.""" import zstandard as zstd @@ -102,7 +110,9 @@ def prepare_full( is_cc = set_name == "common_crawl" - filenames = glob.glob(os.path.join(source_path, pattern), recursive=True) + filenames = glob.glob( + os.path.join(source_path, pattern), recursive=True + ) if not filenames: raise RuntimeError( @@ -127,24 +137,32 @@ def prepare_full( print(f"Processing {name}") if is_cc: - with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: + with zstd.open( + open(filepath, "rb"), "rt", encoding="utf-8" + ) as f: for row in tqdm(f): text = json.loads(row)["text"] text_ids = tokenizer.encode(text) - builder.add_array(np.array(text_ids, dtype=builder.dtype)) + builder.add_array( + np.array(text_ids, dtype=builder.dtype) + ) else: with open(filepath, encoding="utf-8") as f: for row in tqdm(f): text = json.loads(row)["text"] text_ids = tokenizer.encode(text) - builder.add_array(np.array(text_ids, dtype=builder.dtype)) + builder.add_array( + np.array(text_ids, dtype=builder.dtype) + ) builder.write_reminder() def prepare( source_path: Path = Path("data/RedPajama-Data-1T-Sample"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), destination_path: Path = Path("data/redpajama_sample"), sample: bool = True, match: str = "", @@ -157,7 +175,8 @@ def prepare( source_path=source_path, checkpoint_dir=checkpoint_dir, destination_path=destination_path, - chunk_size=(config.block_size + 1) * 1024, # block size + 1 for causal, 1024 blocks + chunk_size=(config.block_size + 1) + * 1024, # block size + 1 for causal, 1024 blocks match=match, ) diff --git a/llm-lora-finetuning/scripts/prepare_slimpajama.py b/llm-lora-finetuning/scripts/prepare_slimpajama.py index 7a83316a..0a80191f 100644 --- a/llm-lora-finetuning/scripts/prepare_slimpajama.py +++ b/llm-lora-finetuning/scripts/prepare_slimpajama.py @@ -30,7 +30,10 @@ def prepare_item(self, filepath): with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: for row in f: text = json.loads(row)["text"] - if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub": + if ( + json.loads(row)["meta"]["redpajama_set_name"] + == "RedPajamaGithub" + ): continue # exclude the GitHub data since it overlaps with starcoder text_ids = self.tokenizer.encode(text, bos=False, eos=True) yield text_ids @@ -44,7 +47,9 @@ def prepare( fast_dev_run: bool = False, ) -> None: tokenizer = Tokenizer(tokenizer_path) - data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) + data_recipe = SlimPajamaDataRecipe( + tokenizer=tokenizer, chunk_size=chunk_size + ) data_processor = DataProcessor( input_dir=str(input_dir), output_dir=str(output_dir), diff --git a/llm-lora-finetuning/scripts/prepare_starcoder.py b/llm-lora-finetuning/scripts/prepare_starcoder.py index ea260ebc..1f67c93e 100644 --- a/llm-lora-finetuning/scripts/prepare_starcoder.py +++ b/llm-lora-finetuning/scripts/prepare_starcoder.py @@ -33,7 +33,9 @@ def prepare_item(self, item_metadata): try: parquet_file = pq.ParquetFile(filepath) # reduce RAM usage - for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]): + for batch in parquet_file.iter_batches( + batch_size=8192, columns=["content"] + ): for text in batch.to_pandas()["content"]: yield self.tokenizer.encode(text, bos=False, eos=True) @@ -55,7 +57,9 @@ def prepare( fast_dev_run: bool = False, ) -> None: tokenizer = Tokenizer(tokenizer_path) - data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) + data_recipe = StarcoderDataRecipe( + tokenizer=tokenizer, chunk_size=chunk_size + ) data_processor = DataProcessor( input_dir=str(input_dir), output_dir=str(output_dir), diff --git a/llm-lora-finetuning/steps/eval.py b/llm-lora-finetuning/steps/eval.py new file mode 100644 index 00000000..4d38bc1f --- /dev/null +++ b/llm-lora-finetuning/steps/eval.py @@ -0,0 +1,45 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import json +import shutil +from pathlib import Path +from typing import Annotated, Any, Dict, Optional + +from evaluate.lm_eval_harness import run_eval_harness +from zenml import step + +from scripts.download import download_from_hub +from scripts.merge_lora import merge_lora + + +@step +def eval( + model_repo: str, adapter_repo: Optional[str] = None +) -> Annotated[Dict[str, Any], "evaluation_results"]: + model_dir = Path("model") + download_from_hub(repo_id=model_repo, checkpoint_dir=model_dir) + + if adapter_repo: + adapter_dir = Path("adapter") + merged_dir = Path("merged") + + download_from_hub(repo_id=adapter_repo, checkpoint_dir=adapter_dir) + + lora_path = adapter_dir / "lit_model_lora_finetuned.pth" + merge_lora( + lora_path=Path(lora_path), + checkpoint_dir=model_dir, + out_dir=merged_dir, + ) + + for path in Path(model_dir).glob("*.json"): + destination = Path(merged_dir) / path.name + + shutil.copy(src=path, dst=destination) + + model_dir = merged_dir + + output_path = Path("output.json") + run_eval_harness(checkpoint_dir=model_dir, save_filepath=output_path) + + with open(output_path, "r") as f: + return json.load(f) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py new file mode 100644 index 00000000..b28d0336 --- /dev/null +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -0,0 +1,24 @@ +# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import importlib +from pathlib import Path + +from zenml import step + +from scripts.download import download_from_hub + + +@step +def feature_engineering(model_repo: str, dataset_name: str) -> None: + checkpoint_dir = Path("checkpoints") + download_from_hub( + repo_id=model_repo, tokenizer_only=True, checkpoint_dir=checkpoint_dir + ) + + destination_dir = Path("data") / dataset_name + + helper_module = importlib.import_module(f"scripts/prepare_{dataset_name}") + prepare_function = getattr(helper_module, "prepare") + + prepare_function( + checkpoint_dir=checkpoint_dir, destination_path=destination_dir + ) diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 35dcb22f..5aef5c02 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -1,43 +1,74 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import shutil from pathlib import Path -from typing import Tuple, Annotated +from typing import Optional +from finetune.lora import setup +from huggingface_hub import upload_folder from lit_gpt.args import IOArgs from zenml import step -from scripts.download import download_from_hub from scripts.convert_hf_checkpoint import convert_hf_checkpoint -from scripts.prepare_alpaca import prepare -from finetune.lora import setup +from scripts.convert_lit_checkpoint import convert_lit_checkpoint +from scripts.download import download_from_hub from scripts.merge_lora import merge_lora -import shutil +from scripts.prepare_alpaca import prepare + @step -def finetune_lora(repo_id: str) -> Tuple[Annotated[str, "checkpoint_dir"], Annotated[str, "output_path"]]: +def finetune_lora( + repo_id: str, + adapter_output_repo: Optional[str] = None, + merged_output_repo: Optional[str] = None, + convert_to_hf: bool = False, +) -> None: checkpoint_dir = Path("checkpoints") data_dir = Path("data/alpaca") output_dir = Path("out/lora/alpaca") download_from_hub(repo_id=repo_id, checkpoint_dir=checkpoint_dir) convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) prepare(destination_path=data_dir, checkpoint_dir=checkpoint_dir) - - io_args = IOArgs( - train_data_dir=data_dir, - val_data_dir=data_dir, - checkpoint_dir=checkpoint_dir, - out_dir=output_dir, - ), + + io_args = ( + IOArgs( + train_data_dir=data_dir, + val_data_dir=data_dir, + checkpoint_dir=checkpoint_dir, + out_dir=output_dir, + ), + ) setup(precision="bf16-true", io=io_args) model_name = repo_id.split("/")[-1] - lora_path = output_dir / model_name / "lit_model_lora_finetuned.pth" - merge_output_dir = Path("out/lora_merged") / model_name - merge_lora(lora_alpha=lora_path, checkpoint_dir=checkpoint_dir, out_dir=merge_output_dir) + if merged_output_repo: + lora_path = output_dir / model_name / "lit_model_lora_finetuned.pth" + + merge_output_dir = Path("out/lora_merged") / model_name + merge_lora( + lora_alpha=lora_path, + checkpoint_dir=checkpoint_dir, + out_dir=merge_output_dir, + ) + + for path in Path(checkpoint_dir).glob("*.json"): + destination = Path(merge_output_dir) / path.name + + shutil.copy(src=path, dst=destination) - for path in Path(checkpoint_dir).glob('*.json'): - destination = Path(merge_output_dir) / path.name + if convert_to_hf: + upload_dir = Path("hf_checkpoint_merged") + convert_lit_checkpoint( + checkpoint_path=merged_output_repo, + output_path=output_dir, + config_path=merged_output_repo / "lit_config.json", + ) + else: + upload_dir = merge_output_dir - shutil.copy(src=path, dst=destination) + upload_folder(repo_id=merged_output_repo, folder_path=upload_dir) - return checkpoint_dir, lora_path + if adapter_output_repo: + upload_folder( + repo_id=adapter_output_repo, folder_path=output_dir / model_name + ) diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 61a86e55..6a5c2130 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -1,28 +1,49 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +import shutil from pathlib import Path - - - -from pathlib import Path - - - - -from lit_gpt.args import IOArgs +from huggingface_hub import upload_folder from zenml import step +from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub from scripts.merge_lora import merge_lora -from scripts.prepare_alpaca import prepare -from finetune.lora import setup -import shutil -@step -def merge(checkpoint_dir: str, lora_path: str, out_dir: str) -> None: - merge_lora(lora_alpha=Path(lora_path), checkpoint_dir=Path(checkpoint_dir), out_dir=Path(out_dir)) - - for path in Path(checkpoint_dir).glob('*.json'): - destination = Path(out_dir) / path.name - shutil.copy(src=path, dst=destination) \ No newline at end of file +@step +def merge( + base_model_repo: str, + adapter_repo: str, + output_repo: str, + convert_to_hf: bool = False, +) -> None: + base_model_dir = Path("checkpoints") + adapter_dir = Path("adapter") + merged_dir = Path("merged") + + download_from_hub(repo_id=base_model_repo, checkpoint_dir=base_model_dir) + download_from_hub(repo_id=adapter_repo, checkpoint_dir=adapter_dir) + + lora_path = adapter_dir / "lit_model_lora_finetuned.pth" + merge_lora( + lora_path=Path(lora_path), + checkpoint_dir=base_model_dir, + out_dir=merged_dir, + ) + + for path in Path(base_model_dir).glob("*.json"): + destination = Path(merged_dir) / path.name + + shutil.copy(src=path, dst=destination) + + if convert_to_hf: + output_dir = Path("hf_checkpoint_merged") + convert_lit_checkpoint( + checkpoint_path=merged_dir, + output_path=output_dir, + config_path=merged_dir / "lit_config.json", + ) + else: + output_dir = merged_dir + + upload_folder(repo_id=output_repo, folder_path=output_dir) From ce2a7c7af3b9e81e2c11471a61f2a7d732c8c1c5 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 5 Mar 2024 14:19:58 +0800 Subject: [PATCH 03/26] dir materializer --- llm-lora-finetuning/pipelines/finetuning.py | 2 ++ .../steps/feature_engineering.py | 36 +++++++++++++++++-- llm-lora-finetuning/steps/finetune.py | 9 +++-- 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index 58f05744..d01ba1f9 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -27,10 +27,12 @@ def finetuning_pipeline( adapter_output_repo: Optional[str] = None, merged_output_repo: Optional[str] = None, convert_to_hf: bool = False, + data_dir: Optional[str] = None, ) -> None: finetune_lora( repo_id=repo_id, adapter_output_repo=adapter_output_repo, merged_output_repo=merged_output_repo, convert_to_hf=convert_to_hf, + data_dir=data_dir ) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index b28d0336..c2fac735 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -1,14 +1,45 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. import importlib from pathlib import Path +from tempfile import mkdtemp +from typing import Any, ClassVar, Tuple, Type from zenml import step +from zenml.enums import ArtifactType +from zenml.io import fileio +from zenml.materializers.base_materializer import BaseMaterializer from scripts.download import download_from_hub -@step -def feature_engineering(model_repo: str, dataset_name: str) -> None: +class LocalDirectoryMaterializer(BaseMaterializer): + ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (str,) + ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.DATA + + def load(self, data_type: Type[Any]) -> Any: + """Write logic here to load the data of an artifact. + + Args: + data_type: What type the artifact data should be loaded as. + + Returns: + """ + directory = mkdtemp(prefix="zenml-artifact") + fileio.copy(self.uri, directory) + return directory + + def save(self, data: Any) -> None: + """Write logic here to save the data of an artifact. + + Args: + data: The data of the artifact to save. + """ + assert isinstance(data, str) + fileio.copy(data, self.uri) + + +@step(output_materializers=LocalDirectoryMaterializer) +def feature_engineering(model_repo: str, dataset_name: str) -> str: checkpoint_dir = Path("checkpoints") download_from_hub( repo_id=model_repo, tokenizer_only=True, checkpoint_dir=checkpoint_dir @@ -22,3 +53,4 @@ def feature_engineering(model_repo: str, dataset_name: str) -> None: prepare_function( checkpoint_dir=checkpoint_dir, destination_path=destination_dir ) + return destination_dir diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 5aef5c02..a495256b 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -21,13 +21,18 @@ def finetune_lora( adapter_output_repo: Optional[str] = None, merged_output_repo: Optional[str] = None, convert_to_hf: bool = False, + data_dir: Optional[str] = None, ) -> None: checkpoint_dir = Path("checkpoints") - data_dir = Path("data/alpaca") output_dir = Path("out/lora/alpaca") download_from_hub(repo_id=repo_id, checkpoint_dir=checkpoint_dir) convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) - prepare(destination_path=data_dir, checkpoint_dir=checkpoint_dir) + + if data_dir: + data_dir = Path(data_dir) + else: + data_dir = Path("data/alpaca") + prepare(destination_path=data_dir, checkpoint_dir=checkpoint_dir) io_args = ( IOArgs( From f0739b17b2c24862ae46db2eacd8eea8768d5821 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Tue, 5 Mar 2024 15:03:15 +0800 Subject: [PATCH 04/26] wip --- llm-lora-finetuning/configs/eval.yaml | 3 +++ llm-lora-finetuning/configs/feature.yaml | 3 +++ llm-lora-finetuning/configs/finetune.yaml | 5 +++++ llm-lora-finetuning/configs/merge.yaml | 5 +++++ llm-lora-finetuning/pipelines/eval.py | 3 ++- llm-lora-finetuning/pipelines/feature_engineering.py | 3 ++- llm-lora-finetuning/pipelines/finetuning.py | 7 ++++--- llm-lora-finetuning/pipelines/merge.py | 4 +++- llm-lora-finetuning/steps/feature_engineering.py | 10 +++++----- llm-lora-finetuning/steps/finetune.py | 8 +++----- 10 files changed, 35 insertions(+), 16 deletions(-) create mode 100644 llm-lora-finetuning/configs/eval.yaml create mode 100644 llm-lora-finetuning/configs/finetune.yaml create mode 100644 llm-lora-finetuning/configs/merge.yaml diff --git a/llm-lora-finetuning/configs/eval.yaml b/llm-lora-finetuning/configs/eval.yaml new file mode 100644 index 00000000..a03ba2c6 --- /dev/null +++ b/llm-lora-finetuning/configs/eval.yaml @@ -0,0 +1,3 @@ +parameters: + model_repo: mistralai/Mistral-7B-Instruct-v0.1 + adapter_repo: ... diff --git a/llm-lora-finetuning/configs/feature.yaml b/llm-lora-finetuning/configs/feature.yaml index e69de29b..5de07309 100644 --- a/llm-lora-finetuning/configs/feature.yaml +++ b/llm-lora-finetuning/configs/feature.yaml @@ -0,0 +1,3 @@ +parameters: + model_repo: mistralai/Mistral-7B-Instruct-v0.1 + dataset_name: alpaca diff --git a/llm-lora-finetuning/configs/finetune.yaml b/llm-lora-finetuning/configs/finetune.yaml new file mode 100644 index 00000000..78ea0ae9 --- /dev/null +++ b/llm-lora-finetuning/configs/finetune.yaml @@ -0,0 +1,5 @@ +parameters: + repo_id: mistralai/Mistral-7B-Instruct-v0.1 + adapter_output_repo: null + merged_output_repo: null + convert_to_hf: False diff --git a/llm-lora-finetuning/configs/merge.yaml b/llm-lora-finetuning/configs/merge.yaml new file mode 100644 index 00000000..3e9ca3ad --- /dev/null +++ b/llm-lora-finetuning/configs/merge.yaml @@ -0,0 +1,5 @@ +parameters: + base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 + adapter_repo: ... + output_repo: ... + convert_to_hf: False diff --git a/llm-lora-finetuning/pipelines/eval.py b/llm-lora-finetuning/pipelines/eval.py index 88194f98..7595b599 100644 --- a/llm-lora-finetuning/pipelines/eval.py +++ b/llm-lora-finetuning/pipelines/eval.py @@ -2,8 +2,9 @@ from steps.eval import eval from zenml import pipeline +from zenml.config import DockerSettings -@pipeline +@pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) def eval_pipeline(model_repo: str, adapter_repo: Optional[str] = None) -> None: eval(model_repo=model_repo, adapter_repo=adapter_repo) diff --git a/llm-lora-finetuning/pipelines/feature_engineering.py b/llm-lora-finetuning/pipelines/feature_engineering.py index 4e447421..610ad471 100644 --- a/llm-lora-finetuning/pipelines/feature_engineering.py +++ b/llm-lora-finetuning/pipelines/feature_engineering.py @@ -1,7 +1,8 @@ from steps.feature_engineering import feature_engineering from zenml import pipeline +from zenml.config import DockerSettings -@pipeline +@pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) def feature_engineering_pipeline(model_repo: str, dataset_name: str) -> None: feature_engineering(model_repo=model_repo, dataset_name=dataset_name) diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index d01ba1f9..1edd1b84 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from pathlib import Path from typing import Optional -from steps.finetune import finetune_lora +from steps.finetune import finetune from zenml import pipeline from zenml.config import DockerSettings @@ -29,10 +30,10 @@ def finetuning_pipeline( convert_to_hf: bool = False, data_dir: Optional[str] = None, ) -> None: - finetune_lora( + finetune( repo_id=repo_id, adapter_output_repo=adapter_output_repo, merged_output_repo=merged_output_repo, convert_to_hf=convert_to_hf, - data_dir=data_dir + data_dir=Path(data_dir) if data_dir else None, ) diff --git a/llm-lora-finetuning/pipelines/merge.py b/llm-lora-finetuning/pipelines/merge.py index 4ea639d8..e33f64d2 100644 --- a/llm-lora-finetuning/pipelines/merge.py +++ b/llm-lora-finetuning/pipelines/merge.py @@ -17,9 +17,10 @@ from steps.merge import merge from zenml import pipeline +from zenml.config import DockerSettings -@pipeline +@pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) def merge_pipeline( base_model_repo: str, adapter_repo: str, @@ -28,6 +29,7 @@ def merge_pipeline( ) -> None: merge( base_model_repo=base_model_repo, + adapter_repo=adapter_repo, output_repo=output_repo, convert_to_hf=convert_to_hf, ) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index c2fac735..f19de6c1 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -13,7 +13,7 @@ class LocalDirectoryMaterializer(BaseMaterializer): - ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (str,) + ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Path,) ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.DATA def load(self, data_type: Type[Any]) -> Any: @@ -26,7 +26,7 @@ def load(self, data_type: Type[Any]) -> Any: """ directory = mkdtemp(prefix="zenml-artifact") fileio.copy(self.uri, directory) - return directory + return Path(directory) def save(self, data: Any) -> None: """Write logic here to save the data of an artifact. @@ -34,12 +34,12 @@ def save(self, data: Any) -> None: Args: data: The data of the artifact to save. """ - assert isinstance(data, str) - fileio.copy(data, self.uri) + assert isinstance(data, Path) + fileio.copy(str(data), self.uri) @step(output_materializers=LocalDirectoryMaterializer) -def feature_engineering(model_repo: str, dataset_name: str) -> str: +def feature_engineering(model_repo: str, dataset_name: str) -> Path: checkpoint_dir = Path("checkpoints") download_from_hub( repo_id=model_repo, tokenizer_only=True, checkpoint_dir=checkpoint_dir diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index a495256b..1d543601 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -16,21 +16,19 @@ @step -def finetune_lora( +def finetune( repo_id: str, adapter_output_repo: Optional[str] = None, merged_output_repo: Optional[str] = None, convert_to_hf: bool = False, - data_dir: Optional[str] = None, + data_dir: Optional[Path] = None, ) -> None: checkpoint_dir = Path("checkpoints") output_dir = Path("out/lora/alpaca") download_from_hub(repo_id=repo_id, checkpoint_dir=checkpoint_dir) convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) - if data_dir: - data_dir = Path(data_dir) - else: + if not data_dir: data_dir = Path("data/alpaca") prepare(destination_path=data_dir, checkpoint_dir=checkpoint_dir) From 20f751559410342b4252b6ab843d67b4033205fc Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Wed, 6 Mar 2024 12:06:30 +0800 Subject: [PATCH 05/26] format --- llm-lora-finetuning/.dockerignore | 5 +++-- llm-lora-finetuning/steps/feature_engineering.py | 4 ++-- llm-lora-finetuning/steps/finetune.py | 2 +- llm-lora-finetuning/steps/merge.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/llm-lora-finetuning/.dockerignore b/llm-lora-finetuning/.dockerignore index 4a37fa74..083734a0 100644 --- a/llm-lora-finetuning/.dockerignore +++ b/llm-lora-finetuning/.dockerignore @@ -1,7 +1,8 @@ * !/pipelines/** !/steps/** -!/lit_gpt/** +!/evaluate/** +!/finetune/** !/generate/** +!/lit_gpt/** !/scripts/** -!/finetune/** \ No newline at end of file diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index f19de6c1..81c28083 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -12,7 +12,7 @@ from scripts.download import download_from_hub -class LocalDirectoryMaterializer(BaseMaterializer): +class DirectoryMaterializer(BaseMaterializer): ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Path,) ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.DATA @@ -38,7 +38,7 @@ def save(self, data: Any) -> None: fileio.copy(str(data), self.uri) -@step(output_materializers=LocalDirectoryMaterializer) +@step(output_materializers=DirectoryMaterializer) def feature_engineering(model_repo: str, dataset_name: str) -> Path: checkpoint_dir = Path("checkpoints") download_from_hub( diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 1d543601..06cead79 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -62,7 +62,7 @@ def finetune( if convert_to_hf: upload_dir = Path("hf_checkpoint_merged") convert_lit_checkpoint( - checkpoint_path=merged_output_repo, + checkpoint_path=merged_output_repo / "lit_model.pth", output_path=output_dir, config_path=merged_output_repo / "lit_config.json", ) diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 6a5c2130..45b0d35f 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -39,7 +39,7 @@ def merge( if convert_to_hf: output_dir = Path("hf_checkpoint_merged") convert_lit_checkpoint( - checkpoint_path=merged_dir, + checkpoint_path=merged_dir / "lit_model.pth", output_path=output_dir, config_path=merged_dir / "lit_config.json", ) From aee20c5e53a710411f670f1bdfbe2819686ff1d1 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Wed, 6 Mar 2024 12:15:26 +0800 Subject: [PATCH 06/26] fix imports --- llm-lora-finetuning/lit_gpt/__init__.py | 4 ++-- llm-lora-finetuning/steps/feature_engineering.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llm-lora-finetuning/lit_gpt/__init__.py b/llm-lora-finetuning/lit_gpt/__init__.py index 9eac3836..f3974ec0 100644 --- a/llm-lora-finetuning/lit_gpt/__init__.py +++ b/llm-lora-finetuning/lit_gpt/__init__.py @@ -5,8 +5,8 @@ from lightning_utilities.core.imports import RequirementCache -from lit_gpt.config import Config -from lit_gpt.model import GPT +from lit_gpt.model import GPT # isort: skip +from lit_gpt.config import Config # isort: skip from lit_gpt.tokenizer import Tokenizer _LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.2.0.dev0") diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index 81c28083..50d43873 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -47,7 +47,7 @@ def feature_engineering(model_repo: str, dataset_name: str) -> Path: destination_dir = Path("data") / dataset_name - helper_module = importlib.import_module(f"scripts/prepare_{dataset_name}") + helper_module = importlib.import_module(f"scripts.prepare_{dataset_name}") prepare_function = getattr(helper_module, "prepare") prepare_function( From b1724f67dfe5831ae18f8450a49a67a7daa1256a Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Wed, 6 Mar 2024 13:21:07 +0800 Subject: [PATCH 07/26] Rewrite materializer --- .../steps/feature_engineering.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index 50d43873..07ca4cd7 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -8,9 +8,11 @@ from zenml.enums import ArtifactType from zenml.io import fileio from zenml.materializers.base_materializer import BaseMaterializer - +from lit_gpt import Config +import json +from dataclasses import asdict from scripts.download import download_from_hub - +import os class DirectoryMaterializer(BaseMaterializer): ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Path,) @@ -25,7 +27,7 @@ def load(self, data_type: Type[Any]) -> Any: Returns: """ directory = mkdtemp(prefix="zenml-artifact") - fileio.copy(self.uri, directory) + self._copy_directory(src=self.uri, dst=directory) return Path(directory) def save(self, data: Any) -> None: @@ -35,16 +37,35 @@ def save(self, data: Any) -> None: data: The data of the artifact to save. """ assert isinstance(data, Path) - fileio.copy(str(data), self.uri) + self._copy_directory(src=str(data), dst=self.uri) + + @staticmethod + def _copy_directory(src: str, dst: str) -> None: + for src_dir, _, files in fileio.walk(src): + dst_dir = os.path.join(dst, os.path.relpath(src_dir, src)) + fileio.makedirs(dst_dir) + + for file in files: + src_file = os.path.join(src_dir, file) + dst_file = os.path.join(dst_dir, file) + fileio.copy(src_file, dst_file) @step(output_materializers=DirectoryMaterializer) def feature_engineering(model_repo: str, dataset_name: str) -> Path: - checkpoint_dir = Path("checkpoints") + checkpoint_root_dir = Path("checkpoints") download_from_hub( - repo_id=model_repo, tokenizer_only=True, checkpoint_dir=checkpoint_dir + repo_id=model_repo, tokenizer_only=True, checkpoint_dir=checkpoint_root_dir ) + checkpoint_dir = checkpoint_root_dir / model_repo + + model_name = checkpoint_dir.name + config = Config.from_name(model_name) + config_dict = asdict(config) + with open(checkpoint_dir / "lit_config.json", "w") as json_config: + json.dump(config_dict, json_config) + destination_dir = Path("data") / dataset_name helper_module = importlib.import_module(f"scripts.prepare_{dataset_name}") From 86649fcd166e8abcfbfae7f8d98219d0e4e4e842 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Wed, 6 Mar 2024 14:09:45 +0800 Subject: [PATCH 08/26] More logging --- .../evaluate/lm_eval_harness.py | 12 +-- llm-lora-finetuning/finetune/adapter.py | 32 +++----- llm-lora-finetuning/finetune/adapter_v2.py | 32 +++----- llm-lora-finetuning/finetune/full.py | 45 ++++-------- llm-lora-finetuning/finetune/lora.py | 32 +++----- llm-lora-finetuning/generate/adapter.py | 12 +-- llm-lora-finetuning/generate/adapter_v2.py | 12 +-- llm-lora-finetuning/generate/base.py | 8 +- llm-lora-finetuning/generate/full.py | 8 +- llm-lora-finetuning/generate/lora.py | 8 +- llm-lora-finetuning/generate/sequentially.py | 42 +++-------- llm-lora-finetuning/generate/tp.py | 32 ++------ llm-lora-finetuning/lit_gpt/adapter.py | 38 +++------- llm-lora-finetuning/lit_gpt/adapter_v2.py | 26 ++----- llm-lora-finetuning/lit_gpt/args.py | 8 +- llm-lora-finetuning/lit_gpt/config.py | 36 +++------ llm-lora-finetuning/lit_gpt/lora.py | 60 ++++----------- llm-lora-finetuning/lit_gpt/model.py | 73 +++++-------------- llm-lora-finetuning/lit_gpt/packed_dataset.py | 8 +- llm-lora-finetuning/lit_gpt/tokenizer.py | 18 ++--- llm-lora-finetuning/lit_gpt/utils.py | 37 +++------- .../scripts/convert_hf_checkpoint.py | 19 ++--- .../scripts/convert_lit_checkpoint.py | 4 +- llm-lora-finetuning/scripts/download.py | 4 +- llm-lora-finetuning/scripts/merge_lora.py | 4 +- llm-lora-finetuning/scripts/prepare_alpaca.py | 8 +- llm-lora-finetuning/scripts/prepare_csv.py | 8 +- llm-lora-finetuning/scripts/prepare_dolly.py | 8 +- llm-lora-finetuning/scripts/prepare_flan.py | 8 +- llm-lora-finetuning/scripts/prepare_lima.py | 12 +-- .../scripts/prepare_longform.py | 16 ++-- .../scripts/prepare_openwebtext.py | 20 ++--- .../scripts/prepare_redpajama.py | 20 ++--- .../scripts/prepare_slimpajama.py | 9 +-- .../scripts/prepare_starcoder.py | 4 +- llm-lora-finetuning/steps/eval.py | 13 +++- .../steps/feature_engineering.py | 22 ++++-- llm-lora-finetuning/steps/finetune.py | 73 ++++++++++++++----- llm-lora-finetuning/steps/merge.py | 15 +++- llm-lora-finetuning/steps/utils.py | 11 +++ 40 files changed, 299 insertions(+), 558 deletions(-) create mode 100644 llm-lora-finetuning/steps/utils.py diff --git a/llm-lora-finetuning/evaluate/lm_eval_harness.py b/llm-lora-finetuning/evaluate/lm_eval_harness.py index 6f90c19f..55de2cc5 100644 --- a/llm-lora-finetuning/evaluate/lm_eval_harness.py +++ b/llm-lora-finetuning/evaluate/lm_eval_harness.py @@ -45,9 +45,7 @@ def __init__( @classmethod def create_from_arg_string(cls, arg_string, additional_config=None): - kwargs = { - el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",") - } + kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")} return cls(**kwargs, **additional_config) @property @@ -87,9 +85,7 @@ def _model_call(self, inps): return self.model(inps) @torch.inference_mode() - def _model_generate( - self, context, max_length, eos_token_id - ) -> torch.Tensor: + def _model_generate(self, context, max_length, eos_token_id) -> torch.Tensor: # this only supports batch size 1 assert context.shape[0] == 1 out = generate(self.model, context[0], max_length, eos_id=eos_token_id) @@ -178,9 +174,7 @@ def run_eval_harness( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/finetune/adapter.py b/llm-lora-finetuning/finetune/adapter.py index acf8f6d4..1826603f 100644 --- a/llm-lora-finetuning/finetune/adapter.py +++ b/llm-lora-finetuning/finetune/adapter.py @@ -76,9 +76,7 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -140,9 +138,7 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything( - 1337 - ) # same seed for every process to init model (FSDP) + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -151,9 +147,7 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print( - f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" - ) + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_adapter_as_trainable(model) @@ -205,9 +199,7 @@ def main( ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print( - f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" - ) + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") # Save the final checkpoint at the end of training save_path = io.out_dir / "lit_model_adapter_finetuned.pth" @@ -228,9 +220,7 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min( - longest_seq_length, train.max_seq_length or float("inf") - ) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -261,9 +251,7 @@ def fit( longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = ( - iter_num % train.gradient_accumulation_iters(devices) != 0 - ) + is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -295,9 +283,7 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate( - fabric, model, val_data, tokenizer, eval, train - ) + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) t1 = time.perf_counter() - t0 fabric.print( f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -332,7 +318,9 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + instruction = ( + "Recommend a movie for me to watch during the weekend and explain the reason." + ) fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/finetune/adapter_v2.py b/llm-lora-finetuning/finetune/adapter_v2.py index ac7de327..d30ff127 100644 --- a/llm-lora-finetuning/finetune/adapter_v2.py +++ b/llm-lora-finetuning/finetune/adapter_v2.py @@ -76,9 +76,7 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -140,9 +138,7 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything( - 1337 - ) # same seed for every process to init model (FSDP) + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -151,9 +147,7 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print( - f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" - ) + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_adapter_v2_as_trainable(model) @@ -205,9 +199,7 @@ def main( ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print( - f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" - ) + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") # Save the final checkpoint at the end of training save_path = io.out_dir / "lit_model_adapter_finetuned.pth" @@ -228,9 +220,7 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min( - longest_seq_length, train.max_seq_length or float("inf") - ) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -261,9 +251,7 @@ def fit( longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = ( - iter_num % train.gradient_accumulation_iters(devices) != 0 - ) + is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -295,9 +283,7 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate( - fabric, model, val_data, tokenizer, eval, train - ) + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) t1 = time.perf_counter() - t0 fabric.print( f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -332,7 +318,9 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + instruction = ( + "Recommend a movie for me to watch during the weekend and explain the reason." + ) fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/finetune/full.py b/llm-lora-finetuning/finetune/full.py index 02e28a72..bff8de53 100644 --- a/llm-lora-finetuning/finetune/full.py +++ b/llm-lora-finetuning/finetune/full.py @@ -105,9 +105,7 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything( - 1337 - ) # same seed for every process to init model (FSDP) + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -116,9 +114,7 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print( - f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" - ) + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) @@ -161,14 +157,10 @@ def main( fit(fabric, state, train_data, val_data, devices, resume, io, train, eval) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print( - f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" - ) + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") # Save the final checkpoint at the end of training - fabric.save( - io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]} - ) + fabric.save(io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]}) def fit( @@ -187,9 +179,7 @@ def fit( scheduler = state["scheduler"] tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min( - longest_seq_length, train.max_seq_length or float("inf") - ) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -211,9 +201,7 @@ def fit( for resume_iter in range(initial_iter): get_batch(fabric, train_data, None) if resume_iter % 1000 == 0: - fabric.print( - f"Resuming dataset: {resume_iter} / {initial_iter}" - ) + fabric.print(f"Resuming dataset: {resume_iter} / {initial_iter}") fabric.barrier() fabric.print( f"Resuming data loader finished. Took {time.perf_counter() - resume_t0:.1f} seconds to reach iteration" @@ -226,9 +214,7 @@ def fit( ).to(fabric.device) fabric.barrier() - for state["iter_num"] in range( - state["iter_num"] + 1, train.max_iters(devices) + 1 - ): + for state["iter_num"] in range(state["iter_num"] + 1, train.max_iters(devices) + 1): iter_t0 = time.perf_counter() input_ids, targets = get_batch( @@ -285,9 +271,7 @@ def fit( if not is_accumulating and state["step_count"] % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate( - fabric, model, val_data, tokenizer, eval, train - ) + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) t1 = time.perf_counter() - t0 fabric.print( f"iter {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -295,13 +279,8 @@ def fit( metrics = {"val_loss": val_loss, "val_ppl": math.exp(val_loss)} fabric.log_dict(metrics, step=state["iter_num"]) fabric.barrier() - if ( - not is_accumulating - and state["step_count"] % train.save_interval == 0 - ): - checkpoint_path = ( - io.out_dir / f"step-{state['step_count']:06d}.pth" - ) + if not is_accumulating and state["step_count"] % train.save_interval == 0: + checkpoint_path = io.out_dir / f"step-{state['step_count']:06d}.pth" fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}") fabric.save(checkpoint_path, state) @@ -330,7 +309,9 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + instruction = ( + "Recommend a movie for me to watch during the weekend and explain the reason." + ) fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/finetune/lora.py b/llm-lora-finetuning/finetune/lora.py index 39caa06e..8e72a571 100644 --- a/llm-lora-finetuning/finetune/lora.py +++ b/llm-lora-finetuning/finetune/lora.py @@ -85,9 +85,7 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -172,9 +170,7 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything( - 1337 - ) # same seed for every process to init model (FSDP) + fabric.seed_everything(1337) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -183,9 +179,7 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print( - f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" - ) + fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_lora_as_trainable(model) @@ -237,9 +231,7 @@ def main( ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print( - f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" - ) + fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") # Save the final LoRA checkpoint at the end of training save_path = io.out_dir / "lit_model_lora_finetuned.pth" @@ -260,9 +252,7 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min( - longest_seq_length, train.max_seq_length or float("inf") - ) + model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -293,9 +283,7 @@ def fit( longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = ( - iter_num % train.gradient_accumulation_iters(devices) != 0 - ) + is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -327,9 +315,7 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate( - fabric, model, val_data, tokenizer, eval, train - ) + val_loss = validate(fabric, model, val_data, tokenizer, eval, train) t1 = time.perf_counter() - t0 fabric.print( f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -364,7 +350,9 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = "Recommend a movie for me to watch during the weekend and explain the reason." + instruction = ( + "Recommend a movie for me to watch during the weekend and explain the reason." + ) fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/generate/adapter.py b/llm-lora-finetuning/generate/adapter.py index 3daa8836..2d4857c4 100644 --- a/llm-lora-finetuning/generate/adapter.py +++ b/llm-lora-finetuning/generate/adapter.py @@ -29,12 +29,8 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", - adapter_path: Path = Path( - "out/adapter/alpaca/lit_model_adapter_finetuned.pth" - ), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -68,9 +64,7 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/adapter_v2.py b/llm-lora-finetuning/generate/adapter_v2.py index 6f9d76d4..77780001 100644 --- a/llm-lora-finetuning/generate/adapter_v2.py +++ b/llm-lora-finetuning/generate/adapter_v2.py @@ -29,12 +29,8 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", - adapter_path: Path = Path( - "out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth" - ), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + adapter_path: Path = Path("out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth"), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -68,9 +64,7 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/base.py b/llm-lora-finetuning/generate/base.py index f8cfa7bd..75dce2aa 100644 --- a/llm-lora-finetuning/generate/base.py +++ b/llm-lora-finetuning/generate/base.py @@ -122,9 +122,7 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -153,9 +151,7 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/full.py b/llm-lora-finetuning/generate/full.py index cc1da495..6b2f9443 100644 --- a/llm-lora-finetuning/generate/full.py +++ b/llm-lora-finetuning/generate/full.py @@ -29,9 +29,7 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", finetuned_path: Path = Path("out/full/alpaca/lit_model_finetuned.pth"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -65,9 +63,7 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/lora.py b/llm-lora-finetuning/generate/lora.py index 0b30b701..1f2e5bf2 100644 --- a/llm-lora-finetuning/generate/lora.py +++ b/llm-lora-finetuning/generate/lora.py @@ -30,9 +30,7 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -75,9 +73,7 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/sequentially.py b/llm-lora-finetuning/generate/sequentially.py index d2dde4bb..d2489602 100644 --- a/llm-lora-finetuning/generate/sequentially.py +++ b/llm-lora-finetuning/generate/sequentially.py @@ -32,9 +32,7 @@ @torch.inference_mode() -def sequential( - model: GPT, root: torch.device, max_seq_length: int, devices: int -): +def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int): if model.config.n_layer % devices: # TODO: support smarter partitioning schemes raise NotImplementedError( @@ -42,9 +40,7 @@ def sequential( ) layers_per_rank = model.config.n_layer // devices # dictates where each block should be instantiated - mapping = layer_to_device( - model, chunk_on=Block, chunk_size=layers_per_rank - ) + mapping = layer_to_device(model, chunk_on=Block, chunk_size=layers_per_rank) # materialize each block on the appropriate device for path, target_index in mapping.items(): @@ -52,9 +48,7 @@ def sequential( target_device = torch.device(root.type, target_index) print(f"Moving {path!r} to {target_device}", file=sys.stderr) # submodules loaded by the checkpoint will be on CPU (if no quantization). move them - replace_device( - submodule, replace=torch.device("cpu"), by=target_device - ) + replace_device(submodule, replace=torch.device("cpu"), by=target_device) # in case the checkpoint was partial, materialize leftover metas _materialize_meta_tensors(submodule, target_device) # and build the kv cache @@ -86,9 +80,7 @@ def sequential( partial(move_block_input, target_device) ) if layer_num == model.config.n_layer - 1: - submodule.register_forward_hook( - partial(move_block_output, root) - ) + submodule.register_forward_hook(partial(move_block_output, root)) return model @@ -134,9 +126,7 @@ def replace_device( devices = {t.device for t in tensors.values()} if len(devices) != 1: # since this is using `submodule.to`, different devices in the same submodule is a problem - path_to_device = { - f"{name}.{p}": t.device for p, t in tensors.items() - } + path_to_device = {f"{name}.{p}": t.device for p, t in tensors.items()} raise ValueError(f"Found multiple devices: {path_to_device}") if devices.pop() == replace: submodule.to(by) @@ -151,9 +141,7 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path( - "checkpoints/mistralai/Mistral-7B-Instruct-v0.1" - ), + checkpoint_dir: Path = Path("checkpoints/mistralai/Mistral-7B-Instruct-v0.1"), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"] ] = None, @@ -183,9 +171,7 @@ def main( if compile: raise NotImplementedError # untested if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -228,9 +214,7 @@ def main( ) t0 = time.perf_counter() - state_dict = torch.load( - str(checkpoint_path), mmap=True, map_location="cpu" - ) + state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") # TODO: this assumes that the model fits on CPU. Use lazy_load and make the materialization checkpoint aware model.load_state_dict(state_dict, assign=True) print( @@ -241,9 +225,7 @@ def main( model = fabric.setup_module(model, move_to_device=False) t0 = time.perf_counter() - model = sequential( - model, fabric.device, max_returned_tokens, total_devices - ) + model = sequential(model, fabric.device, max_returned_tokens, total_devices) print( f"Time to sequential-ize the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr, @@ -294,8 +276,8 @@ def main( if __name__ == "__main__": torch.set_float32_matmul_precision("high") - logging.getLogger( - "lightning.fabric.plugins.precision.bitsandbytes" - ).setLevel(logging.DEBUG) + logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes").setLevel( + logging.DEBUG + ) CLI(main) diff --git a/llm-lora-finetuning/generate/tp.py b/llm-lora-finetuning/generate/tp.py index e8c7e1ef..a4f75661 100644 --- a/llm-lora-finetuning/generate/tp.py +++ b/llm-lora-finetuning/generate/tp.py @@ -43,9 +43,7 @@ def tensor_parallel_linear( f"This linear's {attr} value ({size}) is not evenly divisible by the world size ({world_size})" ) - shard = torch.tensor_split(linear.weight, world_size, dim=dim)[ - fabric.global_rank - ] + shard = torch.tensor_split(linear.weight, world_size, dim=dim)[fabric.global_rank] # overwrite `.data` instead of recreating the parameter for quantization (bitsandbytes) support. # the bitsandbytes linear classes use custom `torch.nn.Parameter` subclasses linear.weight.data = shard @@ -53,9 +51,7 @@ def tensor_parallel_linear( if linear.bias is not None and dim == 0: shard = torch.tensor_split(linear.bias, world_size)[fabric.global_rank] - linear.bias = torch.nn.Parameter( - shard, requires_grad=linear.bias.requires_grad - ) + linear.bias = torch.nn.Parameter(shard, requires_grad=linear.bias.requires_grad) def tensor_parallel_mlp( @@ -65,15 +61,11 @@ def tensor_parallel_mlp( tensor_parallel_linear(fabric, mlp.fc_1, "colwise") tensor_parallel_linear(fabric, mlp.fc_2, "colwise") tensor_parallel_linear(fabric, mlp.proj, "rowwise") - mlp.register_forward_hook( - partial(all_reduce_output, fabric.world_size) - ) + mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) elif isinstance(mlp, GptNeoxMLP): tensor_parallel_linear(fabric, mlp.fc, "colwise") tensor_parallel_linear(fabric, mlp.proj, "rowwise") - mlp.register_forward_hook( - partial(all_reduce_output, fabric.world_size) - ) + mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) elif isinstance(mlp, LLaMAMoE): # we use expert slicing across ranks, alternatively, we could create a expert parallelism group # when the number of experts is a multiple of the world size @@ -123,9 +115,7 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"] ] = None, @@ -155,9 +145,7 @@ def main( if compile: raise NotImplementedError # untested if "mixed" in precision: - raise ValueError( - "Quantization and mixed precision is not supported." - ) + raise ValueError("Quantization and mixed precision is not supported.") dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -204,9 +192,7 @@ def main( for rank in range(fabric.world_size): if fabric.global_rank == rank: t0 = time.perf_counter() - state_dict = torch.load( - str(checkpoint_path), mmap=True, map_location="cpu" - ) + state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") model.load_state_dict(state_dict, assign=True) print( f"[{rank}] Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", @@ -278,9 +264,7 @@ def main( if __name__ == "__main__": torch.set_float32_matmul_precision("high") - bnb_logger = logging.getLogger( - "lightning.fabric.plugins.precision.bitsandbytes" - ) + bnb_logger = logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes") bnb_logger.setLevel(logging.DEBUG) bnb_logger.debug = rank_zero_only(bnb_logger.debug) diff --git a/llm-lora-finetuning/lit_gpt/adapter.py b/llm-lora-finetuning/lit_gpt/adapter.py index 61744419..4ad6fc0c 100644 --- a/llm-lora-finetuning/lit_gpt/adapter.py +++ b/llm-lora-finetuning/lit_gpt/adapter.py @@ -43,9 +43,7 @@ def __init__(self, config: Config) -> None: self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList( - Block(config, i) for i in range(config.n_layer) - ), + h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) @@ -75,17 +73,13 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte( - idx - ) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) for block in self.transformer.h: x = block(x, cos, sin, mask, input_pos) x = self.transformer.ln_f(x) if lm_head_chunk_size > 0: # chunk the lm head logits to reduce the peak memory used by autograd - return [ - self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1) - ] + return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] return self.lm_head(x) # (b, t, vocab_size) @classmethod @@ -123,17 +117,11 @@ def __init__(self, config: Config, block_idx: int) -> None: super().__init__(config) if block_idx >= config.adapter_start_layer: # adapter embedding layer - self.adapter_wte = nn.Embedding( - config.adapter_prompt_length, config.n_embd - ) + self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) # gate for adaption - self.gating_factor = torch.nn.Parameter( - torch.zeros(1, 1, config.n_head, 1) - ) + self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) # kv cache for inference - self.adapter_kv_cache: Optional[ - Tuple[torch.Tensor, torch.Tensor] - ] = None + self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None self.block_idx = block_idx def scaled_dot_product_attention( @@ -169,12 +157,8 @@ def scaled_dot_product_attention( # for MHA this is a no-op ak = ak.repeat_interleave(q_per_kv, dim=2) av = av.repeat_interleave(q_per_kv, dim=2) - ak = ak.view( - 1, -1, aT, self.config.head_size - ) # (1, nh_ak, aT, hs) - av = av.view( - 1, -1, aT, self.config.head_size - ) # (1, nh_av, aT, hs) + ak = ak.view(1, -1, aT, self.config.head_size) # (1, nh_ak, aT, hs) + av = av.view(1, -1, aT, self.config.head_size) # (1, nh_av, aT, hs) self.adapter_kv_cache = (ak, av) T = q.size(2) @@ -189,9 +173,9 @@ def _load_from_state_dict( self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any ) -> None: """For compatibility with older checkpoints.""" - if (key := prefix + "gating_factor") in state_dict and state_dict[ - key - ].size(1) == self.config.n_head: + if (key := prefix + "gating_factor") in state_dict and state_dict[key].size( + 1 + ) == self.config.n_head: state_dict[key] = state_dict[key].permute(0, 2, 1, 3) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) diff --git a/llm-lora-finetuning/lit_gpt/adapter_v2.py b/llm-lora-finetuning/lit_gpt/adapter_v2.py index 5d389471..206d9395 100644 --- a/llm-lora-finetuning/lit_gpt/adapter_v2.py +++ b/llm-lora-finetuning/lit_gpt/adapter_v2.py @@ -79,9 +79,7 @@ def __init__(self, config: Config) -> None: self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList( - Block(config, i) for i in range(config.n_layer) - ), + h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) @@ -147,17 +145,11 @@ def __init__(self, config: Config, block_idx: int) -> None: if block_idx >= config.adapter_start_layer: # adapter embedding layer - self.adapter_wte = nn.Embedding( - config.adapter_prompt_length, config.n_embd - ) + self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) # gate for adaption - self.gating_factor = torch.nn.Parameter( - torch.zeros(1, 1, config.n_head, 1) - ) + self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) # kv cache for inference - self.adapter_kv_cache: Optional[ - Tuple[torch.Tensor, torch.Tensor] - ] = None + self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None self.block_idx = block_idx self.config = config @@ -174,9 +166,9 @@ def _load_from_state_dict( } state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) # For compatibility with older checkpoints - if (key := prefix + "gating_factor") in state_dict and state_dict[ - key - ].size(1) == self.config.n_head: + if (key := prefix + "gating_factor") in state_dict and state_dict[key].size( + 1 + ) == self.config.n_head: state_dict[key] = state_dict[key].permute(0, 2, 1, 3) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @@ -248,9 +240,7 @@ class LLaMAMoE(lit_gpt.model.LLaMAMoE): def __init__(self, config: Config) -> None: nn.Module.__init__(self) self.gate = AdapterV2Linear(config.n_embd, config.n_expert, bias=False) - self.experts = nn.ModuleList( - LLaMAMLP(config) for _ in range(config.n_expert) - ) + self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) self.config = config diff --git a/llm-lora-finetuning/lit_gpt/args.py b/llm-lora-finetuning/lit_gpt/args.py index 264c8f51..62217076 100644 --- a/llm-lora-finetuning/lit_gpt/args.py +++ b/llm-lora-finetuning/lit_gpt/args.py @@ -37,17 +37,13 @@ class TrainArgs: def max_iters(self, devices: int) -> int: """Number of iterations""" - max_iters = ( - self.epochs * self.epoch_size // devices // self.micro_batch_size - ) + max_iters = self.epochs * self.epoch_size // devices // self.micro_batch_size assert max_iters > 0 return max_iters def gradient_accumulation_iters(self, devices: int) -> int: """Number of iterations between gradient synchronizations""" - gradient_accumulation_iters = ( - self.batch_size(devices) // self.micro_batch_size - ) + gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size assert gradient_accumulation_iters > 0 return gradient_accumulation_iters diff --git a/llm-lora-finetuning/lit_gpt/config.py b/llm-lora-finetuning/lit_gpt/config.py index dab1523b..bca740d5 100644 --- a/llm-lora-finetuning/lit_gpt/config.py +++ b/llm-lora-finetuning/lit_gpt/config.py @@ -54,9 +54,7 @@ class Config: shared_attention_norm: bool = False _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm" norm_eps: float = 1e-5 - _mlp_class: Literal[ - "GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE" - ] = "GptNeoxMLP" + _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE"] = "GptNeoxMLP" gelu_approximate: str = "none" intermediate_size: Optional[int] = None rope_condense_ratio: int = 1 @@ -90,9 +88,7 @@ def __post_init__(self): # compute the intermediate size for MLP if not set if self.intermediate_size is None: if self._mlp_class == "LLaMAMLP": - raise ValueError( - "The config needs to set the `intermediate_size`" - ) + raise ValueError("The config needs to set the `intermediate_size`") self.intermediate_size = 4 * self.n_embd self.rope_n_elem = int(self.rotary_percentage * self.head_size) @@ -103,9 +99,7 @@ def from_name(cls, name: str, **kwargs: Any) -> Self: # search through all `config['hf_config']['name']` try: conf_dict = next( - config - for config in configs - if name == config["hf_config"]["name"] + config for config in configs if name == config["hf_config"]["name"] ) except StopIteration: raise ValueError(f"{name!r} is not a supported config name") @@ -123,9 +117,7 @@ def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self: with open(path, encoding="utf-8") as fp: json_kwargs = json.load(fp) if "condense_ratio" in json_kwargs: # legacy name - json_kwargs["rope_condense_ratio"] = json_kwargs.pop( - "condense_ratio" - ) + json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio") if "condense_ratio" in kwargs: # legacy name kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") if "org" in json_kwargs: # legacy name @@ -368,9 +360,7 @@ def norm_class(self) -> Type: # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json dict( name="RedPajama-INCITE-{}-3B-v1", - hf_config=dict( - org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1" - ), + hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"), block_size=2048, n_layer=32, n_embd=2560, @@ -391,9 +381,7 @@ def norm_class(self) -> Type: # this redirects to the checkpoint above. kept for those who had the old weights already downloaded dict( name="RedPajama-INCITE-{}-7B-v0.1", - hf_config=dict( - org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1" - ), + hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"), block_size=2048, n_layer=32, padding_multiple=256, @@ -1266,9 +1254,7 @@ def norm_class(self) -> Type: # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json dict( name="stablecode-completion-alpha-3b", - hf_config=dict( - org="stabilityai", name="stablecode-completion-alpha-3b" - ), + hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), block_size=16384, vocab_size=49152, n_layer=32, @@ -1277,9 +1263,7 @@ def norm_class(self) -> Type: # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json dict( name="stablecode-completion-alpha-3b-4k", - hf_config=dict( - org="stabilityai", name="stablecode-completion-alpha-3b-4k" - ), + hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), vocab_size=49152, n_layer=32, n_embd=2560, @@ -1462,9 +1446,7 @@ def norm_class(self) -> Type: # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json dict( name="Llama-2-7b-chat-hf-function-calling-v2", - hf_config=dict( - org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2" - ), + hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"), padding_multiple=64, n_layer=32, rotary_percentage=1.0, diff --git a/llm-lora-finetuning/lit_gpt/lora.py b/llm-lora-finetuning/lit_gpt/lora.py index 84d42543..0df9ae3f 100644 --- a/llm-lora-finetuning/lit_gpt/lora.py +++ b/llm-lora-finetuning/lit_gpt/lora.py @@ -246,9 +246,7 @@ def __init__( torch.zeros((r * sum(enable_lora), in_features)) ) # (4, 128) enable_q, enable_k, enable_v = enable_lora - self.kv_embd_size = self.linear.in_features // ( - n_head // n_query_groups - ) + self.kv_embd_size = self.linear.in_features // (n_head // n_query_groups) # qkv_shapes will be used to split a tensor with weights correctly qkv_shapes = ( self.linear.in_features * enable_q, @@ -284,24 +282,16 @@ def __init__( ind = range(out_features) self.lora_ind = [] if enable_q: - q_ind = [ - x - for x in ind - if (x // head_size) % total_qkv < total_qkv - 2 - ] + q_ind = [x for x in ind if (x // head_size) % total_qkv < total_qkv - 2] self.lora_ind.extend(q_ind) if enable_k: k_ind = [ - x - for x in ind - if (x // head_size) % total_qkv == total_qkv - 2 + x for x in ind if (x // head_size) % total_qkv == total_qkv - 2 ] self.lora_ind.extend(k_ind) if enable_v: v_ind = [ - x - for x in ind - if (x // head_size) % total_qkv == total_qkv - 1 + x for x in ind if (x // head_size) % total_qkv == total_qkv - 1 ] self.lora_ind.extend(v_ind) self.reset_parameters() @@ -362,24 +352,18 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors # for example when we want to merge/unmerge LoRA weights and pretrained weights x = x.transpose(0, 1) - result = x.new_zeros( - (*x.shape[:-1], self.linear.out_features) - ) # (64, 64, 384) + result = x.new_zeros((*x.shape[:-1], self.linear.out_features)) # (64, 64, 384) result = result.view(-1, self.linear.out_features) # (4096, 384) result = result.index_copy( 1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes)), ) # (4096, 256) - return result.view( - (*x.shape[:-1], self.linear.out_features) - ).transpose( + return result.view((*x.shape[:-1], self.linear.out_features)).transpose( 0, 1 ) # (64, 64, 384) - def conv1d( - self, input: torch.Tensor, weight: torch.Tensor - ) -> torch.Tensor: + def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. If the number of heads is equal to the number of query groups - grouped queries are disabled @@ -410,12 +394,8 @@ def conv1d( # ⚬ C_output': embeddings size for each LoRA layer (not equal in size) # ⚬ r: rank of all LoRA layers (equal in size) - input_splitted = input.chunk( - sum(self.enable_lora), dim=1 - ) # N * (B, C // N, T) - weight_splitted = weight.split( - self.qkv_shapes - ) # N * (C_output', r, 1) + input_splitted = input.chunk(sum(self.enable_lora), dim=1) # N * (B, C // N, T) + weight_splitted = weight.split(self.qkv_shapes) # N * (C_output', r, 1) return torch.cat( [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], dim=1, # (B, C_output', T) @@ -430,9 +410,7 @@ def get_lora_AB(self) -> torch.Tensor: lora = self.conv1d( self.lora_A.data.unsqueeze(0), # (4, 128) -> (1, 4, 128) self.lora_B.data.unsqueeze(-1), # (256, 2) -> (256, 2, 1) - ).squeeze( - 0 - ) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128) + ).squeeze(0) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128) return self.zero_pad( lora * self.scaling ) # (256, 128) after zero_pad (384, 128) @@ -511,11 +489,7 @@ def mark_only_lora_as_trainable(model: nn.Module, bias: str = "none") -> None: p.requires_grad = True elif bias == "lora_only": for m in model.modules(): - if ( - isinstance(m, LoRALayer) - and hasattr(m, "bias") - and m.bias is not None - ): + if isinstance(m, LoRALayer) and hasattr(m, "bias") and m.bias is not None: m.bias.requires_grad = True else: raise NotImplementedError @@ -600,17 +574,13 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte( - idx - ) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) for block in self.transformer.h: x = block(x, cos, sin, mask, input_pos) x = self.transformer.ln_f(x) if lm_head_chunk_size > 0: # chunk the lm head logits to reduce the peak memory used by autograd - return [ - self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1) - ] + return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] return self.lm_head(x) # (B, T, vocab_size) @classmethod @@ -794,9 +764,7 @@ def __init__(self, config: Config) -> None: lora_alpha=config.alpha, lora_dropout=config.dropout, ) - self.experts = nn.ModuleList( - LLaMAMLP(config) for _ in range(config.n_expert) - ) + self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) self.config = config diff --git a/llm-lora-finetuning/lit_gpt/model.py b/llm-lora-finetuning/lit_gpt/model.py index 1ff378fd..6413634b 100644 --- a/llm-lora-finetuning/lit_gpt/model.py +++ b/llm-lora-finetuning/lit_gpt/model.py @@ -94,9 +94,7 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte( - idx - ) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) if self.config.scale_embeddings: x = x * (self.config.n_embd**0.5) @@ -137,10 +135,7 @@ def set_kv_cache( batch_size, max_seq_length, rope_cache_length, device, dtype ) - if ( - self.mask_cache is None - or self.mask_cache.size(3) != max_seq_length - ): + if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length: # passing `attn_mask` to SDPA disables the flash implementation. since we only need the mask # for the kv-cache support (only during inference), we only create it in that situation self.mask_cache = build_mask_cache(max_seq_length, device) @@ -217,23 +212,17 @@ def forward( B, T, C, - ) = ( - x.size() - ) # batch size, sequence length, embedding dimensionality (n_embd) + ) = x.size() # batch size, sequence length, embedding dimensionality (n_embd) qkv = self.attn(x) # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`) q_per_kv = self.config.n_head // self.config.n_query_groups - total_qkv = ( - q_per_kv + 2 - ) # each group has 1+ queries, 1 key, and 1 value + total_qkv = q_per_kv + 2 # each group has 1+ queries, 1 key, and 1 value qkv = qkv.view( B, T, self.config.n_query_groups, total_qkv, self.config.head_size ) - qkv = qkv.permute( - 0, 2, 3, 1, 4 - ) # (B, n_query_groups, total_qkv, T, hs) + qkv = qkv.permute(0, 2, 3, 1, 4) # (B, n_query_groups, total_qkv, T, hs) # split batched computation into three q, k, v = qkv.split((q_per_kv, 1, 1), dim=2) @@ -322,9 +311,7 @@ def build_kv_cache( batch_size, heads, max_seq_length, - rope_cache_length - + self.config.head_size - - self.config.rope_n_elem, + rope_cache_length + self.config.head_size - self.config.rope_n_elem, ) return KVCache(k_shape, v_shape, device=device, dtype=dtype) @@ -332,35 +319,23 @@ def build_kv_cache( class GptNeoxMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc = nn.Linear( - config.n_embd, config.intermediate_size, bias=config.bias - ) - self.proj = nn.Linear( - config.intermediate_size, config.n_embd, bias=config.bias - ) + self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) self.config = config def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.fc(x) - x = torch.nn.functional.gelu( - x, approximate=self.config.gelu_approximate - ) + x = torch.nn.functional.gelu(x, approximate=self.config.gelu_approximate) return self.proj(x) class LLaMAMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc_1 = nn.Linear( - config.n_embd, config.intermediate_size, bias=config.bias - ) - self.fc_2 = nn.Linear( - config.n_embd, config.intermediate_size, bias=config.bias - ) - self.proj = nn.Linear( - config.intermediate_size, config.n_embd, bias=config.bias - ) + self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) + self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) def forward(self, x: torch.Tensor) -> torch.Tensor: x_fc_1 = self.fc_1(x) @@ -381,9 +356,7 @@ class LLaMAMoE(nn.Module): def __init__(self, config: Config) -> None: super().__init__() self.gate = nn.Linear(config.n_embd, config.n_expert, bias=False) - self.experts = nn.ModuleList( - LLaMAMLP(config) for _ in range(config.n_expert) - ) + self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) self.config = config @@ -396,9 +369,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: B, T, C, - ) = ( - x.size() - ) # batch size, sequence length, embedding dimensionality (n_embd) + ) = x.size() # batch size, sequence length, embedding dimensionality (n_embd) x = x.view(-1, C) # (B*T, C) router = self.gate(x) # (B*T, n_expert) probs, indices = torch.topk( @@ -412,9 +383,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: y = torch.zeros_like(x) # (B*T, C) for mask, expert in zip(masks, self.experts): token_idx, expert_idx = torch.where(mask) - y[token_idx] += probs[token_idx, expert_idx, None] * expert( - x[token_idx] - ) + y[token_idx] += probs[token_idx, expert_idx, None] * expert(x[token_idx]) return y.view(B, T, C) @@ -432,9 +401,7 @@ def build_rope_cache( https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. """ # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ - theta = 1.0 / ( - base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem) - ) + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem)) # Create position indexes `[0, 1, ..., seq_len - 1]` seq_idx = torch.arange(seq_len, device=device) / condense_ratio @@ -445,9 +412,7 @@ def build_rope_cache( return torch.cos(idx_theta), torch.sin(idx_theta) -def apply_rope( - x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor -) -> torch.Tensor: +def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: head_size = x.size(-1) x1 = x[..., : head_size // 2] # (B, nh, T, hs/2) x2 = x[..., head_size // 2 :] # (B, nh, T, hs/2) @@ -495,7 +460,5 @@ def reset_parameters(self) -> None: def build_mask_cache( max_seq_length: int, device: Optional[torch.device] = None ) -> torch.Tensor: - ones = torch.ones( - (max_seq_length, max_seq_length), device=device, dtype=torch.bool - ) + ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool) return torch.tril(ones).unsqueeze(0).unsqueeze(0) diff --git a/llm-lora-finetuning/lit_gpt/packed_dataset.py b/llm-lora-finetuning/lit_gpt/packed_dataset.py index a183d4c2..afe9c126 100644 --- a/llm-lora-finetuning/lit_gpt/packed_dataset.py +++ b/llm-lora-finetuning/lit_gpt/packed_dataset.py @@ -256,9 +256,7 @@ def __init__(self, datasets, seed, weights=None): self._weights = [w / sum(weights) for w in weights] def __iter__(self): - return CombinedDatasetIterator( - self._datasets, self._seed, self._weights - ) + return CombinedDatasetIterator(self._datasets, self._seed, self._weights) class CombinedDatasetIterator: @@ -268,7 +266,5 @@ def __init__(self, datasets, seed, weights): self._rng = random.Random(seed) def __next__(self): - (dataset,) = self._rng.choices( - self._datasets, weights=self._weights, k=1 - ) + (dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1) return next(dataset) diff --git a/llm-lora-finetuning/lit_gpt/tokenizer.py b/llm-lora-finetuning/lit_gpt/tokenizer.py index f2832ce6..43331c5a 100644 --- a/llm-lora-finetuning/lit_gpt/tokenizer.py +++ b/llm-lora-finetuning/lit_gpt/tokenizer.py @@ -23,9 +23,7 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None: if (vocabulary_path := checkpoint_dir / "tokenizer.model").is_file(): from sentencepiece import SentencePieceProcessor - self.processor = SentencePieceProcessor( - model_file=str(vocabulary_path) - ) + self.processor = SentencePieceProcessor(model_file=str(vocabulary_path)) self.backend = "sentencepiece" self.bos_id = self.processor.bos_id() self.eos_id = self.processor.eos_id() @@ -43,19 +41,14 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None: config = json.load(fp) bos_token = config.get("bos_token") self.bos_id = ( - self.token_to_id(bos_token) - if bos_token is not None - else None + self.token_to_id(bos_token) if bos_token is not None else None ) eos_token = config.get("eos_token") self.eos_id = ( - self.token_to_id(eos_token) - if eos_token is not None - else None + self.token_to_id(eos_token) if eos_token is not None else None ) if ( - special_tokens_path := checkpoint_dir - / "generation_config.json" + special_tokens_path := checkpoint_dir / "generation_config.json" ).is_file(): with open(special_tokens_path) as fp: config = json.load(fp) @@ -93,8 +86,7 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool: with open(tokenizer_config_path) as fp: config = json.load(fp) if any( - config.get(check, False) - for check in ("add_bos_token", "add_prefix_space") + config.get(check, False) for check in ("add_bos_token", "add_prefix_space") ): return True # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True. diff --git a/llm-lora-finetuning/lit_gpt/utils.py b/llm-lora-finetuning/lit_gpt/utils.py index ba4706ff..8fbcc028 100644 --- a/llm-lora-finetuning/lit_gpt/utils.py +++ b/llm-lora-finetuning/lit_gpt/utils.py @@ -39,9 +39,7 @@ def find_multiple(n: int, k: int) -> int: return n + k - (n % k) -def num_parameters( - module: nn.Module, requires_grad: Optional[bool] = None -) -> int: +def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> int: total = 0 for p in module.parameters(): if requires_grad is None or p.requires_grad == requires_grad: @@ -61,9 +59,7 @@ def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None: checkpoint_dir / "tokenizer.json" ).is_file() or (checkpoint_dir / "tokenizer.model").is_file(), - "tokenizer_config.json": ( - checkpoint_dir / "tokenizer_config.json" - ).is_file(), + "tokenizer_config.json": (checkpoint_dir / "tokenizer_config.json").is_file(), } if checkpoint_dir.is_dir(): if all(files.values()): @@ -96,10 +92,7 @@ class SavingProxyForStorage: def __init__(self, obj, saver, protocol_version=5): self.protocol_version = protocol_version self.saver = saver - if not ( - isinstance(obj, torch.storage.TypedStorage) - or torch.is_storage(obj) - ): + if not (isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj)): raise TypeError(f"expected storage, not {type(obj)}") # this logic is taken from PyTorch 2.0+ torch/serialization.py @@ -132,9 +125,7 @@ def __reduce_ex__(self, protocol_version): class SavingProxyForTensor: def __init__(self, tensor, saver, protocol_version=5): self.protocol_version = protocol_version - self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__( - protocol_version - ) + self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__(protocol_version) if reduce_args[0] == torch._utils._rebuild_tensor_v2: # for Tensors with Python attributes (a0, a1, (storage, *a2_other), *other_reduce_args) = reduce_args @@ -185,9 +176,7 @@ def persistent_id(self, obj): if isinstance(obj, SavingProxyForStorage): return obj.storage_info - if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage( - obj - ): + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): if isinstance(obj, torch.storage.TypedStorage): # TODO: Once we decide to break serialization FC, this case # can be deleted @@ -208,10 +197,7 @@ def persistent_id(self, obj): # not allocated, don't perform this check if storage.data_ptr() != 0: if storage.data_ptr() in self.storage_dtypes: - if ( - storage_dtype - != self.storage_dtypes[storage.data_ptr()] - ): + if storage_dtype != self.storage_dtypes[storage.data_ptr()]: raise RuntimeError( "Cannot save multiple tensors or storages that view the same data as different types" ) @@ -304,8 +290,7 @@ def chunked_cross_entropy( # chunk cross entropy logit_chunks = [ - logit_chunk.reshape(-1, logit_chunk.size(-1)) - for logit_chunk in logits + logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits ] target_chunks = [ target_chunk.reshape(-1) @@ -347,16 +332,12 @@ def chunked_cross_entropy( return torch.cat(loss_chunks).sum() / max(1, non_masked_elems) -def map_old_state_dict_weights( - state_dict: Dict, mapping: Mapping, prefix: str -) -> Dict: +def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict: for checkpoint_name, attribute_name in mapping.items(): full_checkpoint_name = prefix + checkpoint_name if full_checkpoint_name in state_dict: full_attribute_name = prefix + attribute_name - state_dict[full_attribute_name] = state_dict.pop( - full_checkpoint_name - ) + state_dict[full_attribute_name] = state_dict.pop(full_checkpoint_name) return state_dict diff --git a/llm-lora-finetuning/scripts/convert_hf_checkpoint.py b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py index 14d0ff6f..86a5bff9 100644 --- a/llm-lora-finetuning/scripts/convert_hf_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py @@ -211,8 +211,7 @@ def copy_weights_phi( dtype: Optional[torch.dtype] = None, ) -> None: if any( - layer_name.startswith(("layers.", "transformer.")) - for layer_name in hf_weights + layer_name.startswith(("layers.", "transformer.")) for layer_name in hf_weights ): raise ValueError( "You are using an outdated Phi checkpoint. Please reload it as described in 'tutorials/download_phi.md'" @@ -307,9 +306,7 @@ def load_param( @torch.inference_mode() def convert_hf_checkpoint( *, - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), model_name: Optional[str] = None, dtype: Optional[str] = None, ) -> None: @@ -342,22 +339,16 @@ def convert_hf_checkpoint( # Load the json file containing weight mapping pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json" - if ( - pytorch_bin_map_json_path.is_file() - ): # not all checkpoints have this file + if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file with open(pytorch_bin_map_json_path) as json_map: bin_index = json.load(json_map) - bin_files = { - checkpoint_dir / bin for bin in bin_index["weight_map"].values() - } + bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} else: bin_files = set(checkpoint_dir.glob("*.bin")) # some checkpoints serialize the training arguments bin_files = {f for f in bin_files if f.name != "training_args.bin"} if not bin_files: - raise ValueError( - f"Expected {str(checkpoint_dir)!r} to contain .bin files" - ) + raise ValueError(f"Expected {str(checkpoint_dir)!r} to contain .bin files") with incremental_save(checkpoint_dir / "lit_model.pth") as saver: # for checkpoints that split the QKV across several files, we need to keep all the bin files diff --git a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py index 1239e7d2..6b06e888 100644 --- a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py @@ -261,9 +261,7 @@ def convert_lit_checkpoint( copy_fn = partial(copy_weights_falcon, config.name) elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): untie_weights = "Gemma" in config.name - copy_fn = partial( - copy_weights_llama, config, untie_weights=untie_weights - ) + copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights) elif "phi" in config.name: copy_fn = partial(copy_weights_phi, config) else: diff --git a/llm-lora-finetuning/scripts/download.py b/llm-lora-finetuning/scripts/download.py index e5a7459d..594ae9dc 100644 --- a/llm-lora-finetuning/scripts/download.py +++ b/llm-lora-finetuning/scripts/download.py @@ -38,9 +38,7 @@ def download_from_hub( from huggingface_hub import snapshot_download - if ( - "meta-llama" in repo_id or "falcon-180" in repo_id - ) and not access_token: + if ("meta-llama" in repo_id or "falcon-180" in repo_id) and not access_token: raise ValueError( f"{repo_id} requires authentication, please set the `HF_TOKEN=your_token` environment" " variable or pass --access_token=your_token. You can find your token by visiting" diff --git a/llm-lora-finetuning/scripts/merge_lora.py b/llm-lora-finetuning/scripts/merge_lora.py index 89818a99..b9a2baa3 100644 --- a/llm-lora-finetuning/scripts/merge_lora.py +++ b/llm-lora-finetuning/scripts/merge_lora.py @@ -24,9 +24,7 @@ def merge_lora( lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), out_dir: Path = Path("out/lora/checkpoint"), precision: Optional[str] = None, lora_r: int = 8, diff --git a/llm-lora-finetuning/scripts/prepare_alpaca.py b/llm-lora-finetuning/scripts/prepare_alpaca.py index cde6fca1..77c62691 100644 --- a/llm-lora-finetuning/scripts/prepare_alpaca.py +++ b/llm-lora-finetuning/scripts/prepare_alpaca.py @@ -22,9 +22,7 @@ def prepare( destination_path: Path = Path("data/alpaca"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, seed: int = 42, mask_inputs: bool = False, # as in alpaca-lora @@ -39,9 +37,7 @@ def prepare( which stores the preprocessed and tokenized prompts and labels. """ if max_seq_length is None: - with open( - checkpoint_dir / "lit_config.json", "r", encoding="utf-8" - ) as file: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: config = json.load(file) max_seq_length = config["block_size"] diff --git a/llm-lora-finetuning/scripts/prepare_csv.py b/llm-lora-finetuning/scripts/prepare_csv.py index bbd27074..16b45cd4 100644 --- a/llm-lora-finetuning/scripts/prepare_csv.py +++ b/llm-lora-finetuning/scripts/prepare_csv.py @@ -22,9 +22,7 @@ def prepare( csv_path: Path, destination_path: Path = Path("data/csv"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), test_split_fraction: float = 0.1, seed: int = 42, mask_inputs: bool = False, @@ -48,9 +46,7 @@ def prepare( df = pd.read_csv(csv_path, dtype=str).fillna("") if not (df.columns.values == columns).all(): - raise ValueError( - f"CSV columns must be {columns}, found {df.columns.values}" - ) + raise ValueError(f"CSV columns must be {columns}, found {df.columns.values}") data = json.loads(df.to_json(orient="records", indent=4)) print("Loading tokenizer...") diff --git a/llm-lora-finetuning/scripts/prepare_dolly.py b/llm-lora-finetuning/scripts/prepare_dolly.py index 8bb43439..3ecf973f 100644 --- a/llm-lora-finetuning/scripts/prepare_dolly.py +++ b/llm-lora-finetuning/scripts/prepare_dolly.py @@ -23,9 +23,7 @@ def prepare( destination_path: Path = Path("data/dolly"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), test_split_fraction: float = 0.1, seed: int = 42, mask_inputs: bool = False, @@ -41,9 +39,7 @@ def prepare( """ if max_seq_length is None: - with open( - checkpoint_dir / "lit_config.json", "r", encoding="utf-8" - ) as file: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: config = json.load(file) max_seq_length = config["block_size"] diff --git a/llm-lora-finetuning/scripts/prepare_flan.py b/llm-lora-finetuning/scripts/prepare_flan.py index a34b5472..90707853 100644 --- a/llm-lora-finetuning/scripts/prepare_flan.py +++ b/llm-lora-finetuning/scripts/prepare_flan.py @@ -29,9 +29,7 @@ def load_jsonl(filename): def prepare( destination_path: Path = Path("data/flan"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), mask_inputs: bool = False, # as in alpaca-lora subsets: Optional[str] = None, ignore_index: int = -1, @@ -124,9 +122,7 @@ def prepare( subsets = list(supported_subsets) if max_seq_length is None: - with open( - checkpoint_dir / "lit_config.json", "r", encoding="utf-8" - ) as file: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: config = json.load(file) max_seq_length = config["block_size"] diff --git a/llm-lora-finetuning/scripts/prepare_lima.py b/llm-lora-finetuning/scripts/prepare_lima.py index e27928ce..75b57e20 100644 --- a/llm-lora-finetuning/scripts/prepare_lima.py +++ b/llm-lora-finetuning/scripts/prepare_lima.py @@ -23,9 +23,7 @@ def prepare( destination_path: Path = Path("data/lima"), test_split_fraction: float = 0.1, - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), mask_inputs: bool = False, # as in alpaca-lora seed: int = 42, include_multiturn_conversations: bool = False, @@ -48,9 +46,7 @@ def prepare( ) if max_seq_length is None: - with open( - checkpoint_dir / "lit_config.json", "r", encoding="utf-8" - ) as file: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: config = json.load(file) max_seq_length = config["block_size"] @@ -60,9 +56,7 @@ def prepare( from datasets import load_dataset dataset = load_dataset(data_repo_id, token=access_token) - train_data = format_dataset( - dataset["train"], include_multiturn_conversations - ) + train_data = format_dataset(dataset["train"], include_multiturn_conversations) # test set is present but doesn't have any solutions, so we cannot use it here # but have to create our own diff --git a/llm-lora-finetuning/scripts/prepare_longform.py b/llm-lora-finetuning/scripts/prepare_longform.py index 6327bad8..6cea71a5 100644 --- a/llm-lora-finetuning/scripts/prepare_longform.py +++ b/llm-lora-finetuning/scripts/prepare_longform.py @@ -22,9 +22,7 @@ def prepare( destination_path: Path = Path("data/longform"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), mask_inputs: bool = False, # as in alpaca-lora ignore_index: int = -1, max_seq_length: Optional[int] = None, @@ -35,9 +33,7 @@ def prepare( which stores the preprocessed and tokenized prompts and labels. """ if max_seq_length is None: - with open( - checkpoint_dir / "lit_config.json", "r", encoding="utf-8" - ) as file: + with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: config = json.load(file) max_seq_length = config["block_size"] @@ -47,9 +43,13 @@ def prepare( # val_file_name = "val.json" test_file_name = "test.json" - train_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/train.json" + train_file_url = ( + "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/train.json" + ) # val_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/val.json" - test_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/test.json" + test_file_url = ( + "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/test.json" + ) train_file_path = destination_path / train_file_name print("Loading train data file...") diff --git a/llm-lora-finetuning/scripts/prepare_openwebtext.py b/llm-lora-finetuning/scripts/prepare_openwebtext.py index fbb4a8d9..4f1d255e 100644 --- a/llm-lora-finetuning/scripts/prepare_openwebtext.py +++ b/llm-lora-finetuning/scripts/prepare_openwebtext.py @@ -20,9 +20,7 @@ def prepare( destination_path: Path = Path("data/openwebtext"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), seed: int = 42, test_size: Union[float, int, None] = 0.0005, ) -> None: @@ -48,9 +46,7 @@ def prepare( split_dataset = dataset["train"].train_test_split( test_size=test_size, seed=seed, shuffle=True ) - split_dataset["val"] = split_dataset.pop( - "test" - ) # rename the test split to val + split_dataset["val"] = split_dataset.pop("test") # rename the test split to val def process(example): ids = tokenizer.encode(example["text"]).tolist() @@ -73,18 +69,12 @@ def process(example): for split, dset in tokenized.items(): arr_len = np.sum(dset["len"], dtype=np.uint64) filename = destination_path / f"{split}.bin" - dtype = ( - np.uint16 - ) # (can do since enc.max_token_value == 50256 is < 2**16) - arr = np.memmap( - str(filename), dtype=dtype, mode="w+", shape=(arr_len,) - ) + dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) + arr = np.memmap(str(filename), dtype=dtype, mode="w+", shape=(arr_len,)) total_batches = 1024 idx = 0 - for batch_idx in tqdm( - range(total_batches), desc=f"writing {filename}" - ): + for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"): # Batch together samples for faster write batch = dset.shard( num_shards=total_batches, index=batch_idx, contiguous=True diff --git a/llm-lora-finetuning/scripts/prepare_redpajama.py b/llm-lora-finetuning/scripts/prepare_redpajama.py index 02044307..23224c1f 100644 --- a/llm-lora-finetuning/scripts/prepare_redpajama.py +++ b/llm-lora-finetuning/scripts/prepare_redpajama.py @@ -110,9 +110,7 @@ def prepare_full( is_cc = set_name == "common_crawl" - filenames = glob.glob( - os.path.join(source_path, pattern), recursive=True - ) + filenames = glob.glob(os.path.join(source_path, pattern), recursive=True) if not filenames: raise RuntimeError( @@ -137,32 +135,24 @@ def prepare_full( print(f"Processing {name}") if is_cc: - with zstd.open( - open(filepath, "rb"), "rt", encoding="utf-8" - ) as f: + with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: for row in tqdm(f): text = json.loads(row)["text"] text_ids = tokenizer.encode(text) - builder.add_array( - np.array(text_ids, dtype=builder.dtype) - ) + builder.add_array(np.array(text_ids, dtype=builder.dtype)) else: with open(filepath, encoding="utf-8") as f: for row in tqdm(f): text = json.loads(row)["text"] text_ids = tokenizer.encode(text) - builder.add_array( - np.array(text_ids, dtype=builder.dtype) - ) + builder.add_array(np.array(text_ids, dtype=builder.dtype)) builder.write_reminder() def prepare( source_path: Path = Path("data/RedPajama-Data-1T-Sample"), - checkpoint_dir: Path = Path( - "checkpoints/stabilityai/stablelm-base-alpha-3b" - ), + checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), destination_path: Path = Path("data/redpajama_sample"), sample: bool = True, match: str = "", diff --git a/llm-lora-finetuning/scripts/prepare_slimpajama.py b/llm-lora-finetuning/scripts/prepare_slimpajama.py index 0a80191f..7a83316a 100644 --- a/llm-lora-finetuning/scripts/prepare_slimpajama.py +++ b/llm-lora-finetuning/scripts/prepare_slimpajama.py @@ -30,10 +30,7 @@ def prepare_item(self, filepath): with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: for row in f: text = json.loads(row)["text"] - if ( - json.loads(row)["meta"]["redpajama_set_name"] - == "RedPajamaGithub" - ): + if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub": continue # exclude the GitHub data since it overlaps with starcoder text_ids = self.tokenizer.encode(text, bos=False, eos=True) yield text_ids @@ -47,9 +44,7 @@ def prepare( fast_dev_run: bool = False, ) -> None: tokenizer = Tokenizer(tokenizer_path) - data_recipe = SlimPajamaDataRecipe( - tokenizer=tokenizer, chunk_size=chunk_size - ) + data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) data_processor = DataProcessor( input_dir=str(input_dir), output_dir=str(output_dir), diff --git a/llm-lora-finetuning/scripts/prepare_starcoder.py b/llm-lora-finetuning/scripts/prepare_starcoder.py index 1f67c93e..f9104e97 100644 --- a/llm-lora-finetuning/scripts/prepare_starcoder.py +++ b/llm-lora-finetuning/scripts/prepare_starcoder.py @@ -57,9 +57,7 @@ def prepare( fast_dev_run: bool = False, ) -> None: tokenizer = Tokenizer(tokenizer_path) - data_recipe = StarcoderDataRecipe( - tokenizer=tokenizer, chunk_size=chunk_size - ) + data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) data_processor = DataProcessor( input_dir=str(input_dir), output_dir=str(output_dir), diff --git a/llm-lora-finetuning/steps/eval.py b/llm-lora-finetuning/steps/eval.py index 4d38bc1f..6a51615b 100644 --- a/llm-lora-finetuning/steps/eval.py +++ b/llm-lora-finetuning/steps/eval.py @@ -5,24 +5,31 @@ from typing import Annotated, Any, Dict, Optional from evaluate.lm_eval_harness import run_eval_harness -from zenml import step +from zenml import step, log_artifact_metadata, log_model_metadata from scripts.download import download_from_hub from scripts.merge_lora import merge_lora +from steps.utils import get_huggingface_access_token @step def eval( model_repo: str, adapter_repo: Optional[str] = None ) -> Annotated[Dict[str, Any], "evaluation_results"]: + access_token = get_huggingface_access_token() + model_dir = Path("model") - download_from_hub(repo_id=model_repo, checkpoint_dir=model_dir) + download_from_hub( + repo_id=model_repo, checkpoint_dir=model_dir, access_token=access_token + ) if adapter_repo: adapter_dir = Path("adapter") merged_dir = Path("merged") - download_from_hub(repo_id=adapter_repo, checkpoint_dir=adapter_dir) + download_from_hub( + repo_id=adapter_repo, checkpoint_dir=adapter_dir, access_token=access_token + ) lora_path = adapter_dir / "lit_model_lora_finetuned.pth" merge_lora( diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index 07ca4cd7..faf4ce1d 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -1,18 +1,21 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. import importlib +import json +import os +from dataclasses import asdict from pathlib import Path from tempfile import mkdtemp from typing import Any, ClassVar, Tuple, Type +from lit_gpt import Config from zenml import step from zenml.enums import ArtifactType from zenml.io import fileio from zenml.materializers.base_materializer import BaseMaterializer -from lit_gpt import Config -import json -from dataclasses import asdict + from scripts.download import download_from_hub -import os +from steps.utils import get_huggingface_access_token + class DirectoryMaterializer(BaseMaterializer): ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Path,) @@ -53,9 +56,14 @@ def _copy_directory(src: str, dst: str) -> None: @step(output_materializers=DirectoryMaterializer) def feature_engineering(model_repo: str, dataset_name: str) -> Path: + access_token = get_huggingface_access_token() + checkpoint_root_dir = Path("checkpoints") download_from_hub( - repo_id=model_repo, tokenizer_only=True, checkpoint_dir=checkpoint_root_dir + repo_id=model_repo, + tokenizer_only=True, + checkpoint_dir=checkpoint_root_dir, + access_token=access_token, ) checkpoint_dir = checkpoint_root_dir / model_repo @@ -71,7 +79,5 @@ def feature_engineering(model_repo: str, dataset_name: str) -> Path: helper_module = importlib.import_module(f"scripts.prepare_{dataset_name}") prepare_function = getattr(helper_module, "prepare") - prepare_function( - checkpoint_dir=checkpoint_dir, destination_path=destination_dir - ) + prepare_function(checkpoint_dir=checkpoint_dir, destination_path=destination_dir) return destination_dir diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 06cead79..ca4a4de5 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -6,13 +6,17 @@ from finetune.lora import setup from huggingface_hub import upload_folder from lit_gpt.args import IOArgs -from zenml import step +from zenml import log_model_metadata, step +from zenml.logger import get_logger from scripts.convert_hf_checkpoint import convert_hf_checkpoint from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub from scripts.merge_lora import merge_lora from scripts.prepare_alpaca import prepare +from steps.utils import get_huggingface_access_token + +logger = get_logger(__file__) @step @@ -23,31 +27,46 @@ def finetune( convert_to_hf: bool = False, data_dir: Optional[Path] = None, ) -> None: - checkpoint_dir = Path("checkpoints") - output_dir = Path("out/lora/alpaca") - download_from_hub(repo_id=repo_id, checkpoint_dir=checkpoint_dir) + access_token = get_huggingface_access_token() + + checkpoint_root_dir = Path("checkpoints") + checkpoint_dir = checkpoint_root_dir / repo_id + + if checkpoint_dir.exists(): + logger.info("Checkpoint directory already exists, skipping download...") + else: + download_from_hub( + repo_id=repo_id, + checkpoint_dir=checkpoint_root_dir, + access_token=access_token, + ) + convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) if not data_dir: data_dir = Path("data/alpaca") prepare(destination_path=data_dir, checkpoint_dir=checkpoint_dir) - io_args = ( - IOArgs( - train_data_dir=data_dir, - val_data_dir=data_dir, - checkpoint_dir=checkpoint_dir, - out_dir=output_dir, - ), + model_name = checkpoint_dir.name + dataset_name = data_dir.name + + log_model_metadata( + metadata={"model_name": model_name, "dataset_name": dataset_name} ) - setup(precision="bf16-true", io=io_args) + output_dir = Path("output/lora") / dataset_name - model_name = repo_id.split("/")[-1] + io_args = IOArgs( + train_data_dir=data_dir, + val_data_dir=data_dir, + checkpoint_dir=checkpoint_dir, + out_dir=output_dir, + ) + setup(precision="bf16-true", io=io_args) if merged_output_repo: lora_path = output_dir / model_name / "lit_model_lora_finetuned.pth" - merge_output_dir = Path("out/lora_merged") / model_name + merge_output_dir = Path("output/lora_merged") / dataset_name / model_name merge_lora( lora_alpha=lora_path, checkpoint_dir=checkpoint_dir, @@ -60,7 +79,7 @@ def finetune( shutil.copy(src=path, dst=destination) if convert_to_hf: - upload_dir = Path("hf_checkpoint_merged") + upload_dir = Path("output/lora_merged_hf") / dataset_name / model_name convert_lit_checkpoint( checkpoint_path=merged_output_repo / "lit_model.pth", output_path=output_dir, @@ -69,9 +88,27 @@ def finetune( else: upload_dir = merge_output_dir - upload_folder(repo_id=merged_output_repo, folder_path=upload_dir) + commit = upload_folder( + repo_id=merged_output_repo, + folder_path=upload_dir, + token=access_token, + ) + log_model_metadata( + metadata={ + "merged_model_huggingface_commit_hash": commit.oid, + "merged_model_huggingface_commit_url": commit.commit_url, + } + ) if adapter_output_repo: - upload_folder( - repo_id=adapter_output_repo, folder_path=output_dir / model_name + commit = upload_folder( + repo_id=adapter_output_repo, + folder_path=output_dir / model_name, + token=access_token, + ) + log_model_metadata( + metadata={ + "adapter_huggingface_commit_hash": commit.oid, + "adapter_huggingface_commit_url": commit.commit_url, + } ) diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 45b0d35f..60d17561 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -8,6 +8,7 @@ from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub from scripts.merge_lora import merge_lora +from steps.utils import get_huggingface_access_token @step @@ -17,12 +18,22 @@ def merge( output_repo: str, convert_to_hf: bool = False, ) -> None: + access_token = get_huggingface_access_token() + base_model_dir = Path("checkpoints") adapter_dir = Path("adapter") merged_dir = Path("merged") - download_from_hub(repo_id=base_model_repo, checkpoint_dir=base_model_dir) - download_from_hub(repo_id=adapter_repo, checkpoint_dir=adapter_dir) + download_from_hub( + repo_id=base_model_repo, + checkpoint_dir=base_model_dir, + access_token=access_token, + ) + download_from_hub( + repo_id=adapter_repo, + checkpoint_dir=adapter_dir, + access_token=access_token, + ) lora_path = adapter_dir / "lit_model_lora_finetuned.pth" merge_lora( diff --git a/llm-lora-finetuning/steps/utils.py b/llm-lora-finetuning/steps/utils.py new file mode 100644 index 00000000..0d4fa90d --- /dev/null +++ b/llm-lora-finetuning/steps/utils.py @@ -0,0 +1,11 @@ +import os +from typing import Optional + +from zenml.client import Client + + +def get_huggingface_access_token() -> Optional[str]: + try: + return Client().get_secret("huggingface_credentials").secret_values["token"] + except KeyError: + return os.getenv("HF_TOKEN") From 9ccd8b7ec1ecb6fabdd15a93448d49b825e1d54b Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Wed, 6 Mar 2024 15:30:08 +0800 Subject: [PATCH 09/26] Set precision --- .../evaluate/lm_eval_harness.py | 12 ++- llm-lora-finetuning/finetune/adapter.py | 32 +++++--- llm-lora-finetuning/finetune/adapter_v2.py | 32 +++++--- llm-lora-finetuning/finetune/full.py | 45 ++++++++---- llm-lora-finetuning/finetune/lora.py | 32 +++++--- llm-lora-finetuning/generate/adapter.py | 12 ++- llm-lora-finetuning/generate/adapter_v2.py | 12 ++- llm-lora-finetuning/generate/base.py | 8 +- llm-lora-finetuning/generate/full.py | 8 +- llm-lora-finetuning/generate/lora.py | 8 +- llm-lora-finetuning/generate/sequentially.py | 42 ++++++++--- llm-lora-finetuning/generate/tp.py | 32 ++++++-- llm-lora-finetuning/lit_gpt/adapter.py | 38 +++++++--- llm-lora-finetuning/lit_gpt/adapter_v2.py | 26 +++++-- llm-lora-finetuning/lit_gpt/args.py | 8 +- llm-lora-finetuning/lit_gpt/config.py | 36 ++++++--- llm-lora-finetuning/lit_gpt/lora.py | 60 +++++++++++---- llm-lora-finetuning/lit_gpt/model.py | 73 ++++++++++++++----- llm-lora-finetuning/lit_gpt/packed_dataset.py | 8 +- llm-lora-finetuning/lit_gpt/tokenizer.py | 18 +++-- llm-lora-finetuning/lit_gpt/utils.py | 37 +++++++--- .../scripts/convert_hf_checkpoint.py | 19 +++-- .../scripts/convert_lit_checkpoint.py | 4 +- llm-lora-finetuning/scripts/download.py | 4 +- llm-lora-finetuning/scripts/merge_lora.py | 4 +- llm-lora-finetuning/scripts/prepare_alpaca.py | 8 +- llm-lora-finetuning/scripts/prepare_csv.py | 8 +- llm-lora-finetuning/scripts/prepare_dolly.py | 8 +- llm-lora-finetuning/scripts/prepare_flan.py | 8 +- llm-lora-finetuning/scripts/prepare_lima.py | 12 ++- .../scripts/prepare_longform.py | 16 ++-- .../scripts/prepare_openwebtext.py | 20 +++-- .../scripts/prepare_redpajama.py | 20 +++-- .../scripts/prepare_slimpajama.py | 9 ++- .../scripts/prepare_starcoder.py | 4 +- llm-lora-finetuning/steps/eval.py | 9 ++- .../steps/feature_engineering.py | 4 +- llm-lora-finetuning/steps/finetune.py | 15 +++- llm-lora-finetuning/steps/utils.py | 6 +- 39 files changed, 554 insertions(+), 203 deletions(-) diff --git a/llm-lora-finetuning/evaluate/lm_eval_harness.py b/llm-lora-finetuning/evaluate/lm_eval_harness.py index 55de2cc5..6f90c19f 100644 --- a/llm-lora-finetuning/evaluate/lm_eval_harness.py +++ b/llm-lora-finetuning/evaluate/lm_eval_harness.py @@ -45,7 +45,9 @@ def __init__( @classmethod def create_from_arg_string(cls, arg_string, additional_config=None): - kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")} + kwargs = { + el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",") + } return cls(**kwargs, **additional_config) @property @@ -85,7 +87,9 @@ def _model_call(self, inps): return self.model(inps) @torch.inference_mode() - def _model_generate(self, context, max_length, eos_token_id) -> torch.Tensor: + def _model_generate( + self, context, max_length, eos_token_id + ) -> torch.Tensor: # this only supports batch size 1 assert context.shape[0] == 1 out = generate(self.model, context[0], max_length, eos_id=eos_token_id) @@ -174,7 +178,9 @@ def run_eval_harness( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/finetune/adapter.py b/llm-lora-finetuning/finetune/adapter.py index 1826603f..acf8f6d4 100644 --- a/llm-lora-finetuning/finetune/adapter.py +++ b/llm-lora-finetuning/finetune/adapter.py @@ -76,7 +76,9 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -138,7 +140,9 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -147,7 +151,9 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_adapter_as_trainable(model) @@ -199,7 +205,9 @@ def main( ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final checkpoint at the end of training save_path = io.out_dir / "lit_model_adapter_finetuned.pth" @@ -220,7 +228,9 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -251,7 +261,9 @@ def fit( longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + is_accumulating = ( + iter_num % train.gradient_accumulation_iters(devices) != 0 + ) with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -283,7 +295,9 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 fabric.print( f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -318,9 +332,7 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = ( - "Recommend a movie for me to watch during the weekend and explain the reason." - ) + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/finetune/adapter_v2.py b/llm-lora-finetuning/finetune/adapter_v2.py index d30ff127..ac7de327 100644 --- a/llm-lora-finetuning/finetune/adapter_v2.py +++ b/llm-lora-finetuning/finetune/adapter_v2.py @@ -76,7 +76,9 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -138,7 +140,9 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -147,7 +151,9 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_adapter_v2_as_trainable(model) @@ -199,7 +205,9 @@ def main( ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final checkpoint at the end of training save_path = io.out_dir / "lit_model_adapter_finetuned.pth" @@ -220,7 +228,9 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -251,7 +261,9 @@ def fit( longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + is_accumulating = ( + iter_num % train.gradient_accumulation_iters(devices) != 0 + ) with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -283,7 +295,9 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 fabric.print( f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -318,9 +332,7 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = ( - "Recommend a movie for me to watch during the weekend and explain the reason." - ) + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/finetune/full.py b/llm-lora-finetuning/finetune/full.py index bff8de53..02e28a72 100644 --- a/llm-lora-finetuning/finetune/full.py +++ b/llm-lora-finetuning/finetune/full.py @@ -105,7 +105,9 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -114,7 +116,9 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) @@ -157,10 +161,14 @@ def main( fit(fabric, state, train_data, val_data, devices, resume, io, train, eval) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final checkpoint at the end of training - fabric.save(io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]}) + fabric.save( + io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]} + ) def fit( @@ -179,7 +187,9 @@ def fit( scheduler = state["scheduler"] tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -201,7 +211,9 @@ def fit( for resume_iter in range(initial_iter): get_batch(fabric, train_data, None) if resume_iter % 1000 == 0: - fabric.print(f"Resuming dataset: {resume_iter} / {initial_iter}") + fabric.print( + f"Resuming dataset: {resume_iter} / {initial_iter}" + ) fabric.barrier() fabric.print( f"Resuming data loader finished. Took {time.perf_counter() - resume_t0:.1f} seconds to reach iteration" @@ -214,7 +226,9 @@ def fit( ).to(fabric.device) fabric.barrier() - for state["iter_num"] in range(state["iter_num"] + 1, train.max_iters(devices) + 1): + for state["iter_num"] in range( + state["iter_num"] + 1, train.max_iters(devices) + 1 + ): iter_t0 = time.perf_counter() input_ids, targets = get_batch( @@ -271,7 +285,9 @@ def fit( if not is_accumulating and state["step_count"] % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 fabric.print( f"iter {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -279,8 +295,13 @@ def fit( metrics = {"val_loss": val_loss, "val_ppl": math.exp(val_loss)} fabric.log_dict(metrics, step=state["iter_num"]) fabric.barrier() - if not is_accumulating and state["step_count"] % train.save_interval == 0: - checkpoint_path = io.out_dir / f"step-{state['step_count']:06d}.pth" + if ( + not is_accumulating + and state["step_count"] % train.save_interval == 0 + ): + checkpoint_path = ( + io.out_dir / f"step-{state['step_count']:06d}.pth" + ) fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}") fabric.save(checkpoint_path, state) @@ -309,9 +330,7 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = ( - "Recommend a movie for me to watch during the weekend and explain the reason." - ) + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/finetune/lora.py b/llm-lora-finetuning/finetune/lora.py index 8e72a571..39caa06e 100644 --- a/llm-lora-finetuning/finetune/lora.py +++ b/llm-lora-finetuning/finetune/lora.py @@ -85,7 +85,9 @@ def setup( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -170,7 +172,9 @@ def main( check_valid_checkpoint_dir(io.checkpoint_dir) - fabric.seed_everything(1337) # same seed for every process to init model (FSDP) + fabric.seed_everything( + 1337 + ) # same seed for every process to init model (FSDP) if fabric.global_rank == 0: os.makedirs(io.out_dir, exist_ok=True) @@ -179,7 +183,9 @@ def main( val_data = torch.load(io.val_data_dir / "test.pt") checkpoint_path = io.checkpoint_dir / "lit_model.pth" - fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}") + fabric.print( + f"Loading model {str(checkpoint_path)!r} with {config.__dict__}" + ) with fabric.init_module(empty_init=(devices > 1)): model = GPT(config) mark_only_lora_as_trainable(model) @@ -231,7 +237,9 @@ def main( ) fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + fabric.print( + f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + ) # Save the final LoRA checkpoint at the end of training save_path = io.out_dir / "lit_model_lora_finetuned.pth" @@ -252,7 +260,9 @@ def fit( ) -> None: tokenizer = Tokenizer(io.checkpoint_dir) longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data) - model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf")) + model.max_seq_length = min( + longest_seq_length, train.max_seq_length or float("inf") + ) fabric.print( f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is" f" {model.max_seq_length} and context length is {model.config.block_size}" @@ -283,7 +293,9 @@ def fit( longest_seq_ix if iter_num == 1 else None, ) - is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0 + is_accumulating = ( + iter_num % train.gradient_accumulation_iters(devices) != 0 + ) with fabric.no_backward_sync(model, enabled=is_accumulating): logits = model(input_ids, lm_head_chunk_size=128) # shift the targets such that output n predicts token n+1 @@ -315,7 +327,9 @@ def fit( if not is_accumulating and step_count % eval.interval == 0: t0 = time.perf_counter() - val_loss = validate(fabric, model, val_data, tokenizer, eval, train) + val_loss = validate( + fabric, model, val_data, tokenizer, eval, train + ) t1 = time.perf_counter() - t0 fabric.print( f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms" @@ -350,9 +364,7 @@ def validate( val_loss = losses.mean() # produce an example: - instruction = ( - "Recommend a movie for me to watch during the weekend and explain the reason." - ) + instruction = "Recommend a movie for me to watch during the weekend and explain the reason." fabric.print(instruction) sample = {"instruction": instruction, "input": ""} prompt = generate_prompt(sample) diff --git a/llm-lora-finetuning/generate/adapter.py b/llm-lora-finetuning/generate/adapter.py index 2d4857c4..3daa8836 100644 --- a/llm-lora-finetuning/generate/adapter.py +++ b/llm-lora-finetuning/generate/adapter.py @@ -29,8 +29,12 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", - adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + adapter_path: Path = Path( + "out/adapter/alpaca/lit_model_adapter_finetuned.pth" + ), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -64,7 +68,9 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/adapter_v2.py b/llm-lora-finetuning/generate/adapter_v2.py index 77780001..6f9d76d4 100644 --- a/llm-lora-finetuning/generate/adapter_v2.py +++ b/llm-lora-finetuning/generate/adapter_v2.py @@ -29,8 +29,12 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", - adapter_path: Path = Path("out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + adapter_path: Path = Path( + "out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth" + ), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -64,7 +68,9 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/base.py b/llm-lora-finetuning/generate/base.py index 75dce2aa..f8cfa7bd 100644 --- a/llm-lora-finetuning/generate/base.py +++ b/llm-lora-finetuning/generate/base.py @@ -122,7 +122,9 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -151,7 +153,9 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/full.py b/llm-lora-finetuning/generate/full.py index 6b2f9443..cc1da495 100644 --- a/llm-lora-finetuning/generate/full.py +++ b/llm-lora-finetuning/generate/full.py @@ -29,7 +29,9 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", finetuned_path: Path = Path("out/full/alpaca/lit_model_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -63,7 +65,9 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/lora.py b/llm-lora-finetuning/generate/lora.py index 1f2e5bf2..0b30b701 100644 --- a/llm-lora-finetuning/generate/lora.py +++ b/llm-lora-finetuning/generate/lora.py @@ -30,7 +30,9 @@ def main( prompt: str = "What food do llamas eat?", input: str = "", lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"] ] = None, @@ -73,7 +75,9 @@ def main( plugins = None if quantize is not None and quantize.startswith("bnb."): if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, diff --git a/llm-lora-finetuning/generate/sequentially.py b/llm-lora-finetuning/generate/sequentially.py index d2489602..d2dde4bb 100644 --- a/llm-lora-finetuning/generate/sequentially.py +++ b/llm-lora-finetuning/generate/sequentially.py @@ -32,7 +32,9 @@ @torch.inference_mode() -def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int): +def sequential( + model: GPT, root: torch.device, max_seq_length: int, devices: int +): if model.config.n_layer % devices: # TODO: support smarter partitioning schemes raise NotImplementedError( @@ -40,7 +42,9 @@ def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int ) layers_per_rank = model.config.n_layer // devices # dictates where each block should be instantiated - mapping = layer_to_device(model, chunk_on=Block, chunk_size=layers_per_rank) + mapping = layer_to_device( + model, chunk_on=Block, chunk_size=layers_per_rank + ) # materialize each block on the appropriate device for path, target_index in mapping.items(): @@ -48,7 +52,9 @@ def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int target_device = torch.device(root.type, target_index) print(f"Moving {path!r} to {target_device}", file=sys.stderr) # submodules loaded by the checkpoint will be on CPU (if no quantization). move them - replace_device(submodule, replace=torch.device("cpu"), by=target_device) + replace_device( + submodule, replace=torch.device("cpu"), by=target_device + ) # in case the checkpoint was partial, materialize leftover metas _materialize_meta_tensors(submodule, target_device) # and build the kv cache @@ -80,7 +86,9 @@ def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int partial(move_block_input, target_device) ) if layer_num == model.config.n_layer - 1: - submodule.register_forward_hook(partial(move_block_output, root)) + submodule.register_forward_hook( + partial(move_block_output, root) + ) return model @@ -126,7 +134,9 @@ def replace_device( devices = {t.device for t in tensors.values()} if len(devices) != 1: # since this is using `submodule.to`, different devices in the same submodule is a problem - path_to_device = {f"{name}.{p}": t.device for p, t in tensors.items()} + path_to_device = { + f"{name}.{p}": t.device for p, t in tensors.items() + } raise ValueError(f"Found multiple devices: {path_to_device}") if devices.pop() == replace: submodule.to(by) @@ -141,7 +151,9 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path("checkpoints/mistralai/Mistral-7B-Instruct-v0.1"), + checkpoint_dir: Path = Path( + "checkpoints/mistralai/Mistral-7B-Instruct-v0.1" + ), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"] ] = None, @@ -171,7 +183,9 @@ def main( if compile: raise NotImplementedError # untested if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -214,7 +228,9 @@ def main( ) t0 = time.perf_counter() - state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") + state_dict = torch.load( + str(checkpoint_path), mmap=True, map_location="cpu" + ) # TODO: this assumes that the model fits on CPU. Use lazy_load and make the materialization checkpoint aware model.load_state_dict(state_dict, assign=True) print( @@ -225,7 +241,9 @@ def main( model = fabric.setup_module(model, move_to_device=False) t0 = time.perf_counter() - model = sequential(model, fabric.device, max_returned_tokens, total_devices) + model = sequential( + model, fabric.device, max_returned_tokens, total_devices + ) print( f"Time to sequential-ize the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr, @@ -276,8 +294,8 @@ def main( if __name__ == "__main__": torch.set_float32_matmul_precision("high") - logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes").setLevel( - logging.DEBUG - ) + logging.getLogger( + "lightning.fabric.plugins.precision.bitsandbytes" + ).setLevel(logging.DEBUG) CLI(main) diff --git a/llm-lora-finetuning/generate/tp.py b/llm-lora-finetuning/generate/tp.py index a4f75661..e8c7e1ef 100644 --- a/llm-lora-finetuning/generate/tp.py +++ b/llm-lora-finetuning/generate/tp.py @@ -43,7 +43,9 @@ def tensor_parallel_linear( f"This linear's {attr} value ({size}) is not evenly divisible by the world size ({world_size})" ) - shard = torch.tensor_split(linear.weight, world_size, dim=dim)[fabric.global_rank] + shard = torch.tensor_split(linear.weight, world_size, dim=dim)[ + fabric.global_rank + ] # overwrite `.data` instead of recreating the parameter for quantization (bitsandbytes) support. # the bitsandbytes linear classes use custom `torch.nn.Parameter` subclasses linear.weight.data = shard @@ -51,7 +53,9 @@ def tensor_parallel_linear( if linear.bias is not None and dim == 0: shard = torch.tensor_split(linear.bias, world_size)[fabric.global_rank] - linear.bias = torch.nn.Parameter(shard, requires_grad=linear.bias.requires_grad) + linear.bias = torch.nn.Parameter( + shard, requires_grad=linear.bias.requires_grad + ) def tensor_parallel_mlp( @@ -61,11 +65,15 @@ def tensor_parallel_mlp( tensor_parallel_linear(fabric, mlp.fc_1, "colwise") tensor_parallel_linear(fabric, mlp.fc_2, "colwise") tensor_parallel_linear(fabric, mlp.proj, "rowwise") - mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) + mlp.register_forward_hook( + partial(all_reduce_output, fabric.world_size) + ) elif isinstance(mlp, GptNeoxMLP): tensor_parallel_linear(fabric, mlp.fc, "colwise") tensor_parallel_linear(fabric, mlp.proj, "rowwise") - mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size)) + mlp.register_forward_hook( + partial(all_reduce_output, fabric.world_size) + ) elif isinstance(mlp, LLaMAMoE): # we use expert slicing across ranks, alternatively, we could create a expert parallelism group # when the number of experts is a multiple of the world size @@ -115,7 +123,9 @@ def main( max_new_tokens: int = 50, top_k: Optional[int] = 200, temperature: float = 0.8, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), quantize: Optional[ Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"] ] = None, @@ -145,7 +155,9 @@ def main( if compile: raise NotImplementedError # untested if "mixed" in precision: - raise ValueError("Quantization and mixed precision is not supported.") + raise ValueError( + "Quantization and mixed precision is not supported." + ) dtype = { "16-true": torch.float16, "bf16-true": torch.bfloat16, @@ -192,7 +204,9 @@ def main( for rank in range(fabric.world_size): if fabric.global_rank == rank: t0 = time.perf_counter() - state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu") + state_dict = torch.load( + str(checkpoint_path), mmap=True, map_location="cpu" + ) model.load_state_dict(state_dict, assign=True) print( f"[{rank}] Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", @@ -264,7 +278,9 @@ def main( if __name__ == "__main__": torch.set_float32_matmul_precision("high") - bnb_logger = logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes") + bnb_logger = logging.getLogger( + "lightning.fabric.plugins.precision.bitsandbytes" + ) bnb_logger.setLevel(logging.DEBUG) bnb_logger.debug = rank_zero_only(bnb_logger.debug) diff --git a/llm-lora-finetuning/lit_gpt/adapter.py b/llm-lora-finetuning/lit_gpt/adapter.py index 4ad6fc0c..61744419 100644 --- a/llm-lora-finetuning/lit_gpt/adapter.py +++ b/llm-lora-finetuning/lit_gpt/adapter.py @@ -43,7 +43,9 @@ def __init__(self, config: Config) -> None: self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + h=nn.ModuleList( + Block(config, i) for i in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) @@ -73,13 +75,17 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte( + idx + ) # token embeddings of shape (b, t, n_embd) for block in self.transformer.h: x = block(x, cos, sin, mask, input_pos) x = self.transformer.ln_f(x) if lm_head_chunk_size > 0: # chunk the lm head logits to reduce the peak memory used by autograd - return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] + return [ + self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1) + ] return self.lm_head(x) # (b, t, vocab_size) @classmethod @@ -117,11 +123,17 @@ def __init__(self, config: Config, block_idx: int) -> None: super().__init__(config) if block_idx >= config.adapter_start_layer: # adapter embedding layer - self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) + self.adapter_wte = nn.Embedding( + config.adapter_prompt_length, config.n_embd + ) # gate for adaption - self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) + self.gating_factor = torch.nn.Parameter( + torch.zeros(1, 1, config.n_head, 1) + ) # kv cache for inference - self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self.adapter_kv_cache: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None self.block_idx = block_idx def scaled_dot_product_attention( @@ -157,8 +169,12 @@ def scaled_dot_product_attention( # for MHA this is a no-op ak = ak.repeat_interleave(q_per_kv, dim=2) av = av.repeat_interleave(q_per_kv, dim=2) - ak = ak.view(1, -1, aT, self.config.head_size) # (1, nh_ak, aT, hs) - av = av.view(1, -1, aT, self.config.head_size) # (1, nh_av, aT, hs) + ak = ak.view( + 1, -1, aT, self.config.head_size + ) # (1, nh_ak, aT, hs) + av = av.view( + 1, -1, aT, self.config.head_size + ) # (1, nh_av, aT, hs) self.adapter_kv_cache = (ak, av) T = q.size(2) @@ -173,9 +189,9 @@ def _load_from_state_dict( self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any ) -> None: """For compatibility with older checkpoints.""" - if (key := prefix + "gating_factor") in state_dict and state_dict[key].size( - 1 - ) == self.config.n_head: + if (key := prefix + "gating_factor") in state_dict and state_dict[ + key + ].size(1) == self.config.n_head: state_dict[key] = state_dict[key].permute(0, 2, 1, 3) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) diff --git a/llm-lora-finetuning/lit_gpt/adapter_v2.py b/llm-lora-finetuning/lit_gpt/adapter_v2.py index 206d9395..5d389471 100644 --- a/llm-lora-finetuning/lit_gpt/adapter_v2.py +++ b/llm-lora-finetuning/lit_gpt/adapter_v2.py @@ -79,7 +79,9 @@ def __init__(self, config: Config) -> None: self.transformer = nn.ModuleDict( dict( wte=nn.Embedding(config.padded_vocab_size, config.n_embd), - h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)), + h=nn.ModuleList( + Block(config, i) for i in range(config.n_layer) + ), ln_f=config.norm_class(config.n_embd, eps=config.norm_eps), ) ) @@ -145,11 +147,17 @@ def __init__(self, config: Config, block_idx: int) -> None: if block_idx >= config.adapter_start_layer: # adapter embedding layer - self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd) + self.adapter_wte = nn.Embedding( + config.adapter_prompt_length, config.n_embd + ) # gate for adaption - self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1)) + self.gating_factor = torch.nn.Parameter( + torch.zeros(1, 1, config.n_head, 1) + ) # kv cache for inference - self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self.adapter_kv_cache: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None self.block_idx = block_idx self.config = config @@ -166,9 +174,9 @@ def _load_from_state_dict( } state_dict = map_old_state_dict_weights(state_dict, mapping, prefix) # For compatibility with older checkpoints - if (key := prefix + "gating_factor") in state_dict and state_dict[key].size( - 1 - ) == self.config.n_head: + if (key := prefix + "gating_factor") in state_dict and state_dict[ + key + ].size(1) == self.config.n_head: state_dict[key] = state_dict[key].permute(0, 2, 1, 3) super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @@ -240,7 +248,9 @@ class LLaMAMoE(lit_gpt.model.LLaMAMoE): def __init__(self, config: Config) -> None: nn.Module.__init__(self) self.gate = AdapterV2Linear(config.n_embd, config.n_expert, bias=False) - self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + self.experts = nn.ModuleList( + LLaMAMLP(config) for _ in range(config.n_expert) + ) self.config = config diff --git a/llm-lora-finetuning/lit_gpt/args.py b/llm-lora-finetuning/lit_gpt/args.py index 62217076..264c8f51 100644 --- a/llm-lora-finetuning/lit_gpt/args.py +++ b/llm-lora-finetuning/lit_gpt/args.py @@ -37,13 +37,17 @@ class TrainArgs: def max_iters(self, devices: int) -> int: """Number of iterations""" - max_iters = self.epochs * self.epoch_size // devices // self.micro_batch_size + max_iters = ( + self.epochs * self.epoch_size // devices // self.micro_batch_size + ) assert max_iters > 0 return max_iters def gradient_accumulation_iters(self, devices: int) -> int: """Number of iterations between gradient synchronizations""" - gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size + gradient_accumulation_iters = ( + self.batch_size(devices) // self.micro_batch_size + ) assert gradient_accumulation_iters > 0 return gradient_accumulation_iters diff --git a/llm-lora-finetuning/lit_gpt/config.py b/llm-lora-finetuning/lit_gpt/config.py index bca740d5..dab1523b 100644 --- a/llm-lora-finetuning/lit_gpt/config.py +++ b/llm-lora-finetuning/lit_gpt/config.py @@ -54,7 +54,9 @@ class Config: shared_attention_norm: bool = False _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm" norm_eps: float = 1e-5 - _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE"] = "GptNeoxMLP" + _mlp_class: Literal[ + "GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE" + ] = "GptNeoxMLP" gelu_approximate: str = "none" intermediate_size: Optional[int] = None rope_condense_ratio: int = 1 @@ -88,7 +90,9 @@ def __post_init__(self): # compute the intermediate size for MLP if not set if self.intermediate_size is None: if self._mlp_class == "LLaMAMLP": - raise ValueError("The config needs to set the `intermediate_size`") + raise ValueError( + "The config needs to set the `intermediate_size`" + ) self.intermediate_size = 4 * self.n_embd self.rope_n_elem = int(self.rotary_percentage * self.head_size) @@ -99,7 +103,9 @@ def from_name(cls, name: str, **kwargs: Any) -> Self: # search through all `config['hf_config']['name']` try: conf_dict = next( - config for config in configs if name == config["hf_config"]["name"] + config + for config in configs + if name == config["hf_config"]["name"] ) except StopIteration: raise ValueError(f"{name!r} is not a supported config name") @@ -117,7 +123,9 @@ def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self: with open(path, encoding="utf-8") as fp: json_kwargs = json.load(fp) if "condense_ratio" in json_kwargs: # legacy name - json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio") + json_kwargs["rope_condense_ratio"] = json_kwargs.pop( + "condense_ratio" + ) if "condense_ratio" in kwargs: # legacy name kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") if "org" in json_kwargs: # legacy name @@ -360,7 +368,9 @@ def norm_class(self) -> Type: # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json dict( name="RedPajama-INCITE-{}-3B-v1", - hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"), + hf_config=dict( + org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1" + ), block_size=2048, n_layer=32, n_embd=2560, @@ -381,7 +391,9 @@ def norm_class(self) -> Type: # this redirects to the checkpoint above. kept for those who had the old weights already downloaded dict( name="RedPajama-INCITE-{}-7B-v0.1", - hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"), + hf_config=dict( + org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1" + ), block_size=2048, n_layer=32, padding_multiple=256, @@ -1254,7 +1266,9 @@ def norm_class(self) -> Type: # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json dict( name="stablecode-completion-alpha-3b", - hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), + hf_config=dict( + org="stabilityai", name="stablecode-completion-alpha-3b" + ), block_size=16384, vocab_size=49152, n_layer=32, @@ -1263,7 +1277,9 @@ def norm_class(self) -> Type: # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json dict( name="stablecode-completion-alpha-3b-4k", - hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), + hf_config=dict( + org="stabilityai", name="stablecode-completion-alpha-3b-4k" + ), vocab_size=49152, n_layer=32, n_embd=2560, @@ -1446,7 +1462,9 @@ def norm_class(self) -> Type: # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json dict( name="Llama-2-7b-chat-hf-function-calling-v2", - hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"), + hf_config=dict( + org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2" + ), padding_multiple=64, n_layer=32, rotary_percentage=1.0, diff --git a/llm-lora-finetuning/lit_gpt/lora.py b/llm-lora-finetuning/lit_gpt/lora.py index 0df9ae3f..84d42543 100644 --- a/llm-lora-finetuning/lit_gpt/lora.py +++ b/llm-lora-finetuning/lit_gpt/lora.py @@ -246,7 +246,9 @@ def __init__( torch.zeros((r * sum(enable_lora), in_features)) ) # (4, 128) enable_q, enable_k, enable_v = enable_lora - self.kv_embd_size = self.linear.in_features // (n_head // n_query_groups) + self.kv_embd_size = self.linear.in_features // ( + n_head // n_query_groups + ) # qkv_shapes will be used to split a tensor with weights correctly qkv_shapes = ( self.linear.in_features * enable_q, @@ -282,16 +284,24 @@ def __init__( ind = range(out_features) self.lora_ind = [] if enable_q: - q_ind = [x for x in ind if (x // head_size) % total_qkv < total_qkv - 2] + q_ind = [ + x + for x in ind + if (x // head_size) % total_qkv < total_qkv - 2 + ] self.lora_ind.extend(q_ind) if enable_k: k_ind = [ - x for x in ind if (x // head_size) % total_qkv == total_qkv - 2 + x + for x in ind + if (x // head_size) % total_qkv == total_qkv - 2 ] self.lora_ind.extend(k_ind) if enable_v: v_ind = [ - x for x in ind if (x // head_size) % total_qkv == total_qkv - 1 + x + for x in ind + if (x // head_size) % total_qkv == total_qkv - 1 ] self.lora_ind.extend(v_ind) self.reset_parameters() @@ -352,18 +362,24 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors # for example when we want to merge/unmerge LoRA weights and pretrained weights x = x.transpose(0, 1) - result = x.new_zeros((*x.shape[:-1], self.linear.out_features)) # (64, 64, 384) + result = x.new_zeros( + (*x.shape[:-1], self.linear.out_features) + ) # (64, 64, 384) result = result.view(-1, self.linear.out_features) # (4096, 384) result = result.index_copy( 1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes)), ) # (4096, 256) - return result.view((*x.shape[:-1], self.linear.out_features)).transpose( + return result.view( + (*x.shape[:-1], self.linear.out_features) + ).transpose( 0, 1 ) # (64, 64, 384) - def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: + def conv1d( + self, input: torch.Tensor, weight: torch.Tensor + ) -> torch.Tensor: """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries. If the number of heads is equal to the number of query groups - grouped queries are disabled @@ -394,8 +410,12 @@ def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: # ⚬ C_output': embeddings size for each LoRA layer (not equal in size) # ⚬ r: rank of all LoRA layers (equal in size) - input_splitted = input.chunk(sum(self.enable_lora), dim=1) # N * (B, C // N, T) - weight_splitted = weight.split(self.qkv_shapes) # N * (C_output', r, 1) + input_splitted = input.chunk( + sum(self.enable_lora), dim=1 + ) # N * (B, C // N, T) + weight_splitted = weight.split( + self.qkv_shapes + ) # N * (C_output', r, 1) return torch.cat( [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], dim=1, # (B, C_output', T) @@ -410,7 +430,9 @@ def get_lora_AB(self) -> torch.Tensor: lora = self.conv1d( self.lora_A.data.unsqueeze(0), # (4, 128) -> (1, 4, 128) self.lora_B.data.unsqueeze(-1), # (256, 2) -> (256, 2, 1) - ).squeeze(0) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128) + ).squeeze( + 0 + ) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128) return self.zero_pad( lora * self.scaling ) # (256, 128) after zero_pad (384, 128) @@ -489,7 +511,11 @@ def mark_only_lora_as_trainable(model: nn.Module, bias: str = "none") -> None: p.requires_grad = True elif bias == "lora_only": for m in model.modules(): - if isinstance(m, LoRALayer) and hasattr(m, "bias") and m.bias is not None: + if ( + isinstance(m, LoRALayer) + and hasattr(m, "bias") + and m.bias is not None + ): m.bias.requires_grad = True else: raise NotImplementedError @@ -574,13 +600,17 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte( + idx + ) # token embeddings of shape (b, t, n_embd) for block in self.transformer.h: x = block(x, cos, sin, mask, input_pos) x = self.transformer.ln_f(x) if lm_head_chunk_size > 0: # chunk the lm head logits to reduce the peak memory used by autograd - return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)] + return [ + self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1) + ] return self.lm_head(x) # (B, T, vocab_size) @classmethod @@ -764,7 +794,9 @@ def __init__(self, config: Config) -> None: lora_alpha=config.alpha, lora_dropout=config.dropout, ) - self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + self.experts = nn.ModuleList( + LLaMAMLP(config) for _ in range(config.n_expert) + ) self.config = config diff --git a/llm-lora-finetuning/lit_gpt/model.py b/llm-lora-finetuning/lit_gpt/model.py index 6413634b..1ff378fd 100644 --- a/llm-lora-finetuning/lit_gpt/model.py +++ b/llm-lora-finetuning/lit_gpt/model.py @@ -94,7 +94,9 @@ def forward( sin = self.sin[:T] mask = None - x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + x = self.transformer.wte( + idx + ) # token embeddings of shape (b, t, n_embd) if self.config.scale_embeddings: x = x * (self.config.n_embd**0.5) @@ -135,7 +137,10 @@ def set_kv_cache( batch_size, max_seq_length, rope_cache_length, device, dtype ) - if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length: + if ( + self.mask_cache is None + or self.mask_cache.size(3) != max_seq_length + ): # passing `attn_mask` to SDPA disables the flash implementation. since we only need the mask # for the kv-cache support (only during inference), we only create it in that situation self.mask_cache = build_mask_cache(max_seq_length, device) @@ -212,17 +217,23 @@ def forward( B, T, C, - ) = x.size() # batch size, sequence length, embedding dimensionality (n_embd) + ) = ( + x.size() + ) # batch size, sequence length, embedding dimensionality (n_embd) qkv = self.attn(x) # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`) q_per_kv = self.config.n_head // self.config.n_query_groups - total_qkv = q_per_kv + 2 # each group has 1+ queries, 1 key, and 1 value + total_qkv = ( + q_per_kv + 2 + ) # each group has 1+ queries, 1 key, and 1 value qkv = qkv.view( B, T, self.config.n_query_groups, total_qkv, self.config.head_size ) - qkv = qkv.permute(0, 2, 3, 1, 4) # (B, n_query_groups, total_qkv, T, hs) + qkv = qkv.permute( + 0, 2, 3, 1, 4 + ) # (B, n_query_groups, total_qkv, T, hs) # split batched computation into three q, k, v = qkv.split((q_per_kv, 1, 1), dim=2) @@ -311,7 +322,9 @@ def build_kv_cache( batch_size, heads, max_seq_length, - rope_cache_length + self.config.head_size - self.config.rope_n_elem, + rope_cache_length + + self.config.head_size + - self.config.rope_n_elem, ) return KVCache(k_shape, v_shape, device=device, dtype=dtype) @@ -319,23 +332,35 @@ def build_kv_cache( class GptNeoxMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) + self.fc = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = nn.Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) self.config = config def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.fc(x) - x = torch.nn.functional.gelu(x, approximate=self.config.gelu_approximate) + x = torch.nn.functional.gelu( + x, approximate=self.config.gelu_approximate + ) return self.proj(x) class LLaMAMLP(nn.Module): def __init__(self, config: Config) -> None: super().__init__() - self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias) - self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias) + self.fc_1 = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.fc_2 = nn.Linear( + config.n_embd, config.intermediate_size, bias=config.bias + ) + self.proj = nn.Linear( + config.intermediate_size, config.n_embd, bias=config.bias + ) def forward(self, x: torch.Tensor) -> torch.Tensor: x_fc_1 = self.fc_1(x) @@ -356,7 +381,9 @@ class LLaMAMoE(nn.Module): def __init__(self, config: Config) -> None: super().__init__() self.gate = nn.Linear(config.n_embd, config.n_expert, bias=False) - self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert)) + self.experts = nn.ModuleList( + LLaMAMLP(config) for _ in range(config.n_expert) + ) self.config = config @@ -369,7 +396,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: B, T, C, - ) = x.size() # batch size, sequence length, embedding dimensionality (n_embd) + ) = ( + x.size() + ) # batch size, sequence length, embedding dimensionality (n_embd) x = x.view(-1, C) # (B*T, C) router = self.gate(x) # (B*T, n_expert) probs, indices = torch.topk( @@ -383,7 +412,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: y = torch.zeros_like(x) # (B*T, C) for mask, expert in zip(masks, self.experts): token_idx, expert_idx = torch.where(mask) - y[token_idx] += probs[token_idx, expert_idx, None] * expert(x[token_idx]) + y[token_idx] += probs[token_idx, expert_idx, None] * expert( + x[token_idx] + ) return y.view(B, T, C) @@ -401,7 +432,9 @@ def build_rope_cache( https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. """ # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ - theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem)) + theta = 1.0 / ( + base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem) + ) # Create position indexes `[0, 1, ..., seq_len - 1]` seq_idx = torch.arange(seq_len, device=device) / condense_ratio @@ -412,7 +445,9 @@ def build_rope_cache( return torch.cos(idx_theta), torch.sin(idx_theta) -def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: +def apply_rope( + x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor +) -> torch.Tensor: head_size = x.size(-1) x1 = x[..., : head_size // 2] # (B, nh, T, hs/2) x2 = x[..., head_size // 2 :] # (B, nh, T, hs/2) @@ -460,5 +495,7 @@ def reset_parameters(self) -> None: def build_mask_cache( max_seq_length: int, device: Optional[torch.device] = None ) -> torch.Tensor: - ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool) + ones = torch.ones( + (max_seq_length, max_seq_length), device=device, dtype=torch.bool + ) return torch.tril(ones).unsqueeze(0).unsqueeze(0) diff --git a/llm-lora-finetuning/lit_gpt/packed_dataset.py b/llm-lora-finetuning/lit_gpt/packed_dataset.py index afe9c126..a183d4c2 100644 --- a/llm-lora-finetuning/lit_gpt/packed_dataset.py +++ b/llm-lora-finetuning/lit_gpt/packed_dataset.py @@ -256,7 +256,9 @@ def __init__(self, datasets, seed, weights=None): self._weights = [w / sum(weights) for w in weights] def __iter__(self): - return CombinedDatasetIterator(self._datasets, self._seed, self._weights) + return CombinedDatasetIterator( + self._datasets, self._seed, self._weights + ) class CombinedDatasetIterator: @@ -266,5 +268,7 @@ def __init__(self, datasets, seed, weights): self._rng = random.Random(seed) def __next__(self): - (dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1) + (dataset,) = self._rng.choices( + self._datasets, weights=self._weights, k=1 + ) return next(dataset) diff --git a/llm-lora-finetuning/lit_gpt/tokenizer.py b/llm-lora-finetuning/lit_gpt/tokenizer.py index 43331c5a..f2832ce6 100644 --- a/llm-lora-finetuning/lit_gpt/tokenizer.py +++ b/llm-lora-finetuning/lit_gpt/tokenizer.py @@ -23,7 +23,9 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None: if (vocabulary_path := checkpoint_dir / "tokenizer.model").is_file(): from sentencepiece import SentencePieceProcessor - self.processor = SentencePieceProcessor(model_file=str(vocabulary_path)) + self.processor = SentencePieceProcessor( + model_file=str(vocabulary_path) + ) self.backend = "sentencepiece" self.bos_id = self.processor.bos_id() self.eos_id = self.processor.eos_id() @@ -41,14 +43,19 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None: config = json.load(fp) bos_token = config.get("bos_token") self.bos_id = ( - self.token_to_id(bos_token) if bos_token is not None else None + self.token_to_id(bos_token) + if bos_token is not None + else None ) eos_token = config.get("eos_token") self.eos_id = ( - self.token_to_id(eos_token) if eos_token is not None else None + self.token_to_id(eos_token) + if eos_token is not None + else None ) if ( - special_tokens_path := checkpoint_dir / "generation_config.json" + special_tokens_path := checkpoint_dir + / "generation_config.json" ).is_file(): with open(special_tokens_path) as fp: config = json.load(fp) @@ -86,7 +93,8 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool: with open(tokenizer_config_path) as fp: config = json.load(fp) if any( - config.get(check, False) for check in ("add_bos_token", "add_prefix_space") + config.get(check, False) + for check in ("add_bos_token", "add_prefix_space") ): return True # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True. diff --git a/llm-lora-finetuning/lit_gpt/utils.py b/llm-lora-finetuning/lit_gpt/utils.py index 8fbcc028..ba4706ff 100644 --- a/llm-lora-finetuning/lit_gpt/utils.py +++ b/llm-lora-finetuning/lit_gpt/utils.py @@ -39,7 +39,9 @@ def find_multiple(n: int, k: int) -> int: return n + k - (n % k) -def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> int: +def num_parameters( + module: nn.Module, requires_grad: Optional[bool] = None +) -> int: total = 0 for p in module.parameters(): if requires_grad is None or p.requires_grad == requires_grad: @@ -59,7 +61,9 @@ def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None: checkpoint_dir / "tokenizer.json" ).is_file() or (checkpoint_dir / "tokenizer.model").is_file(), - "tokenizer_config.json": (checkpoint_dir / "tokenizer_config.json").is_file(), + "tokenizer_config.json": ( + checkpoint_dir / "tokenizer_config.json" + ).is_file(), } if checkpoint_dir.is_dir(): if all(files.values()): @@ -92,7 +96,10 @@ class SavingProxyForStorage: def __init__(self, obj, saver, protocol_version=5): self.protocol_version = protocol_version self.saver = saver - if not (isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj)): + if not ( + isinstance(obj, torch.storage.TypedStorage) + or torch.is_storage(obj) + ): raise TypeError(f"expected storage, not {type(obj)}") # this logic is taken from PyTorch 2.0+ torch/serialization.py @@ -125,7 +132,9 @@ def __reduce_ex__(self, protocol_version): class SavingProxyForTensor: def __init__(self, tensor, saver, protocol_version=5): self.protocol_version = protocol_version - self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__(protocol_version) + self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__( + protocol_version + ) if reduce_args[0] == torch._utils._rebuild_tensor_v2: # for Tensors with Python attributes (a0, a1, (storage, *a2_other), *other_reduce_args) = reduce_args @@ -176,7 +185,9 @@ def persistent_id(self, obj): if isinstance(obj, SavingProxyForStorage): return obj.storage_info - if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage( + obj + ): if isinstance(obj, torch.storage.TypedStorage): # TODO: Once we decide to break serialization FC, this case # can be deleted @@ -197,7 +208,10 @@ def persistent_id(self, obj): # not allocated, don't perform this check if storage.data_ptr() != 0: if storage.data_ptr() in self.storage_dtypes: - if storage_dtype != self.storage_dtypes[storage.data_ptr()]: + if ( + storage_dtype + != self.storage_dtypes[storage.data_ptr()] + ): raise RuntimeError( "Cannot save multiple tensors or storages that view the same data as different types" ) @@ -290,7 +304,8 @@ def chunked_cross_entropy( # chunk cross entropy logit_chunks = [ - logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits + logit_chunk.reshape(-1, logit_chunk.size(-1)) + for logit_chunk in logits ] target_chunks = [ target_chunk.reshape(-1) @@ -332,12 +347,16 @@ def chunked_cross_entropy( return torch.cat(loss_chunks).sum() / max(1, non_masked_elems) -def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict: +def map_old_state_dict_weights( + state_dict: Dict, mapping: Mapping, prefix: str +) -> Dict: for checkpoint_name, attribute_name in mapping.items(): full_checkpoint_name = prefix + checkpoint_name if full_checkpoint_name in state_dict: full_attribute_name = prefix + attribute_name - state_dict[full_attribute_name] = state_dict.pop(full_checkpoint_name) + state_dict[full_attribute_name] = state_dict.pop( + full_checkpoint_name + ) return state_dict diff --git a/llm-lora-finetuning/scripts/convert_hf_checkpoint.py b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py index 86a5bff9..14d0ff6f 100644 --- a/llm-lora-finetuning/scripts/convert_hf_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_hf_checkpoint.py @@ -211,7 +211,8 @@ def copy_weights_phi( dtype: Optional[torch.dtype] = None, ) -> None: if any( - layer_name.startswith(("layers.", "transformer.")) for layer_name in hf_weights + layer_name.startswith(("layers.", "transformer.")) + for layer_name in hf_weights ): raise ValueError( "You are using an outdated Phi checkpoint. Please reload it as described in 'tutorials/download_phi.md'" @@ -306,7 +307,9 @@ def load_param( @torch.inference_mode() def convert_hf_checkpoint( *, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), model_name: Optional[str] = None, dtype: Optional[str] = None, ) -> None: @@ -339,16 +342,22 @@ def convert_hf_checkpoint( # Load the json file containing weight mapping pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json" - if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file + if ( + pytorch_bin_map_json_path.is_file() + ): # not all checkpoints have this file with open(pytorch_bin_map_json_path) as json_map: bin_index = json.load(json_map) - bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()} + bin_files = { + checkpoint_dir / bin for bin in bin_index["weight_map"].values() + } else: bin_files = set(checkpoint_dir.glob("*.bin")) # some checkpoints serialize the training arguments bin_files = {f for f in bin_files if f.name != "training_args.bin"} if not bin_files: - raise ValueError(f"Expected {str(checkpoint_dir)!r} to contain .bin files") + raise ValueError( + f"Expected {str(checkpoint_dir)!r} to contain .bin files" + ) with incremental_save(checkpoint_dir / "lit_model.pth") as saver: # for checkpoints that split the QKV across several files, we need to keep all the bin files diff --git a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py index 6b06e888..1239e7d2 100644 --- a/llm-lora-finetuning/scripts/convert_lit_checkpoint.py +++ b/llm-lora-finetuning/scripts/convert_lit_checkpoint.py @@ -261,7 +261,9 @@ def convert_lit_checkpoint( copy_fn = partial(copy_weights_falcon, config.name) elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): untie_weights = "Gemma" in config.name - copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights) + copy_fn = partial( + copy_weights_llama, config, untie_weights=untie_weights + ) elif "phi" in config.name: copy_fn = partial(copy_weights_phi, config) else: diff --git a/llm-lora-finetuning/scripts/download.py b/llm-lora-finetuning/scripts/download.py index 594ae9dc..e5a7459d 100644 --- a/llm-lora-finetuning/scripts/download.py +++ b/llm-lora-finetuning/scripts/download.py @@ -38,7 +38,9 @@ def download_from_hub( from huggingface_hub import snapshot_download - if ("meta-llama" in repo_id or "falcon-180" in repo_id) and not access_token: + if ( + "meta-llama" in repo_id or "falcon-180" in repo_id + ) and not access_token: raise ValueError( f"{repo_id} requires authentication, please set the `HF_TOKEN=your_token` environment" " variable or pass --access_token=your_token. You can find your token by visiting" diff --git a/llm-lora-finetuning/scripts/merge_lora.py b/llm-lora-finetuning/scripts/merge_lora.py index b9a2baa3..89818a99 100644 --- a/llm-lora-finetuning/scripts/merge_lora.py +++ b/llm-lora-finetuning/scripts/merge_lora.py @@ -24,7 +24,9 @@ def merge_lora( lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), out_dir: Path = Path("out/lora/checkpoint"), precision: Optional[str] = None, lora_r: int = 8, diff --git a/llm-lora-finetuning/scripts/prepare_alpaca.py b/llm-lora-finetuning/scripts/prepare_alpaca.py index 77c62691..cde6fca1 100644 --- a/llm-lora-finetuning/scripts/prepare_alpaca.py +++ b/llm-lora-finetuning/scripts/prepare_alpaca.py @@ -22,7 +22,9 @@ def prepare( destination_path: Path = Path("data/alpaca"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, seed: int = 42, mask_inputs: bool = False, # as in alpaca-lora @@ -37,7 +39,9 @@ def prepare( which stores the preprocessed and tokenized prompts and labels. """ if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] diff --git a/llm-lora-finetuning/scripts/prepare_csv.py b/llm-lora-finetuning/scripts/prepare_csv.py index 16b45cd4..bbd27074 100644 --- a/llm-lora-finetuning/scripts/prepare_csv.py +++ b/llm-lora-finetuning/scripts/prepare_csv.py @@ -22,7 +22,9 @@ def prepare( csv_path: Path, destination_path: Path = Path("data/csv"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), test_split_fraction: float = 0.1, seed: int = 42, mask_inputs: bool = False, @@ -46,7 +48,9 @@ def prepare( df = pd.read_csv(csv_path, dtype=str).fillna("") if not (df.columns.values == columns).all(): - raise ValueError(f"CSV columns must be {columns}, found {df.columns.values}") + raise ValueError( + f"CSV columns must be {columns}, found {df.columns.values}" + ) data = json.loads(df.to_json(orient="records", indent=4)) print("Loading tokenizer...") diff --git a/llm-lora-finetuning/scripts/prepare_dolly.py b/llm-lora-finetuning/scripts/prepare_dolly.py index 3ecf973f..8bb43439 100644 --- a/llm-lora-finetuning/scripts/prepare_dolly.py +++ b/llm-lora-finetuning/scripts/prepare_dolly.py @@ -23,7 +23,9 @@ def prepare( destination_path: Path = Path("data/dolly"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), test_split_fraction: float = 0.1, seed: int = 42, mask_inputs: bool = False, @@ -39,7 +41,9 @@ def prepare( """ if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] diff --git a/llm-lora-finetuning/scripts/prepare_flan.py b/llm-lora-finetuning/scripts/prepare_flan.py index 90707853..a34b5472 100644 --- a/llm-lora-finetuning/scripts/prepare_flan.py +++ b/llm-lora-finetuning/scripts/prepare_flan.py @@ -29,7 +29,9 @@ def load_jsonl(filename): def prepare( destination_path: Path = Path("data/flan"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), mask_inputs: bool = False, # as in alpaca-lora subsets: Optional[str] = None, ignore_index: int = -1, @@ -122,7 +124,9 @@ def prepare( subsets = list(supported_subsets) if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] diff --git a/llm-lora-finetuning/scripts/prepare_lima.py b/llm-lora-finetuning/scripts/prepare_lima.py index 75b57e20..e27928ce 100644 --- a/llm-lora-finetuning/scripts/prepare_lima.py +++ b/llm-lora-finetuning/scripts/prepare_lima.py @@ -23,7 +23,9 @@ def prepare( destination_path: Path = Path("data/lima"), test_split_fraction: float = 0.1, - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), mask_inputs: bool = False, # as in alpaca-lora seed: int = 42, include_multiturn_conversations: bool = False, @@ -46,7 +48,9 @@ def prepare( ) if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] @@ -56,7 +60,9 @@ def prepare( from datasets import load_dataset dataset = load_dataset(data_repo_id, token=access_token) - train_data = format_dataset(dataset["train"], include_multiturn_conversations) + train_data = format_dataset( + dataset["train"], include_multiturn_conversations + ) # test set is present but doesn't have any solutions, so we cannot use it here # but have to create our own diff --git a/llm-lora-finetuning/scripts/prepare_longform.py b/llm-lora-finetuning/scripts/prepare_longform.py index 6cea71a5..6327bad8 100644 --- a/llm-lora-finetuning/scripts/prepare_longform.py +++ b/llm-lora-finetuning/scripts/prepare_longform.py @@ -22,7 +22,9 @@ def prepare( destination_path: Path = Path("data/longform"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), mask_inputs: bool = False, # as in alpaca-lora ignore_index: int = -1, max_seq_length: Optional[int] = None, @@ -33,7 +35,9 @@ def prepare( which stores the preprocessed and tokenized prompts and labels. """ if max_seq_length is None: - with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file: + with open( + checkpoint_dir / "lit_config.json", "r", encoding="utf-8" + ) as file: config = json.load(file) max_seq_length = config["block_size"] @@ -43,13 +47,9 @@ def prepare( # val_file_name = "val.json" test_file_name = "test.json" - train_file_url = ( - "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/train.json" - ) + train_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/train.json" # val_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/val.json" - test_file_url = ( - "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/test.json" - ) + test_file_url = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset/test.json" train_file_path = destination_path / train_file_name print("Loading train data file...") diff --git a/llm-lora-finetuning/scripts/prepare_openwebtext.py b/llm-lora-finetuning/scripts/prepare_openwebtext.py index 4f1d255e..fbb4a8d9 100644 --- a/llm-lora-finetuning/scripts/prepare_openwebtext.py +++ b/llm-lora-finetuning/scripts/prepare_openwebtext.py @@ -20,7 +20,9 @@ def prepare( destination_path: Path = Path("data/openwebtext"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), seed: int = 42, test_size: Union[float, int, None] = 0.0005, ) -> None: @@ -46,7 +48,9 @@ def prepare( split_dataset = dataset["train"].train_test_split( test_size=test_size, seed=seed, shuffle=True ) - split_dataset["val"] = split_dataset.pop("test") # rename the test split to val + split_dataset["val"] = split_dataset.pop( + "test" + ) # rename the test split to val def process(example): ids = tokenizer.encode(example["text"]).tolist() @@ -69,12 +73,18 @@ def process(example): for split, dset in tokenized.items(): arr_len = np.sum(dset["len"], dtype=np.uint64) filename = destination_path / f"{split}.bin" - dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16) - arr = np.memmap(str(filename), dtype=dtype, mode="w+", shape=(arr_len,)) + dtype = ( + np.uint16 + ) # (can do since enc.max_token_value == 50256 is < 2**16) + arr = np.memmap( + str(filename), dtype=dtype, mode="w+", shape=(arr_len,) + ) total_batches = 1024 idx = 0 - for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"): + for batch_idx in tqdm( + range(total_batches), desc=f"writing {filename}" + ): # Batch together samples for faster write batch = dset.shard( num_shards=total_batches, index=batch_idx, contiguous=True diff --git a/llm-lora-finetuning/scripts/prepare_redpajama.py b/llm-lora-finetuning/scripts/prepare_redpajama.py index 23224c1f..02044307 100644 --- a/llm-lora-finetuning/scripts/prepare_redpajama.py +++ b/llm-lora-finetuning/scripts/prepare_redpajama.py @@ -110,7 +110,9 @@ def prepare_full( is_cc = set_name == "common_crawl" - filenames = glob.glob(os.path.join(source_path, pattern), recursive=True) + filenames = glob.glob( + os.path.join(source_path, pattern), recursive=True + ) if not filenames: raise RuntimeError( @@ -135,24 +137,32 @@ def prepare_full( print(f"Processing {name}") if is_cc: - with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: + with zstd.open( + open(filepath, "rb"), "rt", encoding="utf-8" + ) as f: for row in tqdm(f): text = json.loads(row)["text"] text_ids = tokenizer.encode(text) - builder.add_array(np.array(text_ids, dtype=builder.dtype)) + builder.add_array( + np.array(text_ids, dtype=builder.dtype) + ) else: with open(filepath, encoding="utf-8") as f: for row in tqdm(f): text = json.loads(row)["text"] text_ids = tokenizer.encode(text) - builder.add_array(np.array(text_ids, dtype=builder.dtype)) + builder.add_array( + np.array(text_ids, dtype=builder.dtype) + ) builder.write_reminder() def prepare( source_path: Path = Path("data/RedPajama-Data-1T-Sample"), - checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"), + checkpoint_dir: Path = Path( + "checkpoints/stabilityai/stablelm-base-alpha-3b" + ), destination_path: Path = Path("data/redpajama_sample"), sample: bool = True, match: str = "", diff --git a/llm-lora-finetuning/scripts/prepare_slimpajama.py b/llm-lora-finetuning/scripts/prepare_slimpajama.py index 7a83316a..0a80191f 100644 --- a/llm-lora-finetuning/scripts/prepare_slimpajama.py +++ b/llm-lora-finetuning/scripts/prepare_slimpajama.py @@ -30,7 +30,10 @@ def prepare_item(self, filepath): with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: for row in f: text = json.loads(row)["text"] - if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub": + if ( + json.loads(row)["meta"]["redpajama_set_name"] + == "RedPajamaGithub" + ): continue # exclude the GitHub data since it overlaps with starcoder text_ids = self.tokenizer.encode(text, bos=False, eos=True) yield text_ids @@ -44,7 +47,9 @@ def prepare( fast_dev_run: bool = False, ) -> None: tokenizer = Tokenizer(tokenizer_path) - data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) + data_recipe = SlimPajamaDataRecipe( + tokenizer=tokenizer, chunk_size=chunk_size + ) data_processor = DataProcessor( input_dir=str(input_dir), output_dir=str(output_dir), diff --git a/llm-lora-finetuning/scripts/prepare_starcoder.py b/llm-lora-finetuning/scripts/prepare_starcoder.py index f9104e97..1f67c93e 100644 --- a/llm-lora-finetuning/scripts/prepare_starcoder.py +++ b/llm-lora-finetuning/scripts/prepare_starcoder.py @@ -57,7 +57,9 @@ def prepare( fast_dev_run: bool = False, ) -> None: tokenizer = Tokenizer(tokenizer_path) - data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) + data_recipe = StarcoderDataRecipe( + tokenizer=tokenizer, chunk_size=chunk_size + ) data_processor = DataProcessor( input_dir=str(input_dir), output_dir=str(output_dir), diff --git a/llm-lora-finetuning/steps/eval.py b/llm-lora-finetuning/steps/eval.py index 6a51615b..25ee727d 100644 --- a/llm-lora-finetuning/steps/eval.py +++ b/llm-lora-finetuning/steps/eval.py @@ -4,8 +4,9 @@ from pathlib import Path from typing import Annotated, Any, Dict, Optional +import torch from evaluate.lm_eval_harness import run_eval_harness -from zenml import step, log_artifact_metadata, log_model_metadata +from zenml import step from scripts.download import download_from_hub from scripts.merge_lora import merge_lora @@ -16,6 +17,8 @@ def eval( model_repo: str, adapter_repo: Optional[str] = None ) -> Annotated[Dict[str, Any], "evaluation_results"]: + torch.set_float32_matmul_precision("high") + access_token = get_huggingface_access_token() model_dir = Path("model") @@ -28,7 +31,9 @@ def eval( merged_dir = Path("merged") download_from_hub( - repo_id=adapter_repo, checkpoint_dir=adapter_dir, access_token=access_token + repo_id=adapter_repo, + checkpoint_dir=adapter_dir, + access_token=access_token, ) lora_path = adapter_dir / "lit_model_lora_finetuned.pth" diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index faf4ce1d..52bdf6fa 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -79,5 +79,7 @@ def feature_engineering(model_repo: str, dataset_name: str) -> Path: helper_module = importlib.import_module(f"scripts.prepare_{dataset_name}") prepare_function = getattr(helper_module, "prepare") - prepare_function(checkpoint_dir=checkpoint_dir, destination_path=destination_dir) + prepare_function( + checkpoint_dir=checkpoint_dir, destination_path=destination_dir + ) return destination_dir diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index ca4a4de5..54a36ead 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Optional +import torch from finetune.lora import setup from huggingface_hub import upload_folder from lit_gpt.args import IOArgs @@ -27,13 +28,17 @@ def finetune( convert_to_hf: bool = False, data_dir: Optional[Path] = None, ) -> None: + torch.set_float32_matmul_precision("high") + access_token = get_huggingface_access_token() checkpoint_root_dir = Path("checkpoints") checkpoint_dir = checkpoint_root_dir / repo_id if checkpoint_dir.exists(): - logger.info("Checkpoint directory already exists, skipping download...") + logger.info( + "Checkpoint directory already exists, skipping download..." + ) else: download_from_hub( repo_id=repo_id, @@ -66,7 +71,9 @@ def finetune( if merged_output_repo: lora_path = output_dir / model_name / "lit_model_lora_finetuned.pth" - merge_output_dir = Path("output/lora_merged") / dataset_name / model_name + merge_output_dir = ( + Path("output/lora_merged") / dataset_name / model_name + ) merge_lora( lora_alpha=lora_path, checkpoint_dir=checkpoint_dir, @@ -79,7 +86,9 @@ def finetune( shutil.copy(src=path, dst=destination) if convert_to_hf: - upload_dir = Path("output/lora_merged_hf") / dataset_name / model_name + upload_dir = ( + Path("output/lora_merged_hf") / dataset_name / model_name + ) convert_lit_checkpoint( checkpoint_path=merged_output_repo / "lit_model.pth", output_path=output_dir, diff --git a/llm-lora-finetuning/steps/utils.py b/llm-lora-finetuning/steps/utils.py index 0d4fa90d..51a5f0b2 100644 --- a/llm-lora-finetuning/steps/utils.py +++ b/llm-lora-finetuning/steps/utils.py @@ -6,6 +6,10 @@ def get_huggingface_access_token() -> Optional[str]: try: - return Client().get_secret("huggingface_credentials").secret_values["token"] + return ( + Client() + .get_secret("huggingface_credentials") + .secret_values["token"] + ) except KeyError: return os.getenv("HF_TOKEN") From e48ef5ffbc9253f215e1e33dba3bb5fe63a6ac2b Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Wed, 6 Mar 2024 18:07:08 +0800 Subject: [PATCH 10/26] More improvements --- .../steps/feature_engineering.py | 15 +++++++++++--- llm-lora-finetuning/steps/merge.py | 20 +++++++++++++------ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index 52bdf6fa..a596754a 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -5,10 +5,10 @@ from dataclasses import asdict from pathlib import Path from tempfile import mkdtemp -from typing import Any, ClassVar, Tuple, Type +from typing import Annotated, Any, ClassVar, Tuple, Type from lit_gpt import Config -from zenml import step +from zenml import log_artifact_metadata, step from zenml.enums import ArtifactType from zenml.io import fileio from zenml.materializers.base_materializer import BaseMaterializer @@ -55,7 +55,9 @@ def _copy_directory(src: str, dst: str) -> None: @step(output_materializers=DirectoryMaterializer) -def feature_engineering(model_repo: str, dataset_name: str) -> Path: +def feature_engineering( + model_repo: str, dataset_name: str +) -> Annotated[Path, "data"]: access_token = get_huggingface_access_token() checkpoint_root_dir = Path("checkpoints") @@ -74,6 +76,13 @@ def feature_engineering(model_repo: str, dataset_name: str) -> Path: with open(checkpoint_dir / "lit_config.json", "w") as json_config: json.dump(config_dict, json_config) + log_artifact_metadata( + metadata={ + "model_name": model_name, + "model_config": config_dict, + "dataset_name": dataset_name, + } + ) destination_dir = Path("data") / dataset_name helper_module = importlib.import_module(f"scripts.prepare_{dataset_name}") diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 60d17561..84fb94b2 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -3,7 +3,7 @@ from pathlib import Path from huggingface_hub import upload_folder -from zenml import step +from zenml import log_model_metadata, step from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub @@ -35,10 +35,10 @@ def merge( access_token=access_token, ) - lora_path = adapter_dir / "lit_model_lora_finetuned.pth" + lora_path = adapter_dir / adapter_repo / "lit_model_lora_finetuned.pth" merge_lora( lora_path=Path(lora_path), - checkpoint_dir=base_model_dir, + checkpoint_dir=base_model_dir / base_model_repo, out_dir=merged_dir, ) @@ -48,13 +48,21 @@ def merge( shutil.copy(src=path, dst=destination) if convert_to_hf: - output_dir = Path("hf_checkpoint_merged") + output_dir = Path("lora_merged_hf") convert_lit_checkpoint( checkpoint_path=merged_dir / "lit_model.pth", - output_path=output_dir, config_path=merged_dir / "lit_config.json", + output_path=output_dir, ) else: output_dir = merged_dir - upload_folder(repo_id=output_repo, folder_path=output_dir) + commit = upload_folder( + repo_id=output_repo, folder_path=output_dir, token=access_token + ) + log_model_metadata( + metadata={ + "merged_model_huggingface_commit_hash": commit.oid, + "merged_model_huggingface_commit_url": commit.commit_url, + } + ) From 1f850260fecee925827525f8645bc310fffbe8c3 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Thu, 7 Mar 2024 12:53:29 +0800 Subject: [PATCH 11/26] More config --- llm-lora-finetuning/configs/eval.yaml | 7 + llm-lora-finetuning/configs/feature.yaml | 7 + .../configs/finetune-mistral-alpaca.yaml | 19 +++ llm-lora-finetuning/configs/finetune.yaml | 5 - llm-lora-finetuning/configs/merge.yaml | 7 + llm-lora-finetuning/pipelines/__init__.py | 1 - llm-lora-finetuning/pipelines/eval.py | 15 ++ .../pipelines/feature_engineering.py | 15 ++ llm-lora-finetuning/pipelines/finetuning.py | 19 +-- llm-lora-finetuning/pipelines/merge.py | 2 - llm-lora-finetuning/steps/__init__.py | 1 - llm-lora-finetuning/steps/eval.py | 16 +- .../steps/feature_engineering.py | 16 +- llm-lora-finetuning/steps/finetune.py | 146 +++++++++++++++--- llm-lora-finetuning/steps/merge.py | 16 +- llm-lora-finetuning/steps/utils.py | 20 +++ 16 files changed, 262 insertions(+), 50 deletions(-) create mode 100644 llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml delete mode 100644 llm-lora-finetuning/configs/finetune.yaml diff --git a/llm-lora-finetuning/configs/eval.yaml b/llm-lora-finetuning/configs/eval.yaml index a03ba2c6..10550ed0 100644 --- a/llm-lora-finetuning/configs/eval.yaml +++ b/llm-lora-finetuning/configs/eval.yaml @@ -1,3 +1,10 @@ parameters: model_repo: mistralai/Mistral-7B-Instruct-v0.1 adapter_repo: ... +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral \ No newline at end of file diff --git a/llm-lora-finetuning/configs/feature.yaml b/llm-lora-finetuning/configs/feature.yaml index 5de07309..c6ddc812 100644 --- a/llm-lora-finetuning/configs/feature.yaml +++ b/llm-lora-finetuning/configs/feature.yaml @@ -1,3 +1,10 @@ parameters: model_repo: mistralai/Mistral-7B-Instruct-v0.1 dataset_name: alpaca +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral \ No newline at end of file diff --git a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml new file mode 100644 index 00000000..d7ae3244 --- /dev/null +++ b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml @@ -0,0 +1,19 @@ +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral + +steps: + finetune: + parameters: + config: + base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 + training: + save_interval: 1 + epochs: 1 # 5 + epoch_size: 1 # 50000 + global_batch_size: 128 + learning_rate: 3e-4 diff --git a/llm-lora-finetuning/configs/finetune.yaml b/llm-lora-finetuning/configs/finetune.yaml deleted file mode 100644 index 78ea0ae9..00000000 --- a/llm-lora-finetuning/configs/finetune.yaml +++ /dev/null @@ -1,5 +0,0 @@ -parameters: - repo_id: mistralai/Mistral-7B-Instruct-v0.1 - adapter_output_repo: null - merged_output_repo: null - convert_to_hf: False diff --git a/llm-lora-finetuning/configs/merge.yaml b/llm-lora-finetuning/configs/merge.yaml index 3e9ca3ad..0349cedc 100644 --- a/llm-lora-finetuning/configs/merge.yaml +++ b/llm-lora-finetuning/configs/merge.yaml @@ -3,3 +3,10 @@ parameters: adapter_repo: ... output_repo: ... convert_to_hf: False +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral \ No newline at end of file diff --git a/llm-lora-finetuning/pipelines/__init__.py b/llm-lora-finetuning/pipelines/__init__.py index 757bd841..fea81644 100644 --- a/llm-lora-finetuning/pipelines/__init__.py +++ b/llm-lora-finetuning/pipelines/__init__.py @@ -13,4 +13,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# diff --git a/llm-lora-finetuning/pipelines/eval.py b/llm-lora-finetuning/pipelines/eval.py index 7595b599..e397b33e 100644 --- a/llm-lora-finetuning/pipelines/eval.py +++ b/llm-lora-finetuning/pipelines/eval.py @@ -1,3 +1,18 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import Optional from steps.eval import eval diff --git a/llm-lora-finetuning/pipelines/feature_engineering.py b/llm-lora-finetuning/pipelines/feature_engineering.py index 610ad471..de459eaf 100644 --- a/llm-lora-finetuning/pipelines/feature_engineering.py +++ b/llm-lora-finetuning/pipelines/feature_engineering.py @@ -1,3 +1,18 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from steps.feature_engineering import feature_engineering from zenml import pipeline from zenml.config import DockerSettings diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index 1edd1b84..3a0b5adb 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -13,9 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -from pathlib import Path -from typing import Optional from steps.finetune import finetune from zenml import pipeline @@ -23,17 +20,5 @@ @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) -def finetuning_pipeline( - repo_id: str = "mistralai/Mistral-7B-Instruct-v0.1", - adapter_output_repo: Optional[str] = None, - merged_output_repo: Optional[str] = None, - convert_to_hf: bool = False, - data_dir: Optional[str] = None, -) -> None: - finetune( - repo_id=repo_id, - adapter_output_repo=adapter_output_repo, - merged_output_repo=merged_output_repo, - convert_to_hf=convert_to_hf, - data_dir=Path(data_dir) if data_dir else None, - ) +def finetuning_pipeline() -> None: + finetune() diff --git a/llm-lora-finetuning/pipelines/merge.py b/llm-lora-finetuning/pipelines/merge.py index e33f64d2..e3e3373c 100644 --- a/llm-lora-finetuning/pipelines/merge.py +++ b/llm-lora-finetuning/pipelines/merge.py @@ -13,8 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# - from steps.merge import merge from zenml import pipeline from zenml.config import DockerSettings diff --git a/llm-lora-finetuning/steps/__init__.py b/llm-lora-finetuning/steps/__init__.py index 757bd841..fea81644 100644 --- a/llm-lora-finetuning/steps/__init__.py +++ b/llm-lora-finetuning/steps/__init__.py @@ -13,4 +13,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# diff --git a/llm-lora-finetuning/steps/eval.py b/llm-lora-finetuning/steps/eval.py index 25ee727d..38c4820a 100644 --- a/llm-lora-finetuning/steps/eval.py +++ b/llm-lora-finetuning/steps/eval.py @@ -1,4 +1,18 @@ -# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import shutil from pathlib import Path diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index a596754a..f3ccf158 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -1,4 +1,18 @@ -# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import importlib import json import os diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 54a36ead..44e73003 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -1,12 +1,27 @@ -# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import shutil from pathlib import Path -from typing import Optional +from typing import Literal, Optional import torch from finetune.lora import setup from huggingface_hub import upload_folder -from lit_gpt.args import IOArgs +from lit_gpt.args import EvalArgs, IOArgs, TrainArgs +from pydantic import BaseModel from zenml import log_model_metadata, step from zenml.logger import get_logger @@ -20,20 +35,89 @@ logger = get_logger(__file__) +class DataParameters(BaseModel): + seed: int = 42 + test_split_fraction: float = 0.03865 + mask_inputs: bool = False + ignore_index: int = -1 + max_seq_length: Optional[int] = None + + +class TrainingParameters(BaseModel): + save_interval: int = 1000 + log_interval: int = 1 + global_batch_size: int = 64 + micro_batch_size: int = 4 + lr_warmup_steps: int = 100 + epochs: Optional[int] = None + epoch_size: Optional[int] = None + max_tokens: Optional[int] = None + max_seq_length: Optional[int] = None + + learning_rate: float = 1e-3 + weight_decay: float = 0.02 + beta1: float = 0.9 + beta2: float = 0.95 + max_norm: Optional[float] = None + min_lr: float = 6e-5 + + +class EvalParameters(BaseModel): + interval: int = 100 + max_new_tokens: int = 100 + max_iters: int = 100 + + +class LoraParameters(BaseModel): + lora_r: int = 8 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_query: bool = True + lora_key: bool = False + lora_value: bool = True + lora_projection: bool = False + lora_mlp: bool = False + lora_head: bool = False + + +class FinetuningParameters(BaseModel): + base_model_repo: str + data_dir: Optional[Path] = None + + adapter_output_repo: Optional[str] = None + merged_output_repo: Optional[str] = None + convert_to_hf_checkpoint: bool = False + + precision: Optional[str] = None + quantize: Optional[ + Literal[ + "bnb.nf4", + "bnb.nf4-dq", + "bnb.fp4", + "bnb.fp4-dq", + "bnb.int8-training", + ] + ] = None + + data: DataParameters = DataParameters() + training: TrainingParameters = TrainingParameters() + eval: EvalParameters = EvalParameters() + lora: LoraParameters = LoraParameters() + + @step -def finetune( - repo_id: str, - adapter_output_repo: Optional[str] = None, - merged_output_repo: Optional[str] = None, - convert_to_hf: bool = False, - data_dir: Optional[Path] = None, -) -> None: +def finetune(config: FinetuningParameters) -> None: + """Finetune model using LoRA. + + Args: + config: Configuration for this step. + """ torch.set_float32_matmul_precision("high") access_token = get_huggingface_access_token() checkpoint_root_dir = Path("checkpoints") - checkpoint_dir = checkpoint_root_dir / repo_id + checkpoint_dir = checkpoint_root_dir / config.base_model_repo if checkpoint_dir.exists(): logger.info( @@ -41,7 +125,7 @@ def finetune( ) else: download_from_hub( - repo_id=repo_id, + repo_id=config.base_model_repo, checkpoint_dir=checkpoint_root_dir, access_token=access_token, ) @@ -50,7 +134,15 @@ def finetune( if not data_dir: data_dir = Path("data/alpaca") - prepare(destination_path=data_dir, checkpoint_dir=checkpoint_dir) + prepare( + destination_path=data_dir, + checkpoint_dir=checkpoint_dir, + test_split_fraction=config.data.test_split_fraction, + seed=config.data.seed, + mask_inputs=config.data.mask_inputs, + ignore_index=config.data.ignore_index, + max_seq_length=config.data.max_seq_length, + ) model_name = checkpoint_dir.name dataset_name = data_dir.name @@ -66,9 +158,19 @@ def finetune( checkpoint_dir=checkpoint_dir, out_dir=output_dir, ) - setup(precision="bf16-true", io=io_args) + train_args = TrainArgs(**config.training.dict()) + eval_args = EvalArgs(**config.eval.dict()) + setup( + devices=1, + io=io_args, + train=train_args, + eval=eval_args, + precision=config.precision, + quantize=config.quantize, + **config.lora.dict(), + ) - if merged_output_repo: + if config.merged_output_repo: lora_path = output_dir / model_name / "lit_model_lora_finetuned.pth" merge_output_dir = ( @@ -78,6 +180,8 @@ def finetune( lora_alpha=lora_path, checkpoint_dir=checkpoint_dir, out_dir=merge_output_dir, + precision=config.precision, + **config.lora.dict(), ) for path in Path(checkpoint_dir).glob("*.json"): @@ -85,20 +189,20 @@ def finetune( shutil.copy(src=path, dst=destination) - if convert_to_hf: + if config.convert_to_hf_checkpoint: upload_dir = ( Path("output/lora_merged_hf") / dataset_name / model_name ) convert_lit_checkpoint( - checkpoint_path=merged_output_repo / "lit_model.pth", + checkpoint_path=config.merged_output_repo / "lit_model.pth", output_path=output_dir, - config_path=merged_output_repo / "lit_config.json", + config_path=config.merged_output_repo / "lit_config.json", ) else: upload_dir = merge_output_dir commit = upload_folder( - repo_id=merged_output_repo, + repo_id=config.merged_output_repo, folder_path=upload_dir, token=access_token, ) @@ -109,9 +213,9 @@ def finetune( } ) - if adapter_output_repo: + if config.adapter_output_repo: commit = upload_folder( - repo_id=adapter_output_repo, + repo_id=config.adapter_output_repo, folder_path=output_dir / model_name, token=access_token, ) diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 84fb94b2..7c18911e 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -1,4 +1,18 @@ -# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import shutil from pathlib import Path diff --git a/llm-lora-finetuning/steps/utils.py b/llm-lora-finetuning/steps/utils.py index 51a5f0b2..3188de88 100644 --- a/llm-lora-finetuning/steps/utils.py +++ b/llm-lora-finetuning/steps/utils.py @@ -1,3 +1,18 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from typing import Optional @@ -5,6 +20,11 @@ def get_huggingface_access_token() -> Optional[str]: + """Get access token for huggingface. + + Returns: + The access token if one was found. + """ try: return ( Client() From 131223269cf071558833105006be2913c0f68be9 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Thu, 7 Mar 2024 13:07:38 +0800 Subject: [PATCH 12/26] Fix unassigned var --- llm-lora-finetuning/steps/finetune.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 44e73003..f328ad98 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -132,7 +132,9 @@ def finetune(config: FinetuningParameters) -> None: convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) - if not data_dir: + if config.data_dir: + data_dir = config.data_dir + else: data_dir = Path("data/alpaca") prepare( destination_path=data_dir, From c9b50f0ea101cdb57e7bc4170d562e54d1bce223 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Thu, 7 Mar 2024 13:46:52 +0800 Subject: [PATCH 13/26] Cleanup --- llm-lora-finetuning/steps/eval.py | 69 +++++++++++++++++-- .../steps/feature_engineering.py | 33 ++++++--- llm-lora-finetuning/steps/finetune.py | 13 +--- llm-lora-finetuning/steps/merge.py | 50 ++++++++++---- llm-lora-finetuning/steps/params.py | 29 ++++++++ llm-lora-finetuning/steps/utils.py | 17 +++++ 6 files changed, 171 insertions(+), 40 deletions(-) create mode 100644 llm-lora-finetuning/steps/params.py diff --git a/llm-lora-finetuning/steps/eval.py b/llm-lora-finetuning/steps/eval.py index 38c4820a..cac942d9 100644 --- a/llm-lora-finetuning/steps/eval.py +++ b/llm-lora-finetuning/steps/eval.py @@ -16,36 +16,85 @@ import json import shutil from pathlib import Path -from typing import Annotated, Any, Dict, Optional +from typing import Annotated, Any, Dict, List, Literal, Optional import torch from evaluate.lm_eval_harness import run_eval_harness +from pydantic import BaseModel from zenml import step from scripts.download import download_from_hub from scripts.merge_lora import merge_lora -from steps.utils import get_huggingface_access_token +from steps.params import LoraParameters +from steps.utils import ( + convert_to_lit_checkpoint_if_necessary, + get_huggingface_access_token, +) + + +class EvaluationParameters(BaseModel): + """If `adapter_repo` is set, it will be merged with the model. Otherwise + the model itself will be evaluated. + """ + + model_repo: str + adapter_repo: Optional[str] = None + + precision: Optional[str] = None + quantize: Optional[ + Literal[ + "bnb.nf4", + "bnb.nf4-dq", + "bnb.fp4", + "bnb.fp4-dq", + "bnb.int8-training", + ] + ] = None + + lora: LoraParameters = LoraParameters() + + eval_tasks: List[str] = [ + "arc_challenge", + "piqa", + "hellaswag", + "hendrycksTest-*", + ] + num_fewshot: int = 0 + limit: Optional[int] = None + bootstrap_iters: int = 100000 + no_cache: bool = True @step def eval( - model_repo: str, adapter_repo: Optional[str] = None + config: EvaluationParameters, ) -> Annotated[Dict[str, Any], "evaluation_results"]: + """Evaluate model. + + Args: + config: Configuration for this step. + """ torch.set_float32_matmul_precision("high") access_token = get_huggingface_access_token() model_dir = Path("model") download_from_hub( - repo_id=model_repo, checkpoint_dir=model_dir, access_token=access_token + repo_id=config.model_repo, + checkpoint_dir=model_dir, + access_token=access_token, ) - if adapter_repo: + convert_to_lit_checkpoint_if_necessary( + checkpoint_dir=model_dir / config.model_repo + ) + + if config.adapter_repo: adapter_dir = Path("adapter") merged_dir = Path("merged") download_from_hub( - repo_id=adapter_repo, + repo_id=config.adapter_repo, checkpoint_dir=adapter_dir, access_token=access_token, ) @@ -55,6 +104,8 @@ def eval( lora_path=Path(lora_path), checkpoint_dir=model_dir, out_dir=merged_dir, + precision=config.precision, + **config.lora.dict() ) for path in Path(model_dir).glob("*.json"): @@ -65,7 +116,11 @@ def eval( model_dir = merged_dir output_path = Path("output.json") - run_eval_harness(checkpoint_dir=model_dir, save_filepath=output_path) + run_eval_harness( + checkpoint_dir=model_dir, + save_filepath=output_path, + **config.dict(exclude={"model_repo", "adapter_repo", "lora"}) + ) with open(output_path, "r") as f: return json.load(f) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index f3ccf158..dfc1a890 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -19,9 +19,10 @@ from dataclasses import asdict from pathlib import Path from tempfile import mkdtemp -from typing import Annotated, Any, ClassVar, Tuple, Type +from typing import Annotated, Any, ClassVar, Dict, Tuple, Type from lit_gpt import Config +from pydantic import BaseModel from zenml import log_artifact_metadata, step from zenml.enums import ArtifactType from zenml.io import fileio @@ -68,21 +69,33 @@ def _copy_directory(src: str, dst: str) -> None: fileio.copy(src_file, dst_file) +class FeatureEngineeringParameters(BaseModel): + model_repo: str + dataset_name: str + + prepare_kwargs: Dict[str, Any] = {} + + @step(output_materializers=DirectoryMaterializer) def feature_engineering( - model_repo: str, dataset_name: str + config: FeatureEngineeringParameters, ) -> Annotated[Path, "data"]: + """Prepare the dataset. + + Args: + config: Configuration for this step. + """ access_token = get_huggingface_access_token() checkpoint_root_dir = Path("checkpoints") download_from_hub( - repo_id=model_repo, + repo_id=config.model_repo, tokenizer_only=True, checkpoint_dir=checkpoint_root_dir, access_token=access_token, ) - checkpoint_dir = checkpoint_root_dir / model_repo + checkpoint_dir = checkpoint_root_dir / config.model_repo model_name = checkpoint_dir.name config = Config.from_name(model_name) @@ -94,15 +107,19 @@ def feature_engineering( metadata={ "model_name": model_name, "model_config": config_dict, - "dataset_name": dataset_name, + "dataset_name": config.dataset_name, } ) - destination_dir = Path("data") / dataset_name + destination_dir = Path("data") / config.dataset_name - helper_module = importlib.import_module(f"scripts.prepare_{dataset_name}") + helper_module = importlib.import_module( + f"scripts.prepare_{config.dataset_name}" + ) prepare_function = getattr(helper_module, "prepare") prepare_function( - checkpoint_dir=checkpoint_dir, destination_path=destination_dir + checkpoint_dir=checkpoint_dir, + destination_path=destination_dir, + **config.prepare_kwargs, ) return destination_dir diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index f328ad98..a800c161 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -30,6 +30,7 @@ from scripts.download import download_from_hub from scripts.merge_lora import merge_lora from scripts.prepare_alpaca import prepare +from steps.params import DataParameters, LoraParameters from steps.utils import get_huggingface_access_token logger = get_logger(__file__) @@ -68,18 +69,6 @@ class EvalParameters(BaseModel): max_iters: int = 100 -class LoraParameters(BaseModel): - lora_r: int = 8 - lora_alpha: int = 16 - lora_dropout: float = 0.05 - lora_query: bool = True - lora_key: bool = False - lora_value: bool = True - lora_projection: bool = False - lora_mlp: bool = False - lora_head: bool = False - - class FinetuningParameters(BaseModel): base_model_repo: str data_dir: Optional[Path] = None diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 7c18911e..7bf568a2 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -15,23 +15,39 @@ # limitations under the License. import shutil from pathlib import Path +from typing import Optional from huggingface_hub import upload_folder +from pydantic import BaseModel from zenml import log_model_metadata, step from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub from scripts.merge_lora import merge_lora -from steps.utils import get_huggingface_access_token +from steps.params import LoraParameters +from steps.utils import ( + convert_to_lit_checkpoint_if_necessary, + get_huggingface_access_token, +) + + +class MergeParameters(BaseModel): + base_model_repo: str + adapter_repo: str + output_repo: str + convert_to_hf_checkpoint: bool = False + + precision: Optional[str] = None + lora: LoraParameters = LoraParameters() @step -def merge( - base_model_repo: str, - adapter_repo: str, - output_repo: str, - convert_to_hf: bool = False, -) -> None: +def merge(config: MergeParameters) -> None: + """Merge base model and LoRA adapter. + + Args: + config: Configuration for this step. + """ access_token = get_huggingface_access_token() base_model_dir = Path("checkpoints") @@ -39,21 +55,29 @@ def merge( merged_dir = Path("merged") download_from_hub( - repo_id=base_model_repo, + repo_id=config.base_model_repo, checkpoint_dir=base_model_dir, access_token=access_token, ) download_from_hub( - repo_id=adapter_repo, + repo_id=config.adapter_repo, checkpoint_dir=adapter_dir, access_token=access_token, ) - lora_path = adapter_dir / adapter_repo / "lit_model_lora_finetuned.pth" + convert_to_lit_checkpoint_if_necessary( + checkpoint_dir=base_model_dir / config.model_repo + ) + + lora_path = ( + adapter_dir / config.adapter_repo / "lit_model_lora_finetuned.pth" + ) merge_lora( lora_path=Path(lora_path), - checkpoint_dir=base_model_dir / base_model_repo, + checkpoint_dir=base_model_dir / config.base_model_repo, out_dir=merged_dir, + precision=config.precision, + **config.lora.dict() ) for path in Path(base_model_dir).glob("*.json"): @@ -61,7 +85,7 @@ def merge( shutil.copy(src=path, dst=destination) - if convert_to_hf: + if config.convert_to_hf_checkpoint: output_dir = Path("lora_merged_hf") convert_lit_checkpoint( checkpoint_path=merged_dir / "lit_model.pth", @@ -72,7 +96,7 @@ def merge( output_dir = merged_dir commit = upload_folder( - repo_id=output_repo, folder_path=output_dir, token=access_token + repo_id=config.output_repo, folder_path=output_dir, token=access_token ) log_model_metadata( metadata={ diff --git a/llm-lora-finetuning/steps/params.py b/llm-lora-finetuning/steps/params.py new file mode 100644 index 00000000..a04e950c --- /dev/null +++ b/llm-lora-finetuning/steps/params.py @@ -0,0 +1,29 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pydantic import BaseModel + + +class LoraParameters(BaseModel): + lora_r: int = 8 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_query: bool = True + lora_key: bool = False + lora_value: bool = True + lora_projection: bool = False + lora_mlp: bool = False + lora_head: bool = False diff --git a/llm-lora-finetuning/steps/utils.py b/llm-lora-finetuning/steps/utils.py index 3188de88..6e8f63f9 100644 --- a/llm-lora-finetuning/steps/utils.py +++ b/llm-lora-finetuning/steps/utils.py @@ -14,10 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from pathlib import Path from typing import Optional from zenml.client import Client +from scripts.convert_hf_checkpoint import convert_hf_checkpoint + def get_huggingface_access_token() -> Optional[str]: """Get access token for huggingface. @@ -33,3 +36,17 @@ def get_huggingface_access_token() -> Optional[str]: ) except KeyError: return os.getenv("HF_TOKEN") + + +def convert_to_lit_checkpoint_if_necessary(checkpoint_dir: Path) -> None: + """Convert an HF checkpoint to a lit checkpoint if necessary. + + Args: + checkpoint_dir: The directory of the HF checkpoint. + """ + lit_model_path = checkpoint_dir / "lit_model.pth" + + if lit_model_path.is_file(): + return + + convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) From da29018761e405b66c662cf0b9990cccc1200d91 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Thu, 7 Mar 2024 13:48:44 +0800 Subject: [PATCH 14/26] only convert if necessary --- llm-lora-finetuning/steps/finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index a800c161..577ee70d 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -31,7 +31,7 @@ from scripts.merge_lora import merge_lora from scripts.prepare_alpaca import prepare from steps.params import DataParameters, LoraParameters -from steps.utils import get_huggingface_access_token +from steps.utils import get_huggingface_access_token, convert_to_lit_checkpoint_if_necessary logger = get_logger(__file__) @@ -119,7 +119,7 @@ def finetune(config: FinetuningParameters) -> None: access_token=access_token, ) - convert_hf_checkpoint(checkpoint_dir=checkpoint_dir) + convert_to_lit_checkpoint_if_necessary(checkpoint_dir=checkpoint_dir) if config.data_dir: data_dir = config.data_dir From a48fe08916f4ba9de4a4d3e0476019865d83de5a Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Thu, 7 Mar 2024 15:10:51 +0800 Subject: [PATCH 15/26] Imports --- llm-lora-finetuning/configs/eval.yaml | 3 --- llm-lora-finetuning/configs/feature.yaml | 3 --- llm-lora-finetuning/configs/merge.yaml | 5 ----- llm-lora-finetuning/pipelines/__init__.py | 4 ++++ .../pipelines/{eval.py => evaluate.py} | 7 +++---- .../pipelines/feature_engineering.py | 6 +++--- llm-lora-finetuning/pipelines/finetuning.py | 2 +- llm-lora-finetuning/pipelines/merge.py | 16 +++------------- llm-lora-finetuning/steps/__init__.py | 4 ++++ .../steps/{eval.py => evaluate.py} | 2 +- llm-lora-finetuning/steps/finetune.py | 6 ++++-- 11 files changed, 23 insertions(+), 35 deletions(-) rename llm-lora-finetuning/pipelines/{eval.py => evaluate.py} (79%) rename llm-lora-finetuning/steps/{eval.py => evaluate.py} (99%) diff --git a/llm-lora-finetuning/configs/eval.yaml b/llm-lora-finetuning/configs/eval.yaml index 10550ed0..f34ed8fb 100644 --- a/llm-lora-finetuning/configs/eval.yaml +++ b/llm-lora-finetuning/configs/eval.yaml @@ -1,6 +1,3 @@ -parameters: - model_repo: mistralai/Mistral-7B-Instruct-v0.1 - adapter_repo: ... model: name: mistral-7b-lora description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." diff --git a/llm-lora-finetuning/configs/feature.yaml b/llm-lora-finetuning/configs/feature.yaml index c6ddc812..f34ed8fb 100644 --- a/llm-lora-finetuning/configs/feature.yaml +++ b/llm-lora-finetuning/configs/feature.yaml @@ -1,6 +1,3 @@ -parameters: - model_repo: mistralai/Mistral-7B-Instruct-v0.1 - dataset_name: alpaca model: name: mistral-7b-lora description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." diff --git a/llm-lora-finetuning/configs/merge.yaml b/llm-lora-finetuning/configs/merge.yaml index 0349cedc..f34ed8fb 100644 --- a/llm-lora-finetuning/configs/merge.yaml +++ b/llm-lora-finetuning/configs/merge.yaml @@ -1,8 +1,3 @@ -parameters: - base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 - adapter_repo: ... - output_repo: ... - convert_to_hf: False model: name: mistral-7b-lora description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." diff --git a/llm-lora-finetuning/pipelines/__init__.py b/llm-lora-finetuning/pipelines/__init__.py index fea81644..2c50e45e 100644 --- a/llm-lora-finetuning/pipelines/__init__.py +++ b/llm-lora-finetuning/pipelines/__init__.py @@ -13,3 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from pipelines.evaluate import eval_pipeline +from pipelines.feature_engineering import feature_engineering_pipeline +from pipelines.finetuning import finetuning_pipeline +from pipelines.merge import merge_pipeline diff --git a/llm-lora-finetuning/pipelines/eval.py b/llm-lora-finetuning/pipelines/evaluate.py similarity index 79% rename from llm-lora-finetuning/pipelines/eval.py rename to llm-lora-finetuning/pipelines/evaluate.py index e397b33e..25cdbdfc 100644 --- a/llm-lora-finetuning/pipelines/eval.py +++ b/llm-lora-finetuning/pipelines/evaluate.py @@ -13,13 +13,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional -from steps.eval import eval +from steps import evaluate from zenml import pipeline from zenml.config import DockerSettings @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) -def eval_pipeline(model_repo: str, adapter_repo: Optional[str] = None) -> None: - eval(model_repo=model_repo, adapter_repo=adapter_repo) +def eval_pipeline() -> None: + evaluate() diff --git a/llm-lora-finetuning/pipelines/feature_engineering.py b/llm-lora-finetuning/pipelines/feature_engineering.py index de459eaf..65c32347 100644 --- a/llm-lora-finetuning/pipelines/feature_engineering.py +++ b/llm-lora-finetuning/pipelines/feature_engineering.py @@ -13,11 +13,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from steps.feature_engineering import feature_engineering +from steps import feature_engineering from zenml import pipeline from zenml.config import DockerSettings @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) -def feature_engineering_pipeline(model_repo: str, dataset_name: str) -> None: - feature_engineering(model_repo=model_repo, dataset_name=dataset_name) +def feature_engineering_pipeline() -> None: + feature_engineering() diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index 3a0b5adb..24c48af2 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from steps.finetune import finetune +from steps import finetune from zenml import pipeline from zenml.config import DockerSettings diff --git a/llm-lora-finetuning/pipelines/merge.py b/llm-lora-finetuning/pipelines/merge.py index e3e3373c..05daccc3 100644 --- a/llm-lora-finetuning/pipelines/merge.py +++ b/llm-lora-finetuning/pipelines/merge.py @@ -13,21 +13,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from steps.merge import merge +from steps import merge from zenml import pipeline from zenml.config import DockerSettings @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) -def merge_pipeline( - base_model_repo: str, - adapter_repo: str, - output_repo: str, - convert_to_hf: bool = False, -) -> None: - merge( - base_model_repo=base_model_repo, - adapter_repo=adapter_repo, - output_repo=output_repo, - convert_to_hf=convert_to_hf, - ) +def merge_pipeline() -> None: + merge() diff --git a/llm-lora-finetuning/steps/__init__.py b/llm-lora-finetuning/steps/__init__.py index fea81644..ea459261 100644 --- a/llm-lora-finetuning/steps/__init__.py +++ b/llm-lora-finetuning/steps/__init__.py @@ -13,3 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from steps.evaluate import evaluate +from steps.feature_engineering import feature_engineering +from steps.finetune import finetune +from steps.merge import merge diff --git a/llm-lora-finetuning/steps/eval.py b/llm-lora-finetuning/steps/evaluate.py similarity index 99% rename from llm-lora-finetuning/steps/eval.py rename to llm-lora-finetuning/steps/evaluate.py index cac942d9..e1ee3348 100644 --- a/llm-lora-finetuning/steps/eval.py +++ b/llm-lora-finetuning/steps/evaluate.py @@ -66,7 +66,7 @@ class EvaluationParameters(BaseModel): @step -def eval( +def evaluate( config: EvaluationParameters, ) -> Annotated[Dict[str, Any], "evaluation_results"]: """Evaluate model. diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 577ee70d..7e0ab88f 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -25,13 +25,15 @@ from zenml import log_model_metadata, step from zenml.logger import get_logger -from scripts.convert_hf_checkpoint import convert_hf_checkpoint from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub from scripts.merge_lora import merge_lora from scripts.prepare_alpaca import prepare from steps.params import DataParameters, LoraParameters -from steps.utils import get_huggingface_access_token, convert_to_lit_checkpoint_if_necessary +from steps.utils import ( + convert_to_lit_checkpoint_if_necessary, + get_huggingface_access_token, +) logger = get_logger(__file__) From ddf2d51ce3c1bc7bcb1900e31c63a0aaebb01c27 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Thu, 7 Mar 2024 15:39:08 +0800 Subject: [PATCH 16/26] fix wrong import --- llm-lora-finetuning/steps/finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 7e0ab88f..fccde8b2 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -29,7 +29,7 @@ from scripts.download import download_from_hub from scripts.merge_lora import merge_lora from scripts.prepare_alpaca import prepare -from steps.params import DataParameters, LoraParameters +from steps.params import LoraParameters from steps.utils import ( convert_to_lit_checkpoint_if_necessary, get_huggingface_access_token, From 86ace8c03c9f5f15e06ec1da2dbb90e79e55a356 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Thu, 7 Mar 2024 16:56:01 +0800 Subject: [PATCH 17/26] Move materializer --- llm-lora-finetuning/.dockerignore | 1 + llm-lora-finetuning/materializers/__init__.py | 15 ++++ .../materializers/directory_materializer.py | 69 +++++++++++++++++++ llm-lora-finetuning/pipelines/evaluate.py | 1 + .../pipelines/feature_engineering.py | 1 + llm-lora-finetuning/pipelines/finetuning.py | 1 + llm-lora-finetuning/pipelines/merge.py | 1 + llm-lora-finetuning/steps/evaluate.py | 4 +- .../steps/feature_engineering.py | 47 ++----------- llm-lora-finetuning/steps/finetune.py | 8 +++ llm-lora-finetuning/steps/merge.py | 2 + llm-lora-finetuning/steps/params.py | 2 + 12 files changed, 108 insertions(+), 44 deletions(-) create mode 100644 llm-lora-finetuning/materializers/__init__.py create mode 100644 llm-lora-finetuning/materializers/directory_materializer.py diff --git a/llm-lora-finetuning/.dockerignore b/llm-lora-finetuning/.dockerignore index 083734a0..496552c8 100644 --- a/llm-lora-finetuning/.dockerignore +++ b/llm-lora-finetuning/.dockerignore @@ -1,6 +1,7 @@ * !/pipelines/** !/steps/** +!/materializers/** !/evaluate/** !/finetune/** !/generate/** diff --git a/llm-lora-finetuning/materializers/__init__.py b/llm-lora-finetuning/materializers/__init__.py new file mode 100644 index 00000000..fea81644 --- /dev/null +++ b/llm-lora-finetuning/materializers/__init__.py @@ -0,0 +1,15 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/llm-lora-finetuning/materializers/directory_materializer.py b/llm-lora-finetuning/materializers/directory_materializer.py new file mode 100644 index 00000000..f78a23a5 --- /dev/null +++ b/llm-lora-finetuning/materializers/directory_materializer.py @@ -0,0 +1,69 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from tempfile import mkdtemp +from typing import Any, ClassVar, Tuple, Type + +from zenml.enums import ArtifactType +from zenml.io import fileio +from zenml.materializers.base_materializer import BaseMaterializer + + +class DirectoryMaterializer(BaseMaterializer): + """Materializer to store local directories in the artifact store.""" + + ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Path,) + ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.DATA + + def load(self, data_type: Type[Any]) -> Any: + """Copy the artifact files to a local temp directory. + + Args: + data_type: Unused. + + Returns: + Path to the local directory that contains the artifact files. + """ + directory = mkdtemp(prefix="zenml-artifact") + self._copy_directory(src=self.uri, dst=directory) + return Path(directory) + + def save(self, data: Any) -> None: + """Store the directory in the artifact store. + + Args: + data: Path to a local directory to store. + """ + assert isinstance(data, Path) + self._copy_directory(src=str(data), dst=self.uri) + + @staticmethod + def _copy_directory(src: str, dst: str) -> None: + """Recursively copy a directory. + + Args: + src: The directory to copy. + dst: Where to copy the directory to. + """ + for src_dir, _, files in fileio.walk(src): + dst_dir = os.path.join(dst, os.path.relpath(src_dir, src)) + fileio.makedirs(dst_dir) + + for file in files: + src_file = os.path.join(src_dir, file) + dst_file = os.path.join(dst_dir, file) + fileio.copy(src_file, dst_file) diff --git a/llm-lora-finetuning/pipelines/evaluate.py b/llm-lora-finetuning/pipelines/evaluate.py index 25cdbdfc..c1439f1a 100644 --- a/llm-lora-finetuning/pipelines/evaluate.py +++ b/llm-lora-finetuning/pipelines/evaluate.py @@ -21,4 +21,5 @@ @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) def eval_pipeline() -> None: + """Pipeline to evaluate a LoRA fine-tuned LLM.""" evaluate() diff --git a/llm-lora-finetuning/pipelines/feature_engineering.py b/llm-lora-finetuning/pipelines/feature_engineering.py index 65c32347..f8565230 100644 --- a/llm-lora-finetuning/pipelines/feature_engineering.py +++ b/llm-lora-finetuning/pipelines/feature_engineering.py @@ -20,4 +20,5 @@ @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) def feature_engineering_pipeline() -> None: + """Data preprocessing pipeline.""" feature_engineering() diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index 24c48af2..e27c55e5 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -21,4 +21,5 @@ @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) def finetuning_pipeline() -> None: + """Pipeline to finetune LLMs using LoRA.""" finetune() diff --git a/llm-lora-finetuning/pipelines/merge.py b/llm-lora-finetuning/pipelines/merge.py index 05daccc3..66370f32 100644 --- a/llm-lora-finetuning/pipelines/merge.py +++ b/llm-lora-finetuning/pipelines/merge.py @@ -20,4 +20,5 @@ @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) def merge_pipeline() -> None: + """Pipeline to merge LLMs with adapters.""" merge() diff --git a/llm-lora-finetuning/steps/evaluate.py b/llm-lora-finetuning/steps/evaluate.py index e1ee3348..15072007 100644 --- a/llm-lora-finetuning/steps/evaluate.py +++ b/llm-lora-finetuning/steps/evaluate.py @@ -33,7 +33,9 @@ class EvaluationParameters(BaseModel): - """If `adapter_repo` is set, it will be merged with the model. Otherwise + """Parameters for the evaluation step. + + If `adapter_repo` is set, it will be merged with the model. Otherwise the model itself will be evaluated. """ diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index dfc1a890..6a6cdad4 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -15,61 +15,22 @@ # limitations under the License. import importlib import json -import os from dataclasses import asdict from pathlib import Path -from tempfile import mkdtemp -from typing import Annotated, Any, ClassVar, Dict, Tuple, Type +from typing import Annotated, Any, Dict from lit_gpt import Config +from materializers.directory_materializer import DirectoryMaterializer from pydantic import BaseModel from zenml import log_artifact_metadata, step -from zenml.enums import ArtifactType -from zenml.io import fileio -from zenml.materializers.base_materializer import BaseMaterializer from scripts.download import download_from_hub from steps.utils import get_huggingface_access_token -class DirectoryMaterializer(BaseMaterializer): - ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Path,) - ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.DATA - - def load(self, data_type: Type[Any]) -> Any: - """Write logic here to load the data of an artifact. - - Args: - data_type: What type the artifact data should be loaded as. - - Returns: - """ - directory = mkdtemp(prefix="zenml-artifact") - self._copy_directory(src=self.uri, dst=directory) - return Path(directory) - - def save(self, data: Any) -> None: - """Write logic here to save the data of an artifact. - - Args: - data: The data of the artifact to save. - """ - assert isinstance(data, Path) - self._copy_directory(src=str(data), dst=self.uri) - - @staticmethod - def _copy_directory(src: str, dst: str) -> None: - for src_dir, _, files in fileio.walk(src): - dst_dir = os.path.join(dst, os.path.relpath(src_dir, src)) - fileio.makedirs(dst_dir) - - for file in files: - src_file = os.path.join(src_dir, file) - dst_file = os.path.join(dst_dir, file) - fileio.copy(src_file, dst_file) - - class FeatureEngineeringParameters(BaseModel): + """Parameters for the feature engineering step.""" + model_repo: str dataset_name: str diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index fccde8b2..84ea749f 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -39,6 +39,8 @@ class DataParameters(BaseModel): + """Data preprocessing parameters.""" + seed: int = 42 test_split_fraction: float = 0.03865 mask_inputs: bool = False @@ -47,6 +49,8 @@ class DataParameters(BaseModel): class TrainingParameters(BaseModel): + """Training parameters.""" + save_interval: int = 1000 log_interval: int = 1 global_batch_size: int = 64 @@ -66,12 +70,16 @@ class TrainingParameters(BaseModel): class EvalParameters(BaseModel): + """Mid-training evaluation parameters.""" + interval: int = 100 max_new_tokens: int = 100 max_iters: int = 100 class FinetuningParameters(BaseModel): + """Parameters for the finetuning step.""" + base_model_repo: str data_dir: Optional[Path] = None diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 7bf568a2..8f454fef 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -32,6 +32,8 @@ class MergeParameters(BaseModel): + """Parameters for the merging step.""" + base_model_repo: str adapter_repo: str output_repo: str diff --git a/llm-lora-finetuning/steps/params.py b/llm-lora-finetuning/steps/params.py index a04e950c..39d09139 100644 --- a/llm-lora-finetuning/steps/params.py +++ b/llm-lora-finetuning/steps/params.py @@ -18,6 +18,8 @@ class LoraParameters(BaseModel): + """Lora specific parameters.""" + lora_r: int = 8 lora_alpha: int = 16 lora_dropout: float = 0.05 From 5c959895fa76eb871f8575249e526f67f9b96ae3 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 13:16:11 +0800 Subject: [PATCH 18/26] Small fixes --- .../configs/finetune-mistral-alpaca.yaml | 1 + llm-lora-finetuning/steps/finetune.py | 9 ++--- llm-lora-finetuning/steps/merge.py | 36 ++++++++++--------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml index d7ae3244..3cd7516f 100644 --- a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml +++ b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml @@ -11,6 +11,7 @@ steps: parameters: config: base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 + precision: bf16-true training: save_interval: 1 epochs: 1 # 5 diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 84ea749f..668fbc9c 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -135,6 +135,7 @@ def finetune(config: FinetuningParameters) -> None: data_dir = config.data_dir else: data_dir = Path("data/alpaca") + dataset_name = data_dir.name prepare( destination_path=data_dir, checkpoint_dir=checkpoint_dir, @@ -146,12 +147,11 @@ def finetune(config: FinetuningParameters) -> None: ) model_name = checkpoint_dir.name - dataset_name = data_dir.name log_model_metadata( metadata={"model_name": model_name, "dataset_name": dataset_name} ) - output_dir = Path("output/lora") / dataset_name + output_dir = Path("output/lora") / dataset_name / model_name io_args = IOArgs( train_data_dir=data_dir, @@ -172,7 +172,7 @@ def finetune(config: FinetuningParameters) -> None: ) if config.merged_output_repo: - lora_path = output_dir / model_name / "lit_model_lora_finetuned.pth" + lora_path = output_dir / "lit_model_lora_finetuned.pth" merge_output_dir = ( Path("output/lora_merged") / dataset_name / model_name @@ -194,6 +194,7 @@ def finetune(config: FinetuningParameters) -> None: upload_dir = ( Path("output/lora_merged_hf") / dataset_name / model_name ) + upload_dir.mkdir(exist_ok=True) convert_lit_checkpoint( checkpoint_path=config.merged_output_repo / "lit_model.pth", output_path=output_dir, @@ -217,7 +218,7 @@ def finetune(config: FinetuningParameters) -> None: if config.adapter_output_repo: commit = upload_folder( repo_id=config.adapter_output_repo, - folder_path=output_dir / model_name, + folder_path=output_dir, token=access_token, ) log_model_metadata( diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 8f454fef..6d5c5707 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -17,7 +17,7 @@ from pathlib import Path from typing import Optional -from huggingface_hub import upload_folder +from huggingface_hub import snapshot_download, upload_folder from pydantic import BaseModel from zenml import log_model_metadata, step @@ -52,31 +52,31 @@ def merge(config: MergeParameters) -> None: """ access_token = get_huggingface_access_token() - base_model_dir = Path("checkpoints") - adapter_dir = Path("adapter") + checkpoint_root_dir = Path("checkpoints") merged_dir = Path("merged") + base_model_dir = checkpoint_root_dir / config.base_model_repo + adapter_dir = Path("adapters") / config.adapter_repo download_from_hub( repo_id=config.base_model_repo, - checkpoint_dir=base_model_dir, - access_token=access_token, - ) - download_from_hub( - repo_id=config.adapter_repo, - checkpoint_dir=adapter_dir, + checkpoint_dir=checkpoint_root_dir, access_token=access_token, ) - convert_to_lit_checkpoint_if_necessary( - checkpoint_dir=base_model_dir / config.model_repo + snapshot_download( + config.adapter_repo, + local_dir=adapter_dir, + local_dir_use_symlinks=False, + resume_download=True, + token=access_token, ) - lora_path = ( - adapter_dir / config.adapter_repo / "lit_model_lora_finetuned.pth" - ) + convert_to_lit_checkpoint_if_necessary(checkpoint_dir=base_model_dir) + + lora_path = adapter_dir / "lit_model_lora_finetuned.pth" merge_lora( lora_path=Path(lora_path), - checkpoint_dir=base_model_dir / config.base_model_repo, + checkpoint_dir=base_model_dir, out_dir=merged_dir, precision=config.precision, **config.lora.dict() @@ -84,11 +84,13 @@ def merge(config: MergeParameters) -> None: for path in Path(base_model_dir).glob("*.json"): destination = Path(merged_dir) / path.name - shutil.copy(src=path, dst=destination) if config.convert_to_hf_checkpoint: - output_dir = Path("lora_merged_hf") + model_name = base_model_dir.name + + output_dir = Path("output/lora_merged_hf") / model_name + output_dir.mkdir(exist_ok=True) convert_lit_checkpoint( checkpoint_path=merged_dir / "lit_model.pth", config_path=merged_dir / "lit_config.json", From 420775e153b5f81c60505f6b81cd3dcbfee218fa Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 14:36:14 +0800 Subject: [PATCH 19/26] More small changes --- llm-lora-finetuning/steps/evaluate.py | 23 ++++++++++++----------- llm-lora-finetuning/steps/finetune.py | 5 ++--- llm-lora-finetuning/steps/merge.py | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/llm-lora-finetuning/steps/evaluate.py b/llm-lora-finetuning/steps/evaluate.py index 15072007..c46a8017 100644 --- a/llm-lora-finetuning/steps/evaluate.py +++ b/llm-lora-finetuning/steps/evaluate.py @@ -20,6 +20,7 @@ import torch from evaluate.lm_eval_harness import run_eval_harness +from huggingface_hub import snapshot_download from pydantic import BaseModel from zenml import step @@ -80,25 +81,26 @@ def evaluate( access_token = get_huggingface_access_token() - model_dir = Path("model") + model_root_dir = Path("model") download_from_hub( repo_id=config.model_repo, - checkpoint_dir=model_dir, + checkpoint_dir=model_root_dir, access_token=access_token, ) + model_dir = model_root_dir / config.model_repo - convert_to_lit_checkpoint_if_necessary( - checkpoint_dir=model_dir / config.model_repo - ) + convert_to_lit_checkpoint_if_necessary(checkpoint_dir=model_dir) if config.adapter_repo: - adapter_dir = Path("adapter") + adapter_dir = Path("adapters") / config.adapter_repo merged_dir = Path("merged") - download_from_hub( - repo_id=config.adapter_repo, - checkpoint_dir=adapter_dir, - access_token=access_token, + snapshot_download( + config.adapter_repo, + local_dir=adapter_dir, + local_dir_use_symlinks=False, + resume_download=True, + token=access_token, ) lora_path = adapter_dir / "lit_model_lora_finetuned.pth" @@ -112,7 +114,6 @@ def evaluate( for path in Path(model_dir).glob("*.json"): destination = Path(merged_dir) / path.name - shutil.copy(src=path, dst=destination) model_dir = merged_dir diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 668fbc9c..c9b695e6 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -187,18 +187,17 @@ def finetune(config: FinetuningParameters) -> None: for path in Path(checkpoint_dir).glob("*.json"): destination = Path(merge_output_dir) / path.name - shutil.copy(src=path, dst=destination) if config.convert_to_hf_checkpoint: upload_dir = ( Path("output/lora_merged_hf") / dataset_name / model_name ) - upload_dir.mkdir(exist_ok=True) + upload_dir.mkdir(parents=True, exist_ok=True) convert_lit_checkpoint( checkpoint_path=config.merged_output_repo / "lit_model.pth", - output_path=output_dir, config_path=config.merged_output_repo / "lit_config.json", + output_path=upload_dir / "pytorch_model", ) else: upload_dir = merge_output_dir diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 6d5c5707..24d8dcf7 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -90,11 +90,11 @@ def merge(config: MergeParameters) -> None: model_name = base_model_dir.name output_dir = Path("output/lora_merged_hf") / model_name - output_dir.mkdir(exist_ok=True) + output_dir.mkdir(parents=True, exist_ok=True) convert_lit_checkpoint( checkpoint_path=merged_dir / "lit_model.pth", config_path=merged_dir / "lit_config.json", - output_path=output_dir, + output_path=output_dir / "pytorch_model", ) else: output_dir = merged_dir From f9135b6d61e51dc5a6688550ad8bb5b7d01e61ad Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 14:38:38 +0800 Subject: [PATCH 20/26] Fix import --- llm-lora-finetuning/run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llm-lora-finetuning/run.py b/llm-lora-finetuning/run.py index cd1a71a6..297d2c0c 100644 --- a/llm-lora-finetuning/run.py +++ b/llm-lora-finetuning/run.py @@ -105,22 +105,22 @@ def main( pipeline_args["config_path"] = os.path.join(config_folder, config) if feature_pipeline: - from pipelines.feature_engineering import feature_engineering_pipeline + from pipelines import feature_engineering_pipeline feature_engineering_pipeline.with_options(**pipeline_args)() if finetuning_pipeline: - from pipelines.finetuning import finetuning_pipeline + from pipelines import finetuning_pipeline finetuning_pipeline.with_options(**pipeline_args)() if merging_pipeline: - from pipelines.merge import merge_pipeline + from pipelines import merge_pipeline merge_pipeline.with_options(**pipeline_args)() if eval_pipeline: - from pipelines.eval import eval_pipeline + from pipelines import eval_pipeline eval_pipeline.with_options(**pipeline_args)() From bc43519e7e6dba364cc2460f7acf3057fba8ce30 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 16:07:16 +0800 Subject: [PATCH 21/26] Update configs --- llm-lora-finetuning/configs/eval-mistral.yaml | 15 ++++++ llm-lora-finetuning/configs/eval.yaml | 7 --- .../configs/feature-mistral-alpaca.yaml | 15 ++++++ llm-lora-finetuning/configs/feature.yaml | 7 --- .../configs/finetune-mistral-alpaca.yaml | 1 + .../configs/merge-mistral.yaml | 16 +++++++ llm-lora-finetuning/configs/merge.yaml | 7 --- llm-lora-finetuning/pipelines/finetuning.py | 17 +++++-- llm-lora-finetuning/steps/finetune.py | 47 ++++++++++++------- 9 files changed, 92 insertions(+), 40 deletions(-) create mode 100644 llm-lora-finetuning/configs/eval-mistral.yaml delete mode 100644 llm-lora-finetuning/configs/eval.yaml create mode 100644 llm-lora-finetuning/configs/feature-mistral-alpaca.yaml delete mode 100644 llm-lora-finetuning/configs/feature.yaml create mode 100644 llm-lora-finetuning/configs/merge-mistral.yaml delete mode 100644 llm-lora-finetuning/configs/merge.yaml diff --git a/llm-lora-finetuning/configs/eval-mistral.yaml b/llm-lora-finetuning/configs/eval-mistral.yaml new file mode 100644 index 00000000..ecd022f3 --- /dev/null +++ b/llm-lora-finetuning/configs/eval-mistral.yaml @@ -0,0 +1,15 @@ +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral + +steps: + evaluate: + parameters: + config: + model_repo: mistralai/Mistral-7B-Instruct-v0.1 + adapter_repo: ... + precision: bf16-true \ No newline at end of file diff --git a/llm-lora-finetuning/configs/eval.yaml b/llm-lora-finetuning/configs/eval.yaml deleted file mode 100644 index f34ed8fb..00000000 --- a/llm-lora-finetuning/configs/eval.yaml +++ /dev/null @@ -1,7 +0,0 @@ -model: - name: mistral-7b-lora - description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." - tags: - - llm - - lora - - mistral \ No newline at end of file diff --git a/llm-lora-finetuning/configs/feature-mistral-alpaca.yaml b/llm-lora-finetuning/configs/feature-mistral-alpaca.yaml new file mode 100644 index 00000000..d7b5e99b --- /dev/null +++ b/llm-lora-finetuning/configs/feature-mistral-alpaca.yaml @@ -0,0 +1,15 @@ +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral + - alpaca + +steps: + feature_engineering: + parameters: + config: + model_repo: mistralai/Mistral-7B-Instruct-v0.1 + dataset_name: alpaca diff --git a/llm-lora-finetuning/configs/feature.yaml b/llm-lora-finetuning/configs/feature.yaml deleted file mode 100644 index f34ed8fb..00000000 --- a/llm-lora-finetuning/configs/feature.yaml +++ /dev/null @@ -1,7 +0,0 @@ -model: - name: mistral-7b-lora - description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." - tags: - - llm - - lora - - mistral \ No newline at end of file diff --git a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml index 3cd7516f..5d020ec5 100644 --- a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml +++ b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml @@ -5,6 +5,7 @@ model: - llm - lora - mistral + - alpaca steps: finetune: diff --git a/llm-lora-finetuning/configs/merge-mistral.yaml b/llm-lora-finetuning/configs/merge-mistral.yaml new file mode 100644 index 00000000..136f681f --- /dev/null +++ b/llm-lora-finetuning/configs/merge-mistral.yaml @@ -0,0 +1,16 @@ +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral + +steps: + merge: + parameters: + config: + base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 + adapter_repo: ... + output_repo: ... + precision: bf16-true \ No newline at end of file diff --git a/llm-lora-finetuning/configs/merge.yaml b/llm-lora-finetuning/configs/merge.yaml deleted file mode 100644 index f34ed8fb..00000000 --- a/llm-lora-finetuning/configs/merge.yaml +++ /dev/null @@ -1,7 +0,0 @@ -model: - name: mistral-7b-lora - description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." - tags: - - llm - - lora - - mistral \ No newline at end of file diff --git a/llm-lora-finetuning/pipelines/finetuning.py b/llm-lora-finetuning/pipelines/finetuning.py index e27c55e5..23595e53 100644 --- a/llm-lora-finetuning/pipelines/finetuning.py +++ b/llm-lora-finetuning/pipelines/finetuning.py @@ -14,12 +14,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from steps import finetune -from zenml import pipeline +from zenml import get_pipeline_context, pipeline from zenml.config import DockerSettings @pipeline(settings={"docker": DockerSettings(requirements="requirements.txt")}) -def finetuning_pipeline() -> None: +def finetuning_pipeline( + dataset_artifact_name: Optional[str] = None, + dataset_artifact_version: Optional[str] = None, +) -> None: """Pipeline to finetune LLMs using LoRA.""" - finetune() + dataset_directory = None + if dataset_artifact_name: + dataset_directory = get_pipeline_context().model.get_artifact( + name=dataset_artifact_name, version=dataset_artifact_version + ) + + finetune(dataset_directory=dataset_directory) diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index c9b695e6..c95b3d4b 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -15,14 +15,15 @@ # limitations under the License. import shutil from pathlib import Path -from typing import Literal, Optional +from typing import Annotated, Literal, Optional import torch from finetune.lora import setup from huggingface_hub import upload_folder from lit_gpt.args import EvalArgs, IOArgs, TrainArgs +from materializers.directory_materializer import DirectoryMaterializer from pydantic import BaseModel -from zenml import log_model_metadata, step +from zenml import get_step_context, log_model_metadata, step from zenml.logger import get_logger from scripts.convert_lit_checkpoint import convert_lit_checkpoint @@ -81,7 +82,6 @@ class FinetuningParameters(BaseModel): """Parameters for the finetuning step.""" base_model_repo: str - data_dir: Optional[Path] = None adapter_output_repo: Optional[str] = None merged_output_repo: Optional[str] = None @@ -104,8 +104,10 @@ class FinetuningParameters(BaseModel): lora: LoraParameters = LoraParameters() -@step -def finetune(config: FinetuningParameters) -> None: +@step(output_materializers=DirectoryMaterializer) +def finetune( + config: FinetuningParameters, dataset_directory: Optional[Path] = None +) -> Annotated[Optional[Path], "adapter"]: """Finetune model using LoRA. Args: @@ -131,13 +133,21 @@ def finetune(config: FinetuningParameters) -> None: convert_to_lit_checkpoint_if_necessary(checkpoint_dir=checkpoint_dir) - if config.data_dir: - data_dir = config.data_dir + if dataset_directory: + try: + dataset_name = ( + get_step_context() + .inputs["data_dir"] + .run_metadata["dataset_name"] + .value + ) + except KeyError: + dataset_name = "unknown_dataset" else: - data_dir = Path("data/alpaca") - dataset_name = data_dir.name + dataset_directory = Path("data/alpaca") + dataset_name = dataset_directory.name prepare( - destination_path=data_dir, + destination_path=dataset_directory, checkpoint_dir=checkpoint_dir, test_split_fraction=config.data.test_split_fraction, seed=config.data.seed, @@ -151,13 +161,13 @@ def finetune(config: FinetuningParameters) -> None: log_model_metadata( metadata={"model_name": model_name, "dataset_name": dataset_name} ) - output_dir = Path("output/lora") / dataset_name / model_name + adapter_output_dir = Path("output/lora") / dataset_name / model_name io_args = IOArgs( - train_data_dir=data_dir, - val_data_dir=data_dir, + train_data_dir=dataset_directory, + val_data_dir=dataset_directory, checkpoint_dir=checkpoint_dir, - out_dir=output_dir, + out_dir=adapter_output_dir, ) train_args = TrainArgs(**config.training.dict()) eval_args = EvalArgs(**config.eval.dict()) @@ -172,7 +182,7 @@ def finetune(config: FinetuningParameters) -> None: ) if config.merged_output_repo: - lora_path = output_dir / "lit_model_lora_finetuned.pth" + lora_path = adapter_output_dir / "lit_model_lora_finetuned.pth" merge_output_dir = ( Path("output/lora_merged") / dataset_name / model_name @@ -217,7 +227,7 @@ def finetune(config: FinetuningParameters) -> None: if config.adapter_output_repo: commit = upload_folder( repo_id=config.adapter_output_repo, - folder_path=output_dir, + folder_path=adapter_output_dir, token=access_token, ) log_model_metadata( @@ -226,3 +236,8 @@ def finetune(config: FinetuningParameters) -> None: "adapter_huggingface_commit_url": commit.commit_url, } ) + return None + else: + # If the adapter should not be uploaded to the HF Hub, we store it + # in the artifact store + return adapter_output_dir From 0393acb6c76865c5ae48aeac58fca7bdc53cdf89 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 17:28:58 +0800 Subject: [PATCH 22/26] Fix overwrite --- llm-lora-finetuning/steps/feature_engineering.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index 6a6cdad4..c1eee9eb 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -59,15 +59,15 @@ def feature_engineering( checkpoint_dir = checkpoint_root_dir / config.model_repo model_name = checkpoint_dir.name - config = Config.from_name(model_name) - config_dict = asdict(config) + lit_config = Config.from_name(model_name) + lit_config_dict = asdict(lit_config) with open(checkpoint_dir / "lit_config.json", "w") as json_config: - json.dump(config_dict, json_config) + json.dump(lit_config_dict, json_config) log_artifact_metadata( metadata={ "model_name": model_name, - "model_config": config_dict, + "model_config": lit_config_dict, "dataset_name": config.dataset_name, } ) From b07796c15c0286f36a14df326b7adcbcdd529d72 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 18:17:28 +0800 Subject: [PATCH 23/26] More local caching --- .../finetune-mistral-from-dataset.yaml | 23 ++++++++++++ llm-lora-finetuning/steps/evaluate.py | 35 ++++++++++++------- .../steps/feature_engineering.py | 2 +- llm-lora-finetuning/steps/merge.py | 25 ++++++++----- 4 files changed, 63 insertions(+), 22 deletions(-) create mode 100644 llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml diff --git a/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml b/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml new file mode 100644 index 00000000..cbe7b470 --- /dev/null +++ b/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml @@ -0,0 +1,23 @@ +model: + name: mistral-7b-lora + description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." + tags: + - llm + - lora + - mistral + +steps: + finetune: + parameters: + config: + base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 + precision: bf16-true + training: + save_interval: 1 + epochs: 1 # 5 + epoch_size: 1 # 50000 + global_batch_size: 128 + learning_rate: 3e-4 + +parameters: + dataset_artifact_name: dataset \ No newline at end of file diff --git a/llm-lora-finetuning/steps/evaluate.py b/llm-lora-finetuning/steps/evaluate.py index c46a8017..299bc779 100644 --- a/llm-lora-finetuning/steps/evaluate.py +++ b/llm-lora-finetuning/steps/evaluate.py @@ -23,6 +23,7 @@ from huggingface_hub import snapshot_download from pydantic import BaseModel from zenml import step +from zenml.logger import get_logger from scripts.download import download_from_hub from scripts.merge_lora import merge_lora @@ -32,6 +33,8 @@ get_huggingface_access_token, ) +logger = get_logger(__file__) + class EvaluationParameters(BaseModel): """Parameters for the evaluation step. @@ -81,19 +84,25 @@ def evaluate( access_token = get_huggingface_access_token() - model_root_dir = Path("model") - download_from_hub( - repo_id=config.model_repo, - checkpoint_dir=model_root_dir, - access_token=access_token, - ) - model_dir = model_root_dir / config.model_repo + checkpoint_root_dir = Path("checkpoints") + checkpoint_dir = checkpoint_root_dir / config.model_repo + + if checkpoint_dir.exists(): + logger.info( + "Checkpoint directory already exists, skipping download..." + ) + else: + download_from_hub( + repo_id=config.model_repo, + checkpoint_dir=checkpoint_root_dir, + access_token=access_token, + ) - convert_to_lit_checkpoint_if_necessary(checkpoint_dir=model_dir) + convert_to_lit_checkpoint_if_necessary(checkpoint_dir=checkpoint_dir) if config.adapter_repo: adapter_dir = Path("adapters") / config.adapter_repo - merged_dir = Path("merged") + merged_dir = Path("output/merged") snapshot_download( config.adapter_repo, @@ -106,21 +115,21 @@ def evaluate( lora_path = adapter_dir / "lit_model_lora_finetuned.pth" merge_lora( lora_path=Path(lora_path), - checkpoint_dir=model_dir, + checkpoint_dir=checkpoint_dir, out_dir=merged_dir, precision=config.precision, **config.lora.dict() ) - for path in Path(model_dir).glob("*.json"): + for path in Path(checkpoint_dir).glob("*.json"): destination = Path(merged_dir) / path.name shutil.copy(src=path, dst=destination) - model_dir = merged_dir + checkpoint_dir = merged_dir output_path = Path("output.json") run_eval_harness( - checkpoint_dir=model_dir, + checkpoint_dir=checkpoint_dir, save_filepath=output_path, **config.dict(exclude={"model_repo", "adapter_repo", "lora"}) ) diff --git a/llm-lora-finetuning/steps/feature_engineering.py b/llm-lora-finetuning/steps/feature_engineering.py index c1eee9eb..ff5d6a09 100644 --- a/llm-lora-finetuning/steps/feature_engineering.py +++ b/llm-lora-finetuning/steps/feature_engineering.py @@ -40,7 +40,7 @@ class FeatureEngineeringParameters(BaseModel): @step(output_materializers=DirectoryMaterializer) def feature_engineering( config: FeatureEngineeringParameters, -) -> Annotated[Path, "data"]: +) -> Annotated[Path, "dataset"]: """Prepare the dataset. Args: diff --git a/llm-lora-finetuning/steps/merge.py b/llm-lora-finetuning/steps/merge.py index 24d8dcf7..389c0276 100644 --- a/llm-lora-finetuning/steps/merge.py +++ b/llm-lora-finetuning/steps/merge.py @@ -20,6 +20,7 @@ from huggingface_hub import snapshot_download, upload_folder from pydantic import BaseModel from zenml import log_model_metadata, step +from zenml.logger import get_logger from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub @@ -30,6 +31,8 @@ get_huggingface_access_token, ) +logger = get_logger(__file__) + class MergeParameters(BaseModel): """Parameters for the merging step.""" @@ -53,15 +56,21 @@ def merge(config: MergeParameters) -> None: access_token = get_huggingface_access_token() checkpoint_root_dir = Path("checkpoints") - merged_dir = Path("merged") base_model_dir = checkpoint_root_dir / config.base_model_repo adapter_dir = Path("adapters") / config.adapter_repo - download_from_hub( - repo_id=config.base_model_repo, - checkpoint_dir=checkpoint_root_dir, - access_token=access_token, - ) + if base_model_dir.exists(): + logger.info( + "Checkpoint directory already exists, skipping download..." + ) + else: + download_from_hub( + repo_id=config.base_model_repo, + checkpoint_dir=checkpoint_root_dir, + access_token=access_token, + ) + + convert_to_lit_checkpoint_if_necessary(checkpoint_dir=base_model_dir) snapshot_download( config.adapter_repo, @@ -71,9 +80,9 @@ def merge(config: MergeParameters) -> None: token=access_token, ) - convert_to_lit_checkpoint_if_necessary(checkpoint_dir=base_model_dir) - lora_path = adapter_dir / "lit_model_lora_finetuned.pth" + merged_dir = Path("output/merged") + merge_lora( lora_path=Path(lora_path), checkpoint_dir=base_model_dir, From 920a14fd7f64b7f396d77db0a8559a6beb96e3ac Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 18:33:17 +0800 Subject: [PATCH 24/26] Add additional materializer --- llm-lora-finetuning/steps/finetune.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index c95b3d4b..0ee08d92 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -25,6 +25,7 @@ from pydantic import BaseModel from zenml import get_step_context, log_model_metadata, step from zenml.logger import get_logger +from zenml.materializers import BuiltInMaterializer from scripts.convert_lit_checkpoint import convert_lit_checkpoint from scripts.download import download_from_hub @@ -104,7 +105,7 @@ class FinetuningParameters(BaseModel): lora: LoraParameters = LoraParameters() -@step(output_materializers=DirectoryMaterializer) +@step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer]) def finetune( config: FinetuningParameters, dataset_directory: Optional[Path] = None ) -> Annotated[Optional[Path], "adapter"]: From 5222cf406053b8fd2d251f46d6cae998e7cb9039 Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Fri, 8 Mar 2024 18:45:22 +0800 Subject: [PATCH 25/26] Config updates --- .../configs/finetune-mistral-alpaca.yaml | 4 ++-- .../configs/finetune-mistral-from-dataset.yaml | 16 ++++++---------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml index 5d020ec5..4ac1d1f1 100644 --- a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml +++ b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml @@ -15,7 +15,7 @@ steps: precision: bf16-true training: save_interval: 1 - epochs: 1 # 5 - epoch_size: 1 # 50000 + epochs: 5 + epoch_size: 50000 global_batch_size: 128 learning_rate: 3e-4 diff --git a/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml b/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml index cbe7b470..6ca4ab3a 100644 --- a/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml +++ b/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml @@ -1,10 +1,9 @@ +parameters: + dataset_artifact_name: dataset + model: name: mistral-7b-lora - description: "Fine-tune `mistralai/Mistral-7B-Instruct-v0.1`." - tags: - - llm - - lora - - mistral + version: latest steps: finetune: @@ -14,10 +13,7 @@ steps: precision: bf16-true training: save_interval: 1 - epochs: 1 # 5 - epoch_size: 1 # 50000 + epochs: 5 + epoch_size: 50000 global_batch_size: 128 learning_rate: 3e-4 - -parameters: - dataset_artifact_name: dataset \ No newline at end of file From f6643bc45b6383eceb49b87013e8c175483ffccc Mon Sep 17 00:00:00 2001 From: Michael Schuster Date: Mon, 11 Mar 2024 13:41:11 +0800 Subject: [PATCH 26/26] Finish readme --- llm-lora-finetuning/README.md | 54 +++++++++++++++++++ .../configs/finetune-mistral-alpaca.yaml | 2 + .../finetune-mistral-from-dataset.yaml | 2 + llm-lora-finetuning/run.py | 6 ++- 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/llm-lora-finetuning/README.md b/llm-lora-finetuning/README.md index e69de29b..03b71ee1 100644 --- a/llm-lora-finetuning/README.md +++ b/llm-lora-finetuning/README.md @@ -0,0 +1,54 @@ +# ☮️ Fine-tuning open source LLMs using MLOps pipelines + +The goal of this project is to use [ZenML](https://github.com/zenml-io/zenml) to write reusable MLOps pipelines to fine-tune various opens source LLMs. + +Using these pipelines, we can run the data-preparation and model finetuning with a single command while using YAML files for [configuration](https://docs.zenml.io/user-guide/production-guide/configure-pipeline) and letting ZenML take care of tracking our metadata and [containerizing our pipelines](https://docs.zenml.io/user-guide/advanced-guide/infrastructure-management/containerize-your-pipeline). + +## :earth_americas: Inspiration and Credit + +This project heavily relies on the [Lit-GPT project](https://github.com/Lightning-AI/litgpt) of the amazing people at Lightning AI. We used [this blogpost](https://lightning.ai/pages/community/lora-insights/#toc14) to get started with LoRA and QLoRA and modified the commands they recommend to make them work using ZenML. + +## 🏃 How to run + +In this repository we provide a few predefined configuration files for finetuning the [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) model on the [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset. You can change both the base model and dataset by modifying the configuration files. + +If you want to push any of your finetuned adapters or merged models to huggingface, you will need to register a secret with your huggingface access token as follows: +```shell +zenml secret create huggingface_credentials --token= +``` + +### Combined feature engineering and finetuning pipeline + +The easiest way to get started with just a single command is to run the finetuning pipeline with the `finetune-mistral-alpaca.yaml` configuration file, which will do both feature engineering and finetuning: + +```shell +python run.py --finetuning-pipeline --config finetune-mistral-alpaca.yaml +``` + +When running the pipeline like this, the trained adapter will be stored in the ZenML artifact store. You can optionally upload the adapter, the merged model or both by specifying the `adapter_output_repo` and `merged_output_repo` parameters in the configuration file. + + +### Evaluation pipeline + +Before running this pipeline, you will need to fill in the `adapter_repo` in the `eval-mistral.yaml` configuration file. This should point to a huggingface repository that contains the finetuned adapter you got by running the finetuning pipeline. + +```shell +python run.py --eval-pipeline --config eval-mistral.yaml +``` + +### Merging pipeline + +In case you have trained an adapter using the finetuning pipeline, you can merge it with the base model by filling in the `adapter_repo` and `output_repo` parameters in the `merge-mistral.yaml` file, and then running: + +```shell +python run.py --merge-pipeline --config merge-mistral.yaml +``` + +### Feature Engineering followed by Finetuning + +If you want to finetune your model on a different dataset, you can do so by running the feature engineering pipeline followed by the finetuning pipeline. To define your dataset, take a look at the `scripts/prepare_*` scripts and set the dataset name in the `feature-mistral-alpaca.yaml` config file. + +```shell +python run.py --feature-pipeline --config --feature-mistral-alpaca.yaml +python run.py --finetuning-pipeline --config finetune-mistral-from-dataset.yaml +``` diff --git a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml index 4ac1d1f1..d643c52b 100644 --- a/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml +++ b/llm-lora-finetuning/configs/finetune-mistral-alpaca.yaml @@ -13,6 +13,8 @@ steps: config: base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 precision: bf16-true + # merged_output_repo: + # adapter_output_repo: training: save_interval: 1 epochs: 5 diff --git a/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml b/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml index 6ca4ab3a..9cb8da93 100644 --- a/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml +++ b/llm-lora-finetuning/configs/finetune-mistral-from-dataset.yaml @@ -11,6 +11,8 @@ steps: config: base_model_repo: mistralai/Mistral-7B-Instruct-v0.1 precision: bf16-true + # merged_output_repo: + # adapter_output_repo: training: save_interval: 1 epochs: 5 diff --git a/llm-lora-finetuning/run.py b/llm-lora-finetuning/run.py index 297d2c0c..e6c62870 100644 --- a/llm-lora-finetuning/run.py +++ b/llm-lora-finetuning/run.py @@ -101,8 +101,10 @@ def main( "configs", ) pipeline_args = {"enable_cache": not no_cache} - if config: - pipeline_args["config_path"] = os.path.join(config_folder, config) + if not config: + raise RuntimeError("Config file is required to run a pipeline.") + + pipeline_args["config_path"] = os.path.join(config_folder, config) if feature_pipeline: from pipelines import feature_engineering_pipeline