From 2a2ef8fb0be481ac563339eb5cc60fc60524a70c Mon Sep 17 00:00:00 2001 From: turboderp Date: Fri, 8 Sep 2023 17:00:01 +0200 Subject: [PATCH] Support for sharded models --- README.md | 11 ++++- example_basic.py | 2 +- example_batch.py | 2 +- example_cfg.py | 2 +- example_flask.py | 2 +- example_lora.py | 2 +- example_ws.py | 2 +- model.py | 103 +++++++++++++++++++++++++---------------------- model_init.py | 20 ++++++--- requirements.txt | 2 +- util/shard.py | 84 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 169 insertions(+), 63 deletions(-) create mode 100644 util/shard.py diff --git a/README.md b/README.md index 5a4f6307..a1cc713c 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ have no AMD devices to test or optimize on. * Python 3.9 or newer * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118 -* `safetensors` 0.3.1 +* `safetensors` 0.3.2 * `sentencepiece` * `ninja` @@ -190,4 +190,11 @@ Moved the todo list [here](doc/TODO.md). **2023-01-09**: Added rope_theta parameter for (at least partial) CodeLlama support. If you were using alpha = 97 or similar, you would no longer need that for CodeLlama models. Still stuff to sort out regarding the extended -vocabulary. \ No newline at end of file +vocabulary. + +**2023-08-09**: Added support for sharded models. `config.model_path` now accepts either a filename or a list of +filenames. `model_init()` will detect multiple .safetensors files if given a model directory. Note the change in the +various examples: `model_path = glob.glob(st_pattern)[0]` becomes simply `model_path = glob.glob(st_pattern)`. Also +there's a little script in `util/shard.py` to split large .safetensors files. It also produces an index.json file for +the sharded model, just for completeness, although ExLlama doesn't need it to read the shards. Note that the +**safetensors dependency was bumped to version 0.3.2**. \ No newline at end of file diff --git a/example_basic.py b/example_basic.py index e95d0adb..f49ba4f9 100644 --- a/example_basic.py +++ b/example_basic.py @@ -12,7 +12,7 @@ tokenizer_path = os.path.join(model_directory, "tokenizer.model") model_config_path = os.path.join(model_directory, "config.json") st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] +model_path = glob.glob(st_pattern) # Create config, model, tokenizer and generator diff --git a/example_batch.py b/example_batch.py index 179cf2f4..66986267 100644 --- a/example_batch.py +++ b/example_batch.py @@ -12,7 +12,7 @@ tokenizer_path = os.path.join(model_directory, "tokenizer.model") model_config_path = os.path.join(model_directory, "config.json") st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] +model_path = glob.glob(st_pattern) # Batched prompts diff --git a/example_cfg.py b/example_cfg.py index 53fcbbb6..de5750b4 100644 --- a/example_cfg.py +++ b/example_cfg.py @@ -15,7 +15,7 @@ tokenizer_path = os.path.join(model_directory, "tokenizer.model") model_config_path = os.path.join(model_directory, "config.json") st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] +model_path = glob.glob(st_pattern) # Create config, model, tokenizer and generator diff --git a/example_flask.py b/example_flask.py index d83a45b4..b0d79d4a 100644 --- a/example_flask.py +++ b/example_flask.py @@ -10,7 +10,7 @@ tokenizer_path = os.path.join(model_directory, "tokenizer.model") model_config_path = os.path.join(model_directory, "config.json") st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] +model_path = glob.glob(st_pattern) config = ExLlamaConfig(model_config_path) # create config from config.json config.model_path = model_path # supply path to model weights file diff --git a/example_lora.py b/example_lora.py index e47c34f5..53bfa61e 100644 --- a/example_lora.py +++ b/example_lora.py @@ -18,7 +18,7 @@ tokenizer_path = os.path.join(model_directory, "tokenizer.model") model_config_path = os.path.join(model_directory, "config.json") st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] +model_path = glob.glob(st_pattern) lora_config_path = os.path.join(lora_directory, "adapter_config.json") lora_path = os.path.join(lora_directory, "adapter_model.bin") diff --git a/example_ws.py b/example_ws.py index 0c504ab0..a0a3922e 100644 --- a/example_ws.py +++ b/example_ws.py @@ -260,7 +260,7 @@ async def main(websocket, path): tokenizer_path = os.path.join(model_directory, "tokenizer.model") model_config_path = os.path.join(model_directory, "config.json") st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] +model_path = glob.glob(st_pattern) esTokenizer = SentencePieceProcessor(model_file = tokenizer_path) config = ExLlamaConfig(model_config_path) # create config from config.json config.set_auto_map('17.615,18.8897') diff --git a/model.py b/model.py index 0decc8ab..51961664 100644 --- a/model.py +++ b/model.py @@ -75,7 +75,7 @@ def __init__(self, model_config_path): # Required settings - self.model_path = None + self.model_path = None # str or list[str] self.device_map = ExLlamaDeviceMap(self.num_hidden_layers) # Optional settings @@ -726,27 +726,27 @@ def __init__(self, config): self.config.set_tuning_params() - # Load model weights + # Read tensor list from file(s) - tensors = {} - with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f: - - # Begin auto mapping if enabled + if isinstance(self.config.model_path, str): model_path = [self.config.model_path] + else: model_path = self.config.model_path - decoder_size = 0 - norm_size = 0 - head_size = 0 - half_element_size = torch.tensor([], dtype = torch.float16).element_size() + # Read tensor list from file(s), and measure layer sizes - if self.config.auto_map is not None: + load_keys = {} - self.config.device_map.embed_tokens = "cpu" - self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1) + decoder_size = 0 + norm_size = 0 + head_size = 0 + for path in model_path: + with safe_open(path, framework = "pt", device = "cpu") as f: for key in f.keys(): if _skip_key(key): continue + load_keys[key] = path + if key.startswith("model.layers.0."): tensor_slice = f.get_slice(key) shape = tensor_slice.get_shape() @@ -765,56 +765,58 @@ def __init__(self, config): head_size += math.prod(shape) * _layer_dtype_size(key) del tensor_slice - # Assign layers automatically + # Begin auto mapping if enabled - device_usage = 0 - device_index = 0 - layer_index_device = 0 - max_usage = self.config.auto_map[device_index] * (1024 ** 3) + if self.config.auto_map is not None: - for layer in range(self.config.num_hidden_layers + 2): + self.config.device_map.embed_tokens = "cpu" + self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1) - this_layer_size = decoder_size - if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size - elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size + # Assign layers automatically - while device_usage + this_layer_size > max_usage: - device_index += 1 - device_usage = 0 - layer_index_device = 0 - max_usage = self.config.auto_map[device_index] * (1024 ** 3) - if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.") + device_usage = 0 + device_index = 0 + layer_index_device = 0 + max_usage = self.config.auto_map[device_index] * (1024 ** 3) - target = f"cuda:{device_index}" - if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target - elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target - else: self.config.device_map.layers[layer] = f"cuda:{device_index}" + for layer in range(self.config.num_hidden_layers + 2): - device_usage += this_layer_size - layer_index_device += 1 + this_layer_size = decoder_size + if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size + elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size - # Read tensor list from file + while device_usage + this_layer_size > max_usage: + device_index += 1 + device_usage = 0 + layer_index_device = 0 + max_usage = self.config.auto_map[device_index] * (1024 ** 3) + if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.") - load_keys = [] - with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f: - for key in f.keys(): - load_keys.append(key) + target = f"cuda:{device_index}" + if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target + elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target + else: self.config.device_map.layers[layer] = f"cuda:{device_index}" - # Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk + device_usage += this_layer_size + layer_index_device += 1 + + # Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk max_dq_buffer_size = 0 - f = None + tensors = {} + st_mem = 0 MAX_ST_MEM = 1024**3 + f = None + prev_path = "" + for key, path in load_keys.items(): - for key in load_keys: - - if _skip_key(key): continue device = self.config.device_map.map(key) - if f is None or st_mem > MAX_ST_MEM: + if f is None or st_mem > MAX_ST_MEM or path != prev_path: if f is not None: del f - f = safe_open(self.config.model_path, framework = "pt", device = "cpu") + f = safe_open(path, framework = "pt", device = "cpu") + prev_path = path st_mem = 0 tensor = f.get_tensor(key) @@ -828,10 +830,13 @@ def __init__(self, config): if key.endswith(".input_layernorm.weight"): tensor = tensor.half() if key.endswith(".post_attention_layernorm.weight"): tensor = tensor.half() - tensor = tensor.to(device, non_blocking = True) - if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, tensor.numel() * 8) + if device == "cpu": keep_tensor = tensor.clone() + else: keep_tensor = tensor.to(device) + del tensor + + if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, keep_tensor.numel() * 8) - tensors[key] = tensor + tensors[key] = keep_tensor del f diff --git a/model_init.py b/model_init.py index 7703f07c..e3a4c347 100644 --- a/model_init.py +++ b/model_init.py @@ -59,10 +59,10 @@ def get_model_files(args): if len(st) == 0: print(f" !! No files matching {st_pattern}") sys.exit() - if len(st) > 1: - print(f" !! Multiple files matching {st_pattern}") - sys.exit() - args.model = st[0] + # if len(st) > 1: + # print(f" !! Multiple files matching {st_pattern}") + # sys.exit() + args.model = st else: if args.tokenizer is None or args.config is None or args.model is None: print(" !! Please specify either -d or all of -t, -c and -m") @@ -71,6 +71,13 @@ def get_model_files(args): # Feedback +def _common_chars(names): + cname = max(names, key = len) + for x in names: + for p, c in enumerate(x): + if c != cname[p] and cname[p] != "*": cname = cname[:p] + "*" + cname[p+1:] + return cname + def print_options(args, extra_options = None): print_opts = [] @@ -82,7 +89,10 @@ def print_options(args, extra_options = None): print(f" -- Tokenizer: {args.tokenizer}") print(f" -- Model config: {args.config}") - print(f" -- Model: {args.model}") + + if isinstance(args.model, str): print(f" -- Model: {args.model}") + else: print(f" -- Model: {_common_chars(args.model)}") + print(f" -- Sequence length: {args.length}") if args.compress_pos_emb != 1.0: print(f" -- RoPE compression factor: {args.compress_pos_emb}") diff --git a/requirements.txt b/requirements.txt index af86b688..94a3127b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ torch>=2.0.1 -safetensors==0.3.1 +safetensors==0.3.2 sentencepiece>=0.1.97 ninja==1.11.1 diff --git a/util/shard.py b/util/shard.py new file mode 100644 index 00000000..5b29db0f --- /dev/null +++ b/util/shard.py @@ -0,0 +1,84 @@ +import argparse, json, math, os +from safetensors import safe_open +from safetensors.torch import save_file + +parser = argparse.ArgumentParser(description = "Split .safetensors file into shards") +parser.add_argument("input_file", type = str, help = "Path to input file") +parser.add_argument("shard_size", type = int, help = "Shard size in megabytes") +args = parser.parse_args() + +input_file = args.input_file +input_base, _ = os.path.splitext(input_file) +shard_size = args.shard_size * 1024**2 + +# Create tensor map + +def _tsize(st, key): + + tslice = st.get_slice(key) + shape = tslice.get_shape() + numel = 1 + for x in shape: numel *= x + dtype = tslice.get_dtype() + del tslice + if dtype == "I32": return numel * 4 + elif dtype == "I16": return numel * 2 + elif dtype == "F16": return numel * 2 + elif dtype == "F32": return numel * 4 + else: raise ValueError("Unexpected datatype: " + key) + +num_files = 0 +current_size = shard_size + 1 +total_size = 0 +tensor_map = [] + +print(f" -- Scanning tensors in {input_file}") + +with safe_open(input_file, framework = "pt", device = "cpu") as f: + + for key in f.keys(): + + tensor_size = _tsize(f, key) + total_size += tensor_size + + if current_size + tensor_size > shard_size: + + num_files += 1 + current_size = 0 + current_list = [] + tensor_map.append(current_list) + + current_size += tensor_size + current_list.append(key) + +# Split into output files + +weight_map = {} + +for file_index, keys in enumerate(tensor_map): + + shard = {} + shard_filename = f"{input_base}-{file_index + 1:05}-of-{num_files:05}.safetensors" + + with safe_open(input_file, framework = "pt", device = "cpu") as f: + for key in keys: + print(f" -- Reading: {key}") + shard[key] = f.get_tensor(key) + weight_map[key] = shard_filename + + print(f" -- Writing: {shard_filename}") + save_file(shard, shard_filename) + +# Compile index + +index = { "metadata": { "total_size": total_size }, "weight_map": weight_map } +index_filename = f"{input_file}.index.json" + +print(f" -- Writing: {index_filename}") + +with open(index_filename, 'w') as f: + json.dump(index, f, indent = 2) + +# Done + +print(f" -- Done") \ No newline at end of file