From 2a2ef8fb0be481ac563339eb5cc60fc60524a70c Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Fri, 8 Sep 2023 17:00:01 +0200
Subject: [PATCH] Support for sharded models

---
 README.md        |  11 ++++-
 example_basic.py |   2 +-
 example_batch.py |   2 +-
 example_cfg.py   |   2 +-
 example_flask.py |   2 +-
 example_lora.py  |   2 +-
 example_ws.py    |   2 +-
 model.py         | 103 +++++++++++++++++++++++++----------------------
 model_init.py    |  20 ++++++---
 requirements.txt |   2 +-
 util/shard.py    |  84 ++++++++++++++++++++++++++++++++++++++
 11 files changed, 169 insertions(+), 63 deletions(-)
 create mode 100644 util/shard.py

diff --git a/README.md b/README.md
index 5a4f6307..a1cc713c 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ have no AMD devices to test or optimize on.
 
 * Python 3.9 or newer
 * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
-* `safetensors` 0.3.1
+* `safetensors` 0.3.2
 * `sentencepiece`
 * `ninja`
 
@@ -190,4 +190,11 @@ Moved the todo list [here](doc/TODO.md).
 
 **2023-01-09**: Added rope_theta parameter for (at least partial) CodeLlama support. If you were using alpha = 97
 or similar, you would no longer need that for CodeLlama models. Still stuff to sort out regarding the extended
-vocabulary.
\ No newline at end of file
+vocabulary.
+
+**2023-08-09**: Added support for sharded models. `config.model_path` now accepts either a filename or a list of
+filenames. `model_init()` will detect multiple .safetensors files if given a model directory. Note the change in the
+various examples: `model_path = glob.glob(st_pattern)[0]` becomes simply `model_path = glob.glob(st_pattern)`. Also
+there's a little script in `util/shard.py` to split large .safetensors files. It also produces an index.json file for
+the sharded model, just for completeness, although ExLlama doesn't need it to read the shards. Note that the 
+**safetensors dependency was bumped to version 0.3.2**. 
\ No newline at end of file
diff --git a/example_basic.py b/example_basic.py
index e95d0adb..f49ba4f9 100644
--- a/example_basic.py
+++ b/example_basic.py
@@ -12,7 +12,7 @@
 tokenizer_path = os.path.join(model_directory, "tokenizer.model")
 model_config_path = os.path.join(model_directory, "config.json")
 st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
+model_path = glob.glob(st_pattern)
 
 # Create config, model, tokenizer and generator
 
diff --git a/example_batch.py b/example_batch.py
index 179cf2f4..66986267 100644
--- a/example_batch.py
+++ b/example_batch.py
@@ -12,7 +12,7 @@
 tokenizer_path = os.path.join(model_directory, "tokenizer.model")
 model_config_path = os.path.join(model_directory, "config.json")
 st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
+model_path = glob.glob(st_pattern)
 
 # Batched prompts
 
diff --git a/example_cfg.py b/example_cfg.py
index 53fcbbb6..de5750b4 100644
--- a/example_cfg.py
+++ b/example_cfg.py
@@ -15,7 +15,7 @@
 tokenizer_path = os.path.join(model_directory, "tokenizer.model")
 model_config_path = os.path.join(model_directory, "config.json")
 st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
+model_path = glob.glob(st_pattern)
 
 # Create config, model, tokenizer and generator
 
diff --git a/example_flask.py b/example_flask.py
index d83a45b4..b0d79d4a 100644
--- a/example_flask.py
+++ b/example_flask.py
@@ -10,7 +10,7 @@
 tokenizer_path = os.path.join(model_directory, "tokenizer.model")
 model_config_path = os.path.join(model_directory, "config.json")
 st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
+model_path = glob.glob(st_pattern)
 
 config = ExLlamaConfig(model_config_path)               # create config from config.json
 config.model_path = model_path                          # supply path to model weights file
diff --git a/example_lora.py b/example_lora.py
index e47c34f5..53bfa61e 100644
--- a/example_lora.py
+++ b/example_lora.py
@@ -18,7 +18,7 @@
 tokenizer_path = os.path.join(model_directory, "tokenizer.model")
 model_config_path = os.path.join(model_directory, "config.json")
 st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
+model_path = glob.glob(st_pattern)
 
 lora_config_path = os.path.join(lora_directory, "adapter_config.json")
 lora_path = os.path.join(lora_directory, "adapter_model.bin")
diff --git a/example_ws.py b/example_ws.py
index 0c504ab0..a0a3922e 100644
--- a/example_ws.py
+++ b/example_ws.py
@@ -260,7 +260,7 @@ async def main(websocket, path):
 tokenizer_path = os.path.join(model_directory, "tokenizer.model")
 model_config_path = os.path.join(model_directory, "config.json")
 st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
+model_path = glob.glob(st_pattern)
 esTokenizer = SentencePieceProcessor(model_file = tokenizer_path)
 config = ExLlamaConfig(model_config_path)               # create config from config.json
 config.set_auto_map('17.615,18.8897')
diff --git a/model.py b/model.py
index 0decc8ab..51961664 100644
--- a/model.py
+++ b/model.py
@@ -75,7 +75,7 @@ def __init__(self, model_config_path):
 
         # Required settings
 
-        self.model_path = None
+        self.model_path = None  # str or list[str]
         self.device_map = ExLlamaDeviceMap(self.num_hidden_layers)
 
         # Optional settings
@@ -726,27 +726,27 @@ def __init__(self, config):
 
         self.config.set_tuning_params()
 
-        # Load model weights
+        # Read tensor list from file(s)
 
-        tensors = {}
-        with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f:
-
-            # Begin auto mapping if enabled
+        if isinstance(self.config.model_path, str): model_path = [self.config.model_path]
+        else: model_path = self.config.model_path
 
-            decoder_size = 0
-            norm_size = 0
-            head_size = 0
-            half_element_size = torch.tensor([], dtype = torch.float16).element_size()
+        # Read tensor list from file(s), and measure layer sizes
 
-            if self.config.auto_map is not None:
+        load_keys = {}
 
-                self.config.device_map.embed_tokens = "cpu"
-                self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1)
+        decoder_size = 0
+        norm_size = 0
+        head_size = 0
 
+        for path in model_path:
+            with safe_open(path, framework = "pt", device = "cpu") as f:
                 for key in f.keys():
 
                     if _skip_key(key): continue
 
+                    load_keys[key] = path
+
                     if key.startswith("model.layers.0."):
                         tensor_slice = f.get_slice(key)
                         shape = tensor_slice.get_shape()
@@ -765,56 +765,58 @@ def __init__(self, config):
                         head_size += math.prod(shape) * _layer_dtype_size(key)
                         del tensor_slice
 
-                # Assign layers automatically
+        # Begin auto mapping if enabled
 
-                device_usage = 0
-                device_index = 0
-                layer_index_device = 0
-                max_usage = self.config.auto_map[device_index] * (1024 ** 3)
+        if self.config.auto_map is not None:
 
-                for layer in range(self.config.num_hidden_layers + 2):
+            self.config.device_map.embed_tokens = "cpu"
+            self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1)
 
-                    this_layer_size = decoder_size
-                    if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size
-                    elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size
+            # Assign layers automatically
 
-                    while device_usage + this_layer_size > max_usage:
-                        device_index += 1
-                        device_usage = 0
-                        layer_index_device = 0
-                        max_usage = self.config.auto_map[device_index] * (1024 ** 3)
-                        if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.")
+            device_usage = 0
+            device_index = 0
+            layer_index_device = 0
+            max_usage = self.config.auto_map[device_index] * (1024 ** 3)
 
-                    target = f"cuda:{device_index}"
-                    if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target
-                    elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target
-                    else: self.config.device_map.layers[layer] = f"cuda:{device_index}"
+            for layer in range(self.config.num_hidden_layers + 2):
 
-                    device_usage += this_layer_size
-                    layer_index_device += 1
+                this_layer_size = decoder_size
+                if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size
+                elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size
 
-        # Read tensor list from file
+                while device_usage + this_layer_size > max_usage:
+                    device_index += 1
+                    device_usage = 0
+                    layer_index_device = 0
+                    max_usage = self.config.auto_map[device_index] * (1024 ** 3)
+                    if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.")
 
-        load_keys = []
-        with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f:
-            for key in f.keys():
-                load_keys.append(key)
+                target = f"cuda:{device_index}"
+                if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target
+                elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target
+                else: self.config.device_map.layers[layer] = f"cuda:{device_index}"
 
-        # Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk
+                device_usage += this_layer_size
+                layer_index_device += 1
+
+         # Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk
 
         max_dq_buffer_size = 0
-        f = None
+        tensors = {}
+
         st_mem = 0
         MAX_ST_MEM = 1024**3
+        f = None
+        prev_path = ""
+        for key, path in load_keys.items():
 
-        for key in load_keys:
-
-            if _skip_key(key): continue
             device = self.config.device_map.map(key)
 
-            if f is None or st_mem > MAX_ST_MEM:
+            if f is None or st_mem > MAX_ST_MEM or path != prev_path:
                 if f is not None: del f
-                f = safe_open(self.config.model_path, framework = "pt", device = "cpu")
+                f = safe_open(path, framework = "pt", device = "cpu")
+                prev_path = path
                 st_mem = 0
 
             tensor = f.get_tensor(key)
@@ -828,10 +830,13 @@ def __init__(self, config):
             if key.endswith(".input_layernorm.weight"): tensor = tensor.half()
             if key.endswith(".post_attention_layernorm.weight"): tensor = tensor.half()
 
-            tensor = tensor.to(device, non_blocking = True)
-            if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, tensor.numel() * 8)
+            if device == "cpu": keep_tensor = tensor.clone()
+            else: keep_tensor = tensor.to(device)
+            del tensor
+
+            if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, keep_tensor.numel() * 8)
 
-            tensors[key] = tensor
+            tensors[key] = keep_tensor
 
         del f
 
diff --git a/model_init.py b/model_init.py
index 7703f07c..e3a4c347 100644
--- a/model_init.py
+++ b/model_init.py
@@ -59,10 +59,10 @@ def get_model_files(args):
         if len(st) == 0:
             print(f" !! No files matching {st_pattern}")
             sys.exit()
-        if len(st) > 1:
-            print(f" !! Multiple files matching {st_pattern}")
-            sys.exit()
-        args.model = st[0]
+        # if len(st) > 1:
+        #     print(f" !! Multiple files matching {st_pattern}")
+        #     sys.exit()
+        args.model = st
     else:
         if args.tokenizer is None or args.config is None or args.model is None:
             print(" !! Please specify either -d or all of -t, -c and -m")
@@ -71,6 +71,13 @@ def get_model_files(args):
 
 # Feedback
 
+def _common_chars(names):
+    cname = max(names, key = len)
+    for x in names:
+        for p, c in enumerate(x):
+            if c != cname[p] and cname[p] != "*": cname = cname[:p] + "*" + cname[p+1:]
+    return cname
+
 def print_options(args, extra_options = None):
 
     print_opts = []
@@ -82,7 +89,10 @@ def print_options(args, extra_options = None):
 
     print(f" -- Tokenizer: {args.tokenizer}")
     print(f" -- Model config: {args.config}")
-    print(f" -- Model: {args.model}")
+
+    if isinstance(args.model, str): print(f" -- Model: {args.model}")
+    else: print(f" -- Model: {_common_chars(args.model)}")
+
     print(f" -- Sequence length: {args.length}")
     if args.compress_pos_emb != 1.0:
         print(f" -- RoPE compression factor: {args.compress_pos_emb}")
diff --git a/requirements.txt b/requirements.txt
index af86b688..94a3127b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 torch>=2.0.1
-safetensors==0.3.1
+safetensors==0.3.2
 sentencepiece>=0.1.97
 ninja==1.11.1
diff --git a/util/shard.py b/util/shard.py
new file mode 100644
index 00000000..5b29db0f
--- /dev/null
+++ b/util/shard.py
@@ -0,0 +1,84 @@
+import argparse, json, math, os
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+parser = argparse.ArgumentParser(description = "Split .safetensors file into shards")
+parser.add_argument("input_file", type = str, help = "Path to input file")
+parser.add_argument("shard_size", type = int, help = "Shard size in megabytes")
+args = parser.parse_args()
+
+input_file = args.input_file
+input_base, _ = os.path.splitext(input_file)
+shard_size = args.shard_size * 1024**2
+
+# Create tensor map
+
+def _tsize(st, key):
+
+    tslice = st.get_slice(key)
+    shape = tslice.get_shape()
+    numel = 1
+    for x in shape: numel *= x
+    dtype = tslice.get_dtype()
+    del tslice
+    if dtype == "I32": return numel * 4
+    elif dtype == "I16": return numel * 2
+    elif dtype == "F16": return numel * 2
+    elif dtype == "F32": return numel * 4
+    else: raise ValueError("Unexpected datatype: " + key)
+
+num_files = 0
+current_size = shard_size + 1
+total_size = 0
+tensor_map = []
+
+print(f" -- Scanning tensors in {input_file}")
+
+with safe_open(input_file, framework = "pt", device = "cpu") as f:
+    
+    for key in f.keys():
+        
+        tensor_size = _tsize(f, key)
+        total_size += tensor_size
+        
+        if current_size + tensor_size > shard_size:
+            
+            num_files += 1
+            current_size = 0
+            current_list = []
+            tensor_map.append(current_list)
+            
+        current_size += tensor_size
+        current_list.append(key)
+
+# Split into output files
+
+weight_map = {}
+
+for file_index, keys in enumerate(tensor_map):
+    
+    shard = {}
+    shard_filename = f"{input_base}-{file_index + 1:05}-of-{num_files:05}.safetensors"
+
+    with safe_open(input_file, framework = "pt", device = "cpu") as f:
+        for key in keys:
+            print(f" -- Reading: {key}")
+            shard[key] = f.get_tensor(key)
+            weight_map[key] = shard_filename
+
+    print(f" -- Writing: {shard_filename}")
+    save_file(shard, shard_filename)
+    
+# Compile index
+
+index = { "metadata": { "total_size": total_size }, "weight_map": weight_map }
+index_filename = f"{input_file}.index.json"
+
+print(f" -- Writing: {index_filename}")
+
+with open(index_filename, 'w') as f:
+    json.dump(index, f, indent = 2)
+
+# Done    
+    
+print(f" -- Done")
\ No newline at end of file