Skip to content

Commit

Permalink
Support for sharded models
Browse files Browse the repository at this point in the history
  • Loading branch information
turboderp committed Sep 8, 2023
1 parent a544085 commit 2a2ef8f
Show file tree
Hide file tree
Showing 11 changed files with 169 additions and 63 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ have no AMD devices to test or optimize on.

* Python 3.9 or newer
* `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
* `safetensors` 0.3.1
* `safetensors` 0.3.2
* `sentencepiece`
* `ninja`

Expand Down Expand Up @@ -190,4 +190,11 @@ Moved the todo list [here](doc/TODO.md).

**2023-01-09**: Added rope_theta parameter for (at least partial) CodeLlama support. If you were using alpha = 97
or similar, you would no longer need that for CodeLlama models. Still stuff to sort out regarding the extended
vocabulary.
vocabulary.

**2023-08-09**: Added support for sharded models. `config.model_path` now accepts either a filename or a list of
filenames. `model_init()` will detect multiple .safetensors files if given a model directory. Note the change in the
various examples: `model_path = glob.glob(st_pattern)[0]` becomes simply `model_path = glob.glob(st_pattern)`. Also
there's a little script in `util/shard.py` to split large .safetensors files. It also produces an index.json file for
the sharded model, just for completeness, although ExLlama doesn't need it to read the shards. Note that the
**safetensors dependency was bumped to version 0.3.2**.
2 changes: 1 addition & 1 deletion example_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]
model_path = glob.glob(st_pattern)

# Create config, model, tokenizer and generator

Expand Down
2 changes: 1 addition & 1 deletion example_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]
model_path = glob.glob(st_pattern)

# Batched prompts

Expand Down
2 changes: 1 addition & 1 deletion example_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]
model_path = glob.glob(st_pattern)

# Create config, model, tokenizer and generator

Expand Down
2 changes: 1 addition & 1 deletion example_flask.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]
model_path = glob.glob(st_pattern)

config = ExLlamaConfig(model_config_path) # create config from config.json
config.model_path = model_path # supply path to model weights file
Expand Down
2 changes: 1 addition & 1 deletion example_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]
model_path = glob.glob(st_pattern)

lora_config_path = os.path.join(lora_directory, "adapter_config.json")
lora_path = os.path.join(lora_directory, "adapter_model.bin")
Expand Down
2 changes: 1 addition & 1 deletion example_ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ async def main(websocket, path):
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]
model_path = glob.glob(st_pattern)
esTokenizer = SentencePieceProcessor(model_file = tokenizer_path)
config = ExLlamaConfig(model_config_path) # create config from config.json
config.set_auto_map('17.615,18.8897')
Expand Down
103 changes: 54 additions & 49 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(self, model_config_path):

# Required settings

self.model_path = None
self.model_path = None # str or list[str]
self.device_map = ExLlamaDeviceMap(self.num_hidden_layers)

# Optional settings
Expand Down Expand Up @@ -726,27 +726,27 @@ def __init__(self, config):

self.config.set_tuning_params()

# Load model weights
# Read tensor list from file(s)

tensors = {}
with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f:

# Begin auto mapping if enabled
if isinstance(self.config.model_path, str): model_path = [self.config.model_path]
else: model_path = self.config.model_path

decoder_size = 0
norm_size = 0
head_size = 0
half_element_size = torch.tensor([], dtype = torch.float16).element_size()
# Read tensor list from file(s), and measure layer sizes

if self.config.auto_map is not None:
load_keys = {}

self.config.device_map.embed_tokens = "cpu"
self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1)
decoder_size = 0
norm_size = 0
head_size = 0

for path in model_path:
with safe_open(path, framework = "pt", device = "cpu") as f:
for key in f.keys():

if _skip_key(key): continue

load_keys[key] = path

if key.startswith("model.layers.0."):
tensor_slice = f.get_slice(key)
shape = tensor_slice.get_shape()
Expand All @@ -765,56 +765,58 @@ def __init__(self, config):
head_size += math.prod(shape) * _layer_dtype_size(key)
del tensor_slice

# Assign layers automatically
# Begin auto mapping if enabled

device_usage = 0
device_index = 0
layer_index_device = 0
max_usage = self.config.auto_map[device_index] * (1024 ** 3)
if self.config.auto_map is not None:

for layer in range(self.config.num_hidden_layers + 2):
self.config.device_map.embed_tokens = "cpu"
self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1)

this_layer_size = decoder_size
if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size
elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size
# Assign layers automatically

while device_usage + this_layer_size > max_usage:
device_index += 1
device_usage = 0
layer_index_device = 0
max_usage = self.config.auto_map[device_index] * (1024 ** 3)
if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.")
device_usage = 0
device_index = 0
layer_index_device = 0
max_usage = self.config.auto_map[device_index] * (1024 ** 3)

target = f"cuda:{device_index}"
if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target
elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target
else: self.config.device_map.layers[layer] = f"cuda:{device_index}"
for layer in range(self.config.num_hidden_layers + 2):

device_usage += this_layer_size
layer_index_device += 1
this_layer_size = decoder_size
if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size
elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size

# Read tensor list from file
while device_usage + this_layer_size > max_usage:
device_index += 1
device_usage = 0
layer_index_device = 0
max_usage = self.config.auto_map[device_index] * (1024 ** 3)
if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.")

load_keys = []
with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f:
for key in f.keys():
load_keys.append(key)
target = f"cuda:{device_index}"
if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target
elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target
else: self.config.device_map.layers[layer] = f"cuda:{device_index}"

# Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk
device_usage += this_layer_size
layer_index_device += 1

# Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk

max_dq_buffer_size = 0
f = None
tensors = {}

st_mem = 0
MAX_ST_MEM = 1024**3
f = None
prev_path = ""
for key, path in load_keys.items():

for key in load_keys:

if _skip_key(key): continue
device = self.config.device_map.map(key)

if f is None or st_mem > MAX_ST_MEM:
if f is None or st_mem > MAX_ST_MEM or path != prev_path:
if f is not None: del f
f = safe_open(self.config.model_path, framework = "pt", device = "cpu")
f = safe_open(path, framework = "pt", device = "cpu")
prev_path = path
st_mem = 0

tensor = f.get_tensor(key)
Expand All @@ -828,10 +830,13 @@ def __init__(self, config):
if key.endswith(".input_layernorm.weight"): tensor = tensor.half()
if key.endswith(".post_attention_layernorm.weight"): tensor = tensor.half()

tensor = tensor.to(device, non_blocking = True)
if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, tensor.numel() * 8)
if device == "cpu": keep_tensor = tensor.clone()
else: keep_tensor = tensor.to(device)
del tensor

if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, keep_tensor.numel() * 8)

tensors[key] = tensor
tensors[key] = keep_tensor

del f

Expand Down
20 changes: 15 additions & 5 deletions model_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ def get_model_files(args):
if len(st) == 0:
print(f" !! No files matching {st_pattern}")
sys.exit()
if len(st) > 1:
print(f" !! Multiple files matching {st_pattern}")
sys.exit()
args.model = st[0]
# if len(st) > 1:
# print(f" !! Multiple files matching {st_pattern}")
# sys.exit()
args.model = st
else:
if args.tokenizer is None or args.config is None or args.model is None:
print(" !! Please specify either -d or all of -t, -c and -m")
Expand All @@ -71,6 +71,13 @@ def get_model_files(args):

# Feedback

def _common_chars(names):
cname = max(names, key = len)
for x in names:
for p, c in enumerate(x):
if c != cname[p] and cname[p] != "*": cname = cname[:p] + "*" + cname[p+1:]
return cname

def print_options(args, extra_options = None):

print_opts = []
Expand All @@ -82,7 +89,10 @@ def print_options(args, extra_options = None):

print(f" -- Tokenizer: {args.tokenizer}")
print(f" -- Model config: {args.config}")
print(f" -- Model: {args.model}")

if isinstance(args.model, str): print(f" -- Model: {args.model}")
else: print(f" -- Model: {_common_chars(args.model)}")

print(f" -- Sequence length: {args.length}")
if args.compress_pos_emb != 1.0:
print(f" -- RoPE compression factor: {args.compress_pos_emb}")
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch>=2.0.1
safetensors==0.3.1
safetensors==0.3.2
sentencepiece>=0.1.97
ninja==1.11.1
84 changes: 84 additions & 0 deletions util/shard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import argparse, json, math, os
from safetensors import safe_open
from safetensors.torch import save_file

parser = argparse.ArgumentParser(description = "Split .safetensors file into shards")
parser.add_argument("input_file", type = str, help = "Path to input file")
parser.add_argument("shard_size", type = int, help = "Shard size in megabytes")
args = parser.parse_args()

input_file = args.input_file
input_base, _ = os.path.splitext(input_file)
shard_size = args.shard_size * 1024**2

# Create tensor map

def _tsize(st, key):

tslice = st.get_slice(key)
shape = tslice.get_shape()
numel = 1
for x in shape: numel *= x
dtype = tslice.get_dtype()
del tslice
if dtype == "I32": return numel * 4
elif dtype == "I16": return numel * 2
elif dtype == "F16": return numel * 2
elif dtype == "F32": return numel * 4
else: raise ValueError("Unexpected datatype: " + key)

num_files = 0
current_size = shard_size + 1
total_size = 0
tensor_map = []

print(f" -- Scanning tensors in {input_file}")

with safe_open(input_file, framework = "pt", device = "cpu") as f:

for key in f.keys():

tensor_size = _tsize(f, key)
total_size += tensor_size

if current_size + tensor_size > shard_size:

num_files += 1
current_size = 0
current_list = []
tensor_map.append(current_list)

current_size += tensor_size
current_list.append(key)

# Split into output files

weight_map = {}

for file_index, keys in enumerate(tensor_map):

shard = {}
shard_filename = f"{input_base}-{file_index + 1:05}-of-{num_files:05}.safetensors"

with safe_open(input_file, framework = "pt", device = "cpu") as f:
for key in keys:
print(f" -- Reading: {key}")
shard[key] = f.get_tensor(key)
weight_map[key] = shard_filename

print(f" -- Writing: {shard_filename}")
save_file(shard, shard_filename)

# Compile index

index = { "metadata": { "total_size": total_size }, "weight_map": weight_map }
index_filename = f"{input_file}.index.json"

print(f" -- Writing: {index_filename}")

with open(index_filename, 'w') as f:
json.dump(index, f, indent = 2)

# Done

print(f" -- Done")

0 comments on commit 2a2ef8f

Please sign in to comment.