diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..ef1f3072
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+# ignore __pycache__ folder
+__pycache__/
\ No newline at end of file
diff --git a/README.md b/README.md
index 574210a6..a1cc713c 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ have no AMD devices to test or optimize on.
 
 * Python 3.9 or newer
 * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
-* `safetensors` 0.3.1
+* `safetensors` 0.3.2
 * `sentencepiece`
 * `ninja`
 
@@ -188,6 +188,13 @@ Moved the todo list [here](doc/TODO.md).
 
 ## Recent updates
 
-**2023-07-19**: Added support for grouped-query attention and Llama-2 70b. There's still a bit of optimization to do,
-since it slows down considerably on very long sequences despite GQA having the potential to be faster. Also could use
-some more thorough testing.
\ No newline at end of file
+**2023-01-09**: Added rope_theta parameter for (at least partial) CodeLlama support. If you were using alpha = 97
+or similar, you would no longer need that for CodeLlama models. Still stuff to sort out regarding the extended
+vocabulary.
+
+**2023-08-09**: Added support for sharded models. `config.model_path` now accepts either a filename or a list of
+filenames. `model_init()` will detect multiple .safetensors files if given a model directory. Note the change in the
+various examples: `model_path = glob.glob(st_pattern)[0]` becomes simply `model_path = glob.glob(st_pattern)`. Also
+there's a little script in `util/shard.py` to split large .safetensors files. It also produces an index.json file for
+the sharded model, just for completeness, although ExLlama doesn't need it to read the shards. Note that the 
+**safetensors dependency was bumped to version 0.3.2**. 
\ No newline at end of file
diff --git a/alt_generator.py b/alt_generator.py
new file mode 100644
index 00000000..3b85deb4
--- /dev/null
+++ b/alt_generator.py
@@ -0,0 +1,329 @@
+import cuda_ext
+from model import ExLlama, ExLlamaCache
+from tokenizer import ExLlamaTokenizer
+from lora import ExLlamaLora
+import torch
+import torch.nn.functional as F
+
+MAX_CACHED_STRINGS = 100
+
+class ExLlamaAltGenerator:
+
+    class Settings:
+
+        temperature = 0.95
+        top_k = 40                              # consider the most probable top_k samples, 0 to disable top_k sampling
+        top_p = 0.65                            # consider tokens up to a cumulative probabiltiy of top_p, 0.0 to disable top_p sampling
+        min_p = 0.0                             # Do not consider tokens with probability less than this
+        typical = 0.0                           # Locally typical sampling threshold, 0.0 to disable typical sampling
+
+        token_repetition_penalty_max = 1.15     # Repetition penalty for most recent tokens
+        token_repetition_penalty_sustain = -1   # No. most recent tokens to repeat penalty for, -1 to apply to whole context
+        token_repetition_penalty_decay = 0      # Gradually decrease penalty over this many tokens
+
+        disallowed_tokens: list[int] = None     # List of tokens to inhibit, e.g. tokenizer.eos_token_id
+        lora: ExLlamaLora = None                # LoRA to apply when generating
+
+    # Internal state
+
+    model: ExLlama
+    cache: ExLlamaCache
+    tokenizer: ExLlamaTokenizer
+    tokenizer_cache = {}
+
+    settings: Settings
+    stop_strings: list = []
+    stop_tokens: list = []
+    held_text: str = ""
+    max_stop_tokens: int = 2
+    sequence_ids: torch.Tensor = None
+    sequence_str: str = None
+    remaining_tokens: int = 0
+
+
+    def __init__(self, model: ExLlama, tokenizer: ExLlamaTokenizer, cache: ExLlamaCache):
+
+        self.model = model
+        self.tokenizer = tokenizer
+        self.cache = cache
+        self.settings = ExLlamaAltGenerator.Settings()
+
+
+    def cached_tokenize(self, text: str, encode_special_characters = False):
+
+        if text in self.tokenizer_cache:
+            return self.tokenizer_cache[text]
+
+        while len(self.tokenizer_cache) >= MAX_CACHED_STRINGS:
+            del self.tokenizer_cache[next(iter(self.tokenizer_cache))]  # Always removes oldest entry, as of Python 3.7
+
+        new_enc = self.tokenizer.encode(text, encode_special_characters = encode_special_characters)
+        self.tokenizer_cache[text] = new_enc
+        return new_enc
+
+
+    def get_num_tokens(self, text: str, encode_special_characters = False):
+
+        return self.cached_tokenize(text, encode_special_characters = encode_special_characters).shape[-1]
+
+
+    # Begin generating
+    #
+    # prompt: The input prompt. Will be tokenized and then truncated to the models max sequence length minus max_new_tokens
+    # stop_conditions: List of strings or integer token IDs that will end the sequence
+    # settings: ExLlamaAltGeneratorSettings
+    # encode_special_characters: Set to true to tokenize "</s>" etc.
+
+    def begin_stream(self, prompt: str, stop_conditions: list, max_new_tokens: int, gen_settings: Settings, encode_special_characters = False):
+
+        assert isinstance(prompt, str), "ExLlamaAltGenerator does not support batched generation"
+
+        # Tokenize prompt and limit length to allow prompt and (max) new tokens within max sequence length
+
+        max_input_tokens = self.model.config.max_seq_len - max_new_tokens
+        self.remaining_tokens = max_new_tokens
+
+        input_ids = self.cached_tokenize(prompt, encode_special_characters)
+        applied_input_ids = input_ids[:, -max_input_tokens:]
+        self.sequence_str = self.tokenizer.decode(applied_input_ids)[0] if applied_input_ids.shape[0] < input_ids.shape[0] else prompt
+
+        # Settings
+
+        self.stop_strings = []
+        self.stop_tokens = []
+        for t in stop_conditions:
+            if isinstance(t, int): self.stop_tokens += [t]
+            elif isinstance(t, str): self.stop_strings += [t]
+            else: raise ValueError("Unsupported type in stop_conditions")
+
+        self.held_text = ""
+
+        self.max_stop_tokens = 2
+        for ss in self.stop_strings:
+            self.max_stop_tokens = max(self.max_stop_tokens, self.get_num_tokens(ss) + 2)
+
+        self.settings = gen_settings
+
+        # Start generation
+
+        self.gen_begin_reuse(applied_input_ids, gen_settings)
+
+
+    # Get the next chunk of text in the stream
+    #
+    # Returns stream_chunk: str, EOS: bool
+
+    def stream(self):
+
+        # Check total response length
+
+        if self.remaining_tokens == 0:
+            self.sequence_str += self.held_text
+            return self.held_text, True
+
+        self.remaining_tokens -= 1
+
+        # Decode the current tail end of the sequence
+
+        old_tail = self.tokenizer.decode(self.sequence_ids[:, -self.max_stop_tokens:])[0]
+
+        # Generate a single token and append to the sequence
+
+        next_token = self.gen_single_token(self.settings)
+
+        # End immediately if it was a stop token
+
+        if next_token in self.stop_tokens:
+            self.sequence_str += self.held_text
+            return self.held_text, True
+
+        # Decode the tail end of the sequence with the added token to get (actual) characters added
+
+        new_tail = self.tokenizer.decode(self.sequence_ids[:, -(self.max_stop_tokens + 1):])[0]
+        self.held_text += new_tail[len(old_tail):]
+
+        # Hold text as long as it contains part of a stop string
+
+        partial_ss = False
+        for ss in self.stop_strings:
+
+            # Check if held_text fully contains stop string
+
+            position = self.held_text.find(ss)
+            if position != -1:
+                self.sequence_str += self.held_text[:position]
+                return self.held_text[:position], True
+
+            # Check for overlap between end of held_text and start of stop string
+
+            overlap = 0
+            for j in range(1, min(len(self.held_text), len(ss)) + 1):
+                if self.held_text[-j:] == ss[:j]: overlap = j
+            if overlap > 0: partial_ss = True
+
+        # If holding text because of a partial stop condition, return nothing but also EOS = False
+
+        if partial_ss:
+            return "", False
+
+        # No stop condition, so return whatever has been generated
+
+        stream_text = self.held_text
+        self.held_text = ""
+        self.sequence_str += stream_text
+        return stream_text, False
+
+
+    # Generate and return a full prompt completion. See begin_stream() above
+
+    def generate(self, prompt: str, stop_conditions: list, max_new_tokens: int, gen_settings: Settings, encode_special_characters = False):
+
+        self.begin_stream(prompt, stop_conditions, max_new_tokens, gen_settings, encode_special_characters)
+        response = ""
+        while True:
+            chunk, eos = self.stream()
+            response += chunk
+            if eos: break
+
+        return response
+
+
+    # Begin generation
+
+    def gen_begin(self, in_tokens, gen_settings):
+
+        self.sequence_ids = in_tokens.clone()
+        self.cache.current_seq_len = 0
+        self.model.forward(self.sequence_ids[:, :-1], self.cache, preprocess_only = True, lora = gen_settings.lora)
+
+
+    def gen_begin_reuse(self, in_tokens, gen_settings):
+
+        if self.sequence_ids is None or self.cache.current_seq_len == 0:
+            self.gen_begin(in_tokens, gen_settings)
+            return
+
+        reuse = 0
+        while reuse < self.sequence_ids.shape[-1] and reuse < in_tokens.shape[-1] and self.sequence_ids[0, reuse] == in_tokens[0, reuse]:
+            reuse += 1
+
+        if reuse < 2:
+            self.gen_begin(in_tokens, gen_settings)
+            return
+
+        self.cache.current_seq_len = reuse - 1
+        self.sequence_ids = in_tokens[:, :reuse]
+
+        if reuse < in_tokens.shape[-1]: self.gen_feed_tokens(in_tokens[:, reuse:], gen_settings)
+
+
+    def gen_feed_tokens(self, in_tokens, gen_settings):
+
+        if self.sequence_ids is None:
+            self.gen_begin(in_tokens, gen_settings)
+            return
+
+        start = self.cache.current_seq_len
+        self.sequence_ids = torch.cat((self.sequence_ids, in_tokens), dim = 1)
+
+        self.model.forward(self.sequence_ids[:, start : -1], self.cache, preprocess_only = True, lora = gen_settings.lora)
+
+
+    # Generate one token in current sequence
+
+    def gen_single_token(self, gen_settings):
+
+        # Simple sampling case:
+
+        logits = self.model.forward(self.sequence_ids[:, -1:], self.cache, lora = gen_settings.lora)
+        token, _ = self.sample(logits, gen_settings)
+        self.sequence_ids = torch.cat([self.sequence_ids, token], dim = 1)
+        return token
+
+
+    def sample(self, logits, gen_settings):
+
+        cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence_ids,
+                                                self.settings.token_repetition_penalty_max,
+                                                self.settings.token_repetition_penalty_sustain,
+                                                self.settings.token_repetition_penalty_decay,
+                                                logits)
+
+        logits[:, :, self.tokenizer.bos_token_id] = -10000.0
+
+        if logits.dim() == 3: logits = logits[0, -1, :]
+        elif logits.dim() == 2: logits = logits[-1, :]
+        else: raise ValueError("Bad logits dimension")
+
+        # Disallow tokens
+
+        if gen_settings.disallowed_tokens is not None:
+            logits[gen_settings.disallowed_tokens] = float("-inf")
+
+        # Base probabilities
+
+        logits /= gen_settings.temperature
+        logits += 1e-8
+        probs = torch.softmax(logits, dim = -1)
+
+        # Top K
+
+        if gen_settings.top_k == 0:
+            top_probs, top_indices = torch.sort(probs, descending = True)
+        else:
+            top_probs, top_indices = torch.topk(probs, gen_settings.top_k)
+            top_probs = F.normalize(top_probs, p = 1, dim = -1)
+
+        # Top P
+
+        if gen_settings.top_p > 0.0:
+
+            num_top_p_probs = 0
+            cum_prob = top_probs[0].item()
+            while True:
+                num_top_p_probs += 1
+                if num_top_p_probs == top_probs.shape[-1]: break
+                if top_probs[num_top_p_probs].item() < gen_settings.min_p: break
+                cum_prob += top_probs[num_top_p_probs].item()
+                if cum_prob > gen_settings.top_p: break
+
+            top_probs = top_probs[:num_top_p_probs]
+            top_probs = F.normalize(top_probs, p = 1, dim = -1)
+            top_indices = top_indices[:num_top_p_probs]
+
+        # Locally typical sampling
+
+        if gen_settings.typical > 0.0:
+
+            epsilon = 1e-10
+            log_probs = (top_probs + epsilon).log()
+            neg_entropy = (top_probs * log_probs).sum()
+            entropy_dev = (neg_entropy - log_probs).abs()
+            _, entropy_dev_order = torch.sort(entropy_dev)
+
+            top_probs = top_probs.gather(-1, entropy_dev_order)
+            top_indices = top_indices.gather(-1, entropy_dev_order)
+
+            num_typical_probs = 0
+            cum_prob = top_probs[0].item()
+            while True:
+                num_typical_probs += 1
+                if num_typical_probs == top_probs.shape[-1]: break
+                cum_prob += top_probs[num_typical_probs].item()
+                if cum_prob > gen_settings.typical: break
+
+            top_probs = top_probs[:num_typical_probs]
+            top_probs = F.normalize(top_probs, p = 1, dim = -1)
+            top_indices = top_indices[:num_typical_probs]
+
+        # Multinomial sampling from top_probs, kept in same order as top_indices
+
+        sampled_ind = torch.multinomial(top_probs, 1)
+        sampled_tokens = top_indices[sampled_ind]
+        sampled_probs = top_probs[sampled_ind]  # Return probs before second norm
+
+        # if sampled_tokens.shape[0] > 1:
+        #     sampled_tokens, ind = sampled_tokens.sort()
+        #     sampled_probs = sampled_probs[ind]
+
+        return sampled_tokens.unsqueeze(0), sampled_probs.unsqueeze(0)
diff --git a/exllama/generator.py b/exllama/generator.py
index 24627daf..14bba67f 100644
--- a/exllama/generator.py
+++ b/exllama/generator.py
@@ -16,9 +16,9 @@ class Settings:
         min_p = 0.0                             # Do not consider tokens with probability less than this
         typical = 0.0                           # Locally typical sampling threshold, 0.0 to disable typical sampling
 
-        token_repetition_penalty_max = 1.15  # Repetition penalty for most recent tokens
+        token_repetition_penalty_max = 1.15     # Repetition penalty for most recent tokens
         token_repetition_penalty_sustain = 256  # No. most recent tokens to repeat penalty for, -1 to apply to whole context
-        token_repetition_penalty_decay = 128  # Gradually decrease penalty over this many tokens
+        token_repetition_penalty_decay = 128    # Gradually decrease penalty over this many tokens
 
         beams = 1
         beam_length = 1
@@ -34,6 +34,7 @@ class Settings:
     disallowed_tokens: list[int] or None
     lora: ExLlamaLora or None
 
+
     def __init__(self, model, tokenizer, cache):
 
         self.model = model
@@ -75,6 +76,20 @@ def batched_sample(self, logits, temperature, top_k, top_p, min_p, typical, num
         return torch.cat(samples, dim = 0), torch.cat(scores, dim = 0)
 
 
+    # Sample one token from logits with current settings
+
+    def sample_current(self, logits, num = 1):
+
+        return self.sample(logits,
+                           self.settings.temperature,
+                           self.settings.top_k,
+                           self.settings.top_p,
+                           self.settings.min_p,
+                           self.settings.typical)
+
+
+    # Sample one token from logits
+
     def sample(self, logits, temperature, top_k, top_p, min_p, typical, num = 1):
 
         # torch.manual_seed(42)
@@ -162,7 +177,7 @@ def disallow_tokens(self, tokens):
         self.disallowed_tokens = tokens
 
 
-    def gen_begin(self, in_tokens):
+    def gen_begin(self, in_tokens, mask = None):
 
         self.end_beam_search()
 
@@ -170,7 +185,7 @@ def gen_begin(self, in_tokens):
         self.sequence_actual = in_tokens.clone()
         self.cache.current_seq_len = 0
 
-        self.model.forward(self.sequence[:, :-1], self.cache, preprocess_only = True, lora = self.lora)
+        self.model.forward(self.sequence[:, :-1], self.cache, preprocess_only = True, lora = self.lora, input_mask = mask)
 
 
     def gen_begin_empty(self):
@@ -181,11 +196,11 @@ def gen_begin_empty(self):
         self.cache.current_seq_len = 0
 
 
-    def gen_begin_reuse(self, in_tokens):
+    def gen_begin_reuse(self, in_tokens, mask = None):
 
         self.end_beam_search()
         if self.sequence is None or self.cache.current_seq_len == 0:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, mask = mask)
             return 0
 
         # if in_tokens.shape[-1] < self.sequence.shape[-1]:
@@ -196,7 +211,7 @@ def gen_begin_reuse(self, in_tokens):
             reuse += 1
 
         if reuse < 2:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, mask = mask)
             return 0
 
         # print (f"Reusing cache: {reuse} tokens")
@@ -205,14 +220,14 @@ def gen_begin_reuse(self, in_tokens):
         self.sequence = self.sequence[:, :reuse]
         self.sequence_actual = self.sequence.clone()
 
-        if reuse < in_tokens.shape[-1]: self.gen_feed_tokens(in_tokens[:, reuse:])
+        if reuse < in_tokens.shape[-1]: self.gen_feed_tokens(in_tokens[:, reuse:], mask = mask)
         return reuse
 
 
-    def gen_feed_tokens(self, in_tokens):
+    def gen_feed_tokens(self, in_tokens, mask = None):
 
         if self.sequence is None:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, mask = mask)
             return
 
         self.end_beam_search()
@@ -225,7 +240,7 @@ def gen_feed_tokens(self, in_tokens):
             self.sequence = torch.cat((self.sequence, in_tokens), dim = 1)
 
         if start < self.sequence.shape[-1] - 1:
-            self.model.forward(self.sequence[:, start : -1], self.cache, preprocess_only = True, lora = self.lora)
+            self.model.forward(self.sequence[:, start : -1], self.cache, preprocess_only = True, lora = self.lora, input_mask = mask)
 
         self.sequence_actual = self.sequence
 
@@ -247,15 +262,15 @@ def gen_rewind(self, num_tokens):
         self.sequence_actual = self.sequence
 
 
-    def gen_prune_right(self, tokens):
+    def gen_prune_right(self, tokens, mask = None):
 
         self.end_beam_search()
         if tokens > self.sequence.shape[-1] - 1: return
-        self.gen_begin(self.sequence[:, tokens:])
+        self.gen_begin(self.sequence[:, tokens:], mask = mask)
         self.sequence_actual = self.sequence
 
 
-    def gen_prune_to(self, min_tokens_to_keep, token_id):
+    def gen_prune_to(self, min_tokens_to_keep, token_id, mask = None):
 
         self.end_beam_search()
 
@@ -272,10 +287,10 @@ def gen_prune_to(self, min_tokens_to_keep, token_id):
 
             if not pruned: return
 
-        self.gen_begin(self.sequence)
+        self.gen_begin(self.sequence, mask = mask)
 
 
-    def gen_prune_left(self, num_tokens):
+    def gen_prune_left(self, num_tokens, mask = None):
 
         num_tokens = min(num_tokens, self.sequence_actual.shape[-1] - 1)
 
@@ -285,7 +300,7 @@ def gen_prune_left(self, num_tokens):
             self.begin_beam_search()
         else:
             self.sequence = self.sequence[:, num_tokens:]
-            self.gen_begin(self.sequence)
+            self.gen_begin(self.sequence, mask = mask)
 
 
     def gen_num_tokens(self):
@@ -299,14 +314,14 @@ def generate_simple(self, prompt, max_new_tokens = 128):
 
         self.end_beam_search()
 
-        ids = self.tokenizer.encode(prompt)
-        self.gen_begin(ids)
+        ids, mask = self.tokenizer.encode(prompt, return_mask = True, max_seq_len = self.model.config.max_seq_len)
+        self.gen_begin(ids, mask = mask)
 
         max_new_tokens = min(max_new_tokens, self.model.config.max_seq_len - ids.shape[1])
 
         eos = torch.zeros((ids.shape[0],), dtype = torch.bool)
         for i in range(max_new_tokens):
-            token = self.gen_single_token()
+            token = self.gen_single_token(mask = mask)
             for j in range(token.shape[0]):
                 if token[j, 0].item() == self.tokenizer.eos_token_id: eos[j] = True
             if eos.all(): break
@@ -315,9 +330,20 @@ def generate_simple(self, prompt, max_new_tokens = 128):
         return text
 
 
+    # Apply repetition penalty with current  settings
+
+    def apply_rep_penalty(self, logits):
+
+        cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
+                                                self.settings.token_repetition_penalty_max,
+                                                self.settings.token_repetition_penalty_sustain,
+                                                self.settings.token_repetition_penalty_decay,
+                                                logits)
+
+
     # Generate a single token with the current settings, append to sequence
 
-    def gen_single_token(self, constraints = None):
+    def gen_single_token(self, constraints = None, mask = None):
 
         self.end_beam_search()
 
@@ -325,13 +351,8 @@ def gen_single_token(self, constraints = None):
 
         if self.sequence is not None:
 
-            logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora)
-
-            cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
-                                                    self.settings.token_repetition_penalty_max,
-                                                    self.settings.token_repetition_penalty_sustain,
-                                                    self.settings.token_repetition_penalty_decay,
-                                                    logits)
+            logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora, input_mask = mask)
+            self.apply_rep_penalty(logits)
 
             logits[:, :, self.tokenizer.bos_token_id] = -10000.0
 
diff --git a/exllama/lora.py b/exllama/lora.py
index 01820aa7..c3019e42 100644
--- a/exllama/lora.py
+++ b/exllama/lora.py
@@ -106,12 +106,15 @@ def __init__(self, model, lora_config_path, lora_path):
 
             # Check that dtype is compatible, or convert
 
-            if tensor.dtype == torch.float16:
-                pass
+            if tensor.dtype == torch.bfloat16:
+                tensor = tensor.to(torch.float16)
 
             elif tensor.dtype == torch.float32:
                 tensor = tensor.to(torch.float16)
 
+            elif tensor.dtype == torch.float16:
+                pass
+
             else: raise ValueError(f" ## Error: unsupported tensor dtype in {self.lora_path}")
 
             # Move to target device
diff --git a/exllama/model.py b/exllama/model.py
index 68a6e7b4..5f4c1e7c 100644
--- a/exllama/model.py
+++ b/exllama/model.py
@@ -9,6 +9,10 @@
 
 from . import cuda_ext
 
+try:
+    from flash_attn import flash_attn_func
+except:
+    pass
 
 class ParsedEnum(Enum):
 
@@ -37,9 +41,9 @@ def __init__(self, model_config_path):
 
         # Loaded/automatic settings
 
-        self.bos_token_id = read_config["bos_token_id"]  # Note that the HF LlamaTokenizer doesn't seem to recognize these automatically
-        self.eos_token_id = read_config["eos_token_id"]
-        self.pad_token_id = read_config["pad_token_id"]
+        self.bos_token_id = read_config["bos_token_id"] if "bos_token_id" in read_config else 1
+        self.eos_token_id = read_config["eos_token_id"] if "eos_token_id" in read_config else 2
+        self.pad_token_id = read_config["pad_token_id"] if "pad_token_id" in read_config else 0
 
         self.hidden_size = read_config["hidden_size"]
         self.initializer_range = read_config["initializer_range"]
@@ -56,7 +60,7 @@ def __init__(self, model_config_path):
             self.num_key_value_heads = self.num_attention_heads
             self.num_key_value_groups = 1
 
-        self.rotary_embedding_base = 10000  # Constant used for pretrained models, leave as is unless retraining
+        self.rotary_embedding_base = read_config["rope_theta"] if "rope_theta" in read_config else 10000.0
         self.head_dim = self.hidden_size // self.num_attention_heads
 
         self.groupsize = None  # Autodetected
@@ -65,7 +69,7 @@ def __init__(self, model_config_path):
 
         # Required settings
 
-        self.model_path = None
+        self.model_path = None  # str or list[str]
         self.device_map = ExLlamaDeviceMap(self.num_hidden_layers)
 
         # Optional settings
@@ -80,6 +84,7 @@ def __init__(self, model_config_path):
 
         # Tuning
 
+        self.use_flash_attn_2 = False
         self.matmul_recons_thd = 8
         self.fused_mlp_thd = 2
         self.sdp_thd = 8
@@ -115,6 +120,7 @@ def set_auto_map(self, map_string):
     def calculate_rotary_embedding_base(self):
         self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2))
 
+
 # 4-bit linear layer implementation
 
 class Ex4bitLinear:
@@ -351,7 +357,7 @@ def fused(self, hidden_states, cache, buffer, input_layernorm, lora):
                                      self.config.head_dim,
                                      cache.key_states[self.index],
                                      cache.value_states[self.index],
-                                     self.config.max_seq_len,
+                                     cache.max_seq_len,
                                      q_a, q_b,
                                      k_a, k_b,
                                      v_a, v_b,
@@ -361,8 +367,8 @@ def fused(self, hidden_states, cache, buffer, input_layernorm, lora):
 
         # Get k, v with past
 
-        key_states = cache.key_states[self.index].narrow(2, 0, past_len + q_len)
-        value_states = cache.value_states[self.index].narrow(2, 0, past_len + q_len)
+        key_states = cache.key_states[self.index].narrow(2, 0, past_len + q_len).narrow(0, 0, bsz)
+        value_states = cache.value_states[self.index].narrow(2, 0, past_len + q_len).narrow(0, 0, bsz)
 
         # Repeat K/V heads if num_key_value_headsn_kv_heads < n_heads
 
@@ -374,12 +380,26 @@ def fused(self, hidden_states, cache, buffer, input_layernorm, lora):
         # TODO: Figure out if we can use cublasHgemmStridedBatched() to do this matmul without reshaping. Torch uses
         # gemmStridedBatchedEx() internally, so it should be possible.
 
-        key_states.transpose_(2, 3)
-        attn_weights = torch.matmul(query_states, key_states)
-        attn_weights /= math.sqrt(self.config.head_dim)
-        attn_weights = nn.functional.softmax(attn_weights, dim = -1, dtype = torch.float16)
-        attn_output = torch.matmul(attn_weights, value_states)
-        attn_output = attn_output.transpose(1, 2)
+        # -- Flash Attention 2.0
+
+        if self.config.use_flash_attn_2 and (past_len == 0 or q_len == 1):
+
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            query_states = query_states.transpose(1, 2)
+            attn_output = flash_attn_func(query_states, key_states, value_states, causal = (past_len == 0))
+
+        # -- HF Transformers regular attention, faster on shorter sequences, same VRAM usage
+
+        else:
+
+            key_states.transpose_(2, 3)
+            attn_weights = torch.matmul(query_states, key_states)
+            attn_weights /= math.sqrt(self.config.head_dim)
+            attn_weights = nn.functional.softmax(attn_weights, dim = -1, dtype = torch.float16)
+            attn_output = torch.matmul(attn_weights, value_states)
+            attn_output = attn_output.transpose(1, 2)
+
         attn_output = attn_output.reshape(bsz, q_len, self.config.hidden_size)
 
         # Output projection
@@ -411,26 +431,33 @@ def forward(self, hidden_states, cache, buffer, lora):
 
         # Add keys and values to cache
 
-        new_keys = cache.key_states[self.index].narrow(2, past_len, q_len)
-        new_values = cache.value_states[self.index].narrow(2, past_len, q_len)
+        new_keys = cache.key_states[self.index].narrow(2, past_len, q_len).narrow(0, 0, bsz)
+        new_values = cache.value_states[self.index].narrow(2, past_len, q_len).narrow(0, 0, bsz)
         new_keys.copy_(key_states)
         new_values.copy_(value_states)
 
         # Key/value tensors with past
 
-        key_states = cache.key_states[self.index].narrow(2, 0, past_len + q_len)
-        value_states = cache.value_states[self.index].narrow(2, 0, past_len + q_len)
+        key_states = cache.key_states[self.index].narrow(2, 0, past_len + q_len).narrow(0, 0, bsz)
+        value_states = cache.value_states[self.index].narrow(2, 0, past_len + q_len).narrow(0, 0, bsz)
 
-        # Repeat K/V heads if num_key_value_headsn_kv_heads < n_heads
+        # Attention
 
-        key_states = self.repeat_kv(key_states, self.config.num_key_value_groups)
-        value_states = self.repeat_kv(value_states, self.config.num_key_value_groups)
+        # -- Flash Attention 2.0
 
-        # Attention
+        if self.config.use_flash_attn_2 and (past_len == 0 or q_len == 1):
+
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            query_states = query_states.transpose(1, 2)
+            attn_output = flash_attn_func(query_states, key_states, value_states, causal = (past_len == 0))
 
         # -- HF Transformers regular attention, faster on shorter sequences, same VRAM usage
 
-        if self.config.sdp_thd == 0 or q_len < self.config.sdp_thd:
+        elif self.config.sdp_thd == 0 or q_len < self.config.sdp_thd:
+
+            key_states = self.repeat_kv(key_states, self.config.num_key_value_groups)
+            value_states = self.repeat_kv(value_states, self.config.num_key_value_groups)
 
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
             attn_weights /= math.sqrt(self.config.head_dim)
@@ -446,6 +473,13 @@ def forward(self, hidden_states, cache, buffer, lora):
             # Torch's SDP attention has a built-in causal mask feature which we can use only when there is no past, i.e.
             # it can only apply a square attention mask. It saves quite a bit of VRAM but in practice Torch seems to use
             # the same amount of memory at peak anyway.
+            #
+            # TODO: Apparently flash attention is disabled when supplying an attention mask tensor. Figure out if this
+            # is true and maybe drop SDP altogether. If causal masking in flash-attn is updated eventually there should
+            # be no need for this anyway.
+
+            key_states = self.repeat_kv(key_states, self.config.num_key_value_groups)
+            value_states = self.repeat_kv(value_states, self.config.num_key_value_groups)
 
             if past_len > 0 or (bsz > 1 and buffer.attn_mask is not None):
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = buffer.attn_mask, is_causal = False)
@@ -686,27 +720,27 @@ def __init__(self, config):
 
         self.config.set_tuning_params()
 
-        # Load model weights
-
-        tensors = {}
-        with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f:
+        # Read tensor list from file(s)
 
-            # Begin auto mapping if enabled
+        if isinstance(self.config.model_path, str): model_path = [self.config.model_path]
+        else: model_path = self.config.model_path
 
-            decoder_size = 0
-            norm_size = 0
-            head_size = 0
-            half_element_size = torch.tensor([], dtype = torch.float16).element_size()
+        # Read tensor list from file(s), and measure layer sizes
 
-            if self.config.auto_map is not None:
+        load_keys = {}
 
-                self.config.device_map.embed_tokens = "cpu"
-                self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1)
+        decoder_size = 0
+        norm_size = 0
+        head_size = 0
 
+        for path in model_path:
+            with safe_open(path, framework = "pt", device = "cpu") as f:
                 for key in f.keys():
 
                     if _skip_key(key): continue
 
+                    load_keys[key] = path
+
                     if key.startswith("model.layers.0."):
                         tensor_slice = f.get_slice(key)
                         shape = tensor_slice.get_shape()
@@ -725,56 +759,58 @@ def __init__(self, config):
                         head_size += math.prod(shape) * _layer_dtype_size(key)
                         del tensor_slice
 
-                # Assign layers automatically
+        # Begin auto mapping if enabled
+
+        if self.config.auto_map is not None:
 
-                device_usage = 0
-                device_index = 0
-                layer_index_device = 0
-                max_usage = self.config.auto_map[device_index] * (1024 ** 3)
+            self.config.device_map.embed_tokens = "cpu"
+            self.config.device_map.layers = ["cuda:0"] + ["?"] * (self.config.num_hidden_layers - 1)
 
-                for layer in range(self.config.num_hidden_layers + 2):
+            # Assign layers automatically
 
-                    this_layer_size = decoder_size
-                    if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size
-                    elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size
+            device_usage = 0
+            device_index = 0
+            layer_index_device = 0
+            max_usage = self.config.auto_map[device_index] * (1024 ** 3)
 
-                    while device_usage + this_layer_size > max_usage:
-                        device_index += 1
-                        device_usage = 0
-                        layer_index_device = 0
-                        max_usage = self.config.auto_map[device_index] * (1024 ** 3)
-                        if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.")
+            for layer in range(self.config.num_hidden_layers + 2):
 
-                    target = f"cuda:{device_index}"
-                    if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target
-                    elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target
-                    else: self.config.device_map.layers[layer] = f"cuda:{device_index}"
+                this_layer_size = decoder_size
+                if layer == self.config.num_hidden_layers + 0: this_layer_size = norm_size
+                elif layer == self.config.num_hidden_layers + 1: this_layer_size = head_size
 
-                    device_usage += this_layer_size
-                    layer_index_device += 1
+                while device_usage + this_layer_size > max_usage:
+                    device_index += 1
+                    device_usage = 0
+                    layer_index_device = 0
+                    max_usage = self.config.auto_map[device_index] * (1024 ** 3)
+                    if device_index >= len(self.config.auto_map): raise ValueError("Model too large for device allocation scheme.")
 
-        # Read tensor list from file
+                target = f"cuda:{device_index}"
+                if layer == self.config.num_hidden_layers + 0: self.config.device_map.norm = target
+                elif layer == self.config.num_hidden_layers + 1: self.config.device_map.lm_head = target
+                else: self.config.device_map.layers[layer] = f"cuda:{device_index}"
 
-        load_keys = []
-        with safe_open(self.config.model_path, framework = "pt", device = "cpu") as f:
-            for key in f.keys():
-                load_keys.append(key)
+                device_usage += this_layer_size
+                layer_index_device += 1
 
-        # Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk
+         # Load up to 1 GB of tensors at a time, closing and reopening the file in between each chunk
 
         max_dq_buffer_size = 0
-        f = None
+        tensors = {}
+
         st_mem = 0
         MAX_ST_MEM = 1024**3
+        f = None
+        prev_path = ""
+        for key, path in load_keys.items():
 
-        for key in load_keys:
-
-            if _skip_key(key): continue
             device = self.config.device_map.map(key)
 
-            if f is None or st_mem > MAX_ST_MEM:
+            if f is None or st_mem > MAX_ST_MEM or path != prev_path:
                 if f is not None: del f
-                f = safe_open(self.config.model_path, framework = "pt", device = "cpu")
+                f = safe_open(path, framework = "pt", device = "cpu")
+                prev_path = path
                 st_mem = 0
 
             tensor = f.get_tensor(key)
@@ -788,10 +824,13 @@ def __init__(self, config):
             if key.endswith(".input_layernorm.weight"): tensor = tensor.half()
             if key.endswith(".post_attention_layernorm.weight"): tensor = tensor.half()
 
-            tensor = tensor.to(device, non_blocking = True)
-            if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, tensor.numel() * 8)
+            if device == "cpu": keep_tensor = tensor.clone()
+            else: keep_tensor = tensor.to(device)
+            del tensor
 
-            tensors[key] = tensor
+            if key.endswith(".qweight"): max_dq_buffer_size = max(max_dq_buffer_size, keep_tensor.numel() * 8)
+
+            tensors[key] = keep_tensor
 
         del f
 
@@ -805,6 +844,8 @@ def __init__(self, config):
 
         self.embed_tokens = nn.Embedding(self.config.vocab_size, self.config.hidden_size, self.config.pad_token_id, device = "meta")
         self.embed_tokens.weight = nn.Parameter(tensors["model.embed_tokens.weight"])
+        with torch.no_grad():
+            self.embed_tokens.weight[self.config.pad_token_id] = 0
 
         # Norm
 
@@ -886,10 +927,14 @@ def forward(self,
         q_len = input_ids.shape[-1]
         remaining_q_len = q_len
         bsz = input_ids.shape[0]
+
+        assert input_mask is None or (input_mask.shape[-1] >= input_ids.shape[-1] and input_mask.shape[-2] == input_ids.shape[-2])
+
         # The buffers can only fit max_input_len tokens, so with larger batch sizes we reduce our work size correspondingly.
+
         effective_max_input_len = self.config.max_input_len // bsz
 
-        # Split forward pass
+        # Split sequence
 
         result = None
 
@@ -900,14 +945,16 @@ def forward(self,
 
             chunk_size = min(remaining_q_len, effective_max_input_len)
 
-            # Limit chunk_size to keep size of attention operation <= max_attention_size
+            # Limit chunk_size to keep size of attention operation <= max_attention_size, unless using flash-attn
 
-            past_len = cache.current_seq_len
-            attn_size = (past_len + remaining_q_len) * remaining_q_len
-            max_a = self.config.max_attention_size
-            if attn_size > max_a:
-                cs = (math.sqrt(past_len ** 2 + 4 * max_a) - past_len) / 2
-                chunk_size = math.floor(cs)
+            if not self.config.use_flash_attn_2 or chunk_begin > 0:
+
+                past_len = cache.current_seq_len
+                attn_size = (past_len + remaining_q_len) * remaining_q_len
+                max_a = self.config.max_attention_size
+                if attn_size > max_a:
+                    cs = (math.sqrt(past_len ** 2 + 4 * max_a) - past_len) / 2
+                    chunk_size = min(chunk_size, math.floor(cs))
 
             # Process chunk
 
@@ -942,8 +989,6 @@ def _forward(self,
                  output_device = None,
                  input_mask = None):
 
-        assert input_mask is None or input_mask.shape == input_ids.shape
-
         # if torch.is_grad_enabled():
         #     raise ValueError("Forward pass called with gradients enabled. Back propagation is not supported yet.")
         with torch.no_grad():
@@ -958,7 +1003,9 @@ def _forward(self,
 
             devs = self.config.device_map.get_layers_devs()
 
-            if seq_len > 1:
+            # if not self.config.use_flash_attn_2:
+
+            if seq_len > 1 or input_mask is not None:
 
                 attn_mask = torch.zeros(batch_size, 1, seq_len, past_len + seq_len, dtype = torch.float16, device = devs[0])
                 attn_mask_triu = torch.triu(torch.full((seq_len - 1, seq_len - 1), -65504.))
@@ -966,6 +1013,7 @@ def _forward(self,
 
                 if input_mask is not None:
 
+                    input_mask = input_mask[:, :past_len + seq_len]
                     input_mask = _move_tensor(input_mask, devs[0], "input_mask", self.config)
                     input_mask = torch.where(input_mask, 0, -65504.).half()
                     input_mask = input_mask.unsqueeze(1).unsqueeze(2)
@@ -978,6 +1026,10 @@ def _forward(self,
 
             buffer.attn_mask = attn_mask
 
+            # else:
+            #
+            #     buffer.attn_mask = None
+
             # Embeddings
             # TODO: Allow passing input embeddings instead of IDs
 
diff --git a/exllama/tokenizer.py b/exllama/tokenizer.py
index 596b4a39..ec9114b7 100644
--- a/exllama/tokenizer.py
+++ b/exllama/tokenizer.py
@@ -12,58 +12,178 @@ def __init__(self, tokenizer_model_path):
         self.unk_token = "<unk>"
         self.bos_token = "<s>"
         self.eos_token = "</s>"
-        self.unk_token_id = self.tokenizer.unk_id()
+        self.unk_token_id = self.tokenizer.unk_id() # is the same as pad token id...
         self.eos_token_id = self.tokenizer.eos_id()
         self.bos_token_id = self.tokenizer.bos_id()
         self.pad_token_id = 0  # self.tokenizer.pad_id()
         self.newline_token_id = 13
 
+        self.special_characters = [(self.bos_token, self.bos_token_id), (self.eos_token, self.eos_token_id), (self.unk_token, self.unk_token_id)] # for tokenzier encoding
 
     # Encode string
 
-    def encode(self, text):
+    def encode(self, text, return_mask = False, max_seq_len = 2048, add_bos = False, add_eos = False, encode_special_characters = False):
 
         if isinstance(text, list):
 
             # text is a list of strings
 
             list_ids = self.tokenizer.EncodeAsIds(text)
+
+            # pad bos and eos
+
+            if add_bos:
+                for ids in list_ids: ids.insert(0, self.bos_token_id)
+            if add_eos:
+                for ids in list_ids: ids.append(self.eos_token_id)
+
             max_length = max([len(ids) for ids in list_ids])
 
+            needs_mask = False
             padded_ids = []
             for ids in list_ids:
+                if len(ids) != len(list_ids[0]): needs_mask = True
                 padding = torch.full((max_length - len(ids),), self.pad_token_id)
                 sequence = torch.tensor(ids)
                 padded_ids.append(torch.cat((padding, sequence), dim = 0).long())
 
-            return torch.stack(padded_ids, dim = 0)
+            stacked_ids = torch.stack(padded_ids, dim = 0)
+
+            if return_mask:
+                if needs_mask:
+                    mask_padding = torch.full((stacked_ids.shape[0], max_seq_len - stacked_ids.shape[1]), True, dtype = torch.bool, device = "cpu")
+                    mask = stacked_ids != 0
+                    mask = torch.cat((mask, mask_padding), dim = 1)
+                    return stacked_ids, mask
+                else:
+                    return stacked_ids, None
+            else:
+                return stacked_ids
 
         else:
 
             # text is a single string
-
-            ids = self.tokenizer.EncodeAsIds(text)
-            return torch.tensor(ids).unsqueeze(0)
-
-    def decode(self, ids):
+            split_text = [text]
+
+            # look for special characters
+            if encode_special_characters:
+                for special_character, special_token_id in self.special_characters:
+                    temp_text = []
+                    for segment in split_text:
+                        if isinstance(segment, str) and special_character in segment:
+                            # for each special character, append the text before the special character, then append the special character ID, then the rest of the text
+                            parts = segment.split(special_character)
+                            new_parts = []
+                            for i, part in enumerate(parts):
+                                new_parts.append(part)
+                                if i < len(parts) - 1:  # add the special token id between parts, but not after the last part
+                                    new_parts.append(special_token_id)
+                            temp_text.extend(new_parts)
+                        else:
+                            temp_text.append(segment)
+                    split_text = temp_text
+
+            ids = []
+
+            for text_chunk in split_text:
+                if isinstance(text_chunk, str):
+                    ids += self.tokenizer.EncodeAsIds(text_chunk)
+                else:
+                    ids.append(text_chunk)
+
+            # pad bos and eos
+
+            if add_bos:
+              ids = [self.bos_token_id] + ids
+            if add_eos:
+              ids = ids + [self.eos_token_id]
+
+            stacked_ids = torch.tensor(ids).unsqueeze(0)
+
+            if return_mask:
+                return stacked_ids, None
+            else:
+                return stacked_ids
+
+    def decode(self, ids, decode_special_characters=False):
+        
+        special_ids = {id_: char for char, id_ in self.special_characters}  # create a lookup dictionary
 
         if ids.dim() > 1:
-
+            
             texts = []
             for i in range(ids.shape[0]):
                 seq = ids[i].tolist()
                 seq = [t for t in seq if t != self.pad_token_id]
-                if self.eos_token_id in seq: seq = seq[:seq.index(self.eos_token_id)]
-                texts.append(self.tokenizer.Decode(seq))
+
+                if decode_special_characters:
+                    text_parts = []
+                    normal_ids = []  # list of lists
+                    current_normal_ids = []  # current list of normal IDs
+                    for idx, id_ in enumerate(seq):
+                        if id_ in special_ids:
+                            # Save the current list of normal IDs, then start a new one
+                            normal_ids.append(current_normal_ids)
+                            current_normal_ids = []
+                            # Store special token as a string
+                            text_parts.append(special_ids[id_])
+                        else:
+                            current_normal_ids.append(id_)
+                    normal_ids.append(current_normal_ids)  # save the last segment of normal IDs
+                    
+                    decoded_segments = [self.tokenizer.Decode(segment) for segment in normal_ids]
+                    for idx, decoded_segment in enumerate(decoded_segments):
+                        text_parts.insert(2*idx, decoded_segment)
+                    
+                    texts.append("".join(text_parts))
+                else:
+                    if self.eos_token_id in seq:  # to not mess up special char decoding
+                        seq = seq[:seq.index(self.eos_token_id)]
+                    texts.append(self.tokenizer.Decode(seq))
+
             return texts
 
         else:
-
+            
             ids = ids.tolist()
-            text = self.tokenizer.Decode(ids)
+
+            if decode_special_characters:
+                
+                text_parts = []
+                normal_ids = []  # list of lists
+                current_normal_ids = []  # current list of normal IDs
+                for idx, id_ in enumerate(ids):
+                    if id_ in special_ids:
+                        # Save the current list of normal IDs, then start a new one
+                        normal_ids.append(current_normal_ids)
+                        current_normal_ids = []
+                        # Store special token as a string
+                        text_parts.append(special_ids[id_])
+                    else:
+                        current_normal_ids.append(id_)
+                normal_ids.append(current_normal_ids)  # save the last segment of normal IDs
+                
+                decoded_segments = [self.tokenizer.Decode(segment) for segment in normal_ids]
+                for idx, decoded_segment in enumerate(decoded_segments):
+                    text_parts.insert(2*idx, decoded_segment)
+                
+                text = "".join(text_parts)
+            
+            else:
+              
+                text = self.tokenizer.Decode(ids)
+
             return text
 
-    def num_tokens(self, text):
 
-        ids = self.tokenizer.Encode(text)
-        return len(ids)
+    def num_tokens(self, text, encode_special_characters = False):
+        
+        if encode_special_characters:
+            
+            ids = self.encode(text, encode_special_characters = True)
+            return ids.size(1)
+        
+        else:
+            
+            ids = self.tokenizer.Encode(text)
+            return len(ids)
diff --git a/exllama_ext/cpu_func/rep_penalty.cpp b/exllama_ext/cpu_func/rep_penalty.cpp
index 26c9dc16..e83849ad 100644
--- a/exllama_ext/cpu_func/rep_penalty.cpp
+++ b/exllama_ext/cpu_func/rep_penalty.cpp
@@ -17,7 +17,7 @@ void rep_penalty_cpu
     float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f;
 
     int s = sustain == -1 ? seq_len : sustain;
-    int beg = seq_len - sustain - decay;
+    int beg = seq_len - s - decay;
     if (beg < 0) beg = 0;
 
     for (int i = 0; i < vocab_size; i++) rep_mask[i] = 1.0f;
@@ -57,7 +57,7 @@ void apply_rep_penalty_cpu
     float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f;
 
     int s = sustain == -1 ? seq_len : sustain;
-    int beg = seq_len - sustain - decay;
+    int beg = seq_len - s - decay;
     if (beg < 0) beg = 0;
 
     for (int i = seq_len; i > beg;)
diff --git a/exllama_ext/cuda_func/q4_matmul.cu b/exllama_ext/cuda_func/q4_matmul.cu
index 4bbe5883..a1b7be70 100644
--- a/exllama_ext/cuda_func/q4_matmul.cu
+++ b/exllama_ext/cuda_func/q4_matmul.cu
@@ -11,6 +11,10 @@
 const int THREADS_X = 32;       // Block size and thread count along columns in w and out
 const int THREADS_Y = 1;        // Block size and thread count along rows in x and out
 
+#if defined(USE_SMEM)
+const int GROUP_STEP = 32;      // Assumed group size when block_size_z % groupsize != 0
+#endif
+
 typedef void (*fp_q4_matmul_kernel)
 (
     const half*,
@@ -44,12 +48,19 @@ __global__ void q4_matmul_kernel
     bool no_zero
 )
 {
+    #if defined(USE_SMEM)
+
+        extern __shared__ half2 x_cache[];
+        half* x_cache_h = (half*)x_cache;
+
+    #endif
+
     // Start of block
 
     int x_column = block_size_z * blockIdx.z;
     int x_column_end = min(dim, block_size_z * (blockIdx.z + 1));
 
-    int w_column = THREADS_X * blockIdx.x + threadIdx.x;  // assume width of weight matrix divisible by THREADS_X (32)
+    int w_column = THREADS_X * blockIdx.x + threadIdx.x;  // assume width of weight matrix divisible by THREADS_X
     int x_row = THREADS_Y * blockIdx.y + threadIdx.y;
 
     int iterations = (x_column_end - x_column) / 8;
@@ -67,8 +78,8 @@ __global__ void q4_matmul_kernel
     if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0)
     {
         *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0;
-        __syncthreads();
     }
+    __syncthreads();
 
     // Loop over part of x row (and w column)
 
@@ -82,49 +93,109 @@ __global__ void q4_matmul_kernel
 
         for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize)
         {
-            if constexpr (use_half2)
-            {
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
-            else
-            {
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
+            #if defined(USE_SMEM)
+
+                for (int i = threadIdx.x; i < groupsize; i += THREADS_X)
+                {
+                    if constexpr (use_x_map) x_cache_h[i] = *x_.item_ptr(x_row, x_map[k + i]);
+                    else                     x_cache_h[i] = *x_.item_ptr(x_row, k + i);
+                }
+                __syncthreads();
+
+                if constexpr (use_half2)
+                {
+                    half2 w_scale = w_scales_.item_half2half2(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                    acc = dot_product_8(acc, x_cache, w_, k, w_column, w_scale, w_zero, groupsize / 8);
+                }
+                else
+                {
+                    half w_scale = w_scales_.item(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                    acc_h = dot_product_8_h(acc_h, x_cache_h, w_, k, w_column, w_scale, w_zero, groupsize / 8);
+                }
+                __syncthreads();
+
+            #else
+
+                if constexpr (use_half2)
+                {
+                    half2 w_scale = w_scales_.item_half2half2(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+
+                    if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
+                    else                     acc = dot_product_8      (acc, (const half2*) x_.item_ptr(x_row, k), w_, k, w_column, w_scale, w_zero, groupsize / 8);
+                }
+                else
+                {
+                    half w_scale = w_scales_.item(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+
+                    if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
+                    else                     acc_h = dot_product_8_h      (acc_h, x_.item_ptr(x_row, k), w_, k, w_column, w_scale, w_zero, groupsize / 8);
+                }
+
+            #endif
         }
     }
     else
     {
-        // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache
+        // Otherwise assume groupsize is a multiple of GROUP_STEP, do GROUP_STEP columns per iteration and trust the cache
 
-        for (int k = x_column; k < x_column + iterations * 8; k += 8)
-        {
-            if constexpr (use_half2)
-            {
-                int group = k / groupsize;
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+        #if defined(USE_SMEM)
 
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
-            }
-            else
+            for (int k = x_column; k < x_column + iterations * 8; k += GROUP_STEP)
             {
-                int group = k / groupsize;
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                for (int i = threadIdx.x; i < GROUP_STEP; i += THREADS_X)
+                {
+                    if constexpr (use_x_map) x_cache_h[i] = *x_.item_ptr(x_row, x_map[k + i]);
+                    else                     x_cache_h[i] = *x_.item_ptr(x_row, k + i);
+                }
+                __syncthreads();
+
+                if constexpr (use_half2)
+                {
+                    int group = k / groupsize;
+                    half2 w_scale = w_scales_.item_half2half2(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                    acc = dot_product_8(acc, x_cache, w_, k, w_column, w_scale, w_zero, GROUP_STEP / 8);
+                }
+                else
+                {
+                    int group = k / groupsize;
+                    half w_scale = w_scales_.item(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                    acc_h = dot_product_8_h(acc_h, x_cache_h, w_, k, w_column, w_scale, w_zero, GROUP_STEP / 8);
+                }
+                __syncthreads();
+            }
 
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
+        #else
+
+            for (int k = x_column; k < x_column + iterations * 8; k += 8)
+            {
+                if constexpr (use_half2)
+                {
+                    int group = k / groupsize;
+                    half2 w_scale = w_scales_.item_half2half2(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+
+                    if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
+                    else                     acc = dot_product_8      (acc, (const half2*) x_.item_ptr(x_row, k), w_, k, w_column, w_scale, w_zero, 1);
+                }
+                else
+                {
+                    int group = k / groupsize;
+                    half w_scale = w_scales_.item(group, w_column);
+                    uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+
+                    if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
+                    else                     acc_h = dot_product_8_h      (acc_h, x_.item_ptr(x_row, k), w_, k, w_column, w_scale, w_zero, 1);
+                }
             }
-        }
+
+        #endif
+
     }
 
     // Add to block result
@@ -213,7 +284,18 @@ void q4_matmul_cuda
     );
 
     fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map);
-    kernel<<<blocks, threads, 0, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
+
+    #if defined(USE_SMEM)
+
+        int shared_mem = (block_size_z % w->groupsize == 0 ? w->groupsize : GROUP_STEP) * sizeof(half);
+
+    # else
+
+        int shared_mem = 0;
+
+    #endif
+
+    kernel<<<blocks, threads, shared_mem, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
 }
 
 void q4_matmul_recons_cuda
diff --git a/exllama_ext/cuda_func/q4_matmul.cuh b/exllama_ext/cuda_func/q4_matmul.cuh
index 8c90e821..1bd91250 100644
--- a/exllama_ext/cuda_func/q4_matmul.cuh
+++ b/exllama_ext/cuda_func/q4_matmul.cuh
@@ -16,6 +16,10 @@
 #define rocblas_handle hipblasHandle_t
 #endif
 
+#if !defined(USE_ROCM) && (!defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700))
+#define USE_SMEM
+#endif
+
 void q4_matmul_cuda
 (
     ExLlamaTuning* tuningParams,
diff --git a/exllama_ext/hip_compat.cuh b/exllama_ext/hip_compat.cuh
index e650cd4a..1301ca08 100644
--- a/exllama_ext/hip_compat.cuh
+++ b/exllama_ext/hip_compat.cuh
@@ -1,7 +1,7 @@
 #ifndef _hip_compat_cuh
 #define _hip_compat_cuh
 
-// Workaround for a bug in hipamd, backported from upstream.
+// Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6.
 __device__ __forceinline__ __half __compat_hrcp(__half x) {
     return __half_raw{
         static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
@@ -15,7 +15,7 @@ __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
 #define hrcp __compat_hrcp
 #define h2rcp __compat_h2rcp
 
-// Workaround for hipify_python using rocblas instead of hipblas.
+// Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf.
 __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
                                                                hipblasOperation_t transA,
                                                                hipblasOperation_t transB,
@@ -37,7 +37,9 @@ __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t
                         reinterpret_cast<const hipblasHalf *>(beta),
                         reinterpret_cast<hipblasHalf *>(CP), ldc);
 }
+#define hipblasHgemm __compat_hipblasHgemm
 
+// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
 #define rocblas_handle hipblasHandle_t
 #define rocblas_operation_none HIPBLAS_OP_N
 #define rocblas_get_stream hipblasGetStream
diff --git a/exllama_ext/matrix.cuh b/exllama_ext/matrix.cuh
index 1b9ad165..0c421c60 100644
--- a/exllama_ext/matrix.cuh
+++ b/exllama_ext/matrix.cuh
@@ -87,9 +87,7 @@ public:
 __device__ __forceinline__ half2 dot_product_8
 (
     const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
+    const half2* h_ptr,
     MatrixView_q4_column& v_,
     const int v_row,                    // divisible by 8
     const int v_column,
@@ -98,7 +96,6 @@ __device__ __forceinline__ half2 dot_product_8
     const int count
 )
 {
-    const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column);
     const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
     half2 result = acc;
 
@@ -138,9 +135,7 @@ __device__ __forceinline__ half2 dot_product_8
 __device__ __forceinline__ half dot_product_8_h
 (
     const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
+    const half* h_ptr,
     MatrixView_q4_column& v_,
     const int v_row,                    // divisible by 8
     const int v_column,
@@ -149,7 +144,6 @@ __device__ __forceinline__ half dot_product_8_h
     const int count
 )
 {
-    const half* h_ptr = h_.item_ptr(h_row, h_column);
     const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
     half result = acc;
 
@@ -291,4 +285,4 @@ __device__ __forceinline__ half dot_product_8_x_map_h
     return result;
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/globals.py b/globals.py
new file mode 100644
index 00000000..f2895221
--- /dev/null
+++ b/globals.py
@@ -0,0 +1,22 @@
+import os
+
+def set_affinity_mask(affinity_mask = None):
+
+    if affinity_mask is None:
+        cpu_count = os.cpu_count()
+        affinity_mask = set(range(cpu_count))
+
+    os.sched_setaffinity(0, affinity_mask)
+
+
+def set_affinity_list(affinity_list = None):
+
+    if affinity_list is None: set_affinity_mask(None)
+    else: set_affinity_mask(set(affinity_list))
+
+
+def set_affinity_str(affinity_str = None):
+
+    if affinity_str is None or affinity_str.isspace(): set_affinity_mask(None)
+    aff = [int(alloc) for alloc in affinity_str.split(",")]
+    set_affinity_list(aff)
diff --git a/model_init.py b/model_init.py
index 68901992..e3a4c347 100644
--- a/model_init.py
+++ b/model_init.py
@@ -2,6 +2,7 @@
 from tokenizer import ExLlamaTokenizer
 import argparse, sys, os, glob
 from torch import version as torch_version
+from globals import set_affinity_str
 
 def add_args(parser):
 
@@ -14,9 +15,12 @@ def add_args(parser):
     parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048)
     parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0)
     parser.add_argument("-a", "--alpha", type = float, help = "alpha for context size extension via embedding extension", default = 1.0)
+    parser.add_argument("-theta", "--theta", type = float, help = "theta (base) for RoPE embeddings")
 
     parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs")
 
+    parser.add_argument("-flash", "--flash_attn", nargs = '?', const = 'default', metavar = "METHOD", help = "Use Flash Attention with specified input length (must have Flash Attention 2.0 installed)")
+
     parser.add_argument("-mmrt", "--matmul_recons_thd", type = int, help = "No. rows at which to use reconstruction and cuBLAS for quant matmul. 0 = never, 1 = always", default = 8)
     parser.add_argument("-fmt", "--fused_mlp_thd", type = int, help = "Maximum no. of rows for which to use fused MLP. 0 = never", default = 2)
     parser.add_argument("-sdpt", "--sdp_thd", type = int, help = "No. rows at which to switch to scaled_dot_product_attention. 0 = never, 1 = always", default = 8)
@@ -31,6 +35,8 @@ def add_args(parser):
     parser.add_argument("-fh2", "--force_half2", action = "store_true", help = "Force enable half2 even if unsupported")
     parser.add_argument("-cs", "--concurrent_streams", action = "store_true", help = "Use concurrent CUDA streams")
 
+    parser.add_argument("-aff", "--affinity", type = str, help = "Comma-separated list, sets processor core affinity. E.g.: -aff 0,1,2,3")
+
 
 def post_parse(args):
 
@@ -53,10 +59,10 @@ def get_model_files(args):
         if len(st) == 0:
             print(f" !! No files matching {st_pattern}")
             sys.exit()
-        if len(st) > 1:
-            print(f" !! Multiple files matching {st_pattern}")
-            sys.exit()
-        args.model = st[0]
+        # if len(st) > 1:
+        #     print(f" !! Multiple files matching {st_pattern}")
+        #     sys.exit()
+        args.model = st
     else:
         if args.tokenizer is None or args.config is None or args.model is None:
             print(" !! Please specify either -d or all of -t, -c and -m")
@@ -65,17 +71,28 @@ def get_model_files(args):
 
 # Feedback
 
+def _common_chars(names):
+    cname = max(names, key = len)
+    for x in names:
+        for p, c in enumerate(x):
+            if c != cname[p] and cname[p] != "*": cname = cname[:p] + "*" + cname[p+1:]
+    return cname
+
 def print_options(args, extra_options = None):
 
     print_opts = []
     if args.gpu_split is not None: print_opts.append(f"gpu_split: {args.gpu_split}")
     if args.gpu_peer_fix: print_opts.append("gpu_peer_fix")
+    if args.affinity: print_opts.append(f" --affinity: {args.affinity}")
 
     if extra_options is not None: print_opts += extra_options
 
     print(f" -- Tokenizer: {args.tokenizer}")
     print(f" -- Model config: {args.config}")
-    print(f" -- Model: {args.model}")
+
+    if isinstance(args.model, str): print(f" -- Model: {args.model}")
+    else: print(f" -- Model: {_common_chars(args.model)}")
+
     print(f" -- Sequence length: {args.length}")
     if args.compress_pos_emb != 1.0:
         print(f" -- RoPE compression factor: {args.compress_pos_emb}")
@@ -84,9 +101,12 @@ def print_options(args, extra_options = None):
         print(f" -- RoPE alpha factor: {args.alpha}")
 
     print(f" -- Tuning:")
+
+    if args.flash_attn: print(f" -- --flash_attn")
+    else: print(f" -- --sdp_thd: {args.sdp_thd}" + (" (disabled)" if args.sdp_thd == 0 else ""))
+
     print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else ""))
     print(f" -- --fused_mlp_thd: {args.fused_mlp_thd}" + (" (disabled)" if args.fused_mlp_thd == 0 else ""))
-    print(f" -- --sdp_thd: {args.sdp_thd}" + (" (disabled)" if args.sdp_thd == 0 else ""))
     if args.matmul_fused_remap: print(f" -- --matmul_fused_remap")
     if args.no_fused_attn: print(f" -- --no_fused_attn")
     if args.rmsnorm_no_half2: print(f" -- --rmsnorm_no_half2")
@@ -112,6 +132,13 @@ def make_config(args):
     config.alpha_value = args.alpha
     config.calculate_rotary_embedding_base()
 
+    if args.flash_attn:
+        config.use_flash_attn_2 = True
+        try:
+            config.max_input_len = int(args.flash_attn)
+        except ValueError:
+            pass
+
     config.matmul_recons_thd = args.matmul_recons_thd
     config.fused_mlp_thd = args.fused_mlp_thd
     config.sdp_thd = args.sdp_thd
@@ -124,9 +151,19 @@ def make_config(args):
     config.silu_no_half2 = args.silu_no_half2
     config.concurrent_streams = args.concurrent_streams
 
+    if args.theta:
+        config.rotary_embedding_base = args.theta
+
     return config
 
 
+# Global state
+
+def set_globals(args):
+
+    if args.affinity: set_affinity_str(args.affinity)
+
+
 # Print stats after loading model
 
 def print_stats(model):
diff --git a/requirements.txt b/requirements.txt
index af86b688..94a3127b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 torch>=2.0.1
-safetensors==0.3.1
+safetensors==0.3.2
 sentencepiece>=0.1.97
 ninja==1.11.1
diff --git a/setup.py b/setup.py
index 7753b1d7..415147fa 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name="exllama",
-    version="0.0.6",
+    version="0.0.7",
     install_requires=[
         "torch",
     ],
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index 7ec5d48f..9811db7f 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -34,13 +34,14 @@ def begin():
 def next_logits(input_ids, apply_lora, last_id_only = True, input_mask = None):
     global model, cache
 
-    n_logits = None
-    a = 0
-    while a < input_ids.shape[-1]:
-        b = min(input_ids.shape[-1], a + 2048)
-        n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora, input_mask = input_mask)
-        a = b
-
+    # n_logits = None
+    # a = 0
+    # while a < input_ids.shape[-1]:
+    #     b = min(input_ids.shape[-1], a + 2048)
+    #     n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora, input_mask = input_mask)
+    #     a = b
+
+    n_logits = model.forward(input_ids, cache, last_id_only, lora=apply_lora, input_mask=input_mask)
     return n_logits
 
 
@@ -117,6 +118,10 @@ def mem(name, total = False):
 
 model_init.print_options(args, print_opts)
 
+# Globals
+
+model_init.set_globals(args)
+
 # Instantiate model
 
 config = model_init.make_config(args)
diff --git a/util/shard.py b/util/shard.py
new file mode 100644
index 00000000..5b29db0f
--- /dev/null
+++ b/util/shard.py
@@ -0,0 +1,84 @@
+import argparse, json, math, os
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+parser = argparse.ArgumentParser(description = "Split .safetensors file into shards")
+parser.add_argument("input_file", type = str, help = "Path to input file")
+parser.add_argument("shard_size", type = int, help = "Shard size in megabytes")
+args = parser.parse_args()
+
+input_file = args.input_file
+input_base, _ = os.path.splitext(input_file)
+shard_size = args.shard_size * 1024**2
+
+# Create tensor map
+
+def _tsize(st, key):
+
+    tslice = st.get_slice(key)
+    shape = tslice.get_shape()
+    numel = 1
+    for x in shape: numel *= x
+    dtype = tslice.get_dtype()
+    del tslice
+    if dtype == "I32": return numel * 4
+    elif dtype == "I16": return numel * 2
+    elif dtype == "F16": return numel * 2
+    elif dtype == "F32": return numel * 4
+    else: raise ValueError("Unexpected datatype: " + key)
+
+num_files = 0
+current_size = shard_size + 1
+total_size = 0
+tensor_map = []
+
+print(f" -- Scanning tensors in {input_file}")
+
+with safe_open(input_file, framework = "pt", device = "cpu") as f:
+    
+    for key in f.keys():
+        
+        tensor_size = _tsize(f, key)
+        total_size += tensor_size
+        
+        if current_size + tensor_size > shard_size:
+            
+            num_files += 1
+            current_size = 0
+            current_list = []
+            tensor_map.append(current_list)
+            
+        current_size += tensor_size
+        current_list.append(key)
+
+# Split into output files
+
+weight_map = {}
+
+for file_index, keys in enumerate(tensor_map):
+    
+    shard = {}
+    shard_filename = f"{input_base}-{file_index + 1:05}-of-{num_files:05}.safetensors"
+
+    with safe_open(input_file, framework = "pt", device = "cpu") as f:
+        for key in keys:
+            print(f" -- Reading: {key}")
+            shard[key] = f.get_tensor(key)
+            weight_map[key] = shard_filename
+
+    print(f" -- Writing: {shard_filename}")
+    save_file(shard, shard_filename)
+    
+# Compile index
+
+index = { "metadata": { "total_size": total_size }, "weight_map": weight_map }
+index_filename = f"{input_file}.index.json"
+
+print(f" -- Writing: {index_filename}")
+
+with open(index_filename, 'w') as f:
+    json.dump(index, f, indent = 2)
+
+# Done    
+    
+print(f" -- Done")
\ No newline at end of file
diff --git a/webui/app.py b/webui/app.py
index c2fa909f..808e67d1 100644
--- a/webui/app.py
+++ b/webui/app.py
@@ -138,6 +138,8 @@ def api_append_block():
 model_init.print_options(args)
 config = model_init.make_config(args)
 
+model_init.set_globals(args)
+
 print(f" -- Loading model...")
 model = ExLlama(config)
 
diff --git a/webui/session.py b/webui/session.py
index 60a979a7..403b1626 100644
--- a/webui/session.py
+++ b/webui/session.py
@@ -310,7 +310,16 @@ def api_populate(self):
 
         # Add model info
 
-        model_str = os.path.splitext(os.path.basename(model.config.model_path))[0] + "\n"
+        def _common_chars(names):
+            cname = max(names, key=len)
+            for x in names:
+                for p, c in enumerate(x):
+                    if c != cname[p] and cname[p] != "*": cname = cname[:p] + "*" + cname[p + 1:]
+            return cname
+
+        mp = model.config.model_path if isinstance(model.config.model_path, str) else _common_chars(model.config.model_path)
+
+        model_str = os.path.splitext(os.path.basename(mp))[0] + "\n"
         model_str += f"Sequence length: {model.config.max_seq_len}\n"
 
         dic["model_info"] = model_str.strip()