v3.5 hotfix

vince62s · Feb 26, 2024 · 95b345c · 95b345c
1 parent b9a60d6
commit 95b345c
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 17 deletions.
diff --git a/eval_llm/WIKITEXT2/readme.md b/eval_llm/WIKITEXT2/readme.md
@@ -0,0 +1,54 @@
+These are perplexity computed on wikitext2.
+
+Numbers are not comparable to lm-evaluation-harness since they compute word / byte / bit perplexity like this:
+
+hf-auto (pretrained=mistralai/Mistral-7B-Instruct-v0.2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 8
+| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
+|--------|------:|------|------|---------------|-----:|---|------|
+|wikitext| 2|none |None |word_perplexity|9.8183|± |N/A |
+| | |none |None |byte_perplexity|1.5329|± |N/A |
+| | |none |None |bits_per_byte |0.6163|± |N/A |
+
+
+hf-auto (pretrained=meta-llama/Llama-2-7b-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 1
+| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
+|--------|------:|------|------|---------------|-----:|---|------|
+|wikitext| 2|none |None |word_perplexity|8.7921|± |N/A |
+| | |none |None |byte_perplexity|1.5016|± |N/A |
+| | |none |None |bits_per_byte |0.5865|± |N/A |
+
+
+Numbers are not comparable to perplexity reported by llama.cpp because we use a smaller context window but also we detokenize the raw corpus (thing that they shoudl do but they don't)
+
+| 7B Family | | PPL | Time (sec) |
+| ---------------- | --------------------- | ----- | ---------- |
+| Base | llama2 | 5.78 | 152 |
+| | mistral v0.1 | 5.70 | 162 |
+| | awq | 5.81 | 165 |
+| | Yi-6B-200K | 7.76 | 133 |
+| | xgen-7B | 8.64 | 129 |
+| | mpt-7B | 8.43 | 147 |
+| | | | |
+| Instruct / Tuned | llama2-chat | 7.37 | 148 |
+| | mistral-instr-v0.2 | 6.98 | 160 |
+| | gemm-awq | 7.07 | 164 |
+| | gemv-awq | 7.07 | 237 |
+| | | | |
+| | Alma-7B-R | 6.82 | 156 |
+| | TowerInstruct-7B | 6.45 | 157 |
+| | codellama-7B | 8.56 | 154 |
+| | | | |
+| 3B Family | Phi-2 | 9.74 | 52 |
+| | Phi-2-psy | 10.44 | 53 |
+| | | | |
+| 13B Family | llama2 (4-bit) | 5.31 | 296 |
+| | llama2-chat (4-bit) | 6.59 | 292 |
+| | | | |
+| 34B Family | codellama-34B (4-bit) | 6.00 | 706 |
+
+
+We note that llama2 and Mistral are in fact very close for their base model. However there is a shift between their chat model.
+
+All others are quite below which is surprising for Yi given their results on the Open llm leaderboard.
+
+I need to check why Mistral seems a little slower than llama2, it should be the opposite.
diff --git a/onmt/models/model.py b/onmt/models/model.py
@@ -157,7 +157,9 @@ def load_state_dict(
  )
  param.data = checkpoint["generator"][keyname]
  del checkpoint["generator"][keyname]
- elif strict and "lora" not in param_name:
+ elif strict and (
+ "lora" not in param_name and "slopes" not in param_name
+ ):
  raise ValueError(
  "Missing key in checkpoint: %s" % name + "." + param_name
  )
@@ -234,7 +236,9 @@ def load_safe_state_dict(
  name, module, param_name, param, buf_list, ckpt_t, offset
  )
  keyfound[name + "." + param_name] = True
- elif strict and "lora" not in param_name:
+ elif strict and (
+ "lora" not in param_name and "slopes" not in param_name
+ ):
  raise ValueError(
  "Missing key in safetensors checkpoint: %s" % name
  + "."

diff --git a/onmt/modules/multi_headed_attn.py b/onmt/modules/multi_headed_attn.py
@@ -599,7 +599,7 @@ def forward(
  base=self.rotary_theta,
  device=query.device,
  )
- rope = self.rope[start_pos : start_pos + seqlen]
+ rope = self.rope[start_pos : start_pos + seqlen].to(query.device)
  query, key = apply_rotary_emb(
  query, key, rope, interleave=self.rotary_interleave
  )

diff --git a/onmt/train_single.py b/onmt/train_single.py
@@ -27,20 +27,6 @@
 
 def prepare_transforms_vocabs(opt, transforms_cls):
  """Prepare or dump transforms before training."""
- # if transform + options set in 'valid' we need to copy in main
- # transform / options for scoring considered as inference
- validset_transforms = opt.data.get("valid", {}).get("transforms", None)
- if validset_transforms:
- opt.transforms = validset_transforms
- if opt.data.get("valid", {}).get("tgt_prefix", None):
- opt.tgt_prefix = opt.data.get("valid", {}).get("tgt_prefix", None)
- opt.tgt_file_prefix = True
- if opt.data.get("valid", {}).get("src_prefix", None):
- opt.src_prefix = opt.data.get("valid", {}).get("src_prefix", None)
- if opt.data.get("valid", {}).get("tgt_suffix", None):
- opt.tgt_suffix = opt.data.get("valid", {}).get("tgt_suffix", None)
- if opt.data.get("valid", {}).get("src_suffix", None):
- opt.src_suffix = opt.data.get("valid", {}).get("src_suffix", None)
  specials = get_specials(opt, transforms_cls)
 
  vocabs = build_vocab(opt, specials)
@@ -77,6 +63,20 @@ def _init_train(opt):
  """
  ArgumentParser.validate_prepare_opts(opt)
  transforms_cls = get_transforms_cls(opt._all_transform)
+ # if transform + options set in 'valid' we need to copy in main
+ # transform / options for scoring considered as inference
+ validset_transforms = opt.data.get("valid", {}).get("transforms", None)
+ if validset_transforms:
+ opt.transforms = validset_transforms
+ if opt.data.get("valid", {}).get("tgt_prefix", None):
+ opt.tgt_prefix = opt.data.get("valid", {}).get("tgt_prefix", None)
+ opt.tgt_file_prefix = True
+ if opt.data.get("valid", {}).get("src_prefix", None):
+ opt.src_prefix = opt.data.get("valid", {}).get("src_prefix", None)
+ if opt.data.get("valid", {}).get("tgt_suffix", None):
+ opt.tgt_suffix = opt.data.get("valid", {}).get("tgt_suffix", None)
+ if opt.data.get("valid", {}).get("src_suffix", None):
+ opt.src_suffix = opt.data.get("valid", {}).get("src_suffix", None)
  if opt.train_from:
  # Load checkpoint if we resume from a previous training.
  checkpoint = load_checkpoint(ckpt_path=opt.train_from)

diff --git a/tools/convert_llama.py b/tools/convert_llama.py
@@ -428,6 +428,9 @@ def __init__(self, model_path: str):
  global_attention_function="softmax",
  self_attn_type="scaled-dot",
  max_relative_positions=-1,
+ rotary_interleave=True,
+ rotary_theta=10000,
+ rotary_dim=0,
  heads=heads,
  sliding_window=sliding_window,
  transformer_ff=transformer_ff,