intel · a32543254 · May 30, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -38,7 +38,8 @@ Neural Speed supports the following models:
     <td>8192</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-7B</a>,
+    <td><a href="https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0" target="_blank" rel="noopener noreferrer">TinyLlama-1.1B</a>,
+    <a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-tB</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-13B</a>,
     <a href="https://huggingface.co/meta-llama/Llama-2-70b-chat-hf" target="_blank" rel="noopener noreferrer">LLaMA2-70B</a></td>
     <td>✅</td>

diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -223,6 +223,7 @@ def init(self,
 
     def init_from_bin(self, model_type, model_path, **generate_kwargs):
         if self.module is None:
+            model_type = model_maps.get(model_type, model_type)
             self.module = _import_package(model_type)
         self.model = self.module.Model()
         if model_type=="whisper":

diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
@@ -144,7 +144,10 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", hparams["num_hidden_layers"]))
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("i", ftype))
-    fout.write(struct.pack("i", hparams["model_max_length"]))
+    if "max_position_embeddings" in hparams:
+        fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+    else:
+        fout.write(struct.pack("i", hparams["model_max_length"]))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("i", 0))
@@ -248,7 +251,10 @@ def baichuan7B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", hparams["num_hidden_layers"]))
     fout.write(struct.pack("i", 128))
     fout.write(struct.pack("i", ftype))
-    fout.write(struct.pack("i", hparams["model_max_length"]))
+    if "max_position_embeddings" in hparams:
+        fout.write(struct.pack("i", hparams["max_position_embeddings"]))
+    else:
+        fout.write(struct.pack("i", hparams["model_max_length"]))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("f", 0))
     fout.write(struct.pack("i", 0))

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
@@ -1357,6 +1357,8 @@ def load_some_model(path: Path) -> ModelPlus:
     if path.is_dir():
         # Check if it's a set of safetensors files first
         files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            files = list(path.glob("model*.safetensors")) # for only one safetensor
         if not files:
             # Try the PyTorch patterns too, with lower priority
             globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]

diff --git a/neural_speed/models/llama/llama.h b/neural_speed/models/llama/llama.h
@@ -20,6 +20,7 @@
 
 enum llama_model {
   LLAMA_UNKNOWN,
+  TINY_LLAMA,
   LLAMA_7B,
   LLAMA_13B,
   LLAMA_30B,
@@ -28,6 +29,12 @@ enum llama_model {
 
 static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
   switch (n_layers) {
+    case 22:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+      };
     case 32:
       return {
           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,

diff --git a/tests/model-test/calculate_percentiles.py b/tests/model-test/calculate_percentiles.py
@@ -37,6 +37,10 @@ def parse_output_file_acc(file_path):
     with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file:
         for line in file:
             accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\d\.\d+\|\±\s+\|\d\.\d+\|", line)
+            if accuracy_match:
+                accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
+                continue
+            accuracy_match = re.search(r"\|\s+\|\s+\|none\s+\|\s+0\|acc\s+\|\s+\d\.\d+\|\±\s+\|\d\.\d+\|", line)
             if accuracy_match:
                 accuracy[0]=float(re.search(r"\d+\.\d+", accuracy_match.group()).group())*100
                 continue

diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh
@@ -146,7 +146,7 @@ model_name_map["starcoder-3b"]="bigcode/starcoder"
 model_name_map["bloom-7b"]="bigscience/bloom-7b1"
 model_name_map["opt-1.3b"]="facebook/opt-1.3b"
 model_name_map["dolly-v2-3b"]="databricks/dolly-v2-3b"
-model_name_map["chatglm3"]="THUDM/chatglm3-6b"
+model_name_map["chatglm3-6b"]="THUDM/chatglm3-6b"
 model_name_map["chatglm2"]="THUDM/chatglm2-6b"
 model_name_map["chatglm-6b"]="THUDM/chatglm-6b"
 model_name_map["baichuan2-13b"]="baichuan-inc/Baichuan2-13B-Chat"
@@ -363,6 +363,7 @@ function main() {
     ninja
     cd ..
     pip install -r $working_dir/requirements.txt
+    pip install lm_eval
     python $working_dir/setup.py install
     ## prepare example requirement
     if [[ -f $requirements_file ]]; then
@@ -468,8 +469,10 @@ function main() {
                         chmod 777 ${WORKSPACE}/${logs_file}
                         if [[ ${input} == "1024" && ${cores_per_instance} == "32" ]]; then
                             echo "-------- Accuracy start--------"
-                            if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" ]]; then
+                            if [[ "${model}" == "llama"* || "${model}" == "gptj-6b" || "${model}" == "mistral-7b" ]]; then
                                 OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --batch_size 8 --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
+                            elif [[ "${model}" == *"gptq" ]]; then
+                                OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --use_gptq --tasks lambada_openai 2>&1 | tee -a ${WORKSPACE}/${logs_file}
                             else
                                 OMP_NUM_THREADS=56 numactl -l -C 0-55 python ./scripts/cal_acc.py --model_name ${model_path} --init_from_bin ${model}-${precision}.bin --tasks lambada_openai --batch_size 1  2>&1 | tee -a ${WORKSPACE}/${logs_file}
                             fi