diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index f5a5876a5..7c96cb73f 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -2344,7 +2344,7 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # Make input MatMul and Add nodes up_matmul_name = f"/model/layers.{layer_id}/mlp/up_proj/MatMul" - self.make_matmul(mlp.up_proj.weight.detach().numpy(), up_matmul_name, root_input) + self.make_matmul(mlp.up_proj, up_matmul_name, root_input) up_add_name = f"/model/layers.{layer_id}/mlp/up_proj/Add" self.make_add_bias(mlp.up_proj.bias.detach().numpy(), up_add_name, f"{up_matmul_name}/output_0") @@ -2390,7 +2390,7 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # Make output MatMul and Add nodes down_matmul_name = f"/model/layers.{layer_id}/mlp/down_proj/MatMul" - self.make_matmul(mlp.down_proj.weight.detach().numpy(), down_matmul_name, f"{mul_name}/output_0") + self.make_matmul(mlp.down_proj, down_matmul_name, f"{mul_name}/output_0") down_add_name = f"/model/layers.{layer_id}/mlp/down_proj/Add" self.make_add_bias(mlp.down_proj.bias.detach().numpy(), down_add_name, f"{down_matmul_name}/output_0") @@ -2454,12 +2454,8 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid elif config.architectures[0] == "Phi3ForCausalLM" and config.max_position_embeddings == 131072: onnx_model = Phi3Mini128KModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "Phi3SmallForCausalLM" and config.max_position_embeddings == 8192: - print("WARNING: This model only works for CUDA currently because `SparseAttention` is only supported for CUDA in ONNX Runtime. Setting `--execution_provider cuda` by default.") - execution_provider = "cuda" onnx_model = Phi3Small8KModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "Phi3SmallForCausalLM" and config.max_position_embeddings == 131072: - print("WARNING: This model only works for CUDA currently because `SparseAttention` is only supported for CUDA in ONNX Runtime. Setting `--execution_provider cuda` by default.") - execution_provider = "cuda" onnx_model = Phi3Small128KModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "Phi3VForCausalLM": print("WARNING: This is only generating the text component of the model. Setting `--extra_options exclude_embeds=true` by default.")