From 95297c5c85439fc54d8c77786940c4f20772bdb6 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Wed, 11 Dec 2024 03:01:57 +0000 Subject: [PATCH 1/4] model_runner_cls is now use WorkerWrapperBase instead as init cls after upstream PR10555 Signed-off-by: Chendi.Xue --- vllm/worker/hpu_worker.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index cc88070fff56e..b73b8d5190b30 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -78,9 +78,7 @@ def __init__( is_encoder_decoder_model = self._is_encoder_decoder_model() ModelRunnerClass: Type[HPUModelRunnerBase] = HPUModelRunner - if model_runner_cls is not None: - ModelRunnerClass = model_runner_cls - elif is_encoder_decoder_model: + if is_encoder_decoder_model: ModelRunnerClass = HPUEncoderDecoderModelRunner self.model_runner: HPUModelRunnerBase = ModelRunnerClass( vllm_config=vllm_config, From 54d29cbfde3c932a957ee74c40ccf87446933562 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Sat, 14 Dec 2024 04:22:04 +0000 Subject: [PATCH 2/4] Fix shape error detected by benchmark Signed-off-by: Chendi.Xue --- vllm/sequence.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 669124319c4f4..53c8a4b73b4e3 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1182,7 +1182,8 @@ def update(self, second_last_token_hidden_states: Optional[torch.Tensor] = None): """Update hidden states from target model invocation. Only used for decode steps""" - assert len(seq_group_metadata_list) == len(hidden_states) + if len(seq_group_metadata_list) < len(hidden_states): + hidden_states = hidden_states[:len(seq_group_metadata_list)] self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) self.hidden_states = torch.cat([self.hidden_states, hidden_states]) From 902f3a3033988feea6ce20f7f92e8b0dfc1e5ad8 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Wed, 11 Dec 2024 05:15:52 +0000 Subject: [PATCH 3/4] Fix pre-compute not correct issue For spec decode eagle mode, need to VLLM_COS_SIN_RECOMPUTE=true Signed-off-by: Chendi.Xue --- .jenkins/test_config.yaml | 2 +- vllm/model_executor/layers/rotary_embedding.py | 7 +++++-- vllm/worker/hpu_model_runner.py | 5 ++++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index 3d8b2416506c7..0b9a2231d59a8 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -57,4 +57,4 @@ stages: command: TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_medusa_correctness.py::test_medusa_e2e_greedy_correctness - name: gsm8k_small_g2_tp1_eagle_spec_decode flavor: g2 - command: TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness + command: VLLM_COS_SIN_RECOMPUTE=true TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index a601189788441..4889c0f0c4cce 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -102,7 +102,9 @@ def __init__( def prepare_cos_sin(self, positions: torch.Tensor, - offsets: Optional[torch.Tensor] = None): + offsets: Optional[torch.Tensor] = None, + recompute_cos_sin: bool = False): + self.recompute_cos_sin = recompute_cos_sin if offsets is not None: offsets = offsets.view(positions.shape[0], -1) positions = positions + offsets @@ -232,11 +234,12 @@ def forward_hpu( ) -> Tuple[torch.Tensor, torch.Tensor]: from habana_frameworks.torch.hpex.kernels import ( RotaryPosEmbeddingMode, apply_rotary_pos_emb) - # Prepare cos-sin caches for long-context + LoRA with offsets for every # forward, since the offset information wasn't available previously if hasattr(self, "scaling_factors") or self.sin is None: self.prepare_cos_sin(positions, offsets) + if self.recompute_cos_sin: + self.prepare_cos_sin(positions, offsets, recompute_cos_sin=True) num_tokens = positions.shape[0] * positions.shape[1] # HPU RoPE kernel requires hidden dimension for cos and sin to be equal # to query hidden dimension, so the original tensors need to be diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d3090d313d155..b80463195ced0 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -209,6 +209,8 @@ def __init__(self, model, block_size, dtype, enforce_eager, layer_names): self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '1').lower() in ['1', 'true'] \ and not is_fake_hpu() + self.recompute_cos_sin = os.getenv('VLLM_COS_SIN_RECOMPUTE', + 'false').lower() in ['1', 'true'] self.block_size = block_size self.dtype = dtype self.layer_names = layer_names @@ -370,7 +372,8 @@ def _prepare_cos_sin(self, positions): # At the end, we should be at the RotaryEmbedding layer. if hasattr(current_module, 'prepare_cos_sin'): - current_module.prepare_cos_sin(positions) + current_module.prepare_cos_sin( + positions, recompute_cos_sin=self.recompute_cos_sin) else: raise AttributeError( "The module at the end of the path does not have \ From fb7d558b9f64e565df7ba14893697f83e92f1507 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Mon, 6 Jan 2025 17:50:05 +0000 Subject: [PATCH 4/4] Fix format error Signed-off-by: Chendi.Xue --- vllm/model_executor/layers/rotary_embedding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 4889c0f0c4cce..90a5f80cf5755 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -234,6 +234,7 @@ def forward_hpu( ) -> Tuple[torch.Tensor, torch.Tensor]: from habana_frameworks.torch.hpex.kernels import ( RotaryPosEmbeddingMode, apply_rotary_pos_emb) + # Prepare cos-sin caches for long-context + LoRA with offsets for every # forward, since the offset information wasn't available previously if hasattr(self, "scaling_factors") or self.sin is None: