vllm-project · yiz-liu · Nov 12, 2025 · Nov 10, 2025 · Nov 11, 2025 · whx-sjtu
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2665,7 +2665,12 @@ def _build_dummy_attn_metadata(
 
             attn_metadata = {}
 
-            seq_lens = self.model_config.max_model_len
+            # When force_attention == True, the model runs in capturing so we
+            # need seq_lens as max_model_len to get max workspace for attention op.
+            # However, when force_attention == False, the model might be running
+            # normal inference. If dp_size > 1, we only need dummy_run
+            # to execute a short attention with seq_lens as 1.
+            seq_lens = self.model_config.max_model_len if force_attention else 1
             self.seq_lens_np[:num_reqs] = seq_lens
             self.seq_lens_np[num_reqs:] = 0