[Model Runner V2] Fix Triton warning on tl.where (#30355)

WoosukKwon · web-flow · commit 9e6562a3f625 · 2025-12-09T09:59:54.000-08:00
Signed-off-by: Woosuk Kwon &lt;woosuk.kwon@berkeley.edu&gt;
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
@@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel(
                 mask=packed_block < tl.cdiv(vocab_size, 32),
             )
             prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
+            prompt_bin_mask = prompt_bin_mask.to(tl.int1)
             prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
 
             # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel(`
`62`	`62`	`mask=packed_block < tl.cdiv(vocab_size, 32),`
`63`	`63`	`)`
`64`	`64`	`prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1`
	`65`	`+ prompt_bin_mask = prompt_bin_mask.to(tl.int1)`
`65`	`66`	`prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)`
`66`	`67`
`67`	`68`	`# If token appears in prompt or output, apply, otherwise use 1.0 for no-op.`