Skip to content

Commit a076ec1

Browse files
authored
Revert "fix llama4 kv cache layout" (#12437)
1 parent 72b5f3d commit a076ec1

File tree

2 files changed

+1
-8
lines changed

2 files changed

+1
-8
lines changed

docs/advanced_features/attention_backend.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ The support matrix is split into two parts: MHA (standard attention) and MLA (mu
2121
| **Triton** |||||||
2222
| **Torch Native (SDPA)** |||||||
2323
| **FlexAttention (PyTorch)** |||||||
24-
| **TRTLLM MHA** | 16, 32 or 64 | |||||
24+
| **TRTLLM MHA** | 16, 32 or 64 | |||||
2525
| **Dual Chunk FlashAttention** |||||||
2626
| **AITER (ROCm)** |||||||
2727
| **Wave (ROCm)** |||||||

python/sglang/srt/server_args.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -980,13 +980,6 @@ def _handle_model_specific_adjustments(self):
980980
logger.warning(
981981
"Use trtllm_mha as attention backend on sm100 for Llama4 model"
982982
)
983-
if is_sm100_supported() and self.attention_backend == "trtllm_mha":
984-
# TODO(brayden): remove this once TRTLLM MHA kernel for FP8 w/ tileSizeKv=128 is available.
985-
# This is a Llama 4 specific issue only.
986-
self.kv_cache_dtype = "bfloat16"
987-
logger.warning(
988-
"Setting kv_cache_dtype to bfloat16 for Llama4 with trtllm_mha backend, due to a missing FlashInfer TRTLLM MHA kernel for FP8 KV Cache"
989-
)
990983
if is_sm100_supported() and self.moe_runner_backend == "auto":
991984
if self.quantization in {"fp8", "modelopt_fp8"}:
992985
self.moe_runner_backend = "flashinfer_trtllm"

0 commit comments

Comments
 (0)