jd-opensource · yingxudeng · Dec 2, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 8, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -27,4 +27,7 @@
 	url = https://gitcode.com/xLLM-AI/spdlog.git
 [submodule "third_party/Mooncake"]
 	path = third_party/Mooncake
-	url = https://gitcode.com/xLLM-AI/Mooncake.git
+	url = https://gitcode.com/xLLM-AI/Mooncake.git
+[submodule "third_party/torch_npu_ops"]
+	path = third_party/torch_npu_ops
+	url = https://gitcode.com/xLLM-AI/torch_npu_ops.git
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
@@ -12,6 +12,7 @@ add_subdirectory(etcd_cpp_apiv3)
 if(USE_NPU)
   add_subdirectory(spdlog)
   add_subdirectory(hccl_transfer/hccl_transfer)
+  add_subdirectory(torch_npu_ops)
 endif()
 add_subdirectory(Mooncake)
 

diff --git a/third_party/torch_npu_ops b/third_party/torch_npu_ops
diff --git a/xllm/core/kernels/CMakeLists.txt b/xllm/core/kernels/CMakeLists.txt
@@ -22,7 +22,7 @@ cc_library(
     ops_api.cpp
   DEPS
     torch
-    $<$<BOOL:${USE_NPU}>:npu_kernels>
+    $<$<BOOL:${USE_NPU}>:torch_npu_kernels>
     $<$<BOOL:${USE_MLU}>:mlu_kernels>
     $<$<BOOL:${USE_CUDA}>:cuda_kernels>
 )
diff --git a/xllm/core/kernels/npu/CMakeLists.txt b/xllm/core/kernels/npu/CMakeLists.txt
@@ -1,14 +1,3 @@
 include(cc_library)
 
-add_subdirectory(xllm_ops)
-
-cc_library(
-  NAME
-    npu_kernels
-  HDRS
-    linear.h
-    split.h
-    rope.h
-  DEPS
-    # spdlog::spdlog
-)
+add_subdirectory(xllm_ops)
diff --git a/xllm/core/kernels/npu/rms_norm.h → xllm/core/kernels/npu/active.cpp b/xllm/core/kernels/npu/rms_norm.h → xllm/core/kernels/npu/active.cpp
@@ -13,20 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#pragma once
-#include "impl/npu_rms_norm_impl.h"
+#include <glog/logging.h>
+#include <torch_npu/csrc/aten/CustomFunctions.h>
 
-namespace xllm {
-namespace kernel {
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
 
-class RmsNorm : public torch::nn::ModuleHolder<NpuRmsNormImpl> {
- public:
-  using torch::nn::ModuleHolder<NpuRmsNormImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuRmsNormImpl;
+namespace xllm::kernel::npu {
 
-  RmsNorm(const ModelContext& context)
-      : ModuleHolder(std::make_shared<NpuRmsNormImpl>(context)) {}
-};
-
-}  // namespace kernel
-}  // namespace xllm
+torch::Tensor active(const torch::Tensor& input, const std::string& act_mode) {
+  if (act_mode != "silu" && act_mode != "swiglu") {
+    LOG(FATAL) << "Only swiglu activation is supported in NPU active";
+  }
+  return at_npu::native::custom_ops::npu_swiglu(input);
+}
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/attention.cpp b/xllm/core/kernels/npu/attention.cpp
@@ -0,0 +1,65 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+namespace xllm::kernel::npu {
+
+void reshape_paged_cache(torch::Tensor& key,
+                         std::optional<torch::Tensor>& value,
+                         torch::Tensor& k_cache,
+                         std::optional<torch::Tensor>& v_cache,
+                         const torch::Tensor& slot_mapping) {
+  atb::npu_reshape_and_cache(
+      key, value.value(), k_cache, v_cache.value(), slot_mapping);
+}
+
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   const torch::Tensor& mask,
+                   const torch::Tensor& seq_len,
+                   float scale,
+                   torch::Tensor& output) {
+  int64_t num_heads = query.size(-2);
+  int64_t num_kv_heads = key.size(-2);
+  atb::npu_flash_attention(
+      query, key, value, mask, seq_len, scale, num_heads, num_kv_heads, output);
+}
+
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  const torch::Tensor& v_cache,
+                  float scale,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  torch::Tensor& output) {
+  int64_t head_size = query.size(-1);
+  int64_t num_heads = query.size(-2);
+  int64_t num_kv_heads = k_cache.size(-2);
+  auto q = query.view({-1, num_heads, head_size});
+  auto o = output.view({-1, num_heads, head_size});
+  atb::npu_paged_attention(q,
+                           k_cache,
+                           v_cache,
+                           num_kv_heads,
+                           num_heads,
+                           scale,
+                           block_table,
+                           seq_lens,
+                           o);
+}
+
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/fused_layernorm.cpp b/xllm/core/kernels/npu/fused_layernorm.cpp
@@ -0,0 +1,44 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <glog/logging.h>
+#include <torch_npu/csrc/aten/CustomFunctions.h>
+
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+
+namespace xllm::kernel::npu {
+
+torch::Tensor rms_norm(const torch::Tensor& input,
+                       const torch::Tensor& weight,
+                       double eps,
+                       const std::string& mode) {
+  if (mode != "rmsnorm") {
+    LOG(FATAL) << "Only rmsnorm mode is supported in NPU rms_norm";
+  }
+  std::tuple<at::Tensor, at::Tensor> result =
+      at_npu::native::custom_ops::npu_rms_norm(input, weight, eps);
+  auto normalized_input = std::get<0>(result);
+  return normalized_input;
+}
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> add_rms_norm(
+    const torch::Tensor& x1,
+    const torch::Tensor& x2,
+    const torch::Tensor& gamma,
+    double epsilon) {
+  return at_npu::native::custom_ops::npu_add_rms_norm(x1, x2, gamma, epsilon);
+}
+
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/linear.h → xllm/core/kernels/npu/matmul.cpp b/xllm/core/kernels/npu/linear.h → xllm/core/kernels/npu/matmul.cpp
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#pragma once
-#include "impl/npu_linear_impl.h"
-
-namespace xllm::kernel {
-
-class Linear : public torch::nn::ModuleHolder<NpuLinearImpl> {
- public:
-  using torch::nn::ModuleHolder<NpuLinearImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuLinearImpl;
-
-  Linear(const ModelContext& context)
-      : ModuleHolder(std::make_shared<NpuLinearImpl>(context)) {}
-};
-
-}  // namespace xllm::kernel
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+
+namespace xllm::kernel::npu {
+
+torch::Tensor matmul(const torch::Tensor& a,
+                     const torch::Tensor& b,
+                     const std::optional<torch::Tensor>& bias) {
+  if (!bias.has_value()) {
+    return torch::nn::functional::linear(a, b);
+  } else {
+    return torch::nn::functional::linear(a, b, bias.value());
+  }
+}
+
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/npu_ops_api.h b/xllm/core/kernels/npu/npu_ops_api.h
@@ -0,0 +1,69 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+#include <torch/torch.h>
+
+#include <optional>
+#include <tuple>
+
+#include "custom_functions_npu/atb_common.h"
+
+namespace xllm::kernel::npu {
+
+void reshape_paged_cache(torch::Tensor& key,
+                         std::optional<torch::Tensor>& value,
+                         torch::Tensor& k_cache,
+                         std::optional<torch::Tensor>& v_cache,
+                         const torch::Tensor& slot_mapping);
+
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   const torch::Tensor& mask,
+                   const torch::Tensor& seq_len,
+                   float scale,
+                   torch::Tensor& output);
+
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  const torch::Tensor& v_cache,
+                  float scale,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  torch::Tensor& output);
+
+torch::Tensor matmul(const torch::Tensor& a,
+                     const torch::Tensor& b,
+                     const std::optional<torch::Tensor>& bias);
+
+torch::Tensor active(const torch::Tensor& input, const std::string& act_mode);
+
+torch::Tensor rms_norm(const torch::Tensor& input,
+                       const torch::Tensor& weight,
+                       double eps,
+                       const std::string& mode);
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> add_rms_norm(
+    const torch::Tensor& x1,
+    const torch::Tensor& x2,
+    const torch::Tensor& gamma,
+    double epsilon);
+
+void apply_rotary(torch::Tensor& q,
+                  torch::Tensor& k,
+                  const torch::Tensor& cos_sin_cache,
+                  const torch::Tensor& positions);
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/rope.cpp b/xllm/core/kernels/npu/rope.cpp
@@ -0,0 +1,42 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <torch_npu/csrc/aten/CustomFunctions.h>
+
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+
+namespace xllm::kernel::npu {
+
+void apply_rotary(torch::Tensor& q,
+                  torch::Tensor& k,
+                  const torch::Tensor& cos_sin_cache,
+                  const torch::Tensor& positions) {
+  auto cos_sin = cos_sin_cache.index_select(0, positions);
+  int64_t last_dim = cos_sin.size(-1);
+  auto cos_sin_vec = cos_sin.view({-1, 2, last_dim / 2})
+                         .repeat({1, 1, 2})
+                         .chunk(2, /*dim=*/-2);
+  auto cos = cos_sin_vec[0].view({1, -1, 1, last_dim});
+  auto sin = cos_sin_vec[1].view({1, -1, 1, last_dim});
+
+  const int64_t rotary_dim = sin.size(-1);
+  q = q.view({1, q.size(0), -1, rotary_dim});
+  k = k.view({1, k.size(0), -1, rotary_dim});
+
+  at_npu::native::custom_ops::npu_apply_rotary_pos_emb(q, k, cos, sin);
+}
+
+}  // namespace xllm::kernel::npu
diff --git a/xllm/core/kernels/npu/rope.h b/xllm/core/kernels/npu/rope.h