Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,7 @@
url = https://gitcode.com/xLLM-AI/spdlog.git
[submodule "third_party/Mooncake"]
path = third_party/Mooncake
url = https://gitcode.com/xLLM-AI/Mooncake.git
url = https://gitcode.com/xLLM-AI/Mooncake.git
[submodule "third_party/torch_npu_ops"]
path = third_party/torch_npu_ops
url = https://gitcode.com/xLLM-AI/torch_npu_ops.git
1 change: 1 addition & 0 deletions third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ add_subdirectory(etcd_cpp_apiv3)
if(USE_NPU)
add_subdirectory(spdlog)
add_subdirectory(hccl_transfer/hccl_transfer)
add_subdirectory(torch_npu_ops)
endif()
add_subdirectory(Mooncake)

Expand Down
1 change: 1 addition & 0 deletions third_party/torch_npu_ops
Submodule torch_npu_ops added at 2bc8f5
2 changes: 1 addition & 1 deletion xllm/core/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ cc_library(
ops_api.cpp
DEPS
torch
$<$<BOOL:${USE_NPU}>:npu_kernels>
$<$<BOOL:${USE_NPU}>:torch_npu_kernels>
$<$<BOOL:${USE_MLU}>:mlu_kernels>
$<$<BOOL:${USE_CUDA}>:cuda_kernels>
)
13 changes: 1 addition & 12 deletions xllm/core/kernels/npu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,3 @@
include(cc_library)

add_subdirectory(xllm_ops)

cc_library(
NAME
npu_kernels
HDRS
linear.h
split.h
rope.h
DEPS
# spdlog::spdlog
)
add_subdirectory(xllm_ops)
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#pragma once
#include "impl/npu_rms_norm_impl.h"
#include <glog/logging.h>
#include <torch_npu/csrc/aten/CustomFunctions.h>

namespace xllm {
namespace kernel {
#include "npu_ops_api.h"
#include "ops_npu/npu_ops.h"

class RmsNorm : public torch::nn::ModuleHolder<NpuRmsNormImpl> {
public:
using torch::nn::ModuleHolder<NpuRmsNormImpl>::ModuleHolder;
using Impl __attribute__((__unused__)) = NpuRmsNormImpl;
namespace xllm::kernel::npu {

RmsNorm(const ModelContext& context)
: ModuleHolder(std::make_shared<NpuRmsNormImpl>(context)) {}
};

} // namespace kernel
} // namespace xllm
torch::Tensor active(const torch::Tensor& input, const std::string& act_mode) {
if (act_mode != "silu" && act_mode != "swiglu") {
LOG(FATAL) << "Only swiglu activation is supported in NPU active";
}
return at_npu::native::custom_ops::npu_swiglu(input);
}
} // namespace xllm::kernel::npu
65 changes: 65 additions & 0 deletions xllm/core/kernels/npu/attention.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://github.com/jd-opensource/xllm/blob/main/LICENSE
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "npu_ops_api.h"
#include "ops_npu/npu_ops.h"
namespace xllm::kernel::npu {

void reshape_paged_cache(torch::Tensor& key,
std::optional<torch::Tensor>& value,
torch::Tensor& k_cache,
std::optional<torch::Tensor>& v_cache,
const torch::Tensor& slot_mapping) {
atb::npu_reshape_and_cache(
key, value.value(), k_cache, v_cache.value(), slot_mapping);
}

void batch_prefill(const torch::Tensor& query,
const torch::Tensor& key,
const torch::Tensor& value,
const torch::Tensor& mask,
const torch::Tensor& seq_len,
float scale,
torch::Tensor& output) {
int64_t num_heads = query.size(-2);
int64_t num_kv_heads = key.size(-2);
atb::npu_flash_attention(
query, key, value, mask, seq_len, scale, num_heads, num_kv_heads, output);
}

void batch_decode(const torch::Tensor& query,
const torch::Tensor& k_cache,
const torch::Tensor& v_cache,
float scale,
const torch::Tensor& block_table,
const torch::Tensor& seq_lens,
torch::Tensor& output) {
int64_t head_size = query.size(-1);
int64_t num_heads = query.size(-2);
int64_t num_kv_heads = k_cache.size(-2);
auto q = query.view({-1, num_heads, head_size});
auto o = output.view({-1, num_heads, head_size});
atb::npu_paged_attention(q,
k_cache,
v_cache,
num_kv_heads,
num_heads,
scale,
block_table,
seq_lens,
o);
}

} // namespace xllm::kernel::npu
44 changes: 44 additions & 0 deletions xllm/core/kernels/npu/fused_layernorm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://github.com/jd-opensource/xllm/blob/main/LICENSE
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <glog/logging.h>
#include <torch_npu/csrc/aten/CustomFunctions.h>

#include "npu_ops_api.h"
#include "ops_npu/npu_ops.h"

namespace xllm::kernel::npu {

torch::Tensor rms_norm(const torch::Tensor& input,
const torch::Tensor& weight,
double eps,
const std::string& mode) {
if (mode != "rmsnorm") {
LOG(FATAL) << "Only rmsnorm mode is supported in NPU rms_norm";
}
std::tuple<at::Tensor, at::Tensor> result =
at_npu::native::custom_ops::npu_rms_norm(input, weight, eps);
auto normalized_input = std::get<0>(result);
return normalized_input;
}

std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> add_rms_norm(
const torch::Tensor& x1,
const torch::Tensor& x2,
const torch::Tensor& gamma,
double epsilon) {
return at_npu::native::custom_ops::npu_add_rms_norm(x1, x2, gamma, epsilon);
}

} // namespace xllm::kernel::npu
31 changes: 16 additions & 15 deletions xllm/core/kernels/npu/linear.h → xllm/core/kernels/npu/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#pragma once
#include "impl/npu_linear_impl.h"

namespace xllm::kernel {

class Linear : public torch::nn::ModuleHolder<NpuLinearImpl> {
public:
using torch::nn::ModuleHolder<NpuLinearImpl>::ModuleHolder;
using Impl __attribute__((__unused__)) = NpuLinearImpl;

Linear(const ModelContext& context)
: ModuleHolder(std::make_shared<NpuLinearImpl>(context)) {}
};

} // namespace xllm::kernel
#include "npu_ops_api.h"
#include "ops_npu/npu_ops.h"

namespace xllm::kernel::npu {

torch::Tensor matmul(const torch::Tensor& a,
const torch::Tensor& b,
const std::optional<torch::Tensor>& bias) {
if (!bias.has_value()) {
return torch::nn::functional::linear(a, b);
} else {
return torch::nn::functional::linear(a, b, bias.value());
}
}

} // namespace xllm::kernel::npu
69 changes: 69 additions & 0 deletions xllm/core/kernels/npu/npu_ops_api.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#pragma once
#include <torch/torch.h>

#include <optional>
#include <tuple>

#include "custom_functions_npu/atb_common.h"

namespace xllm::kernel::npu {

void reshape_paged_cache(torch::Tensor& key,
std::optional<torch::Tensor>& value,
torch::Tensor& k_cache,
std::optional<torch::Tensor>& v_cache,
const torch::Tensor& slot_mapping);

void batch_prefill(const torch::Tensor& query,
const torch::Tensor& key,
const torch::Tensor& value,
const torch::Tensor& mask,
const torch::Tensor& seq_len,
float scale,
torch::Tensor& output);

void batch_decode(const torch::Tensor& query,
const torch::Tensor& k_cache,
const torch::Tensor& v_cache,
float scale,
const torch::Tensor& block_table,
const torch::Tensor& seq_lens,
torch::Tensor& output);

torch::Tensor matmul(const torch::Tensor& a,
const torch::Tensor& b,
const std::optional<torch::Tensor>& bias);

torch::Tensor active(const torch::Tensor& input, const std::string& act_mode);

torch::Tensor rms_norm(const torch::Tensor& input,
const torch::Tensor& weight,
double eps,
const std::string& mode);

std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> add_rms_norm(
const torch::Tensor& x1,
const torch::Tensor& x2,
const torch::Tensor& gamma,
double epsilon);

void apply_rotary(torch::Tensor& q,
torch::Tensor& k,
const torch::Tensor& cos_sin_cache,
const torch::Tensor& positions);
} // namespace xllm::kernel::npu
42 changes: 42 additions & 0 deletions xllm/core/kernels/npu/rope.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://github.com/jd-opensource/xllm/blob/main/LICENSE
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include <torch_npu/csrc/aten/CustomFunctions.h>

#include "npu_ops_api.h"
#include "ops_npu/npu_ops.h"

namespace xllm::kernel::npu {

void apply_rotary(torch::Tensor& q,
torch::Tensor& k,
const torch::Tensor& cos_sin_cache,
const torch::Tensor& positions) {
auto cos_sin = cos_sin_cache.index_select(0, positions);
int64_t last_dim = cos_sin.size(-1);
auto cos_sin_vec = cos_sin.view({-1, 2, last_dim / 2})
.repeat({1, 1, 2})
.chunk(2, /*dim=*/-2);
auto cos = cos_sin_vec[0].view({1, -1, 1, last_dim});
auto sin = cos_sin_vec[1].view({1, -1, 1, last_dim});

const int64_t rotary_dim = sin.size(-1);
q = q.view({1, q.size(0), -1, rotary_dim});
k = k.view({1, k.size(0), -1, rotary_dim});

at_npu::native::custom_ops::npu_apply_rotary_pos_emb(q, k, cos, sin);
}

} // namespace xllm::kernel::npu
30 changes: 0 additions & 30 deletions xllm/core/kernels/npu/rope.h

This file was deleted.

Loading
Loading