Skip to content

Commit 78a754c

Browse files
committed
Merge branch 'main' into mengfeil/test-triton
2 parents 9303efd + 035049f commit 78a754c

File tree

11 files changed

+124
-76
lines changed

11 files changed

+124
-76
lines changed

.github/scripts/build.sh

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -51,26 +51,26 @@ python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0
5151
export USE_STATIC_MKL=1
5252
if [ "${XPU_ONEAPI_PATH}" == "" ];then
5353
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
54-
intel-cmplr-lib-rt==2025.2.1 | \
55-
intel-cmplr-lib-ur==2025.2.1 | \
56-
intel-cmplr-lic-rt==2025.2.1 | \
57-
intel-sycl-rt==2025.2.1 | \
58-
oneccl-devel==2021.16.1 | \
59-
oneccl==2021.16.1 | \
60-
impi-rt==2021.16.1 | \
61-
onemkl-sycl-blas==2025.2.0 | \
62-
onemkl-sycl-dft==2025.2.0 | \
63-
onemkl-sycl-lapack==2025.2.0 | \
64-
onemkl-sycl-rng==2025.2.0 | \
65-
onemkl-sycl-sparse==2025.2.0 | \
66-
dpcpp-cpp-rt==2025.2.1 | \
67-
intel-opencl-rt==2025.2.1 | \
68-
mkl==2025.2.0 | \
69-
intel-openmp==2025.2.1 | \
70-
tbb==2022.2.0 | \
71-
tcmlib==1.4.0 | \
72-
umf==0.11.0 | \
73-
intel-pti==0.13.1
54+
intel-cmplr-lib-rt==2025.3.1 | \
55+
intel-cmplr-lib-ur==2025.3.1 | \
56+
intel-cmplr-lic-rt==2025.3.1 | \
57+
intel-sycl-rt==2025.3.1 | \
58+
oneccl-devel==2021.17.1 | \
59+
oneccl==2021.17.1 | \
60+
impi-rt==2021.17.0 | \
61+
onemkl-sycl-blas==2025.3.0 | \
62+
onemkl-sycl-dft==2025.3.0 | \
63+
onemkl-sycl-lapack==2025.3.0 | \
64+
onemkl-sycl-rng==2025.3.0 | \
65+
onemkl-sycl-sparse==2025.3.0 | \
66+
dpcpp-cpp-rt==2025.3.1 | \
67+
intel-opencl-rt==2025.3.1 | \
68+
mkl==2025.3.0 | \
69+
intel-openmp==2025.3.1 | \
70+
tbb==2022.3.0 | \
71+
tcmlib==1.4.1 | \
72+
umf==1.0.2 | \
73+
intel-pti==0.15.0
7474
"
7575
fi
7676

.github/scripts/install_xpu.bat

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,10 @@
22
REM Description: Install Intel Support Packages on Windows
33
REM BKM reference: https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
44

5-
:xpu_bundle_install_start
6-
75
set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
8-
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
6+
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
97
set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
10-
set XPU_BUNDLE_VERSION=2025.1.3+5
8+
set XPU_BUNDLE_VERSION=2025.2.1+20
119
set XPU_BUNDLE_INSTALLED=0
1210
set XPU_BUNDLE_UNINSTALL=0
1311
set XPU_EXTRA_URL=NULL
@@ -16,9 +14,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
1614
set XPU_EXTRA_INSTALLED=0
1715
set XPU_EXTRA_UNINSTALL=0
1816

19-
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
20-
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
21-
set XPU_BUNDLE_VERSION=2025.2.1+20
17+
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.3] (
18+
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/0909c8b0-1475-414f-a9a9-489ee3822dbf/intel-deep-learning-essentials-2025.3.1.11_offline.exe
19+
set XPU_BUNDLE_VERSION=2025.3.1+8
2220
)
2321

2422
:: Check if XPU bundle is target version or already installed

.github/scripts/ut_result_check.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ check_passed_known_issues() {
7979
fi
8080
# Mark passed items in GitHub issues with strikethrough
8181
if [ "$GITHUB_EVENT_NAME" == "schedule" ] && [ "$inputs_pytorch" != "nightly_wheel" ];then
82-
mark_passed_issue "$output_file" "$known_file"
82+
mark_passed_issue "$output_file" "issues.log"
8383
fi
8484
rm -f "$output_file" # Clean up temporary file
8585
}

.github/workflows/_windows_ut.yml

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ on:
3131
xpu_version:
3232
required: false
3333
type: string
34-
default: '2025.2'
34+
default: '2025.3'
3535
description: Python version
3636
src_changed:
3737
required: true
@@ -49,26 +49,24 @@ permissions: read-all
4949
env:
5050
USE_XPU: 1
5151
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: >-
52-
intel-cmplr-lib-rt==2025.2.1 |
53-
intel-cmplr-lib-ur==2025.2.1 |
54-
intel-cmplr-lic-rt==2025.2.1 |
55-
intel-sycl-rt==2025.2.1 |
56-
oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' |
57-
oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' |
58-
impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' |
59-
onemkl-sycl-blas==2025.2.0 |
60-
onemkl-sycl-dft==2025.2.0 |
61-
onemkl-sycl-lapack==2025.2.0 |
62-
onemkl-sycl-rng==2025.2.0 |
63-
onemkl-sycl-sparse==2025.2.0 |
64-
dpcpp-cpp-rt==2025.2.1 |
65-
intel-opencl-rt==2025.2.1 |
66-
mkl==2025.2.0 |
67-
intel-openmp==2025.2.1 |
68-
tbb==2022.2.0 |
69-
tcmlib==1.4.0 |
70-
umf==0.11.0 |
71-
intel-pti==0.13.1
52+
intel-cmplr-lib-rt==2025.3.1 |
53+
intel-cmplr-lib-ur==2025.3.1 |
54+
intel-cmplr-lic-rt==2025.3.1 |
55+
intel-sycl-rt==2025.3.1 |
56+
onemkl-license==2025.3.0 |
57+
onemkl-sycl-blas==2025.3.0 |
58+
onemkl-sycl-dft==2025.3.0 |
59+
onemkl-sycl-lapack==2025.3.0 |
60+
onemkl-sycl-rng==2025.3.0 |
61+
onemkl-sycl-sparse==2025.3.0 |
62+
dpcpp-cpp-rt==2025.3.1 |
63+
intel-opencl-rt==2025.3.1 |
64+
mkl==2025.3.0 |
65+
intel-openmp==2025.3.1 |
66+
tbb==2022.3.0 |
67+
tcmlib==1.4.1 |
68+
umf==1.0.2 |
69+
intel-pti==0.15.0
7270
7371
jobs:
7472
ut_test:

cmake/BuildFlags.cmake

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,11 @@ macro(set_build_flags)
5555
# SYCL headers, such as deprecated warnings, even if warned API is not actually used in the program.
5656
# We expect that this issue will be addressed in the later version of DPC++ compiler. To workaround
5757
# the issue we wrap paths to SYCL headers in `-isystem`.
58-
foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR)
59-
list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}")
60-
endforeach()
58+
if(SYCL_COMPILER_VERSION VERSION_LESS 20250300)
59+
foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR)
60+
list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}")
61+
endforeach()
62+
endif()
6163
# Excluding warnings which flood the compilation output
6264
# TODO: fix warnings in the source code and then reenable them in compilation
6365
list(APPEND SYCL_HOST_FLAGS -Wno-sign-compare)

src/ATen/native/transformers/xpu/flash_attn/sycltla/kernel/xe_sdpa_fwd_bshd.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ class FMHAPrefill {
191191
}
192192
193193
// Find the length of the longest non masked sequence within that subgroup
194-
int calculate_longest_non_masked_length(
194+
CUTLASS_DEVICE int calculate_longest_non_masked_length(
195195
const int& seq_len_kv,
196196
const int& seq_len_qo,
197197
const int& last_seq_coord,
@@ -222,7 +222,7 @@ class FMHAPrefill {
222222
}
223223
224224
template <class Tensor>
225-
void handle_corner_cases(
225+
CUTLASS_DEVICE void handle_corner_cases(
226226
Tensor& tSr,
227227
const int& thread_idx,
228228
const int& SubgroupSize,

src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_bwd.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,18 +1461,17 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> flash_attention_backward_sycltla(
14611461
.get_info<
14621462
sycl::ext::oneapi::experimental::info::device::architecture>();
14631463
constexpr auto supported_architectures =
1464-
std::array<sycl::ext::oneapi::experimental::architecture, 4>{
1464+
std::array<sycl::ext::oneapi::experimental::architecture, 3>{
14651465
sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc,
14661466
sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg,
1467-
sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21,
1468-
sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g31};
1467+
sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21};
14691468
if (std::find(
14701469
supported_architectures.begin(),
14711470
supported_architectures.end(),
14721471
device_architecture) == supported_architectures.end()) {
14731472
TORCH_CHECK(
14741473
false,
1475-
"XPU device architecture does not support flash attention backward. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21, intel_gpu_bmg_g31.");
1474+
"XPU device architecture does not support flash attention backward. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21.");
14761475
}
14771476

14781477
auto grad_query = at::empty_like(query);

src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_fwd.cpp

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -333,19 +333,19 @@ void run_mha_fwd_(
333333
TileShapeOutPut,
334334
SubgroupLayout,
335335
PipelineStages);
336+
} else {
337+
constexpr int PipelineStages = 2;
338+
using TileShapeQK = Shape<_256, _32, _64>;
339+
using TileShapePV = Shape<_256, _32, _32>;
340+
using TileShapeOutPut = Shape<_256, _128, _32>;
341+
using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>;
342+
run_mha_fwd_specialized(
343+
TileShapeQK,
344+
TileShapePV,
345+
TileShapeOutPut,
346+
SubgroupLayout,
347+
PipelineStages);
336348
}
337-
338-
constexpr int PipelineStages = 2;
339-
using TileShapeQK = Shape<_256, _32, _64>;
340-
using TileShapePV = Shape<_256, _32, _32>;
341-
using TileShapeOutPut = Shape<_256, _128, _32>;
342-
using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>;
343-
run_mha_fwd_specialized(
344-
TileShapeQK,
345-
TileShapePV,
346-
TileShapeOutPut,
347-
SubgroupLayout,
348-
PipelineStages);
349349
} else if (headdim == 192) {
350350
constexpr int PipelineStages = 2;
351351
using TileShapeQK = Shape<_256, _64, _64>;
@@ -537,18 +537,17 @@ flash_attention_forward_sycltla(
537537
.get_info<
538538
sycl::ext::oneapi::experimental::info::device::architecture>();
539539
constexpr auto supported_architectures =
540-
std::array<sycl::ext::oneapi::experimental::architecture, 4>{
540+
std::array<sycl::ext::oneapi::experimental::architecture, 3>{
541541
sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc,
542542
sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg,
543-
sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21,
544-
sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g31};
543+
sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21};
545544
if (std::find(
546545
supported_architectures.begin(),
547546
supported_architectures.end(),
548547
device_architecture) == supported_architectures.end()) {
549548
TORCH_CHECK(
550549
false,
551-
"XPU device architecture does not support flash attention. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21, intel_gpu_bmg_g31.");
550+
"XPU device architecture does not support flash attention. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21.");
552551
}
553552

554553
auto problem_shape = ProblemShapeRegular(

src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -695,15 +695,15 @@ struct AdaptiveAvgPool2dKernelFunctor_cl {
695695
numel_(numel) {}
696696

697697
private:
698+
vec_t* output_;
699+
const vec_t* input_;
698700
int ih_;
699701
int iw_;
700702
int ob_;
701703
int oc_;
702704
int oh_;
703705
int ow_;
704706
int64_t numel_;
705-
const vec_t* input_;
706-
vec_t* output_;
707707
};
708708

709709
#define LAUNCH_AVGPOOL_CHANNEL_LAST_VEC( \

src/ATen/native/xpu/sycl/CopyKernel.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,17 @@ struct CastScalarFunc {
2525
}
2626
};
2727

28+
// TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion
29+
// from incorrectly converting -0.0 to NaN. This temporary fix should
30+
// be removed once the compiler/driver error is resolved.
31+
template <typename Float8DataType>
32+
struct CastScalarFunc<Half, Float8DataType> {
33+
Float8DataType operator()(Half src_val) const {
34+
Half val = src_val == Half(-0.0) ? Half(0.0) : src_val;
35+
return Float8DataType(val);
36+
}
37+
};
38+
2839
void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
2940
ScalarType dtype = iter.dtype(0);
3041
ScalarType other_dtype = iter.dtype(1);

0 commit comments

Comments
 (0)