Merge branch 'main' into mengfeil/test-triton

mengfei25 · mengfei25 · commit 78a754ca091e · 2025-12-08T13:49:48.000+08:00
diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh
@@ -51,26 +51,26 @@ python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0
 export USE_STATIC_MKL=1
 if [ "${XPU_ONEAPI_PATH}" == "" ];then
     export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
-        intel-cmplr-lib-rt==2025.2.1 | \
-        intel-cmplr-lib-ur==2025.2.1 | \
-        intel-cmplr-lic-rt==2025.2.1 | \
-        intel-sycl-rt==2025.2.1 | \
-        oneccl-devel==2021.16.1 | \
-        oneccl==2021.16.1 | \
-        impi-rt==2021.16.1 | \
-        onemkl-sycl-blas==2025.2.0 | \
-        onemkl-sycl-dft==2025.2.0 | \
-        onemkl-sycl-lapack==2025.2.0 | \
-        onemkl-sycl-rng==2025.2.0 | \
-        onemkl-sycl-sparse==2025.2.0 | \
-        dpcpp-cpp-rt==2025.2.1 | \
-        intel-opencl-rt==2025.2.1 | \
-        mkl==2025.2.0 | \
-        intel-openmp==2025.2.1 | \
-        tbb==2022.2.0 | \
-        tcmlib==1.4.0 | \
-        umf==0.11.0 | \
-        intel-pti==0.13.1
+        intel-cmplr-lib-rt==2025.3.1 | \
+        intel-cmplr-lib-ur==2025.3.1 | \
+        intel-cmplr-lic-rt==2025.3.1 | \
+        intel-sycl-rt==2025.3.1 | \
+        oneccl-devel==2021.17.1 | \
+        oneccl==2021.17.1 | \
+        impi-rt==2021.17.0 | \
+        onemkl-sycl-blas==2025.3.0 | \
+        onemkl-sycl-dft==2025.3.0 | \
+        onemkl-sycl-lapack==2025.3.0 | \
+        onemkl-sycl-rng==2025.3.0 | \
+        onemkl-sycl-sparse==2025.3.0 | \
+        dpcpp-cpp-rt==2025.3.1 | \
+        intel-opencl-rt==2025.3.1 | \
+        mkl==2025.3.0 | \
+        intel-openmp==2025.3.1 | \
+        tbb==2022.3.0 | \
+        tcmlib==1.4.1 | \
+        umf==1.0.2 | \
+        intel-pti==0.15.0
     "
 fi
 
diff --git a/.github/scripts/install_xpu.bat b/.github/scripts/install_xpu.bat
@@ -2,12 +2,10 @@
 REM Description: Install Intel Support Packages on Windows
 REM BKM reference: https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
 
-:xpu_bundle_install_start
-
 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-set XPU_BUNDLE_VERSION=2025.1.3+5
+set XPU_BUNDLE_VERSION=2025.2.1+20
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@@ -16,9 +14,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0
 
-if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
-    set XPU_BUNDLE_VERSION=2025.2.1+20
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.3] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/0909c8b0-1475-414f-a9a9-489ee3822dbf/intel-deep-learning-essentials-2025.3.1.11_offline.exe
+    set XPU_BUNDLE_VERSION=2025.3.1+8
 )
 
 :: Check if XPU bundle is target version or already installed
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
@@ -79,7 +79,7 @@ check_passed_known_issues() {
     fi
     # Mark passed items in GitHub issues with strikethrough
     if [ "$GITHUB_EVENT_NAME" == "schedule" ] && [ "$inputs_pytorch" != "nightly_wheel" ];then
-        mark_passed_issue "$output_file" "$known_file"
+        mark_passed_issue "$output_file" "issues.log"
     fi
     rm -f "$output_file"  # Clean up temporary file
 }
diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml
@@ -31,7 +31,7 @@ on:
       xpu_version:
         required: false
         type: string
-        default: '2025.2'
+        default: '2025.3'
         description: Python version
       src_changed:
         required: true
@@ -49,26 +49,24 @@ permissions: read-all
 env: 
     USE_XPU: 1
     PYTORCH_EXTRA_INSTALL_REQUIREMENTS: >-
-      intel-cmplr-lib-rt==2025.2.1 |
-      intel-cmplr-lib-ur==2025.2.1 |
-      intel-cmplr-lic-rt==2025.2.1 |
-      intel-sycl-rt==2025.2.1 |
-      oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' |
-      oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' |
-      impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' |
-      onemkl-sycl-blas==2025.2.0 |
-      onemkl-sycl-dft==2025.2.0 |
-      onemkl-sycl-lapack==2025.2.0 |
-      onemkl-sycl-rng==2025.2.0 |
-      onemkl-sycl-sparse==2025.2.0 |
-      dpcpp-cpp-rt==2025.2.1 |
-      intel-opencl-rt==2025.2.1 |
-      mkl==2025.2.0 |
-      intel-openmp==2025.2.1 |
-      tbb==2022.2.0 |
-      tcmlib==1.4.0 |
-      umf==0.11.0 |
-      intel-pti==0.13.1
+      intel-cmplr-lib-rt==2025.3.1 | 
+      intel-cmplr-lib-ur==2025.3.1 | 
+      intel-cmplr-lic-rt==2025.3.1 | 
+      intel-sycl-rt==2025.3.1 | 
+      onemkl-license==2025.3.0 | 
+      onemkl-sycl-blas==2025.3.0 | 
+      onemkl-sycl-dft==2025.3.0 | 
+      onemkl-sycl-lapack==2025.3.0 | 
+      onemkl-sycl-rng==2025.3.0 | 
+      onemkl-sycl-sparse==2025.3.0 | 
+      dpcpp-cpp-rt==2025.3.1 | 
+      intel-opencl-rt==2025.3.1 | 
+      mkl==2025.3.0 | 
+      intel-openmp==2025.3.1 | 
+      tbb==2022.3.0 | 
+      tcmlib==1.4.1 | 
+      umf==1.0.2 | 
+      intel-pti==0.15.0
 
 jobs:
   ut_test:
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -55,9 +55,11 @@ macro(set_build_flags)
       # SYCL headers, such as deprecated warnings, even if warned API is not actually used in the program.
       # We expect that this issue will be addressed in the later version of DPC++ compiler. To workaround
       # the issue we wrap paths to SYCL headers in `-isystem`.
-      foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR)
-        list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}")
-      endforeach()
+      if(SYCL_COMPILER_VERSION VERSION_LESS 20250300)
+        foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR)
+          list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}")
+        endforeach()
+      endif()
       # Excluding warnings which flood the compilation output
       # TODO: fix warnings in the source code and then reenable them in compilation
       list(APPEND SYCL_HOST_FLAGS -Wno-sign-compare)
diff --git a/src/ATen/native/transformers/xpu/flash_attn/sycltla/kernel/xe_sdpa_fwd_bshd.h b/src/ATen/native/transformers/xpu/flash_attn/sycltla/kernel/xe_sdpa_fwd_bshd.h
@@ -191,7 +191,7 @@ class FMHAPrefill {
   }
 
   // Find the length of the longest non masked sequence within that subgroup
-  int calculate_longest_non_masked_length(
+  CUTLASS_DEVICE int calculate_longest_non_masked_length(
       const int& seq_len_kv,
       const int& seq_len_qo,
       const int& last_seq_coord,
@@ -222,7 +222,7 @@ class FMHAPrefill {
   }
 
   template <class Tensor>
-  void handle_corner_cases(
+  CUTLASS_DEVICE void handle_corner_cases(
       Tensor& tSr,
       const int& thread_idx,
       const int& SubgroupSize,
diff --git a/src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_bwd.cpp b/src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_bwd.cpp
@@ -1461,18 +1461,17 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> flash_attention_backward_sycltla(
           .get_info<
               sycl::ext::oneapi::experimental::info::device::architecture>();
   constexpr auto supported_architectures =
-      std::array<sycl::ext::oneapi::experimental::architecture, 4>{
+      std::array<sycl::ext::oneapi::experimental::architecture, 3>{
           sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc,
           sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg,
-          sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21,
-          sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g31};
+          sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21};
   if (std::find(
           supported_architectures.begin(),
           supported_architectures.end(),
           device_architecture) == supported_architectures.end()) {
     TORCH_CHECK(
         false,
-        "XPU device architecture does not support flash attention backward. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21, intel_gpu_bmg_g31.");
+        "XPU device architecture does not support flash attention backward. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21.");
   }
 
   auto grad_query = at::empty_like(query);
diff --git a/src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_fwd.cpp b/src/ATen/native/transformers/xpu/flash_attn/sycltla/mha_fwd.cpp
@@ -333,19 +333,19 @@ void run_mha_fwd_(
           TileShapeOutPut,
           SubgroupLayout,
           PipelineStages);
+    } else {
+        constexpr int PipelineStages = 2;
+        using TileShapeQK = Shape<_256, _32, _64>;
+        using TileShapePV = Shape<_256, _32, _32>;
+        using TileShapeOutPut = Shape<_256, _128, _32>;
+        using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>;
+        run_mha_fwd_specialized(
+            TileShapeQK,
+            TileShapePV,
+            TileShapeOutPut,
+            SubgroupLayout,
+            PipelineStages);
     }
-
-    constexpr int PipelineStages = 2;
-    using TileShapeQK = Shape<_256, _32, _64>;
-    using TileShapePV = Shape<_256, _32, _32>;
-    using TileShapeOutPut = Shape<_256, _128, _32>;
-    using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>;
-    run_mha_fwd_specialized(
-        TileShapeQK,
-        TileShapePV,
-        TileShapeOutPut,
-        SubgroupLayout,
-        PipelineStages);
   } else if (headdim == 192) {
     constexpr int PipelineStages = 2;
     using TileShapeQK = Shape<_256, _64, _64>;
@@ -537,18 +537,17 @@ flash_attention_forward_sycltla(
           .get_info<
               sycl::ext::oneapi::experimental::info::device::architecture>();
   constexpr auto supported_architectures =
-      std::array<sycl::ext::oneapi::experimental::architecture, 4>{
+      std::array<sycl::ext::oneapi::experimental::architecture, 3>{
           sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc,
           sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg,
-          sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21,
-          sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g31};
+          sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21};
   if (std::find(
           supported_architectures.begin(),
           supported_architectures.end(),
           device_architecture) == supported_architectures.end()) {
     TORCH_CHECK(
         false,
-        "XPU device architecture does not support flash attention. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21, intel_gpu_bmg_g31.");
+        "XPU device architecture does not support flash attention. Supported architectures are: intel_gpu_pvc, intel_gpu_pvc_vg, intel_gpu_bmg_g21.");
   }
 
   auto problem_shape = ProblemShapeRegular(
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -695,15 +695,15 @@ struct AdaptiveAvgPool2dKernelFunctor_cl {
         numel_(numel) {}
 
  private:
+  vec_t* output_;
+  const vec_t* input_;
   int ih_;
   int iw_;
   int ob_;
   int oc_;
   int oh_;
   int ow_;
   int64_t numel_;
-  const vec_t* input_;
-  vec_t* output_;
 };
 
 #define LAUNCH_AVGPOOL_CHANNEL_LAST_VEC(                                  \
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -25,6 +25,17 @@ struct CastScalarFunc {
   }
 };
 
+// TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion
+// from incorrectly converting -0.0 to NaN. This temporary fix should
+// be removed once the compiler/driver error is resolved.
+template <typename Float8DataType>
+struct CastScalarFunc<Half, Float8DataType> {
+  Float8DataType operator()(Half src_val) const {
+    Half val = src_val == Half(-0.0) ? Half(0.0) : src_val;
+    return Float8DataType(val);
+  }
+};
+
 void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
   ScalarType dtype = iter.dtype(0);
   ScalarType other_dtype = iter.dtype(1);
diff --git a/test/regressions/test_conversion.py b/test/regressions/test_conversion.py
@@ -0,0 +1,41 @@
+# Owner(s): ["module: intel"]
+import torch
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_dtype import float8_types
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+
+class TestSimpleConversion(TestCase):
+    def _compare_convert_with_cpu(self, src_cpu, dtype):
+        src_xpu = src_cpu.to(xpu_device)
+        dst_cpu = src_cpu.to(dtype)
+        dst_xpu = src_xpu.to(dtype)
+        self.assertEqual(dst_xpu.to(cpu_device), dst_cpu)
+
+    @dtypes(*float8_types())
+    def test_half_zero(self, dtype):
+        pos_zero_fp16_cpu = torch.zeros((5, 6), dtype=torch.float16)
+        self._compare_convert_with_cpu(pos_zero_fp16_cpu, dtype)
+
+        neg_zero_fp16_cpu = torch.full((5, 6), -0.0, dtype=torch.float16)
+        self._compare_convert_with_cpu(neg_zero_fp16_cpu, dtype)
+
+    @dtypes(*float8_types())
+    def test_half_nonzero(self, dtype):
+        x_fp16_cpu = torch.arange(-100.0, 101.0, dtype=torch.float16)
+        self._compare_convert_with_cpu(x_fp16_cpu, dtype)
+
+
+instantiate_device_type_tests(
+    TestSimpleConversion, globals(), only_for="xpu", allow_xpu=True
+)
+
+
+if __name__ == "__main__":
+    run_tests()

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ check_passed_known_issues() {`
`79`	`79`	`fi`
`80`	`80`	`# Mark passed items in GitHub issues with strikethrough`
`81`	`81`	`if [ "$GITHUB_EVENT_NAME" == "schedule" ] && [ "$inputs_pytorch" != "nightly_wheel" ];then`
`82`		`- mark_passed_issue "$output_file" "$known_file"`
	`82`	`+ mark_passed_issue "$output_file" "issues.log"`
`83`	`83`	`fi`
`84`	`84`	`rm -f "$output_file" # Clean up temporary file`
`85`	`85`	`}`