Update the syntax of value retrival for float2; skip the check for __CUDA_ARCH__ < 1000

seagater · seagater · commit d9acf0f06f81 · 2025-10-24T23:16:06.000Z
diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu
@@ -71,7 +71,7 @@ struct NvlsAdapter {
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
                           size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t nelems,
                           cudaStream_t stream, uint32_t*, uint32_t*, uint32_t*, uint32_t) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 1000
+#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
     if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
     } else
@@ -95,7 +95,7 @@ struct NvlsWithCopyAdapter {
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
                           int rank, int nRanksPerNode, int, size_t nelems, cudaStream_t stream, uint32_t*, uint32_t*,
                           uint32_t*, uint32_t) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 1000
+#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
     if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
     } else
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
@@ -149,13 +149,12 @@ template <bool UseClip = true>
 __forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
 #if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
   // Optimized assembly for gfx942
-  typedef float __attribute__((ext_vector_type(2))) float2_t;
-  float2_t v;
+  float2 v;
   uint32_t ival = 0;
   asm volatile("v_pk_add_f32 %0, %1, %2"
                : "=v"(v)
                : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[0], ival, false);
+  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
 #elif !defined(__HIP_PLATFORM_AMD__)
   // NVIDIA CUDA FP8 addition (CUDA 11.8+)
   __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b)));
@@ -171,13 +170,12 @@ __forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
 template <bool UseClip = true>
 __forceinline__ __device__ __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
 #if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
-  typedef float __attribute__((ext_vector_type(2))) float2_t;
-  float2_t v;
+  float2 v;
   uint32_t ival = 0;
   asm volatile("v_pk_add_f32 %0, %1, %2"
                : "=v"(v)
                : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[1], ival, false);
+  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
 #elif !defined(__HIP_PLATFORM_AMD__)
   // CUDA: Convert to half2, add using optimized __hadd2, convert back
   __fp8x2_e4m3 result = __fp8x2_e4m3(__hadd2(__half2(a), __half2(b)));
@@ -215,13 +213,12 @@ template <bool UseClip = true>
 __forceinline__ __device__ __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
 #if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
   // Optimized assembly for gfx942 (bfloat8)
-  typedef float __attribute__((ext_vector_type(2))) float2_t;
-  float2_t v;
+  float2 v;
   uint32_t ival = 0;
   asm volatile("v_pk_add_f32 %0, %1, %2"
                : "=v"(v)
                : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v[0], v[0], ival, false);
+  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
 #elif !defined(__HIP_PLATFORM_AMD__)
   // NVIDIA CUDA FP8 addition
   __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b)));
@@ -374,21 +371,20 @@ __forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
   uint32_t a32 = static_cast<uint32_t>(a);
   uint32_t b32 = static_cast<uint32_t>(b);
 
-  typedef float __attribute__((ext_vector_type(2))) float2_t;
-  float2_t v_low, v_high;
+  float2 v_low, v_high;
   uint32_t ival = 0;
 
   if constexpr (std::is_same_v<ScalarT, __fp8_e4m3>) {
     // E4M3 using fp8 conversion - process low word (false) and high word (true)
     asm volatile("v_pk_add_f32 %0, %1, %2"
                  : "=v"(v_low)
                  : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, false)));
-    uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low[0], v_low[1], ival, false);
+    uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, ival, false);
 
     asm volatile("v_pk_add_f32 %0, %1, %2"
                  : "=v"(v_high)
                  : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, true)));
-    uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high[0], v_high[1], ival, false);
+    uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, ival, false);
 
     uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
     return static_cast<int>(result);
@@ -397,12 +393,12 @@ __forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
     asm volatile("v_pk_add_f32 %0, %1, %2"
                  : "=v"(v_low)
                  : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, false)));
-    uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low[0], v_low[1], ival, false);
+    uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, ival, false);
 
     asm volatile("v_pk_add_f32 %0, %1, %2"
                  : "=v"(v_high)
                  : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, true)));
-    uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high[0], v_high[1], ival, false);
+    uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, ival, false);
 
     uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
     return static_cast<int>(result);
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
@@ -62,7 +62,7 @@ MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
   asm volatile("v_pk_add_f32 %0, %1, %2"
                : "=v"(v)
                : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[0], ival, false);
+  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
 #else
   return __fp8_e4m3(__hadd(__half(a), __half(b)));
 #endif
@@ -78,7 +78,7 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
   asm volatile("v_pk_add_f32 %0, %1, %2"
                : "=v"(v)
                : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v[0], v[0], ival, false);
+  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
 #else
   return __fp8_e5m2(__hadd(__half(a), __half(b)));
 #endif
@@ -95,7 +95,7 @@ MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e4m3(uint16_t a, uint16_t b) {
   asm volatile("v_pk_add_f32 %0, %1, %2"
                : "=v"(v)
                : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[1], ival, false);
+  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
 }
 
 // E4M3 vectorized addition for 4 elements
@@ -116,7 +116,7 @@ MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e5m2(uint16_t a, uint16_t b) {
   asm volatile("v_pk_add_f32 %0, %1, %2"
                : "=v"(v)
                : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v[0], v[1], ival, false);
+  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false);
 }
 
 // E5M2 vectorized addition for 4 elements