Skip to content

Commit d9acf0f

Browse files
committed
Update the syntax of value retrival for float2; skip the check for __CUDA_ARCH__ < 1000
1 parent 6e7ab36 commit d9acf0f

File tree

3 files changed

+17
-21
lines changed

3 files changed

+17
-21
lines changed

apps/nccl/src/allreduce.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ struct NvlsAdapter {
7171
mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
7272
size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t nelems,
7373
cudaStream_t stream, uint32_t*, uint32_t*, uint32_t*, uint32_t) {
74-
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 1000
74+
#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
7575
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
7676
return cudaErrorNotSupported;
7777
} else
@@ -95,7 +95,7 @@ struct NvlsWithCopyAdapter {
9595
mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
9696
int rank, int nRanksPerNode, int, size_t nelems, cudaStream_t stream, uint32_t*, uint32_t*,
9797
uint32_t*, uint32_t) {
98-
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 1000
98+
#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
9999
if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
100100
return cudaErrorNotSupported;
101101
} else

apps/nccl/src/allreduce.hpp

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -149,13 +149,12 @@ template <bool UseClip = true>
149149
__forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
150150
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
151151
// Optimized assembly for gfx942
152-
typedef float __attribute__((ext_vector_type(2))) float2_t;
153-
float2_t v;
152+
float2 v;
154153
uint32_t ival = 0;
155154
asm volatile("v_pk_add_f32 %0, %1, %2"
156155
: "=v"(v)
157156
: "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
158-
return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[0], ival, false);
157+
return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
159158
#elif !defined(__HIP_PLATFORM_AMD__)
160159
// NVIDIA CUDA FP8 addition (CUDA 11.8+)
161160
__fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b)));
@@ -171,13 +170,12 @@ __forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
171170
template <bool UseClip = true>
172171
__forceinline__ __device__ __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
173172
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
174-
typedef float __attribute__((ext_vector_type(2))) float2_t;
175-
float2_t v;
173+
float2 v;
176174
uint32_t ival = 0;
177175
asm volatile("v_pk_add_f32 %0, %1, %2"
178176
: "=v"(v)
179177
: "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
180-
return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[1], ival, false);
178+
return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
181179
#elif !defined(__HIP_PLATFORM_AMD__)
182180
// CUDA: Convert to half2, add using optimized __hadd2, convert back
183181
__fp8x2_e4m3 result = __fp8x2_e4m3(__hadd2(__half2(a), __half2(b)));
@@ -215,13 +213,12 @@ template <bool UseClip = true>
215213
__forceinline__ __device__ __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
216214
#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
217215
// Optimized assembly for gfx942 (bfloat8)
218-
typedef float __attribute__((ext_vector_type(2))) float2_t;
219-
float2_t v;
216+
float2 v;
220217
uint32_t ival = 0;
221218
asm volatile("v_pk_add_f32 %0, %1, %2"
222219
: "=v"(v)
223220
: "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
224-
return __builtin_amdgcn_cvt_pk_bf8_f32(v[0], v[0], ival, false);
221+
return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
225222
#elif !defined(__HIP_PLATFORM_AMD__)
226223
// NVIDIA CUDA FP8 addition
227224
__fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b)));
@@ -374,21 +371,20 @@ __forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
374371
uint32_t a32 = static_cast<uint32_t>(a);
375372
uint32_t b32 = static_cast<uint32_t>(b);
376373

377-
typedef float __attribute__((ext_vector_type(2))) float2_t;
378-
float2_t v_low, v_high;
374+
float2 v_low, v_high;
379375
uint32_t ival = 0;
380376

381377
if constexpr (std::is_same_v<ScalarT, __fp8_e4m3>) {
382378
// E4M3 using fp8 conversion - process low word (false) and high word (true)
383379
asm volatile("v_pk_add_f32 %0, %1, %2"
384380
: "=v"(v_low)
385381
: "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, false)));
386-
uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low[0], v_low[1], ival, false);
382+
uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, ival, false);
387383

388384
asm volatile("v_pk_add_f32 %0, %1, %2"
389385
: "=v"(v_high)
390386
: "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, true)));
391-
uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high[0], v_high[1], ival, false);
387+
uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, ival, false);
392388

393389
uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
394390
return static_cast<int>(result);
@@ -397,12 +393,12 @@ __forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
397393
asm volatile("v_pk_add_f32 %0, %1, %2"
398394
: "=v"(v_low)
399395
: "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, false)));
400-
uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low[0], v_low[1], ival, false);
396+
uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, ival, false);
401397

402398
asm volatile("v_pk_add_f32 %0, %1, %2"
403399
: "=v"(v_high)
404400
: "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, true)));
405-
uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high[0], v_high[1], ival, false);
401+
uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, ival, false);
406402

407403
uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
408404
return static_cast<int>(result);

src/include/execution_kernel.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
6262
asm volatile("v_pk_add_f32 %0, %1, %2"
6363
: "=v"(v)
6464
: "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
65-
return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[0], ival, false);
65+
return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
6666
#else
6767
return __fp8_e4m3(__hadd(__half(a), __half(b)));
6868
#endif
@@ -78,7 +78,7 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
7878
asm volatile("v_pk_add_f32 %0, %1, %2"
7979
: "=v"(v)
8080
: "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
81-
return __builtin_amdgcn_cvt_pk_bf8_f32(v[0], v[0], ival, false);
81+
return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
8282
#else
8383
return __fp8_e5m2(__hadd(__half(a), __half(b)));
8484
#endif
@@ -95,7 +95,7 @@ MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e4m3(uint16_t a, uint16_t b) {
9595
asm volatile("v_pk_add_f32 %0, %1, %2"
9696
: "=v"(v)
9797
: "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
98-
return __builtin_amdgcn_cvt_pk_fp8_f32(v[0], v[1], ival, false);
98+
return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
9999
}
100100

101101
// E4M3 vectorized addition for 4 elements
@@ -116,7 +116,7 @@ MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e5m2(uint16_t a, uint16_t b) {
116116
asm volatile("v_pk_add_f32 %0, %1, %2"
117117
: "=v"(v)
118118
: "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b, 0)));
119-
return __builtin_amdgcn_cvt_pk_bf8_f32(v[0], v[1], ival, false);
119+
return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false);
120120
}
121121

122122
// E5M2 vectorized addition for 4 elements

0 commit comments

Comments
 (0)