@@ -149,13 +149,12 @@ template <bool UseClip = true>
149149__forceinline__ __device__ __fp8_e4m3 add_elements (__fp8_e4m3 a, __fp8_e4m3 b) {
150150#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
151151 // Optimized assembly for gfx942
152- typedef float __attribute__ ((ext_vector_type (2 ))) float2_t ;
153- float2_t v;
152+ float2 v;
154153 uint32_t ival = 0 ;
155154 asm volatile (" v_pk_add_f32 %0, %1, %2"
156155 : " =v" (v)
157156 : " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (a.__x , 0 )), " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (b.__x , 0 )));
158- return __builtin_amdgcn_cvt_pk_fp8_f32 (v[ 0 ] , v[ 0 ] , ival, false );
157+ return __builtin_amdgcn_cvt_pk_fp8_f32 (v. x , v. x , ival, false );
159158#elif !defined(__HIP_PLATFORM_AMD__)
160159 // NVIDIA CUDA FP8 addition (CUDA 11.8+)
161160 __fp8_e4m3 result = __fp8_e4m3 (__hadd (__half (a), __half (b)));
@@ -171,13 +170,12 @@ __forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
171170template <bool UseClip = true >
172171__forceinline__ __device__ __fp8x2_e4m3 add_elements (__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
173172#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
174- typedef float __attribute__ ((ext_vector_type (2 ))) float2_t ;
175- float2_t v;
173+ float2 v;
176174 uint32_t ival = 0 ;
177175 asm volatile (" v_pk_add_f32 %0, %1, %2"
178176 : " =v" (v)
179177 : " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (a, 0 )), " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (b, 0 )));
180- return __builtin_amdgcn_cvt_pk_fp8_f32 (v[ 0 ] , v[ 1 ] , ival, false );
178+ return __builtin_amdgcn_cvt_pk_fp8_f32 (v. x , v. y , ival, false );
181179#elif !defined(__HIP_PLATFORM_AMD__)
182180 // CUDA: Convert to half2, add using optimized __hadd2, convert back
183181 __fp8x2_e4m3 result = __fp8x2_e4m3 (__hadd2 (__half2 (a), __half2 (b)));
@@ -215,13 +213,12 @@ template <bool UseClip = true>
215213__forceinline__ __device__ __fp8_e5m2 add_elements (__fp8_e5m2 a, __fp8_e5m2 b) {
216214#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
217215 // Optimized assembly for gfx942 (bfloat8)
218- typedef float __attribute__ ((ext_vector_type (2 ))) float2_t ;
219- float2_t v;
216+ float2 v;
220217 uint32_t ival = 0 ;
221218 asm volatile (" v_pk_add_f32 %0, %1, %2"
222219 : " =v" (v)
223220 : " v" (__builtin_amdgcn_cvt_pk_f32_bf8 (a.__x , 0 )), " v" (__builtin_amdgcn_cvt_pk_f32_bf8 (b.__x , 0 )));
224- return __builtin_amdgcn_cvt_pk_bf8_f32 (v[ 0 ] , v[ 0 ] , ival, false );
221+ return __builtin_amdgcn_cvt_pk_bf8_f32 (v. x , v. x , ival, false );
225222#elif !defined(__HIP_PLATFORM_AMD__)
226223 // NVIDIA CUDA FP8 addition
227224 __fp8_e5m2 result = __fp8_e5m2 (__hadd (__half (a), __half (b)));
@@ -374,21 +371,20 @@ __forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
374371 uint32_t a32 = static_cast <uint32_t >(a);
375372 uint32_t b32 = static_cast <uint32_t >(b);
376373
377- typedef float __attribute__ ((ext_vector_type (2 ))) float2_t ;
378- float2_t v_low, v_high;
374+ float2 v_low, v_high;
379375 uint32_t ival = 0 ;
380376
381377 if constexpr (std::is_same_v<ScalarT, __fp8_e4m3>) {
382378 // E4M3 using fp8 conversion - process low word (false) and high word (true)
383379 asm volatile (" v_pk_add_f32 %0, %1, %2"
384380 : " =v" (v_low)
385381 : " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (a32, false )), " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (b32, false )));
386- uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32 (v_low[ 0 ] , v_low[ 1 ] , ival, false );
382+ uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32 (v_low. x , v_low. y , ival, false );
387383
388384 asm volatile (" v_pk_add_f32 %0, %1, %2"
389385 : " =v" (v_high)
390386 : " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (a32, true )), " v" (__builtin_amdgcn_cvt_pk_f32_fp8 (b32, true )));
391- uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32 (v_high[ 0 ] , v_high[ 1 ] , ival, false );
387+ uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32 (v_high. x , v_high. y , ival, false );
392388
393389 uint32_t result = (static_cast <uint32_t >(result_high) << 16 ) | result_low;
394390 return static_cast <int >(result);
@@ -397,12 +393,12 @@ __forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
397393 asm volatile (" v_pk_add_f32 %0, %1, %2"
398394 : " =v" (v_low)
399395 : " v" (__builtin_amdgcn_cvt_pk_f32_bf8 (a32, false )), " v" (__builtin_amdgcn_cvt_pk_f32_bf8 (b32, false )));
400- uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32 (v_low[ 0 ] , v_low[ 1 ] , ival, false );
396+ uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32 (v_low. x , v_low. y , ival, false );
401397
402398 asm volatile (" v_pk_add_f32 %0, %1, %2"
403399 : " =v" (v_high)
404400 : " v" (__builtin_amdgcn_cvt_pk_f32_bf8 (a32, true )), " v" (__builtin_amdgcn_cvt_pk_f32_bf8 (b32, true )));
405- uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32 (v_high[ 0 ] , v_high[ 1 ] , ival, false );
401+ uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32 (v_high. x , v_high. y , ival, false );
406402
407403 uint32_t result = (static_cast <uint32_t >(result_high) << 16 ) | result_low;
408404 return static_cast <int >(result);
0 commit comments