-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[X86] Lower mathlib call ldexp into scalef when avx512 is enabled #166839
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Kavin Gnanapandithan (KavinTheG) ChangesResolve issue #165694. Patch is 49.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166839.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 133406bd8e0d7..f9e9bb26638d4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2590,6 +2590,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
}
+ if (Subtarget.hasAVX512()) {
+ for (MVT VT : { MVT::f32, MVT::f64, MVT::v16f32, MVT::v8f64})
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+
+ if (Subtarget.hasVLX()) {
+ for (MVT VT : { MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64 })
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+
+ if (Subtarget.hasFP16()) {
+ for (MVT VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16 })
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+ }
+ }
+
+ if (Subtarget.hasFP16()) {
+ for (MVT VT : { MVT::f16, MVT::v32f16 })
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+ }
+ }
+
// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
@@ -19142,6 +19162,58 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return SDValue();
}
+static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue X = Op.getOperand(0);
+ MVT XTy = X.getSimpleValueType();
+ SDValue Exp = Op.getOperand(1);
+ MVT XVT, ExpVT;
+
+ switch (XTy.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::f16:
+ if (Subtarget.hasFP16()) {
+ XVT = Subtarget.hasVLX() ? MVT::v8f16 : MVT::v32f16;
+ ExpVT = XVT;
+ break;
+ }
+ X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
+ [[fallthrough]];
+ case MVT::f32:
+ XVT = MVT::v4f32;
+ ExpVT = MVT::v4f32;
+ break;
+ case MVT::f64:
+ XVT = MVT::v2f64;
+ ExpVT = MVT::v2f64;
+ break;
+ case MVT::v4f32:
+ case MVT::v2f64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ case MVT::v16f32:
+ case MVT::v8f64:
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
+ }
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
+ SDValue VX =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
+ SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
+ DAG.getUNDEF(ExpVT), Exp, Zero);
+ SDValue Scalef = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX);
+ SDValue Final =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), Scalef, Zero);
+ if (X.getValueType() != XTy)
+ Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final,
+ DAG.getIntPtrConstant(1, SDLoc(Op)));
+ return Final;
+}
+
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -33672,6 +33744,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
+ case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG);
// clang-format on
}
}
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 81529aff39ff1..499695f408396 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -79,38 +79,64 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: subq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48
-; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vmovd %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-AVX-NEXT: addq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: subq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vmovd %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-AVX2-NEXT: addq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-ONLY-AVX512F: # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm1
+; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm1, %xmm2, %xmm1
+; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm2, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-ONLY-AVX512F-NEXT: retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
@@ -560,82 +586,109 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
-; CHECK-AVX512F: # %bb.0:
-; CHECK-AVX512F-NEXT: subq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-AVX512F-NEXT: addq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8
-; CHECK-AVX512F-NEXT: retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
+; CHECK-ONLY-AVX512F: # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm2
+; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm2, %xmm1, %xmm2
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-ONLY-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT: vmovd %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT: cwtl
+; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-ONLY-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-ONLY-AVX512F-NEXT: retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_8xhalf:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vpextrw $7, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1
+; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm2 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT: vscalefss %xmm1, %xmm2, %xmm1
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-SKX-NEXT: vpextrw $6, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; CHECK-SKX-NEXT: vpextrw $5, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT: vpextrw $4, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SKX-NEXT: vpextrw $3, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT: vpextrw $2, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-SKX-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT: vmovd %xmm0, %eax
+; CHECK-SKX-NEXT: cwtl
+; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-SKX-NEXT: vscalefss %xmm0, %xmm2, %xmm0
+; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SKX-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
}
@@ -1769,3 +1822,5 @@ define x86_fp80 @pr128528(i1 %cond) {
%mul = fmul x86_fp80 %conv, 0xK4007D055555555555800
ret x86_fp80 %mul
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions cpp -- llvm/lib/Target/X86/X86ISelLowering.cpp --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a846b43d4..7b5bb9280 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2591,21 +2591,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasAVX512()) {
- for (MVT VT : { MVT::f32, MVT::f64, MVT::v16f32, MVT::v8f64})
+ for (MVT VT : {MVT::f32, MVT::f64, MVT::v16f32, MVT::v8f64})
setOperationAction(ISD::FLDEXP, VT, Custom);
-
+
if (Subtarget.hasVLX()) {
- for (MVT VT : { MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64 })
+ for (MVT VT : {MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64})
setOperationAction(ISD::FLDEXP, VT, Custom);
if (Subtarget.hasFP16()) {
- for (MVT VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16 })
+ for (MVT VT : {MVT::v8f16, MVT::v16f16, MVT::v32f16})
setOperationAction(ISD::FLDEXP, VT, Custom);
}
}
-
+
if (Subtarget.hasFP16()) {
- for (MVT VT : { MVT::f16, MVT::v32f16 })
+ for (MVT VT : {MVT::f16, MVT::v32f16})
setOperationAction(ISD::FLDEXP, VT, Custom);
}
}
@@ -19163,40 +19163,40 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
MVT XTy = X.getSimpleValueType();
SDValue Exp = Op.getOperand(1);
- MVT XVT, ExpVT;
+ MVT XVT, ExpVT;
- switch (XTy.SimpleTy) {
- default:
- return SDValue();
- case MVT::f16:
- if (Subtarget.hasFP16()) {
- XVT = Subtarget.hasVLX() ? MVT::v8f16 : MVT::v32f16;
- ExpVT = XVT;
- break;
- }
- X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
- [[fallthrough]];
- case MVT::f32:
- XVT = MVT::v4f32;
- ExpVT = MVT::v4f32;
- break;
- case MVT::f64:
- XVT = MVT::v2f64;
- ExpVT = MVT::v2f64;
+ switch (XTy.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::f16:
+ if (Subtarget.hasFP16()) {
+ XVT = Subtarget.hasVLX() ? MVT::v8f16 : MVT::v32f16;
+ ExpVT = XVT;
break;
- case MVT::v4f32:
- case MVT::v2f64:
- case MVT::v8f32:
- case MVT::v4f64:
- case MVT::v16f32:
- case MVT::v8f64:
- Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
- return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
+ }
+ X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
+ [[fallthrough]];
+ case MVT::f32:
+ XVT = MVT::v4f32;
+ ExpVT = MVT::v4f32;
+ break;
+ case MVT::f64:
+ XVT = MVT::v2f64;
+ ExpVT = MVT::v2f64;
+ break;
+ case MVT::v4f32:
+ case MVT::v2f64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ case MVT::v16f32:
+ case MVT::v8f64:
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
}
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
@@ -33747,7 +33747,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG);
- // clang-format on
+ // clang-format on
}
}
|
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice - a few minors
|
|
||
| if (Subtarget.hasVLX()) { | ||
| for (MVT VT : { MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64 }) | ||
| setOperationAction(ISD::FLDEXP, VT, Custom); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might be better off allowing these on non-VLX targets and just widening (with zero) in the upper elements to 512-bits
| } | ||
|
|
||
| static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, | ||
| SelectionDAG &DAG) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang-format?
| SDValue Zero = DAG.getConstant(0, DL, MVT::i64); | ||
| Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); | ||
| SDValue VX = | ||
| DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use DAG.getInsertVectorElt
| DAG.getUNDEF(ExpVT), Exp, Zero); | ||
| SDValue Scalef = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX); | ||
| SDValue Final = | ||
| DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), Scalef, Zero); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getExtractVectorElt
| setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); | ||
| } | ||
|
|
||
| if (Subtarget.hasAVX512()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should probably be in the main hasAVX512 block below
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 | ||
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 | ||
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL | ||
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512FP16
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VLF
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VLFP16
Resolves #165694