[llvm] 6c79cc7 - [X86] Lower mathlib call ldexp into scalef when avx512 is enabled (#166839)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 08:39:50 PST 2025
Author: Kavin Gnanapandithan
Date: 2025-11-20T16:39:45Z
New Revision: 6c79cc7ff7901e9c8a3a4e924abd0e9dbfae039c
URL: https://github.com/llvm/llvm-project/commit/6c79cc7ff7901e9c8a3a4e924abd0e9dbfae039c
DIFF: https://github.com/llvm/llvm-project/commit/6c79cc7ff7901e9c8a3a4e924abd0e9dbfae039c.diff
LOG: [X86] Lower mathlib call ldexp into scalef when avx512 is enabled (#166839)
Resolves #165694
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
llvm/test/CodeGen/X86/ldexp-avx512.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 81da77b88bfa0..dc84025c166a3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2098,9 +2098,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
// narrower widths.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+ for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
+ MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
+ MVT::v16f32, MVT::v8f64})
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
-
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
@@ -19220,6 +19224,72 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return SDValue();
}
+static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue X = Op.getOperand(0);
+ MVT XTy = X.getSimpleValueType();
+ SDValue Exp = Op.getOperand(1);
+
+ switch (XTy.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::f16:
+ if (!Subtarget.hasFP16())
+ X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
+ [[fallthrough]];
+ case MVT::f32:
+ case MVT::f64: {
+ MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
+ 128 / X.getSimpleValueType().getSizeInBits());
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
+ SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
+ SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
+ SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
+ SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
+ return DAG.getFPExtendOrRound(Final, DL, XTy);
+ }
+ case MVT::v4f32:
+ case MVT::v2f64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ case MVT::v16f32:
+ case MVT::v8f64:
+ if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
+ }
+ break;
+ case MVT::v8f16:
+ case MVT::v16f16:
+ if (Subtarget.hasFP16()) {
+ if (Subtarget.hasVLX()) {
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
+ }
+ break;
+ }
+ X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
+ Exp = DAG.getSExtOrTrunc(Exp, DL,
+ X.getSimpleValueType().changeTypeToInteger());
+ break;
+ case MVT::v32f16:
+ if (Subtarget.hasFP16()) {
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
+ }
+ return splitVectorOp(Op, DAG, DL);
+ }
+ SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
+ SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
+ SDValue Scalef =
+ DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
+ SDValue Final =
+ DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
+ return DAG.getFPExtendOrRound(Final, DL, XTy);
+}
+
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -33734,7 +33804,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
- // clang-format on
+ case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG);
+ // clang-format on
}
}
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 19c84d42a7ea6..b655bda68f906 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -79,38 +79,54 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: subq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48
-; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf at PLT
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vmovd %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf at PLT
-; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf at PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf at PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-AVX-NEXT: addq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: subq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf at PLT
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vmovd %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf at PLT
+; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf at PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf at PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-AVX2-NEXT: addq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-ONLY-AVX512F: # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; CHECK-ONLY-AVX512F-NEXT: vmovaps %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
+; CHECK-ONLY-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-ONLY-AVX512F-NEXT: vzeroupper
+; CHECK-ONLY-AVX512F-NEXT: retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
@@ -562,79 +578,11 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
;
; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
; CHECK-AVX512F: # %bb.0:
-; CHECK-AVX512F-NEXT: subq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf at PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-AVX512F-NEXT: addq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8
+; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
+; CHECK-AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; CHECK-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
+; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-AVX512F-NEXT: vzeroupper
; CHECK-AVX512F-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll
index 21491bc2cc8f5..bb6dc3162eb1f 100644
--- a/llvm/test/CodeGen/X86/ldexp-avx512.ll
+++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll
@@ -1,16 +1,37 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512FP16
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VLF
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VLFP16
define half @test_half(half %x, i32 %exp) nounwind {
-; CHECK-LABEL: test_half:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: retq
+; AVX512F-LABEL: test_half:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_half:
+; AVX512FP16: # %bb.0: # %entry
+; AVX512FP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1
+; AVX512FP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0
+; AVX512FP16-NEXT: retq
+;
+; AVX512VL-LABEL: test_half:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_half:
+; AVX512VLFP16: # %bb.0: # %entry
+; AVX512VLFP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1
+; AVX512VLFP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0
+; AVX512VLFP16-NEXT: retq
entry:
%r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp)
ret half %r
@@ -20,7 +41,9 @@ declare half @llvm.ldexp.f16.i32(half, i32) memory(none)
define float @test_float(float %x, i32 %exp) nounwind {
; CHECK-LABEL: test_float:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: jmp ldexpf at PLT # TAILCALL
+; CHECK-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1
+; CHECK-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
entry:
%r = tail call fast float @ldexpf(float %x, i32 %exp)
ret float %r
@@ -30,7 +53,9 @@ declare float @ldexpf(float, i32) memory(none)
define double @test_double(double %x, i32 %exp) nounwind {
; CHECK-LABEL: test_double:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: jmp ldexp at PLT # TAILCALL
+; CHECK-NEXT: vcvtsi2sd %edi, %xmm15, %xmm1
+; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
entry:
%r = tail call fast double @ldexp(double %x, i32 %exp)
ret double %r
@@ -48,220 +73,65 @@ entry:
declare fp128 @ldexpl(fp128, i32) memory(none)
define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind {
-; AVX512-LABEL: test_ldexp_8xhalf:
-; AVX512: # %bb.0:
-; AVX512-NEXT: subq $88, %rsp
-; AVX512-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm0
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX512-NEXT: addq $88, %rsp
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_ldexp_8xhalf:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_ldexp_8xhalf:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512FP16-NEXT: vinsertf32x4 $0, %xmm0, %zmm2, %zmm0
+; AVX512FP16-NEXT: vmovaps %xmm1, %xmm1
+; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512FP16-NEXT: vzeroupper
+; AVX512FP16-NEXT: retq
;
; AVX512VL-LABEL: test_ldexp_8xhalf:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: subq $88, %rsp
-; AVX512VL-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm0
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,0]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX512VL-NEXT: addq $88, %rsp
+; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_8xhalf:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtw2ph %xmm1, %xmm1
+; AVX512VLFP16-NEXT: vscalefph %xmm1, %xmm0, %xmm0
+; AVX512VLFP16-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp)
ret <8 x half> %r
}
declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind {
-; CHECK-LABEL: test_ldexp_4xfloat:
-; CHECK: # %bb.0:
-; CHECK-NEXT: subq $56, %rsp
-; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovd %xmm1, %edi
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-NEXT: addq $56, %rsp
-; CHECK-NEXT: retq
+; AVX512-LABEL: test_ldexp_4xfloat:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm1, %xmm1
+; AVX512-NEXT: vmovaps %xmm0, %xmm0
+; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_4xfloat:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1
+; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_4xfloat:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtdq2ps %xmm1, %xmm1
+; AVX512VLFP16-NEXT: vscalefps %xmm1, %xmm0, %xmm0
+; AVX512VLFP16-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp)
ret <4 x float> %r
}
@@ -270,20 +140,13 @@ declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwind {
; CHECK-LABEL: test_ldexp_2xdouble:
; CHECK: # %bb.0:
-; CHECK-NEXT: subq $56, %rsp
-; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovd %xmm1, %edi
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm2
+; CHECK-NEXT: vscalefsd %xmm2, %xmm0, %xmm2
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm1
+; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; CHECK-NEXT: retq
%r = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %x, <2 x i32> %exp)
ret <2 x double> %r
@@ -291,1225 +154,133 @@ define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwi
declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind {
-; AVX512-LABEL: test_ldexp_16xhalf:
-; AVX512: # %bb.0:
-; AVX512-NEXT: subq $168, %rsp
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX512-NEXT: addq $168, %rsp
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_ldexp_16xhalf:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_ldexp_16xhalf:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512FP16-NEXT: vinsertf64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512FP16-NEXT: vmovaps %ymm1, %ymm1
+; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512FP16-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512FP16-NEXT: retq
;
; AVX512VL-LABEL: test_ldexp_16xhalf:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: subq $168, %rsp
-; AVX512VL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,0]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,0]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512VL-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX512VL-NEXT: addq $168, %rsp
+; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_16xhalf:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtw2ph %ymm1, %ymm1
+; AVX512VLFP16-NEXT: vscalefph %ymm1, %ymm0, %ymm0
+; AVX512VLFP16-NEXT: retq
%r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp)
ret <16 x half> %r
}
declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>)
define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind {
-; CHECK-LABEL: test_ldexp_8xfloat:
-; CHECK: # %bb.0:
-; CHECK-NEXT: subq $120, %rsp
-; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovd %xmm1, %edi
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vmovd %xmm0, %edi
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: addq $120, %rsp
-; CHECK-NEXT: retq
+; AVX512-LABEL: test_ldexp_8xfloat:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %ymm1, %ymm1
+; AVX512-NEXT: vmovaps %ymm0, %ymm0
+; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_8xfloat:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX512VL-NEXT: vscalefps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_8xfloat:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX512VLFP16-NEXT: vscalefps %ymm1, %ymm0, %ymm0
+; AVX512VLFP16-NEXT: retq
%r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp)
ret <8 x float> %r
}
declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>)
define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind {
-; CHECK-LABEL: test_ldexp_4xdouble:
-; CHECK: # %bb.0:
-; CHECK-NEXT: subq $88, %rsp
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractps $2, %xmm1, %edi
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vmovd %xmm0, %edi
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: addq $88, %rsp
-; CHECK-NEXT: retq
+; AVX512-LABEL: test_ldexp_4xdouble:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd %xmm1, %xmm1
+; AVX512-NEXT: vmovapd %ymm0, %ymm0
+; AVX512-NEXT: vscalefpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_4xdouble:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX512VL-NEXT: vscalefpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_4xdouble:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX512VLFP16-NEXT: vscalefpd %ymm1, %ymm0, %ymm0
+; AVX512VLFP16-NEXT: retq
%r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp)
ret <4 x double> %r
}
declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind {
-; AVX512-LABEL: test_ldexp_32xhalf:
-; AVX512: # %bb.0:
-; AVX512-NEXT: subq $360, %rsp # imm = 0x168
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: movswl %ax, %edi
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: callq ldexpf at PLT
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; AVX512-NEXT: addq $360, %rsp # imm = 0x168
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_ldexp_32xhalf:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm2
+; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm3
+; AVX512F-NEXT: vscalefps %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_ldexp_32xhalf:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vcvtw2ph %zmm1, %zmm1
+; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512FP16-NEXT: retq
;
; AVX512VL-LABEL: test_ldexp_32xhalf:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: subq $360, %rsp # imm = 0x168
-; AVX512VL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,0]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,0]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,0]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,0]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movswl %ax, %edi
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq ldexpf at PLT
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512VL-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; AVX512VL-NEXT: addq $360, %rsp # imm = 0x168
+; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm2
+; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm3
+; AVX512VL-NEXT: vscalefps %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vcvtps2ph $4, %zmm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_32xhalf:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtw2ph %zmm1, %zmm1
+; AVX512VLFP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512VLFP16-NEXT: retq
%r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp)
ret <32 x half> %r
}
@@ -1518,148 +289,8 @@ declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>)
define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind {
; CHECK-LABEL: test_ldexp_16xfloat:
; CHECK: # %bb.0:
-; CHECK-NEXT: subq $216, %rsp
-; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovd %xmm1, %edi
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovd %xmm0, %edi
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovd %xmm0, %edi
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vmovd %xmm0, %edi
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexpf at PLT
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; CHECK-NEXT: addq $216, %rsp
+; CHECK-NEXT: vcvtdq2ps %zmm1, %zmm1
+; CHECK-NEXT: vscalefps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %x, <16 x i32> %exp)
ret <16 x float> %r
@@ -1669,81 +300,13 @@ declare <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>)
define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwind {
; CHECK-LABEL: test_ldexp_8xdouble:
; CHECK: # %bb.0:
-; CHECK-NEXT: subq $184, %rsp
-; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vextractps $2, %xmm1, %edi
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vmovd %xmm0, %edi
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vmovd %xmm0, %edi
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq ldexp at PLT
-; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; CHECK-NEXT: addq $184, %rsp
+; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1
+; CHECK-NEXT: vscalefpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %x, <8 x i32> %exp)
ret <8 x double> %r
}
declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512VLF: {{.*}}
More information about the llvm-commits
mailing list