[llvm] 4c5f10a - Revert rGe6ccb57bb3f6b761f2310e97fd6ca99eff42f73e "[SLP] Add cost model for `llvm.powi.*` intrinsics"
    Philip Reames via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Tue Jun 21 09:44:29 PDT 2022
    
    
  
As a reminder, when reverting you should provide context on why a change 
was reverted.  Link to broken buildbot, test case, etc..
Philip
On 6/21/22 08:06, Nabeel Omer via llvm-commits wrote:
> Author: Nabeel Omer
> Date: 2022-06-21T15:05:55Z
> New Revision: 4c5f10aeebd76fcc29155000ccac26514197d3d8
>
> URL: https://github.com/llvm/llvm-project/commit/4c5f10aeebd76fcc29155000ccac26514197d3d8
> DIFF: https://github.com/llvm/llvm-project/commit/4c5f10aeebd76fcc29155000ccac26514197d3d8.diff
>
> LOG: Revert rGe6ccb57bb3f6b761f2310e97fd6ca99eff42f73e "[SLP] Add cost model for `llvm.powi.*` intrinsics"
>
> This reverts commit e6ccb57bb3f6b761f2310e97fd6ca99eff42f73e.
>
> Added:
>      
>
> Modified:
>      llvm/include/llvm/CodeGen/BasicTTIImpl.h
>      llvm/include/llvm/CodeGen/TargetLowering.h
>      llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
>      llvm/test/Analysis/CostModel/X86/powi.ll
>      llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
>      llvm/test/Transforms/SLPVectorizer/X86/powi.ll
>
> Removed:
>      
>
>
> ################################################################################
> diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
> index 01050e66d03e5..6fb3e1a0e5c5c 100644
> --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
> +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
> @@ -1414,24 +1414,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
>       default:
>         break;
>   
> -    case Intrinsic::powi:
> -      if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
> -        bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
> -        if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
> -                                               ShouldOptForSize)) {
> -          // The cost is modeled on the expansion performed by ExpandPowI in
> -          // SelectionDAGBuilder.
> -          unsigned ActiveBits = RHSC->getValue().getActiveBits();
> -          InstructionCost Cost =
> -              ActiveBits * thisT()->getArithmeticInstrCost(Instruction::FMul,
> -                                                           RetTy, CostKind);
> -          if (RHSC->getSExtValue() < 0)
> -            Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
> -                                                    CostKind);
> -          return Cost;
> -        }
> -      }
> -      break;
>       case Intrinsic::cttz:
>         // FIXME: If necessary, this should go in target-specific overrides.
>         if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
>
> diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
> index 98b9a416ea59a..484fd2dec4a10 100644
> --- a/llvm/include/llvm/CodeGen/TargetLowering.h
> +++ b/llvm/include/llvm/CodeGen/TargetLowering.h
> @@ -2196,18 +2196,6 @@ class TargetLoweringBase {
>       return false;
>     }
>   
> -  /// Return true if it is beneficial to expand an @llvm.powi.* intrinsic.
> -  /// If not optimizing for size, expanding @llvm.powi.* intrinsics is always
> -  /// considered beneficial.
> -  /// If optimizing for size, expansion is only considered beneficial for upto
> -  /// 5 multiplies and a divide (if the exponent is negative).
> -  bool isBeneficialToExpandPowI(int Exponent, bool OptForSize) const {
> -    if (Exponent < 0)
> -      Exponent = -Exponent;
> -    return !OptForSize ||
> -           (countPopulation((unsigned int)Exponent) + Log2_32(Exponent) < 7);
> -  }
> -
>     //===--------------------------------------------------------------------===//
>     // TargetLowering Configuration Methods - These methods should be invoked by
>     // the derived class constructor to configure this object for the target.
>
> diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
> index 15455ebbfee89..fc031ce824d26 100644
> --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
> +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
> @@ -5346,36 +5346,38 @@ static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
>   /// ExpandPowI - Expand a llvm.powi intrinsic.
>   static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
>                             SelectionDAG &DAG) {
> -  // If RHS is a constant, we can expand this out to a multiplication tree if
> -  // it's beneficial on the target, otherwise we end up lowering to a call to
> -  // __powidf2 (for example).
> +  // If RHS is a constant, we can expand this out to a multiplication tree,
> +  // otherwise we end up lowering to a call to __powidf2 (for example).  When
> +  // optimizing for size, we only want to do this if the expansion would produce
> +  // a small number of multiplies, otherwise we do the full expansion.
>     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
> +    // Get the exponent as a positive value.
>       unsigned Val = RHSC->getSExtValue();
> +    if ((int)Val < 0) Val = -Val;
>   
>       // powi(x, 0) -> 1.0
>       if (Val == 0)
>         return DAG.getConstantFP(1.0, DL, LHS.getValueType());
>   
> -    if (DAG.getTargetLoweringInfo().isBeneficialToExpandPowI(
> -            Val, DAG.shouldOptForSize())) {
> -      // Get the exponent as a positive value.
> -      if ((int)Val < 0)
> -        Val = -Val;
> +    bool OptForSize = DAG.shouldOptForSize();
> +    if (!OptForSize ||
> +        // If optimizing for size, don't insert too many multiplies.
> +        // This inserts up to 5 multiplies.
> +        countPopulation(Val) + Log2_32(Val) < 7) {
>         // We use the simple binary decomposition method to generate the multiply
>         // sequence.  There are more optimal ways to do this (for example,
>         // powi(x,15) generates one more multiply than it should), but this has
>         // the benefit of being both really simple and much better than a libcall.
> -      SDValue Res; // Logically starts equal to 1.0
> +      SDValue Res;  // Logically starts equal to 1.0
>         SDValue CurSquare = LHS;
>         // TODO: Intrinsics should have fast-math-flags that propagate to these
>         // nodes.
>         while (Val) {
>           if (Val & 1) {
>             if (Res.getNode())
> -            Res =
> -                DAG.getNode(ISD::FMUL, DL, Res.getValueType(), Res, CurSquare);
> +            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
>             else
> -            Res = CurSquare; // 1.0*CurSquare.
> +            Res = CurSquare;  // 1.0*CurSquare.
>           }
>   
>           CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
>
> diff  --git a/llvm/test/Analysis/CostModel/X86/powi.ll b/llvm/test/Analysis/CostModel/X86/powi.ll
> index 311df615380da..f9d007be71cd7 100644
> --- a/llvm/test/Analysis/CostModel/X86/powi.ll
> +++ b/llvm/test/Analysis/CostModel/X86/powi.ll
> @@ -74,55 +74,55 @@ define i32 @powi_var(i32 %arg) {
>   
>   define i32 @powi_3() {
>   ; SSE-LABEL: 'powi_3'
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
>   ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX1-LABEL: 'powi_3'
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX2-LABEL: 'powi_3'
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
>   ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX512-LABEL: 'powi_3'
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 3)
>   ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>     %F32 = call float @llvm.powi.f32(float poison, i32 3)
> @@ -142,55 +142,55 @@ define i32 @powi_3() {
>   
>   define i32 @powi_n3() {
>   ; SSE-LABEL: 'powi_n3'
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 412 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 532 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 1064 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
>   ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX1-LABEL: 'powi_n3'
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX2-LABEL: 'powi_n3'
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
>   ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX512-LABEL: 'powi_n3'
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 -3)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 -3)
>   ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>     %F32 = call float @llvm.powi.f32(float poison, i32 -3)
> @@ -210,25 +210,25 @@ define i32 @powi_n3() {
>   
>   define i32 @powi_6() {
>   ; SSE-LABEL: 'powi_6'
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
>   ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX1-LABEL: 'powi_6'
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
> @@ -236,29 +236,29 @@ define i32 @powi_6() {
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX2-LABEL: 'powi_6'
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
>   ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
>   ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
>   ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX512-LABEL: 'powi_6'
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 6)
>   ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
>   ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 6)
>   ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>     %F32 = call float @llvm.powi.f32(float poison, i32 6)
> @@ -278,55 +278,55 @@ define i32 @powi_6() {
>   
>   define i32 @powi_16() {
>   ; SSE-LABEL: 'powi_16'
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> -; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> +; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
>   ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX1-LABEL: 'powi_16'
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> -; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> +; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
>   ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX2-LABEL: 'powi_16'
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> -; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> +; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
>   ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>   ; AVX512-LABEL: 'powi_16'
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> -; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.powi.f32.i32(float poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.powi.f64.i32(double poison, i32 6)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> poison, i32 16)
> +; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F64 = call <16 x double> @llvm.powi.v16f64.i32(<16 x double> poison, i32 16)
>   ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 poison
>   ;
>     %F32 = call float @llvm.powi.f32(float poison, i32 16)
>
> diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll b/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
> index c3f6127a604fa..39f00840fcd1a 100644
> --- a/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
> +++ b/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
> @@ -6,8 +6,13 @@
>   define <2 x double> @PR53887_v2f64(<2 x double> noundef %x) {
>   ; CHECK-LABEL: @PR53887_v2f64(
>   ; CHECK-NEXT:  entry:
> -; CHECK-NEXT:    [[TMP0:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[X:%.*]], i32 6)
> -; CHECK-NEXT:    ret <2 x double> [[TMP0]]
> +; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[X:%.*]], i64 0
> +; CHECK-NEXT:    [[TMP0:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT]], i32 6)
> +; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i64 0
> +; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x double> [[X]], i64 1
> +; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT1]], i32 6)
> +; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <2 x double> [[VECINIT]], double [[TMP1]], i64 1
> +; CHECK-NEXT:    ret <2 x double> [[VECINIT3]]
>   ;
>   entry:
>     %vecext = extractelement <2 x double> %x, i64 0
> @@ -22,8 +27,20 @@ entry:
>   define <4 x double> @PR53887_v4f64(<4 x double> noundef %x) {
>   ; CHECK-LABEL: @PR53887_v4f64(
>   ; CHECK-NEXT:  entry:
> -; CHECK-NEXT:    [[TMP0:%.*]] = call fast <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[X:%.*]], i32 6)
> -; CHECK-NEXT:    ret <4 x double> [[TMP0]]
> +; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x double> [[X:%.*]], i64 0
> +; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x double> [[X]], i64 1
> +; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[VECEXT]], i32 0
> +; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[VECEXT1]], i32 1
> +; CHECK-NEXT:    [[TMP2:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP1]], i32 6)
> +; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; CHECK-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x double> [[X]], i64 2
> +; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x double> [[X]], i64 3
> +; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[VECEXT4]], i32 0
> +; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[VECEXT7]], i32 1
> +; CHECK-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 6)
> +; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; CHECK-NEXT:    [[VECINIT91:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
> +; CHECK-NEXT:    ret <4 x double> [[VECINIT91]]
>   ;
>   entry:
>     %vecext = extractelement <4 x double> %x, i64 0
>
> diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/powi.ll b/llvm/test/Transforms/SLPVectorizer/X86/powi.ll
> index 732256d2301a4..c623dbbebfa33 100644
> --- a/llvm/test/Transforms/SLPVectorizer/X86/powi.ll
> +++ b/llvm/test/Transforms/SLPVectorizer/X86/powi.ll
> @@ -1,13 +1,18 @@
>   ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
> -; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s
> -; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s
> -; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s
> -; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s
> +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
> +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
> +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
> +; RUN: opt < %s -mtriple=x86_64-linux-gnu -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
>   
>   define <2 x double> @buildvector_powi_2f64_6(<2 x double> %a) {
>   ; CHECK-LABEL: @buildvector_powi_2f64_6(
> -; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[A:%.*]], i32 6)
> -; CHECK-NEXT:    ret <2 x double> [[TMP1]]
> +; CHECK-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
> +; CHECK-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
> +; CHECK-NEXT:    [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 6)
> +; CHECK-NEXT:    [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 6)
> +; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
> +; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
> +; CHECK-NEXT:    ret <2 x double> [[R1]]
>   ;
>     %a0 = extractelement <2 x double> %a, i32 0
>     %a1 = extractelement <2 x double> %a, i32 1
> @@ -38,9 +43,67 @@ define <2 x double> @buildvector_powi_2f64_var(<2 x double> %a, i32 %b) {
>   }
>   
>   define <4 x float> @buildvector_powi_4f32_3(<4 x float> %a) {
> -; CHECK-LABEL: @buildvector_powi_4f32_3(
> -; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[A:%.*]], i32 3)
> -; CHECK-NEXT:    ret <4 x float> [[TMP1]]
> +; SSE-LABEL: @buildvector_powi_4f32_3(
> +; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
> +; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
> +; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
> +; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
> +; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
> +; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A1]], i32 1
> +; SSE-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
> +; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
> +; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
> +; SSE-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP5]], i32 3)
> +; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
> +; SSE-NEXT:    ret <4 x float> [[R31]]
> +;
> +; AVX1-LABEL: @buildvector_powi_4f32_3(
> +; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
> +; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
> +; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
> +; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
> +; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
> +; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
> +; AVX1-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 3)
> +; AVX1-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 3)
> +; AVX1-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
> +; AVX1-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
> +; AVX1-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[C2]], i32 2
> +; AVX1-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[C3]], i32 3
> +; AVX1-NEXT:    ret <4 x float> [[R3]]
> +;
> +; AVX2-LABEL: @buildvector_powi_4f32_3(
> +; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
> +; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
> +; AVX2-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
> +; AVX2-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
> +; AVX2-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
> +; AVX2-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
> +; AVX2-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 3)
> +; AVX2-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 3)
> +; AVX2-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
> +; AVX2-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
> +; AVX2-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[C2]], i32 2
> +; AVX2-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[C3]], i32 3
> +; AVX2-NEXT:    ret <4 x float> [[R3]]
> +;
> +; AVX512-LABEL: @buildvector_powi_4f32_3(
> +; AVX512-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
> +; AVX512-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
> +; AVX512-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
> +; AVX512-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
> +; AVX512-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
> +; AVX512-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
> +; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
> +; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A3]], i32 1
> +; AVX512-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[TMP2]], i32 3)
> +; AVX512-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
> +; AVX512-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
> +; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; AVX512-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
> +; AVX512-NEXT:    ret <4 x float> [[R31]]
>   ;
>     %a0 = extractelement <4 x float> %a, i32 0
>     %a1 = extractelement <4 x float> %a, i32 1
> @@ -62,9 +125,45 @@ define <4 x float> @buildvector_powi_4f32_3(<4 x float> %a) {
>   ;
>   
>   define <4 x double> @buildvector_powi_4f64_16(<4 x double> %a) {
> -; CHECK-LABEL: @buildvector_powi_4f64_16(
> -; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
> -; CHECK-NEXT:    ret <4 x double> [[TMP1]]
> +; SSE-LABEL: @buildvector_powi_4f64_16(
> +; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
> +; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
> +; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
> +; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
> +; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
> +; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
> +; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
> +; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
> +; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
> +; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
> +; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
> +; SSE-NEXT:    ret <4 x double> [[R31]]
> +;
> +; AVX1-LABEL: @buildvector_powi_4f64_16(
> +; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
> +; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
> +; AVX1-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
> +; AVX1-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
> +; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
> +; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
> +; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP2]], i32 16)
> +; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
> +; AVX1-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A3]], i32 1
> +; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[TMP5]], i32 16)
> +; AVX1-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; AVX1-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
> +; AVX1-NEXT:    [[R31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
> +; AVX1-NEXT:    ret <4 x double> [[R31]]
> +;
> +; AVX2-LABEL: @buildvector_powi_4f64_16(
> +; AVX2-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
> +; AVX2-NEXT:    ret <4 x double> [[TMP1]]
> +;
> +; AVX512-LABEL: @buildvector_powi_4f64_16(
> +; AVX512-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
> +; AVX512-NEXT:    ret <4 x double> [[TMP1]]
>   ;
>     %a0 = extractelement <4 x double> %a, i32 0
>     %a1 = extractelement <4 x double> %a, i32 1
> @@ -82,9 +181,64 @@ define <4 x double> @buildvector_powi_4f64_16(<4 x double> %a) {
>   }
>   
>   define <8 x float> @buildvector_powi_8f32_4(<8 x float> %a) {
> -; CHECK-LABEL: @buildvector_powi_8f32_4(
> -; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
> -; CHECK-NEXT:    ret <8 x float> [[TMP1]]
> +; SSE-LABEL: @buildvector_powi_8f32_4(
> +; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
> +; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
> +; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
> +; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
> +; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
> +; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
> +; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
> +; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
> +; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0]], i32 0
> +; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1]], i32 1
> +; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2]], i32 2
> +; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3]], i32 3
> +; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 4)
> +; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[A4]], i32 0
> +; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[A5]], i32 1
> +; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[A6]], i32 2
> +; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A7]], i32 3
> +; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP9]], i32 4)
> +; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
> +; SSE-NEXT:    ret <8 x float> [[R71]]
> +;
> +; AVX1-LABEL: @buildvector_powi_8f32_4(
> +; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
> +; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
> +; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
> +; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
> +; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
> +; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
> +; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
> +; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
> +; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 4)
> +; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 4)
> +; AVX1-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 4)
> +; AVX1-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 4)
> +; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 4)
> +; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 4)
> +; AVX1-NEXT:    [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 4)
> +; AVX1-NEXT:    [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 4)
> +; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[C0]], i32 0
> +; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
> +; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[C2]], i32 2
> +; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[C3]], i32 3
> +; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[C4]], i32 4
> +; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
> +; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[C6]], i32 6
> +; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[C7]], i32 7
> +; AVX1-NEXT:    ret <8 x float> [[R7]]
> +;
> +; AVX2-LABEL: @buildvector_powi_8f32_4(
> +; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
> +; AVX2-NEXT:    ret <8 x float> [[TMP1]]
> +;
> +; AVX512-LABEL: @buildvector_powi_8f32_4(
> +; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
> +; AVX512-NEXT:    ret <8 x float> [[TMP1]]
>   ;
>     %a0 = extractelement <8 x float> %a, i32 0
>     %a1 = extractelement <8 x float> %a, i32 1
> @@ -118,9 +272,61 @@ define <8 x float> @buildvector_powi_8f32_4(<8 x float> %a) {
>   ;
>   
>   define <8 x double> @buildvector_powi_8f64_5(<8 x double> %a) {
> -; CHECK-LABEL: @buildvector_powi_8f64_5(
> -; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
> -; CHECK-NEXT:    ret <8 x double> [[TMP1]]
> +; SSE-LABEL: @buildvector_powi_8f64_5(
> +; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
> +; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
> +; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
> +; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
> +; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
> +; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
> +; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
> +; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
> +; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
> +; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
> +; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
> +; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
> +; SSE-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
> +; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
> +; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
> +; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
> +; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
> +; SSE-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
> +; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
> +; SSE-NEXT:    ret <8 x double> [[R71]]
> +;
> +; AVX1-LABEL: @buildvector_powi_8f64_5(
> +; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
> +; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
> +; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
> +; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
> +; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
> +; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
> +; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
> +; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
> +; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0]], i32 0
> +; AVX1-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1]], i32 1
> +; AVX1-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2]], i32 2
> +; AVX1-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3]], i32 3
> +; AVX1-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP4]], i32 5)
> +; AVX1-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> poison, double [[A4]], i32 0
> +; AVX1-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[A5]], i32 1
> +; AVX1-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[A6]], i32 2
> +; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[A7]], i32 3
> +; AVX1-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[TMP9]], i32 5)
> +; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
> +; AVX1-NEXT:    ret <8 x double> [[R71]]
> +;
> +; AVX2-LABEL: @buildvector_powi_8f64_5(
> +; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
> +; AVX2-NEXT:    ret <8 x double> [[TMP1]]
> +;
> +; AVX512-LABEL: @buildvector_powi_8f64_5(
> +; AVX512-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
> +; AVX512-NEXT:    ret <8 x double> [[TMP1]]
>   ;
>     %a0 = extractelement <8 x double> %a, i32 0
>     %a1 = extractelement <8 x double> %a, i32 1
> @@ -205,9 +411,104 @@ define <8 x double> @buildvector_powi_8f64_mismatch(<8 x double> %a) {
>   }
>   
>   define <16 x float> @buildvector_powi_16f32_n13(<16 x float> %a) {
> -; CHECK-LABEL: @buildvector_powi_16f32_n13(
> -; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
> -; CHECK-NEXT:    ret <16 x float> [[TMP1]]
> +; SSE-LABEL: @buildvector_powi_16f32_n13(
> +; SSE-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
> +; SSE-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
> +; SSE-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
> +; SSE-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
> +; SSE-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
> +; SSE-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
> +; SSE-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
> +; SSE-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
> +; SSE-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
> +; SSE-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
> +; SSE-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
> +; SSE-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
> +; SSE-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
> +; SSE-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
> +; SSE-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
> +; SSE-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
> +; SSE-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[A0]], i32 0
> +; SSE-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[A1]], i32 1
> +; SSE-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[A2]], i32 2
> +; SSE-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[A3]], i32 3
> +; SSE-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[A4]], i32 4
> +; SSE-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[A5]], i32 5
> +; SSE-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[A6]], i32 6
> +; SSE-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[A7]], i32 7
> +; SSE-NEXT:    [[TMP9:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP8]], i32 -13)
> +; SSE-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> poison, float [[A8]], i32 0
> +; SSE-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[A9]], i32 1
> +; SSE-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[A10]], i32 2
> +; SSE-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[A11]], i32 3
> +; SSE-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[A12]], i32 4
> +; SSE-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[A13]], i32 5
> +; SSE-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[A14]], i32 6
> +; SSE-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[A15]], i32 7
> +; SSE-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[TMP17]], i32 -13)
> +; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP18]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; SSE-NEXT:    [[R151:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
> +; SSE-NEXT:    ret <16 x float> [[R151]]
> +;
> +; AVX1-LABEL: @buildvector_powi_16f32_n13(
> +; AVX1-NEXT:    [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
> +; AVX1-NEXT:    [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
> +; AVX1-NEXT:    [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
> +; AVX1-NEXT:    [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
> +; AVX1-NEXT:    [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
> +; AVX1-NEXT:    [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
> +; AVX1-NEXT:    [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
> +; AVX1-NEXT:    [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
> +; AVX1-NEXT:    [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
> +; AVX1-NEXT:    [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
> +; AVX1-NEXT:    [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
> +; AVX1-NEXT:    [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
> +; AVX1-NEXT:    [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
> +; AVX1-NEXT:    [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
> +; AVX1-NEXT:    [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
> +; AVX1-NEXT:    [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
> +; AVX1-NEXT:    [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 -13)
> +; AVX1-NEXT:    [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 -13)
> +; AVX1-NEXT:    [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 -13)
> +; AVX1-NEXT:    [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 -13)
> +; AVX1-NEXT:    [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 -13)
> +; AVX1-NEXT:    [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 -13)
> +; AVX1-NEXT:    [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 -13)
> +; AVX1-NEXT:    [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 -13)
> +; AVX1-NEXT:    [[C8:%.*]] = call float @llvm.powi.f32.i32(float [[A8]], i32 -13)
> +; AVX1-NEXT:    [[C9:%.*]] = call float @llvm.powi.f32.i32(float [[A9]], i32 -13)
> +; AVX1-NEXT:    [[C10:%.*]] = call float @llvm.powi.f32.i32(float [[A10]], i32 -13)
> +; AVX1-NEXT:    [[C11:%.*]] = call float @llvm.powi.f32.i32(float [[A11]], i32 -13)
> +; AVX1-NEXT:    [[C12:%.*]] = call float @llvm.powi.f32.i32(float [[A12]], i32 -13)
> +; AVX1-NEXT:    [[C13:%.*]] = call float @llvm.powi.f32.i32(float [[A13]], i32 -13)
> +; AVX1-NEXT:    [[C14:%.*]] = call float @llvm.powi.f32.i32(float [[A14]], i32 -13)
> +; AVX1-NEXT:    [[C15:%.*]] = call float @llvm.powi.f32.i32(float [[A15]], i32 -13)
> +; AVX1-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[C0]], i32 0
> +; AVX1-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
> +; AVX1-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[C2]], i32 2
> +; AVX1-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[C3]], i32 3
> +; AVX1-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[C4]], i32 4
> +; AVX1-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
> +; AVX1-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[C6]], i32 6
> +; AVX1-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[C7]], i32 7
> +; AVX1-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[C8]], i32 8
> +; AVX1-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
> +; AVX1-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[C10]], i32 10
> +; AVX1-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[C11]], i32 11
> +; AVX1-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[C12]], i32 12
> +; AVX1-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
> +; AVX1-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[C14]], i32 14
> +; AVX1-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[C15]], i32 15
> +; AVX1-NEXT:    ret <16 x float> [[R15]]
> +;
> +; AVX2-LABEL: @buildvector_powi_16f32_n13(
> +; AVX2-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
> +; AVX2-NEXT:    ret <16 x float> [[TMP1]]
> +;
> +; AVX512-LABEL: @buildvector_powi_16f32_n13(
> +; AVX512-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
> +; AVX512-NEXT:    ret <16 x float> [[TMP1]]
>   ;
>     %a0  = extractelement <16 x float> %a, i32 0
>     %a1  = extractelement <16 x float> %a, i32 1
>
>
>          
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
    
    
More information about the llvm-commits
mailing list