[llvm] [SLP]Gather scalarized calls (PR #125070)

Thu Jan 30 07:16:11 PST 2025

llvmbot wrote:



@llvm/pr-subscribers-backend-nvptx

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

If the calls won't be vectorized, but will be scalarized after
vectorization, they should be build as buildvector nodes, not vector
nodes. Vectorization of such calls leads to incorrect cost estimation,
does not allow to calculate correctly spills costs.


---

Patch is 113.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125070.diff


13 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+66-52) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll (+130-86) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll (+130-86) 
- (modified) llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll (+13-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll (+18-12) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll (+18-12) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/call.ll (+44-19) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll (+33-7) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll (+29-5) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll (+18-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll (+18-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/powi.ll (+126-6) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll (+16-7) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c02bc7bfa90aa..0b28cec8fcd2f1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7634,6 +7634,66 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
            NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
 }
 
+/// Builds the arguments types vector for the given call instruction with the
+/// given \p ID for the specified vector factor.
+static SmallVector<Type *>
+buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
+                       const unsigned VF, unsigned MinBW,
+                       const TargetTransformInfo *TTI) {
+  SmallVector<Type *> ArgTys;
+  for (auto [Idx, Arg] : enumerate(CI->args())) {
+    if (ID != Intrinsic::not_intrinsic) {
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
+        ArgTys.push_back(Arg->getType());
+        continue;
+      }
+      if (MinBW > 0) {
+        ArgTys.push_back(
+            getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
+        continue;
+      }
+    }
+    ArgTys.push_back(getWidenedType(Arg->getType(), VF));
+  }
+  return ArgTys;
+}
+
+/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
+/// function (if possible) calls. Returns invalid cost for the corresponding
+/// calls, if they cannot be vectorized/will be scalarized.
+static std::pair<InstructionCost, InstructionCost>
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+                   ArrayRef<Type *> ArgTys) {
+  auto Shape = VFShape::get(CI->getFunctionType(),
+                            ElementCount::getFixed(VecTy->getNumElements()),
+                            false /*HasGlobalPred*/);
+  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+  auto LibCost = InstructionCost::getInvalid();
+  if (!CI->isNoBuiltin() && VecFunc) {
+    // Calculate the cost of the vector library call.
+    // If the corresponding vector call is cheaper, return its cost.
+    LibCost =
+        TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
+  }
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+  // Calculate the cost of the vector intrinsic call.
+  FastMathFlags FMF;
+  if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
+    FMF = FPCI->getFastMathFlags();
+  const InstructionCost ScalarLimit = 10000;
+  IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
+                                    LibCost.isValid() ? LibCost : ScalarLimit);
+  auto IntrinsicCost =
+      TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+  if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
+      (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
+    IntrinsicCost = InstructionCost::getInvalid();
+
+  return {IntrinsicCost, LibCost};
+}
+
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -7974,6 +8034,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         return TreeEntry::NeedToGather;
       }
     }
+    SmallVector<Type *> ArgTys =
+        buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
+    auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
+    auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
+    if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
+      return TreeEntry::NeedToGather;
 
     return TreeEntry::Vectorize;
   }
@@ -9017,34 +9083,6 @@ bool BoUpSLP::areAllUsersVectorized(
          });
 }
 
-static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
-                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
-                   ArrayRef<Type *> ArgTys) {
-  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
-  // Calculate the cost of the scalar and vector calls.
-  FastMathFlags FMF;
-  if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
-    FMF = FPCI->getFastMathFlags();
-  IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
-  auto IntrinsicCost =
-    TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
-
-  auto Shape = VFShape::get(CI->getFunctionType(),
-                            ElementCount::getFixed(VecTy->getNumElements()),
-                            false /*HasGlobalPred*/);
-  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-  auto LibCost = IntrinsicCost;
-  if (!CI->isNoBuiltin() && VecFunc) {
-    // Calculate the cost of the vector library call.
-    // If the corresponding vector call is cheaper, return its cost.
-    LibCost =
-        TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
-  }
-  return {IntrinsicCost, LibCost};
-}
-
 void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
     const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
     SmallVectorImpl<Value *> *OpScalars,
@@ -11045,30 +11083,6 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
   return TTI::CastContextHint::None;
 }
 
-/// Builds the arguments types vector for the given call instruction with the
-/// given \p ID for the specified vector factor.
-static SmallVector<Type *>
-buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
-                       const unsigned VF, unsigned MinBW,
-                       const TargetTransformInfo *TTI) {
-  SmallVector<Type *> ArgTys;
-  for (auto [Idx, Arg] : enumerate(CI->args())) {
-    if (ID != Intrinsic::not_intrinsic) {
-      if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
-        ArgTys.push_back(Arg->getType());
-        continue;
-      }
-      if (MinBW > 0) {
-        ArgTys.push_back(
-            getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
-        continue;
-      }
-    }
-    ArgTys.push_back(getWidenedType(Arg->getType(), VF));
-  }
-  return ArgTys;
-}
-
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                       SmallPtrSetImpl<Value *> &CheckedExtracts) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
index e1fc7e056e0978..0d80affcbb4f4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -24,10 +24,12 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -217,10 +219,12 @@ define <4 x float> @exp_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -297,10 +301,12 @@ define <4 x float> @log_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -470,10 +476,12 @@ define <4 x float> @sin_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -509,10 +517,12 @@ define <4 x float> @cos_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -548,10 +558,12 @@ define <4 x float> @tan_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -587,10 +599,12 @@ define <4 x float> @asin_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -625,10 +639,12 @@ define <4 x float> @int_asin_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -664,10 +680,12 @@ define <4 x float> @acos_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
@@ -702,10 +720,12 @@ define <4 x float> @int_acos_4x(ptr %a) {
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/125070