[llvm] [SLP]Gather scalarized calls (PR #125070)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 4 16:09:39 PST 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/125070
>From 0971c40727e543e71fe744e37c17564af17fb03c Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 30 Jan 2025 15:13:43 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 118 +++++-----
...ccelerate-vector-functions-inseltpoison.ll | 216 +++++++++++-------
.../AArch64/accelerate-vector-functions.ll | 216 +++++++++++-------
.../NVPTX/vectorizable-intrinsic.ll | 17 +-
.../X86/alternate-calls-inseltpoison.ll | 30 ++-
.../SLPVectorizer/X86/alternate-calls.ll | 30 ++-
.../test/Transforms/SLPVectorizer/X86/call.ll | 63 +++--
.../SLPVectorizer/X86/extract_in_tree_user.ll | 40 +++-
.../Transforms/SLPVectorizer/X86/intrinsic.ll | 34 ++-
.../X86/intrinsic_with_scalar_param.ll | 21 +-
.../SLPVectorizer/X86/powi-regression.ll | 20 +-
.../test/Transforms/SLPVectorizer/X86/powi.ll | 132 ++++++++++-
.../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 23 +-
13 files changed, 659 insertions(+), 301 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c02bc7bfa90aa..0b28cec8fcd2f1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7634,6 +7634,66 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
}
+/// Builds the arguments types vector for the given call instruction with the
+/// given \p ID for the specified vector factor.
+static SmallVector<Type *>
+buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
+ const unsigned VF, unsigned MinBW,
+ const TargetTransformInfo *TTI) {
+ SmallVector<Type *> ArgTys;
+ for (auto [Idx, Arg] : enumerate(CI->args())) {
+ if (ID != Intrinsic::not_intrinsic) {
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
+ ArgTys.push_back(Arg->getType());
+ continue;
+ }
+ if (MinBW > 0) {
+ ArgTys.push_back(
+ getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
+ continue;
+ }
+ }
+ ArgTys.push_back(getWidenedType(Arg->getType(), VF));
+ }
+ return ArgTys;
+}
+
+/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
+/// function (if possible) calls. Returns invalid cost for the corresponding
+/// calls, if they cannot be vectorized/will be scalarized.
+static std::pair<InstructionCost, InstructionCost>
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ ArrayRef<Type *> ArgTys) {
+ auto Shape = VFShape::get(CI->getFunctionType(),
+ ElementCount::getFixed(VecTy->getNumElements()),
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ auto LibCost = InstructionCost::getInvalid();
+ if (!CI->isNoBuiltin() && VecFunc) {
+ // Calculate the cost of the vector library call.
+ // If the corresponding vector call is cheaper, return its cost.
+ LibCost =
+ TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
+ }
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the vector intrinsic call.
+ FastMathFlags FMF;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
+ FMF = FPCI->getFastMathFlags();
+ const InstructionCost ScalarLimit = 10000;
+ IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
+ LibCost.isValid() ? LibCost : ScalarLimit);
+ auto IntrinsicCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+ if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
+ (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
+ IntrinsicCost = InstructionCost::getInvalid();
+
+ return {IntrinsicCost, LibCost};
+}
+
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -7974,6 +8034,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::NeedToGather;
}
}
+ SmallVector<Type *> ArgTys =
+ buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
+ auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
+ if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
+ return TreeEntry::NeedToGather;
return TreeEntry::Vectorize;
}
@@ -9017,34 +9083,6 @@ bool BoUpSLP::areAllUsersVectorized(
});
}
-static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
- TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- ArrayRef<Type *> ArgTys) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- // Calculate the cost of the scalar and vector calls.
- FastMathFlags FMF;
- if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
- FMF = FPCI->getFastMathFlags();
- IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
- auto IntrinsicCost =
- TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
-
- auto Shape = VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(VecTy->getNumElements()),
- false /*HasGlobalPred*/);
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- auto LibCost = IntrinsicCost;
- if (!CI->isNoBuiltin() && VecFunc) {
- // Calculate the cost of the vector library call.
- // If the corresponding vector call is cheaper, return its cost.
- LibCost =
- TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
- }
- return {IntrinsicCost, LibCost};
-}
-
void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
SmallVectorImpl<Value *> *OpScalars,
@@ -11045,30 +11083,6 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
return TTI::CastContextHint::None;
}
-/// Builds the arguments types vector for the given call instruction with the
-/// given \p ID for the specified vector factor.
-static SmallVector<Type *>
-buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
- const unsigned VF, unsigned MinBW,
- const TargetTransformInfo *TTI) {
- SmallVector<Type *> ArgTys;
- for (auto [Idx, Arg] : enumerate(CI->args())) {
- if (ID != Intrinsic::not_intrinsic) {
- if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
- ArgTys.push_back(Arg->getType());
- continue;
- }
- if (MinBW > 0) {
- ArgTys.push_back(
- getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
- continue;
- }
- }
- ArgTys.push_back(getWidenedType(Arg->getType(), VF));
- }
- return ArgTys;
-}
-
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
index e1fc7e056e0978..0d80affcbb4f4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -24,10 +24,12 @@ define <4 x float> @int_sin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -217,10 +219,12 @@ define <4 x float> @exp_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -297,10 +301,12 @@ define <4 x float> @log_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -470,10 +476,12 @@ define <4 x float> @sin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -509,10 +517,12 @@ define <4 x float> @cos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -548,10 +558,12 @@ define <4 x float> @tan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -587,10 +599,12 @@ define <4 x float> @asin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -625,10 +639,12 @@ define <4 x float> @int_asin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -664,10 +680,12 @@ define <4 x float> @acos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -702,10 +720,12 @@ define <4 x float> @int_acos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -741,10 +761,12 @@ define <4 x float> @atan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -779,10 +801,12 @@ define <4 x float> @int_atan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -822,11 +846,14 @@ define <4 x float> @atan2_4x(ptr %a, ptr %b) {
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
-; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atan2f(float [[VECEXT_2]], float [[VECEXTB_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atan2f(float [[VECEXT_3]], float [[VECEXTB_3]])
+; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
@@ -870,11 +897,14 @@ define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
-; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_2]], float [[VECEXTB_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_3]], float [[VECEXTB_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -915,10 +945,12 @@ define <4 x float> @sinh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -953,10 +985,12 @@ define <4 x float> @int_sinh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -992,10 +1026,12 @@ define <4 x float> @cosh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1030,10 +1066,12 @@ define <4 x float> @int_cosh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1069,10 +1107,12 @@ define <4 x float> @tanh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1107,10 +1147,12 @@ define <4 x float> @int_tanh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1308,10 +1350,12 @@ define <4 x float> @int_cos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index 666514a2127bba..34d65a307016fa 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -24,10 +24,12 @@ define <4 x float> @int_sin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -217,10 +219,12 @@ define <4 x float> @exp_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -297,10 +301,12 @@ define <4 x float> @log_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -470,10 +476,12 @@ define <4 x float> @sin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -509,10 +517,12 @@ define <4 x float> @cos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -548,10 +558,12 @@ define <4 x float> @tan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -587,10 +599,12 @@ define <4 x float> @asin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -625,10 +639,12 @@ define <4 x float> @int_asin_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -664,10 +680,12 @@ define <4 x float> @acos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -702,10 +720,12 @@ define <4 x float> @int_acos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -741,10 +761,12 @@ define <4 x float> @atan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -779,10 +801,12 @@ define <4 x float> @int_atan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -822,11 +846,14 @@ define <4 x float> @atan2_4x(ptr %a, ptr %b) {
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
-; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atan2f(float [[VECEXT_2]], float [[VECEXTB_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atan2f(float [[VECEXT_3]], float [[VECEXTB_3]])
+; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
@@ -870,11 +897,14 @@ define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
-; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_2]], float [[VECEXTB_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_3]], float [[VECEXTB_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -915,10 +945,12 @@ define <4 x float> @sinh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -953,10 +985,12 @@ define <4 x float> @int_sinh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -992,10 +1026,12 @@ define <4 x float> @cosh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1030,10 +1066,12 @@ define <4 x float> @int_cosh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1069,10 +1107,12 @@ define <4 x float> @tanh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1107,10 +1147,12 @@ define <4 x float> @int_tanh_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
@@ -1308,10 +1350,12 @@ define <4 x float> @int_cos_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
index 114bf585b01635..3a562586d69d80 100644
--- a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
@@ -4,13 +4,17 @@
target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx--nvidiacl"
-; Test that CTLZ can be vectorized currently even though the second argument is a scalar
-
+; Vector versions of the intrinsics are scalarized, so keep them scalar
define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
; CHECK-LABEL: define <2 x i8> @cltz_test(
; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X]], i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
+; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> zeroinitializer, i8 [[CALL_I]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT: [[VEC:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
; CHECK-NEXT: ret <2 x i8> [[VEC]]
;
entry:
@@ -28,7 +32,12 @@ define <2 x i8> @cltz_test_poison(<2 x i8> %x) #0 {
; CHECK-LABEL: define <2 x i8> @cltz_test_poison(
; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VEC:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X]], i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
+; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> poison, i8 [[CALL_I]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT: [[VEC:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
; CHECK-NEXT: ret <2 x i8> [[VEC]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index 78dd39be777cda..b790e6f3c99c6e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -9,23 +9,29 @@
define <8 x float> @ceil_floor(<8 x float> %a) {
; SSE-LABEL: @ceil_floor(
; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
+; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4
+; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5
+; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6
+; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SSE-NEXT: [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
+; SSE-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
+; SSE-NEXT: [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
+; SSE-NEXT: [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
+; SSE-NEXT: [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4
+; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5
+; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6
+; SSE-NEXT: [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7
; SSE-NEXT: ret <8 x float> [[R71]]
;
; SLM-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index 27554e6c46d055..ef1a67032c2372 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -9,23 +9,29 @@
define <8 x float> @ceil_floor(<8 x float> %a) {
; SSE-LABEL: @ceil_floor(
; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
+; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4
+; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5
+; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6
+; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SSE-NEXT: [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
+; SSE-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
+; SSE-NEXT: [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
+; SSE-NEXT: [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
+; SSE-NEXT: [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4
+; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5
+; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6
+; SSE-NEXT: [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7
; SSE-NEXT: ret <8 x float> [[R71]]
;
; SLM-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/call.ll b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
index 8835e3b144be65..548a63c3a8cb11 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/call.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
@@ -15,9 +15,14 @@ declare i64 @round(i64) nounwind willreturn
define void @sin_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @sin_libm(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1
+; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDX1]], align 8
+; CHECK-NEXT: [[SIN1:%.*]] = tail call double @sin(double [[A0]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: [[SIN2:%.*]] = tail call double @sin(double [[A1]]) #[[ATTR3]]
+; CHECK-NEXT: store double [[SIN1]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
+; CHECK-NEXT: store double [[SIN2]], ptr [[IDX2]], align 8
; CHECK-NEXT: ret void
;
%a0 = load double, ptr %a, align 8
@@ -33,9 +38,14 @@ define void @sin_libm(ptr %a, ptr %b) {
define void @cos_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @cos_libm(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1
+; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDX1]], align 8
+; CHECK-NEXT: [[COS1:%.*]] = tail call double @cos(double [[A0]]) #[[ATTR3]]
+; CHECK-NEXT: [[COS2:%.*]] = tail call double @cos(double [[A1]]) #[[ATTR3]]
+; CHECK-NEXT: store double [[COS1]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
+; CHECK-NEXT: store double [[COS2]], ptr [[IDX2]], align 8
; CHECK-NEXT: ret void
;
%a0 = load double, ptr %a, align 8
@@ -51,9 +61,14 @@ define void @cos_libm(ptr %a, ptr %b) {
define void @tan_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @tan_libm(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.tan.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1
+; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDX1]], align 8
+; CHECK-NEXT: [[TAN1:%.*]] = tail call double @tan(double [[A0]]) #[[ATTR3]]
+; CHECK-NEXT: [[TAN2:%.*]] = tail call double @tan(double [[A1]]) #[[ATTR3]]
+; CHECK-NEXT: store double [[TAN1]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
+; CHECK-NEXT: store double [[TAN2]], ptr [[IDX2]], align 8
; CHECK-NEXT: ret void
;
%a0 = load double, ptr %a, align 8
@@ -69,9 +84,14 @@ define void @tan_libm(ptr %a, ptr %b) {
define void @pow_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @pow_libm(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP2]])
-; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1
+; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDX1]], align 8
+; CHECK-NEXT: [[POW1:%.*]] = tail call double @pow(double [[A0]], double [[A0]]) #[[ATTR3]]
+; CHECK-NEXT: [[POW2:%.*]] = tail call double @pow(double [[A1]], double [[A1]]) #[[ATTR3]]
+; CHECK-NEXT: store double [[POW1]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
+; CHECK-NEXT: store double [[POW2]], ptr [[IDX2]], align 8
; CHECK-NEXT: ret void
;
%a0 = load double, ptr %a, align 8
@@ -87,9 +107,14 @@ define void @pow_libm(ptr %a, ptr %b) {
define void @exp_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @exp_libm(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1
+; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDX1]], align 8
+; CHECK-NEXT: [[EXP1:%.*]] = tail call double @exp2(double [[A0]]) #[[ATTR3]]
+; CHECK-NEXT: [[EXP2:%.*]] = tail call double @exp2(double [[A1]]) #[[ATTR3]]
+; CHECK-NEXT: store double [[EXP1]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
+; CHECK-NEXT: store double [[EXP2]], ptr [[IDX2]], align 8
; CHECK-NEXT: ret void
;
%a0 = load double, ptr %a, align 8
@@ -134,8 +159,8 @@ define void @sqrt_libm_errno(ptr %a, ptr %b) {
; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1
; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDX1]], align 8
-; CHECK-NEXT: [[SQRT1:%.*]] = tail call nnan double @sqrt(double [[A0]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[SQRT2:%.*]] = tail call nnan double @sqrt(double [[A1]]) #[[ATTR3]]
+; CHECK-NEXT: [[SQRT1:%.*]] = tail call nnan double @sqrt(double [[A0]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT: [[SQRT2:%.*]] = tail call nnan double @sqrt(double [[A1]]) #[[ATTR4]]
; CHECK-NEXT: store double [[SQRT1]], ptr [[B:%.*]], align 8
; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
; CHECK-NEXT: store double [[SQRT2]], ptr [[IDX2]], align 8
@@ -158,8 +183,8 @@ define void @round_custom(ptr %a, ptr %b) {
; CHECK-NEXT: [[A0:%.*]] = load i64, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1
; CHECK-NEXT: [[A1:%.*]] = load i64, ptr [[IDX1]], align 8
-; CHECK-NEXT: [[ROUND1:%.*]] = tail call i64 @round(i64 [[A0]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT: [[ROUND2:%.*]] = tail call i64 @round(i64 [[A1]]) #[[ATTR4]]
+; CHECK-NEXT: [[ROUND1:%.*]] = tail call i64 @round(i64 [[A0]]) #[[ATTR3]]
+; CHECK-NEXT: [[ROUND2:%.*]] = tail call i64 @round(i64 [[A1]]) #[[ATTR3]]
; CHECK-NEXT: store i64 [[ROUND1]], ptr [[B:%.*]], align 8
; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 1
; CHECK-NEXT: store i64 [[ROUND2]], ptr [[IDX2]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index fbde1e839cfcfb..e7d7ce8ee4a3fb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -34,13 +34,39 @@ declare float @llvm.powi.f32.i32(float, i32)
define void @fn2(ptr %a, ptr %b, ptr %c) {
; CHECK-LABEL: @fn2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 [[TMP3]])
-; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[I0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[I1:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
+; CHECK-NEXT: [[FP1:%.*]] = sitofp i32 [[ADD1]] to float
+; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[FP1]], i32 [[ADD1]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
+; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
+; CHECK-NEXT: [[I3:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
+; CHECK-NEXT: [[FP2:%.*]] = sitofp i32 [[ADD2]] to float
+; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[FP2]], i32 [[ADD1]]) #[[ATTR2]]
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
+; CHECK-NEXT: [[I4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
+; CHECK-NEXT: [[I5:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
+; CHECK-NEXT: [[FP3:%.*]] = sitofp i32 [[ADD3]] to float
+; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[FP3]], i32 [[ADD1]]) #[[ATTR2]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 3
+; CHECK-NEXT: [[I6:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 3
+; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
+; CHECK-NEXT: [[FP4:%.*]] = sitofp i32 [[ADD4]] to float
+; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[FP4]], i32 [[ADD1]]) #[[ATTR2]]
+; CHECK-NEXT: store float [[CALL1]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[C]], i32 1
+; CHECK-NEXT: store float [[CALL2]], ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[C]], i32 2
+; CHECK-NEXT: store float [[CALL3]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[C]], i32 3
+; CHECK-NEXT: store float [[CALL4]], ptr [[ARRAYIDX10]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
index 0eaf89da366011..bb7d54c8a78c3f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -378,11 +378,35 @@ declare float @llvm.powi.f32.i32(float, i32)
define void @vec_powi_f32(ptr %a, ptr %b, ptr %c, i32 %P) {
; CHECK-LABEL: @vec_powi_f32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 [[P:%.*]])
-; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[I0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[I0]], [[I1]]
+; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #[[ATTR3]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
+; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[I2]], [[I3]]
+; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[P]]) #[[ATTR3]]
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[I4]], [[I5]]
+; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #[[ATTR3]]
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
+; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3
+; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[I6]], [[I7]]
+; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[P]]) #[[ATTR3]]
+; CHECK-NEXT: store float [[CALL1]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[C]], i32 1
+; CHECK-NEXT: store float [[CALL2]], ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[C]], i32 2
+; CHECK-NEXT: store float [[CALL3]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[C]], i32 3
+; CHECK-NEXT: store float [[CALL4]], ptr [[ARRAYIDX10]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
index f87a713faf5f54..f8f2459fd9c430 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
@@ -5,9 +5,24 @@ declare float @llvm.powi.f32.i32(float, i32)
define void @vec_powi_f32(ptr %a, ptr %c, i32 %P) {
; CHECK-LABEL: @vec_powi_f32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]])
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[I0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[I0]], i32 [[P:%.*]])
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[I2]], i32 [[P]])
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[I4]], i32 [[P]])
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
+; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[I6]], i32 [[P]])
+; CHECK-NEXT: store float [[CALL1]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[C]], i32 1
+; CHECK-NEXT: store float [[CALL2]], ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[C]], i32 2
+; CHECK-NEXT: store float [[CALL3]], ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[C]], i32 3
+; CHECK-NEXT: store float [[CALL4]], ptr [[ARRAYIDX10]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll b/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
index 9ae378fae13837..0cf0d77e3f29b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/powi-regression.ll
@@ -6,7 +6,12 @@
define <2 x double> @PR53887_v2f64(<2 x double> noundef %x) {
; CHECK-LABEL: @PR53887_v2f64(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x double> [[X:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT]], i32 6)
+; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x double> zeroinitializer, double [[TMP2]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x double> [[X]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT1]], i32 6)
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> [[VECINIT]], double [[TMP1]], i64 1
; CHECK-NEXT: ret <2 x double> [[TMP0]]
;
entry:
@@ -22,7 +27,18 @@ entry:
define <4 x double> @PR53887_v4f64(<4 x double> noundef %x) {
; CHECK-LABEL: @PR53887_v4f64(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call fast <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[X:%.*]], i32 6)
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x double> [[X:%.*]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT]], i32 6)
+; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x double> zeroinitializer, double [[TMP4]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x double> [[X]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT1]], i32 6)
+; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x double> [[VECINIT]], double [[TMP1]], i64 1
+; CHECK-NEXT: [[VECEXT4:%.*]] = extractelement <4 x double> [[X]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT4]], i32 6)
+; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x double> [[VECINIT3]], double [[TMP2]], i64 2
+; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x double> [[X]], i64 3
+; CHECK-NEXT: [[TMP3:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[VECEXT7]], i32 6)
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> [[VECINIT6]], double [[TMP3]], i64 3
; CHECK-NEXT: ret <4 x double> [[TMP0]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/powi.ll b/llvm/test/Transforms/SLPVectorizer/X86/powi.ll
index ce5f912f140b46..a177b46ffde14e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/powi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/powi.ll
@@ -6,7 +6,12 @@
define <2 x double> @buildvector_powi_2f64_6(<2 x double> %a) {
; CHECK-LABEL: @buildvector_powi_2f64_6(
-; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[A:%.*]], i32 6)
+; CHECK-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
+; CHECK-NEXT: [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 6)
+; CHECK-NEXT: [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 6)
+; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%a0 = extractelement <2 x double> %a, i32 0
@@ -39,7 +44,18 @@ define <2 x double> @buildvector_powi_2f64_var(<2 x double> %a, i32 %b) {
define <4 x float> @buildvector_powi_4f32_3(<4 x float> %a) {
; CHECK-LABEL: @buildvector_powi_4f32_3(
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[A:%.*]], i32 3)
+; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 3)
+; CHECK-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 3)
+; CHECK-NEXT: [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 3)
+; CHECK-NEXT: [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 3)
+; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> poison, float [[C0]], i32 0
+; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[C1]], i32 1
+; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[C2]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[R2]], float [[C3]], i32 3
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%a0 = extractelement <4 x float> %a, i32 0
@@ -63,7 +79,18 @@ define <4 x float> @buildvector_powi_4f32_3(<4 x float> %a) {
define <4 x double> @buildvector_powi_4f64_16(<4 x double> %a) {
; CHECK-LABEL: @buildvector_powi_4f64_16(
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> [[A:%.*]], i32 16)
+; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; CHECK-NEXT: [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 16)
+; CHECK-NEXT: [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 16)
+; CHECK-NEXT: [[C2:%.*]] = call double @llvm.powi.f64.i32(double [[A2]], i32 16)
+; CHECK-NEXT: [[C3:%.*]] = call double @llvm.powi.f64.i32(double [[A3]], i32 16)
+; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> poison, double [[C0]], i32 0
+; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[C1]], i32 1
+; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[C2]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[R2]], double [[C3]], i32 3
; CHECK-NEXT: ret <4 x double> [[TMP1]]
;
%a0 = extractelement <4 x double> %a, i32 0
@@ -83,7 +110,30 @@ define <4 x double> @buildvector_powi_4f64_16(<4 x double> %a) {
define <8 x float> @buildvector_powi_8f32_4(<8 x float> %a) {
; CHECK-LABEL: @buildvector_powi_8f32_4(
-; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> [[A:%.*]], i32 4)
+; CHECK-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; CHECK-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 4)
+; CHECK-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 4)
+; CHECK-NEXT: [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 4)
+; CHECK-NEXT: [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 4)
+; CHECK-NEXT: [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 4)
+; CHECK-NEXT: [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 4)
+; CHECK-NEXT: [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 4)
+; CHECK-NEXT: [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 4)
+; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[C0]], i32 0
+; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
+; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[C2]], i32 2
+; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[C3]], i32 3
+; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[C4]], i32 4
+; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
+; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[C6]], i32 6
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> [[R6]], float [[C7]], i32 7
; CHECK-NEXT: ret <8 x float> [[TMP1]]
;
%a0 = extractelement <8 x float> %a, i32 0
@@ -119,7 +169,30 @@ define <8 x float> @buildvector_powi_8f32_4(<8 x float> %a) {
define <8 x double> @buildvector_powi_8f64_5(<8 x double> %a) {
; CHECK-LABEL: @buildvector_powi_8f64_5(
-; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.powi.v8f64.i32(<8 x double> [[A:%.*]], i32 5)
+; CHECK-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
+; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
+; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
+; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
+; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; CHECK-NEXT: [[C0:%.*]] = call double @llvm.powi.f64.i32(double [[A0]], i32 5)
+; CHECK-NEXT: [[C1:%.*]] = call double @llvm.powi.f64.i32(double [[A1]], i32 5)
+; CHECK-NEXT: [[C2:%.*]] = call double @llvm.powi.f64.i32(double [[A2]], i32 5)
+; CHECK-NEXT: [[C3:%.*]] = call double @llvm.powi.f64.i32(double [[A3]], i32 5)
+; CHECK-NEXT: [[C4:%.*]] = call double @llvm.powi.f64.i32(double [[A4]], i32 5)
+; CHECK-NEXT: [[C5:%.*]] = call double @llvm.powi.f64.i32(double [[A5]], i32 5)
+; CHECK-NEXT: [[C6:%.*]] = call double @llvm.powi.f64.i32(double [[A6]], i32 5)
+; CHECK-NEXT: [[C7:%.*]] = call double @llvm.powi.f64.i32(double [[A7]], i32 5)
+; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[C0]], i32 0
+; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
+; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
+; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
+; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
+; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
; CHECK-NEXT: ret <8 x double> [[TMP1]]
;
%a0 = extractelement <8 x double> %a, i32 0
@@ -206,7 +279,54 @@ define <8 x double> @buildvector_powi_8f64_mismatch(<8 x double> %a) {
define <16 x float> @buildvector_powi_16f32_n13(<16 x float> %a) {
; CHECK-LABEL: @buildvector_powi_16f32_n13(
-; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.powi.v16f32.i32(<16 x float> [[A:%.*]], i32 -13)
+; CHECK-NEXT: [[A0:%.*]] = extractelement <16 x float> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <16 x float> [[A]], i32 1
+; CHECK-NEXT: [[A2:%.*]] = extractelement <16 x float> [[A]], i32 2
+; CHECK-NEXT: [[A3:%.*]] = extractelement <16 x float> [[A]], i32 3
+; CHECK-NEXT: [[A4:%.*]] = extractelement <16 x float> [[A]], i32 4
+; CHECK-NEXT: [[A5:%.*]] = extractelement <16 x float> [[A]], i32 5
+; CHECK-NEXT: [[A6:%.*]] = extractelement <16 x float> [[A]], i32 6
+; CHECK-NEXT: [[A7:%.*]] = extractelement <16 x float> [[A]], i32 7
+; CHECK-NEXT: [[A8:%.*]] = extractelement <16 x float> [[A]], i32 8
+; CHECK-NEXT: [[A9:%.*]] = extractelement <16 x float> [[A]], i32 9
+; CHECK-NEXT: [[A10:%.*]] = extractelement <16 x float> [[A]], i32 10
+; CHECK-NEXT: [[A11:%.*]] = extractelement <16 x float> [[A]], i32 11
+; CHECK-NEXT: [[A12:%.*]] = extractelement <16 x float> [[A]], i32 12
+; CHECK-NEXT: [[A13:%.*]] = extractelement <16 x float> [[A]], i32 13
+; CHECK-NEXT: [[A14:%.*]] = extractelement <16 x float> [[A]], i32 14
+; CHECK-NEXT: [[A15:%.*]] = extractelement <16 x float> [[A]], i32 15
+; CHECK-NEXT: [[C0:%.*]] = call float @llvm.powi.f32.i32(float [[A0]], i32 -13)
+; CHECK-NEXT: [[C1:%.*]] = call float @llvm.powi.f32.i32(float [[A1]], i32 -13)
+; CHECK-NEXT: [[C2:%.*]] = call float @llvm.powi.f32.i32(float [[A2]], i32 -13)
+; CHECK-NEXT: [[C3:%.*]] = call float @llvm.powi.f32.i32(float [[A3]], i32 -13)
+; CHECK-NEXT: [[C4:%.*]] = call float @llvm.powi.f32.i32(float [[A4]], i32 -13)
+; CHECK-NEXT: [[C5:%.*]] = call float @llvm.powi.f32.i32(float [[A5]], i32 -13)
+; CHECK-NEXT: [[C6:%.*]] = call float @llvm.powi.f32.i32(float [[A6]], i32 -13)
+; CHECK-NEXT: [[C7:%.*]] = call float @llvm.powi.f32.i32(float [[A7]], i32 -13)
+; CHECK-NEXT: [[C8:%.*]] = call float @llvm.powi.f32.i32(float [[A8]], i32 -13)
+; CHECK-NEXT: [[C9:%.*]] = call float @llvm.powi.f32.i32(float [[A9]], i32 -13)
+; CHECK-NEXT: [[C10:%.*]] = call float @llvm.powi.f32.i32(float [[A10]], i32 -13)
+; CHECK-NEXT: [[C11:%.*]] = call float @llvm.powi.f32.i32(float [[A11]], i32 -13)
+; CHECK-NEXT: [[C12:%.*]] = call float @llvm.powi.f32.i32(float [[A12]], i32 -13)
+; CHECK-NEXT: [[C13:%.*]] = call float @llvm.powi.f32.i32(float [[A13]], i32 -13)
+; CHECK-NEXT: [[C14:%.*]] = call float @llvm.powi.f32.i32(float [[A14]], i32 -13)
+; CHECK-NEXT: [[C15:%.*]] = call float @llvm.powi.f32.i32(float [[A15]], i32 -13)
+; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> poison, float [[C0]], i32 0
+; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
+; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[C2]], i32 2
+; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[C3]], i32 3
+; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[C4]], i32 4
+; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
+; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[C6]], i32 6
+; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[C7]], i32 7
+; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[C8]], i32 8
+; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
+; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[C10]], i32 10
+; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[C11]], i32 11
+; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[C12]], i32 12
+; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
+; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[C14]], i32 14
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x float> [[R14]], float [[C15]], i32 15
; CHECK-NEXT: ret <16 x float> [[TMP1]]
;
%a0 = extractelement <16 x float> %a, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
index 68fb6c6202ce7f..e1e80d96d416de 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
@@ -11,16 +11,25 @@ declare double @llvm.sin.f64(double)
define void @test() {
; CHECK-LABEL: @test(
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 2, i32 6>
-; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 3, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+; CHECK-NEXT: [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]])
+; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]])
+; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]])
+; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]])
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]])
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP13]], double [[SIN3]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP5]]
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> poison, double [[SIN0]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP14]], double [[SIN2]], i32 1
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP9]]
; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]]
; CHECK-NEXT: store <2 x double> [[TMP12]], ptr @dst, align 8
More information about the llvm-commits
mailing list