[llvm] 466217e - [SLP]Fix graph traversal in getSpillCost
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 31 03:27:51 PST 2025
Author: Alexey Bataev
Date: 2025-01-31T06:27:47-05:00
New Revision: 466217eb0334d467ec8e9b96c52ee988aaadc389
URL: https://github.com/llvm/llvm-project/commit/466217eb0334d467ec8e9b96c52ee988aaadc389
DIFF: https://github.com/llvm/llvm-project/commit/466217eb0334d467ec8e9b96c52ee988aaadc389.diff
LOG: [SLP]Fix graph traversal in getSpillCost
getSpill cost relies on def-use order when performs the analysis for the
vectorized instructions live-over-calls spills.
Patch fixes it to check the dependencies based on TreeEntries and
performs actual vectorized type analysis.
Reviewers: RKSimon, preames
Reviewed By: preames
Pull Request: https://github.com/llvm/llvm-project/pull/124984
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f89944f5a0bfc7..beff0d8409787a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1395,7 +1395,7 @@ class BoUpSLP {
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
- InstructionCost getSpillCost() const;
+ InstructionCost getSpillCost();
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
@@ -2958,7 +2958,7 @@ class BoUpSLP {
}
/// Check if the value is vectorized in the tree.
- bool isVectorized(Value *V) const {
+ bool isVectorized(const Value *V) const {
assert(V && "V cannot be nullptr.");
return ScalarToTreeEntries.contains(V);
}
@@ -12160,16 +12160,15 @@ bool BoUpSLP::isTreeNotExtendable() const {
return Res;
}
-InstructionCost BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
- unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
InstructionCost Cost = 0;
- SmallPtrSet<Instruction *, 4> LiveValues;
- Instruction *PrevInst = nullptr;
+ SmallPtrSet<const TreeEntry *, 4> LiveEntries;
+ const TreeEntry *Prev = nullptr;
// The entries in VectorizableTree are not necessarily ordered by their
// position in basic blocks. Collect them and order them by dominance so later
@@ -12177,61 +12176,64 @@ InstructionCost BoUpSLP::getSpillCost() const {
//
diff erent basic blocks, we only scan to the beginning of the block, so
// their order does not matter, as long as all instructions in a basic block
// are grouped together. Using dominance ensures a deterministic order.
- SmallVector<Instruction *, 16> OrderedScalars;
+ SmallVector<TreeEntry *, 16> OrderedEntries;
for (const auto &TEPtr : VectorizableTree) {
- if (TEPtr->State != TreeEntry::Vectorize)
+ if (TEPtr->isGather())
continue;
- Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
- if (!Inst)
- continue;
- OrderedScalars.push_back(Inst);
- }
- llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
- auto *NodeA = DT->getNode(A->getParent());
- auto *NodeB = DT->getNode(B->getParent());
+ OrderedEntries.push_back(TEPtr.get());
+ }
+ llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
+ const TreeEntry *TB) {
+ Instruction &A = getLastInstructionInBundle(TA);
+ Instruction &B = getLastInstructionInBundle(TB);
+ auto *NodeA = DT->getNode(A.getParent());
+ auto *NodeB = DT->getNode(B.getParent());
assert(NodeA && "Should only process reachable instructions");
assert(NodeB && "Should only process reachable instructions");
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have
diff erent DFS numbers");
if (NodeA != NodeB)
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
- return B->comesBefore(A);
+ return B.comesBefore(&A);
});
- for (Instruction *Inst : OrderedScalars) {
- if (!PrevInst) {
- PrevInst = Inst;
+ for (const TreeEntry *TE : OrderedEntries) {
+ if (!Prev) {
+ Prev = TE;
continue;
}
- // Update LiveValues.
- LiveValues.erase(PrevInst);
- for (auto &J : PrevInst->operands()) {
- if (isa<Instruction>(&*J) && isVectorized(&*J))
- LiveValues.insert(cast<Instruction>(&*J));
+ LiveEntries.erase(Prev);
+ for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
+ const TreeEntry *Op = getVectorizedOperand(Prev, I);
+ if (!Op)
+ continue;
+ assert(!Op->isGather() && "Expected vectorized operand.");
+ LiveEntries.insert(Op);
}
LLVM_DEBUG({
- dbgs() << "SLP: #LV: " << LiveValues.size();
- for (auto *X : LiveValues)
- dbgs() << " " << X->getName();
+ dbgs() << "SLP: #LV: " << LiveEntries.size();
+ for (auto *X : LiveEntries)
+ X->dump();
dbgs() << ", Looking at ";
- Inst->dump();
+ TE->dump();
});
// Now find the sequence of instructions between PrevInst and Inst.
unsigned NumCalls = 0;
- BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
- PrevInstIt =
- PrevInst->getIterator().getReverse();
+ const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
+ BasicBlock::const_reverse_iterator
+ InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
+ PrevInstIt = PrevInst->getIterator().getReverse();
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
- PrevInstIt = Inst->getParent()->rbegin();
+ PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
continue;
}
- auto NoCallIntrinsic = [this](Instruction *I) {
- auto *II = dyn_cast<IntrinsicInst>(I);
+ auto NoCallIntrinsic = [this](const Instruction *I) {
+ const auto *II = dyn_cast<IntrinsicInst>(I);
if (!II)
return false;
if (II->isAssumeLikeIntrinsic())
@@ -12252,25 +12254,28 @@ InstructionCost BoUpSLP::getSpillCost() const {
};
// Debug information does not impact spill cost.
- if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
- &*PrevInstIt != PrevInst)
+ // Vectorized calls, represented as vector intrinsics, do not impact spill
+ // cost.
+ if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
NumCalls++;
++PrevInstIt;
}
if (NumCalls) {
- SmallVector<Type *, 4> V;
- for (auto *II : LiveValues) {
- auto *ScalarTy = II->getType();
- if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
- ScalarTy = VectorTy->getElementType();
- V.push_back(getWidenedType(ScalarTy, BundleWidth));
+ SmallVector<Type *, 4> EntriesTypes;
+ for (const TreeEntry *TE : LiveEntries) {
+ auto *ScalarTy = TE->getMainOp()->getType();
+ auto It = MinBWs.find(TE);
+ if (It != MinBWs.end())
+ ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
+ EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
}
- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
+ Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
}
- PrevInst = Inst;
+ Prev = TE;
}
return Cost;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index 9ce79e5ea356b9..5ad676537f9c45 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -684,27 +684,27 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]]
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
+; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
+; CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[MUL]], 2
; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4
; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4
; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1
; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64
-; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
+; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
-; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
+; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
+; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
-; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
+; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4
; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24
@@ -715,25 +715,22 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28
; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
-; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 32
-; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44
-; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 36
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4
+; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX65]], align 4
; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
-; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
-; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
+; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4
; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
-; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4
+; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 6fbd05aaedfe5b..5bfd776512711f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -149,37 +149,27 @@ define <4 x float> @exp_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @exp_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -206,37 +196,27 @@ define <4 x float> @int_exp_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_exp_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -263,37 +243,27 @@ define <4 x float> @log_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @log_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -320,37 +290,27 @@ define <4 x float> @int_log_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_log_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -377,37 +337,27 @@ define <4 x float> @sin_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @sin_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -434,37 +384,27 @@ define <4 x float> @int_sin_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_sin_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -491,37 +431,27 @@ define <4 x float> @asin_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @asinf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @asin_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @asinf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -548,37 +478,27 @@ define <4 x float> @int_asin_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_asin_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -605,37 +525,27 @@ define <4 x float> @cos_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @cos_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -662,37 +572,27 @@ define <4 x float> @int_cos_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_cos_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -719,37 +619,27 @@ define <4 x float> @acos_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @acos_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -776,37 +666,27 @@ define <4 x float> @int_acos_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_acos_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -833,37 +713,27 @@ define <4 x float> @tan_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @tan_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -890,37 +760,27 @@ define <4 x float> @int_tan_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_tan_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -947,37 +807,27 @@ define <4 x float> @atan_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @atan_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -1004,37 +854,27 @@ define <4 x float> @int_atan_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_atan_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -1061,37 +901,27 @@ define <4 x float> @sinh_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @sinh_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -1118,37 +948,27 @@ define <4 x float> @int_sinh_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_sinh_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -1289,37 +1109,27 @@ define <4 x float> @cosh_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @coshf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @cosh_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @coshf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -1346,37 +1156,27 @@ define <4 x float> @int_cosh_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_cosh_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -1517,37 +1317,27 @@ define <4 x float> @tanh_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @tanh_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -1574,37 +1364,27 @@ define <4 x float> @int_tanh_4x(ptr %a) {
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
-; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
-; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
;
; DEFAULT-LABEL: define <4 x float> @int_tanh_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
-; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
-; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; DEFAULT-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP1]])
+; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4]])
+; DEFAULT-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
More information about the llvm-commits
mailing list