[llvm] 4675a16 - Revert "[SLP]Improve analysis/emission of vector operands for alternate nodes."
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 25 05:27:43 PST 2021
Author: Alexey Bataev
Date: 2021-11-25T05:19:49-08:00
New Revision: 4675a1654cb3a3a528ceb4021d94c6cbe143b64a
URL: https://github.com/llvm/llvm-project/commit/4675a1654cb3a3a528ceb4021d94c6cbe143b64a
DIFF: https://github.com/llvm/llvm-project/commit/4675a1654cb3a3a528ceb4021d94c6cbe143b64a.diff
LOG: Revert "[SLP]Improve analysis/emission of vector operands for alternate nodes."
This reverts commit 496254cf802a21e1967b61dec48017b8ec831574 to fix
compiler crashes reported in D114101#3152982.
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 281a3a40a6a5..d64eb05700c9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1680,28 +1680,6 @@ class BoUpSLP {
return IsSame(Scalars, ReuseShuffleIndices);
}
- /// \returns true if current entry has same operands as \p TE.
- bool hasEqualOperands(const TreeEntry &TE) const {
- if (TE.getNumOperands() != getNumOperands())
- return false;
- SmallBitVector Used(getNumOperands());
- for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
- unsigned PrevCount = Used.count();
- for (unsigned K = 0; K < E; ++K) {
- if (Used.test(K))
- continue;
- if (getOperand(K) == TE.getOperand(I)) {
- Used.set(K);
- break;
- }
- }
- // Check if we actually found the matching operand.
- if (PrevCount == Used.count())
- return false;
- }
- return true;
- }
-
/// \return Final vectorization factor for the node. Defined by the total
/// number of vectorized scalars, including those, used several times in the
/// entry and counted in the \a ReuseShuffleIndices, if any.
@@ -1795,12 +1773,6 @@ class BoUpSLP {
return Operands[OpIdx];
}
- /// \returns the \p OpIdx operand of this TreeEntry.
- ArrayRef<Value *> getOperand(unsigned OpIdx) const {
- assert(OpIdx < Operands.size() && "Off bounds");
- return Operands[OpIdx];
- }
-
/// \returns the number of operands.
unsigned getNumOperands() const { return Operands.size(); }
@@ -2106,7 +2078,7 @@ class BoUpSLP {
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
- SetVector<Instruction *> GatherShuffleSeq;
+ SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
@@ -5044,30 +5016,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
InstructionCost VecCost = 0;
- // Try to find the previous shuffle node with the same operands and same
- // main/alternate ops.
- auto &&TryFindNodeWithEqualOperands = [this, E]() {
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
- if (TE.get() == E)
- break;
- if (TE->isAltShuffle() &&
- ((TE->getOpcode() == E->getOpcode() &&
- TE->getAltOpcode() == E->getAltOpcode()) ||
- (TE->getOpcode() == E->getAltOpcode() &&
- TE->getAltOpcode() == E->getOpcode())) &&
- TE->hasEqualOperands(*E))
- return true;
- }
- return false;
- };
- if (TryFindNodeWithEqualOperands()) {
- LLVM_DEBUG({
- dbgs() << "SLP: diamond match for alternate node found.\n";
- E->dump();
- });
- // No need to add new vector costs here since we're going to reuse
- // same main/alternate vector ops, just do
diff erent shuffling.
- } else if (Instruction::isBinaryOp(E->getOpcode())) {
+ if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
@@ -5779,7 +5728,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
if (!InsElt)
return Vec;
- GatherShuffleSeq.insert(InsElt);
+ GatherSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
if (TreeEntry *Entry = getTreeEntry(V)) {
@@ -5966,7 +5915,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
ShuffleBuilder.addMask(ReuseShuffleIndicies);
Vec = ShuffleBuilder.finalize(Vec);
if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
+ GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
@@ -6004,7 +5953,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
Vec = ShuffleBuilder.finalize(Vec);
if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
+ GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
@@ -6495,14 +6444,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V1 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
}
- // Add V0 and V1 to later analysis to try to find and remove matching
- // instruction, if any.
- for (Value *V : {V0, V1}) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- }
// Create shuffle to take alternate operations from the vector.
// Also, gather up main and alt scalar ops to propagate IR flags to
@@ -6716,10 +6657,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
void BoUpSLP::optimizeGatherSequence() {
- LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
- for (Instruction *I : GatherShuffleSeq) {
+ for (Instruction *I : GatherSeq) {
if (isDeleted(I))
continue;
@@ -6778,7 +6719,7 @@ void BoUpSLP::optimizeGatherSequence() {
if (isDeleted(&In))
continue;
if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
- !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
+ !isa<ShuffleVectorInst>(&In))
continue;
// Check if we can replace this instruction with any of the
@@ -6800,7 +6741,7 @@ void BoUpSLP::optimizeGatherSequence() {
}
}
CSEBlocks.clear();
- GatherShuffleSeq.clear();
+ GatherSeq.clear();
}
// Groups the instructions to a bundle (which is then a single scheduling entity)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index 26aeadac2968..16fd83f4b2ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -9,9 +9,11 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: ret <2 x i64> [[TMP7]]
;
%v0.0 = extractelement <2 x i64> %v0, i32 0
%v0.1 = extractelement <2 x i64> %v0, i32 1
@@ -37,10 +39,12 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
; CHECK-NEXT: ret void
;
%a.0 = getelementptr i64, i64* %a, i64 0
@@ -69,9 +73,11 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: ret <4 x i32> [[TMP7]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
@@ -105,9 +111,11 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -216,15 +224,17 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
-; CHECK-NEXT: ret i32 [[TMP11]]
+; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
+; CHECK-NEXT: ret i32 [[TMP13]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 0ab9d032729f..b4ed8604e2f8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -9,9 +9,11 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: ret <2 x i64> [[TMP7]]
;
%v0.0 = extractelement <2 x i64> %v0, i32 0
%v0.1 = extractelement <2 x i64> %v0, i32 1
@@ -37,10 +39,12 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
; CHECK-NEXT: ret void
;
%a.0 = getelementptr i64, i64* %a, i64 0
@@ -69,9 +73,11 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: ret <4 x i32> [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: ret <4 x i32> [[TMP7]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
@@ -105,9 +111,11 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -216,15 +224,17 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
-; CHECK-NEXT: ret i32 [[TMP11]]
+; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
+; CHECK-NEXT: ret i32 [[TMP13]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
index b140c567a7c2..697767a07a5c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
@@ -8,7 +8,7 @@
; YAML-NEXT: Function: build_vec_v2i64
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
-; YAML-NEXT: - Cost: '-10'
+; YAML-NEXT: - Cost: '-8'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '8'
@@ -17,9 +17,11 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
-; CHECK-NEXT: ret <2 x i64> [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: ret <2 x i64> [[TMP7]]
;
%v0.0 = extractelement <2 x i64> %v0, i32 0
%v0.1 = extractelement <2 x i64> %v0, i32 1
More information about the llvm-commits
mailing list