[llvm] 92fbd76 - [SLP]Improve registering and merging of compatible shuffles.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 2 08:49:11 PST 2021
Author: Alexey Bataev
Date: 2021-12-02T08:48:29-08:00
New Revision: 92fbd76af525095197188c862db2c4f5b96a8a73
URL: https://github.com/llvm/llvm-project/commit/92fbd76af525095197188c862db2c4f5b96a8a73
DIFF: https://github.com/llvm/llvm-project/commit/92fbd76af525095197188c862db2c4f5b96a8a73.diff
LOG: [SLP]Improve registering and merging of compatible shuffles.
If several shuffle instructions are emitted, some of them might
same/compatible (less defined) with the previously emitted ones. Such
shuffles can be removed safely, improving the total cost of the
vectorized code.
Differential Revision: https://reviews.llvm.org/D114087
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fb18da0ddc694..a6231f8b6e7f9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5922,10 +5922,17 @@ class ShuffleInstructionBuilder {
const unsigned VF = 0;
bool IsFinalized = false;
SmallVector<int, 4> Mask;
+ /// Holds all of the instructions that we gathered.
+ SetVector<Instruction *> &GatherShuffleSeq;
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> &CSEBlocks;
public:
- ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)
- : Builder(Builder), VF(VF) {}
+ ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF,
+ SetVector<Instruction *> &GatherShuffleSeq,
+ SetVector<BasicBlock *> &CSEBlocks)
+ : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq),
+ CSEBlocks(CSEBlocks) {}
/// Adds a mask, inverting it before applying.
void addInversedMask(ArrayRef<unsigned> SubMask) {
@@ -5955,7 +5962,12 @@ class ShuffleInstructionBuilder {
if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))
return V;
- return Builder.CreateShuffleVector(V, Mask, "shuffle");
+ Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ return Vec;
}
~ShuffleInstructionBuilder() {
@@ -6013,6 +6025,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
std::iota(UniformMask.begin(), UniformMask.end(), 0);
V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
}
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
return V;
}
@@ -6060,15 +6076,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
VL = UniqueValues;
}
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+ CSEBlocks);
Value *Vec = gather(VL);
if (!ReuseShuffleIndicies.empty()) {
ShuffleBuilder.addMask(ReuseShuffleIndicies);
Vec = ShuffleBuilder.finalize(Vec);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
return Vec;
}
@@ -6083,7 +6096,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
unsigned VF = E->getVectorFactor();
- ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
+ CSEBlocks);
if (E->State == TreeEntry::NeedToGather) {
if (E->getMainOp())
setInsertPointAfterBundle(E);
@@ -6097,16 +6111,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
"Expected shuffle of 1 or 2 entries.");
Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
Entries.back()->VectorizedValue, Mask);
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
} else {
Vec = gather(E->Scalars);
}
if (NeedToShuffleReuses) {
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
Vec = ShuffleBuilder.finalize(Vec);
- if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherShuffleSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
E->VectorizedValue = Vec;
return Vec;
@@ -6223,8 +6237,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IsIdentity &= *InsertIdx - Offset == I;
Mask[*InsertIdx - Offset] = I;
}
- if (!IsIdentity || NumElts != NumScalars)
+ if (!IsIdentity || NumElts != NumScalars) {
V = Builder.CreateShuffleVector(V, Mask);
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
if ((!IsIdentity || Offset != 0 ||
!isUndefVector(FirstInsert->getOperand(0))) &&
@@ -6239,6 +6258,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V = Builder.CreateShuffleVector(
FirstInsert->getOperand(0), V, InsertMask,
cast<Instruction>(E->Scalars.back())->getName());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
++NumVectorInstructions;
@@ -6621,8 +6644,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
- if (Instruction *I = dyn_cast<Instruction>(V))
+ if (auto *I = dyn_cast<Instruction>(V)) {
V = propagateMetadata(I, E->Scalars);
+ GatherShuffleSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
@@ -6863,7 +6889,50 @@ void BoUpSLP::optimizeGatherSequence() {
return A->getDFSNumIn() < B->getDFSNumIn();
});
- // Perform O(N^2) search over the gather sequences and merge identical
+ // Less defined shuffles can be replaced by the more defined copies.
+ // Between two shuffles one is less defined if it has the same vector operands
+ // and its mask indeces are the same as in the first one or undefs. E.g.
+ // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
+ // poison, <0, 0, 0, 0>.
+ auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
+ SmallVectorImpl<int> &NewMask) {
+ if (I1->getType() != I2->getType())
+ return false;
+ auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
+ auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
+ if (!SI1 || !SI2)
+ return I1->isIdenticalTo(I2);
+ if (SI1->isIdenticalTo(SI2))
+ return true;
+ for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
+ if (SI1->getOperand(I) != SI2->getOperand(I))
+ return false;
+ // Check if the second instruction is more defined than the first one.
+ NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
+ ArrayRef<int> SM1 = SI1->getShuffleMask();
+ // Count trailing undefs in the mask to check the final number of used
+ // registers.
+ unsigned LastUndefsCnt = 0;
+ for (int I = 0, E = NewMask.size(); I < E; ++I) {
+ if (SM1[I] == UndefMaskElem)
+ ++LastUndefsCnt;
+ else
+ LastUndefsCnt = 0;
+ if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&
+ NewMask[I] != SM1[I])
+ return false;
+ if (NewMask[I] == UndefMaskElem)
+ NewMask[I] = SM1[I];
+ }
+ // Check if the last undefs actually change the final number of used vector
+ // registers.
+ return SM1.size() - LastUndefsCnt > 1 &&
+ TTI->getNumberOfParts(SI1->getType()) ==
+ TTI->getNumberOfParts(
+ FixedVectorType::get(SI1->getType()->getElementType(),
+ SM1.size() - LastUndefsCnt));
+ };
+ // Perform O(N^2) search over the gather/shuffle sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into
diff erent buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
@@ -6883,11 +6952,29 @@ void BoUpSLP::optimizeGatherSequence() {
// Check if we can replace this instruction with any of the
// visited instructions.
bool Replaced = false;
- for (Instruction *v : Visited) {
- if (In.isIdenticalTo(v) &&
- DT->dominates(v->getParent(), In.getParent())) {
- In.replaceAllUsesWith(v);
+ for (Instruction *&V : Visited) {
+ SmallVector<int> NewMask;
+ if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
+ DT->dominates(V->getParent(), In.getParent())) {
+ In.replaceAllUsesWith(V);
eraseInstruction(&In);
+ if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
+ if (!NewMask.empty())
+ SI->setShuffleMask(NewMask);
+ Replaced = true;
+ break;
+ }
+ if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
+ GatherShuffleSeq.contains(V) &&
+ IsIdenticalOrLessDefined(V, &In, NewMask) &&
+ DT->dominates(In.getParent(), V->getParent())) {
+ In.moveAfter(V);
+ V->replaceAllUsesWith(&In);
+ eraseInstruction(V);
+ if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
+ if (!NewMask.empty())
+ SI->setShuffleMask(NewMask);
+ V = &In;
Replaced = true;
break;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
index 3e23a4762777d..01b75f4f9806e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
@@ -9,8 +9,7 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
-; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
@@ -39,9 +38,8 @@ define i32 @diamond_broadcast2(i32* noalias nocapture %B, i32* noalias nocapture
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
@@ -70,9 +68,8 @@ define i32 @diamond_broadcast3(i32* noalias nocapture %B, i32* noalias nocapture
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 0>
-; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
index 88d43e64894f2..27d6db4b48950 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
@@ -7,15 +7,15 @@ define void @test(i16 %0) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> <i16 0, i16 poison>, i16 [[TMP0:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, <4 x i32> [[TMP5]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
; CHECK-NEXT: br label [[FOR_BODY92:%.*]]
; CHECK: for.body92:
; CHECK-NEXT: [[SUM_MVR_I:%.*]] = getelementptr i32, i32* undef, i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[SUM_MVR_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 2
; CHECK-NEXT: [[SUM_MVC_I:%.*]] = getelementptr i32, i32* undef, i32 1
; CHECK-NEXT: [[SUM_MVC_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, <4 x i32> [[TMP5]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[SUM_MVR_I]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 8
More information about the llvm-commits
mailing list