[llvm] bebe702 - [SLP]Detect reused scalars in all possible gathers for better vectorization cost.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 5 09:43:36 PDT 2021
Author: Alexey Bataev
Date: 2021-10-05T09:43:03-07:00
New Revision: bebe702dbe8c883fd534d718288ed18319dea1a1
URL: https://github.com/llvm/llvm-project/commit/bebe702dbe8c883fd534d718288ed18319dea1a1
DIFF: https://github.com/llvm/llvm-project/commit/bebe702dbe8c883fd534d718288ed18319dea1a1.diff
LOG: [SLP]Detect reused scalars in all possible gathers for better vectorization cost.
Some initially gathered nodes missed the check for the reused scalars,
which leads to high gather cost. Such nodes still can be represented as
m gathers + shuffle instead of n gathers, where m < n.
Differential Revision: https://reviews.llvm.org/D111153
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 84ee6512173b3..61a4cbec6e038 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3198,10 +3198,42 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+ SmallVector<int> ReuseShuffleIndicies;
+ SmallVector<Value *> UniqueValues;
+ auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
+ &UserTreeIdx,
+ this](const InstructionsState &S) {
+ // Check that every instruction appears once in this bundle.
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1
+ : Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ size_t NumUniqueScalarValues = UniqueValues.size();
+ if (NumUniqueScalarValues == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (NumUniqueScalarValues <= 1 ||
+ !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return false;
+ }
+ VL = UniqueValues;
+ }
+ return true;
+ };
+
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
@@ -3210,7 +3242,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
isa<ScalableVectorType>(
cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
@@ -3236,7 +3270,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
!all_of(VL, isVectorLikeInstWithConstOps))) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
@@ -3258,7 +3294,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
@@ -3277,7 +3315,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (getTreeEntry(I)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is already in tree.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
@@ -3288,7 +3328,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (Value *V : VL) {
if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ if (TryToFindDuplicates(S))
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
return;
}
}
@@ -3307,28 +3349,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
// Check that every instruction appears once in this bundle.
- SmallVector<int> ReuseShuffleIndicies;
- SmallVector<Value *, 4> UniqueValues;
- DenseMap<Value *, unsigned> UniquePositions;
- for (Value *V : VL) {
- auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndicies.emplace_back(Res.first->second);
- if (Res.second)
- UniqueValues.emplace_back(V);
- }
- size_t NumUniqueScalarValues = UniqueValues.size();
- if (NumUniqueScalarValues == VL.size()) {
- ReuseShuffleIndicies.clear();
- } else {
- LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
- if (NumUniqueScalarValues <= 1 ||
- !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
- LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- VL = UniqueValues;
- }
+ if (!TryToFindDuplicates(S))
+ return;
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
index f7c0b92d10caf..d23dc9b1d822a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -16,38 +16,13 @@
define void @splat(i8 %a, i8 %b, i8 %c) {
; SSE-LABEL: @splat(
-; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
-; SSE-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
-; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1
-; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1
-; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1
-; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]]
-; SSE-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1
-; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
-; SSE-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1
-; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]]
-; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1
-; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]]
-; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1
-; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1
-; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1
-; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1
-; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1
-; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1
-; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1
-; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1
-; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]]
-; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1
+; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
+; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
+; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[B:%.*]], i32 1
+; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]]
+; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
; SSE-NEXT: ret void
;
; AVX-LABEL: @splat(
More information about the llvm-commits
mailing list