[llvm] d7dd31e - [SLP]Better analysis of the repeated instructions during operands reordering
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 24 14:04:48 PDT 2024
Author: Alexey Bataev
Date: 2024-09-24T14:03:10-07:00
New Revision: d7dd31e41791d71ad81af9cc4e7a26b26d4cb27e
URL: https://github.com/llvm/llvm-project/commit/d7dd31e41791d71ad81af9cc4e7a26b26d4cb27e
DIFF: https://github.com/llvm/llvm-project/commit/d7dd31e41791d71ad81af9cc4e7a26b26d4cb27e.diff
LOG: [SLP]Better analysis of the repeated instructions during operands reordering
When doing the repeated instructions analysis, better to make the
reordering non-profitable, if the number of unique instructions is not
power-of-2. In this case better to keep power-of-2 elements as this
allows better vectorization.
Fixes https://github.com/llvm/llvm-project/issues/109725
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b79e964cdb1b6b..414c6388c777b3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1930,30 +1930,38 @@ class BoUpSLP {
/// elements in the lane, it will be vectorized with higher probability
/// after removing duplicates. Currently the SLP vectorizer supports only
/// vectorization of the power-of-2 number of unique scalars.
- int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+ int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
+ const SmallBitVector &UsedLanes) const {
Value *IdxLaneV = getData(Idx, Lane).V;
- if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
+ if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
+ isa<ExtractElementInst>(IdxLaneV))
return 0;
- SmallPtrSet<Value *, 4> Uniques;
- for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
+ SmallDenseMap<Value *, unsigned, 4> Uniques;
+ for (unsigned Ln : seq<unsigned>(getNumLanes())) {
if (Ln == Lane)
continue;
Value *OpIdxLnV = getData(OpIdx, Ln).V;
if (!isa<Instruction>(OpIdxLnV))
return 0;
- Uniques.insert(OpIdxLnV);
+ Uniques.try_emplace(OpIdxLnV, Ln);
}
- int UniquesCount = Uniques.size();
- int UniquesCntWithIdxLaneV =
- Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
+ unsigned UniquesCount = Uniques.size();
+ auto IdxIt = Uniques.find(IdxLaneV);
+ unsigned UniquesCntWithIdxLaneV =
+ IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
Value *OpIdxLaneV = getData(OpIdx, Lane).V;
- int UniquesCntWithOpIdxLaneV =
- Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
+ auto OpIdxIt = Uniques.find(OpIdxLaneV);
+ unsigned UniquesCntWithOpIdxLaneV =
+ OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
return 0;
- return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
- UniquesCntWithOpIdxLaneV) -
- (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
+ return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
+ UniquesCntWithOpIdxLaneV,
+ UniquesCntWithOpIdxLaneV -
+ bit_floor(UniquesCntWithOpIdxLaneV)) -
+ ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
+ ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
+ : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
}
/// \param Lane lane of the operands under analysis.
@@ -1993,7 +2001,7 @@ class BoUpSLP {
/// predecessors.
int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
int Lane, unsigned OpIdx, unsigned Idx,
- bool &IsUsed) {
+ bool &IsUsed, const SmallBitVector &UsedLanes) {
LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
LookAheadMaxDepth);
// Keep track of the instruction stack as we recurse into the operands
@@ -2002,11 +2010,10 @@ class BoUpSLP {
LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
/*CurrLevel=*/1, MainAltOps);
if (Score) {
- int SplatScore = getSplatScore(Lane, OpIdx, Idx);
+ int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
if (Score <= -SplatScore) {
- // Set the minimum score for splat-like sequence to avoid setting
- // failed state.
- Score = 1;
+ // Failed score.
+ Score = 0;
} else {
Score += SplatScore;
// Scale score to see the
diff erence between
diff erent operands
@@ -2036,7 +2043,8 @@ class BoUpSLP {
std::optional<unsigned>
getBestOperand(unsigned OpIdx, int Lane, int LastLane,
ArrayRef<ReorderingMode> ReorderingModes,
- ArrayRef<Value *> MainAltOps) {
+ ArrayRef<Value *> MainAltOps,
+ const SmallBitVector &UsedLanes) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
@@ -2092,7 +2100,7 @@ class BoUpSLP {
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
- OpIdx, Idx, IsUsed);
+ OpIdx, Idx, IsUsed, UsedLanes);
if (Score > static_cast<int>(BestOp.Score) ||
(Score > 0 && Score == static_cast<int>(BestOp.Score) &&
Idx == OpIdx)) {
@@ -2507,20 +2515,24 @@ class BoUpSLP {
for (unsigned I = 0; I < NumOperands; ++I)
MainAltOps[I].push_back(getData(I, FirstLane).V);
+ SmallBitVector UsedLanes(NumLanes);
+ UsedLanes.set(FirstLane);
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
// Visit the lane on the right and then the lane on the left.
for (int Direction : {+1, -1}) {
int Lane = FirstLane + Direction * Distance;
if (Lane < 0 || Lane >= (int)NumLanes)
continue;
+ UsedLanes.set(Lane);
int LastLane = Lane - Direction;
assert(LastLane >= 0 && LastLane < (int)NumLanes &&
"Out of bounds");
// Look for a good match for each operand.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
- std::optional<unsigned> BestIdx = getBestOperand(
- OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
+ std::optional<unsigned> BestIdx =
+ getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
+ MainAltOps[OpIdx], UsedLanes);
// By not selecting a value, we allow the operands that follow to
// select a better matching value. We will get a non-null value in
// the next run of getBestOperand().
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
index 9691cb7537a702..33fa00c1881da3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
@@ -6,29 +6,23 @@ define i32 @a() {
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
-; CHECK-NEXT: [[TMP2:%.*]] = phi i8 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[BB1]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = phi i8 [ 0, [[TMP0]] ], [ [[TMP8:%.*]], %[[BB1]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4
-; CHECK-NEXT: [[TMP7]] = extractelement <4 x i8> [[TMP6]], i32 3
-; CHECK-NEXT: [[TMP8]] = extractelement <4 x i8> [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i8> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = xor i8 [[TMP9]], [[TMP3]]
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i8> [[TMP6]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = xor i8 [[TMP11]], [[TMP2]]
-; CHECK-NEXT: [[TMP13:%.*]] = xor i8 [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = xor i8 [[TMP7]], [[TMP11]]
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 0, i32 3>
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 2
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP13]], i32 4
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 poison, i32 13>
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP14]], i32 6
+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4
+; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: br label %[[BB1]]
;
br label %1
More information about the llvm-commits
mailing list