[llvm] [LV]Fix/improve max safe distance analysis (PR #121156)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 26 08:25:52 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-analysis
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
The patch improves/fixes existing max safe distance analysis. It fixes
the power-of-2 analsysi for non-power-of-2 safe distances (by using gcd
and countr_zero instead of bit_floor), improves detection of the
potentially tail folded loops with safe distances.
Part of #<!-- -->100755
---
Full diff: https://github.com/llvm/llvm-project/pull/121156.diff
7 Files Affected:
- (modified) llvm/include/llvm/Analysis/LoopAccessAnalysis.h (+10)
- (modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+6)
- (modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+13-10)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+38-15)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll (+2-50)
- (modified) llvm/test/Transforms/LoopVectorize/memdep.ll (+2-2)
``````````diff
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index a35bc7402d1a89..d5cf959fb04ec2 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -216,6 +216,12 @@ class MemoryDepChecker {
return MaxSafeVectorWidthInBits;
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding and safe to operate simultaneously.
+ std::optional<uint64_t> getStoreLoadForwardSafeVF() const {
+ return MaxStoreLoadForwardSafeVF;
+ }
+
/// In same cases when the dependency check fails we can still
/// vectorize the loop with a dynamic array access check.
bool shouldRetryWithRuntimeCheck() const {
@@ -304,6 +310,10 @@ class MemoryDepChecker {
/// restrictive.
uint64_t MaxSafeVectorWidthInBits = -1U;
+ /// Maximum number of elements (power-of-2 and non-power-of-2), which do not
+ /// prevent store-load forwarding and safe to operate simultaneously.
+ std::optional<uint64_t> MaxStoreLoadForwardSafeVF;
+
/// If we see a non-constant dependence distance we can still try to
/// vectorize this loop with runtime checks.
bool FoundNonConstantDistanceDependence = false;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index fbe80eddbae07a..462c11d841b841 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -412,6 +412,12 @@ class LoopVectorizationLegality {
return getUncountableExitBlocks()[0];
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding and safe to operate simultaneously.
+ std::optional<unsigned> getMaxStoreLoadForwardSafeVFPowerOf2() const {
+ return LAI->getDepChecker().getStoreLoadForwardSafeVF();
+ }
+
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) const {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2c75d5625cb66d..764600c3adae7a 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1752,31 +1752,34 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
// cause any slowdowns.
const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
// Maximum vector factor.
- uint64_t MaxVFWithoutSLForwardIssues = std::min(
- VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
+ uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = std::min(
+ VectorizerParams::MaxVectorWidth * TypeByteSize,
+ MaxStoreLoadForwardSafeVF.value_or(std::numeric_limits<uint64_t>::max()));
// Compute the smallest VF at which the store and load would be misaligned.
- for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
- VF *= 2) {
+ for (uint64_t VF = 2 * TypeByteSize;
+ VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
// If the number of vector iteration between the store and the load are
// small we could incur conflicts.
if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
- MaxVFWithoutSLForwardIssues = (VF >> 1);
+ MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
break;
}
}
- if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
LLVM_DEBUG(
dbgs() << "LAA: Distance " << Distance
<< " that could cause a store-load forwarding conflict\n");
return true;
}
- if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
- MaxVFWithoutSLForwardIssues !=
- VectorizerParams::MaxVectorWidth * TypeByteSize)
- MinDepDistBytes = MaxVFWithoutSLForwardIssues;
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize)
+ MaxStoreLoadForwardSafeVF = 1;
+ else if (MaxVFWithoutSLForwardIssuesPowerOf2 < MaxStoreLoadForwardSafeVF &&
+ MaxVFWithoutSLForwardIssuesPowerOf2 !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
+ MaxStoreLoadForwardSafeVF = MaxVFWithoutSLForwardIssuesPowerOf2;
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb828b738d310f..367a011323b51b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1436,8 +1436,10 @@ class LoopVectorizationCostModel {
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
/// \param IsScalableVF true if scalable vector factors enabled.
+ /// \param TailFoldPowOf2 true if tail folding with power-of-2
+ /// safe distance can be enabled.
/// \param UserIC User specific interleave count.
- void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ void setTailFoldingStyles(bool IsScalableVF, bool TailFoldPowOf2, unsigned UserIC) {
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
if (!Legal->canFoldTailByMasking()) {
ChosenTailFoldingStyle =
@@ -1446,24 +1448,37 @@ class LoopVectorizationCostModel {
}
if (!ForceTailFoldingStyle.getNumOccurrences()) {
- ChosenTailFoldingStyle = std::make_pair(
- TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
- TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
+ if (!TailFoldPowOf2)
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ else
+ ChosenTailFoldingStyle = std::make_pair(
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
return;
}
// Set styles when forced.
ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
ForceTailFoldingStyle.getValue());
- if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+ if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) {
+ if (!TailFoldPowOf2)
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
return;
+ }
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
- bool EVLIsLegal = UserIC <= 1 &&
+ bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath;
if (!EVLIsLegal) {
+ if (!TailFoldPowOf2) {
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ return;
+ }
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
// in a generic way.
@@ -4016,11 +4031,15 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
- unsigned MaxSafeElements =
- llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ unsigned MaxSafeElements = Legal->getMaxSafeVectorWidthInBits() / WidestType;
+ if (Legal->isSafeForAnyVectorWidth())
+ MaxSafeElements = bit_ceil(MaxSafeElements);
+ unsigned MaxSafeElementsPowerOf2 = bit_floor(std::gcd(
+ MaxSafeElements, Legal->getMaxStoreLoadForwardSafeVFPowerOf2().value_or(
+ 1ULL << countr_zero(MaxSafeElements))));
+ auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
- auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
- auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
if (!Legal->isSafeForAnyVectorWidth())
this->MaxSafeElements = MaxSafeElements;
@@ -4233,13 +4252,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxFactors;
}
+ MaxPowerOf2RuntimeVF.reset();
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
+ setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(),
+ !MaxPowerOf2RuntimeVF.has_value(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -4258,6 +4275,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return MaxFactors;
}
+ if (MaxPowerOf2RuntimeVF) {
+ // Accept MaxFixedVF if we do not have a tail.
+ LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ return MaxFactors;
+ }
+
// If there was a tail-folding hint/switch, but we can't fold the tail by
// masking, fallback to a vectorization with a scalar epilogue.
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index eb60c24393df99..cbdd9a06497655 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -21,7 +21,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found trip count: 0
; CHECK-NEXT: LV: Found maximum trip count: 4294967295
; CHECK-NEXT: LV: Scalable vectorization is available
-; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT: LV: The max safe fixed VF is: 134217728.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -268,7 +268,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found trip count: 0
; CHECK-NEXT: LV: Found maximum trip count: 4294967295
; CHECK-NEXT: LV: Scalable vectorization is available
-; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT: LV: The max safe fixed VF is: 134217728.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
diff --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
index d1ad7e3f4fc0d8..ea592c1e1063ac 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
@@ -24,57 +24,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define void @maxvf3() {
; CHECK-LABEL: @maxvf3(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], splat (i32 14)
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK: pred.store.if:
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP2]]
-; CHECK-NEXT: store i8 69, ptr [[TMP3]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
-; CHECK: pred.store.continue:
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; CHECK: pred.store.if1:
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP5]]
-; CHECK-NEXT: store i8 69, ptr [[TMP6]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
-; CHECK: pred.store.continue2:
-; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <2 x i32> splat (i32 3), [[VEC_IND]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; CHECK: pred.store.if3:
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP9]]
-; CHECK-NEXT: store i8 7, ptr [[TMP10]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
-; CHECK: pred.store.continue4:
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; CHECK: pred.store.if5:
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP12]]
-; CHECK-NEXT: store i8 7, ptr [[TMP13]], align 8
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
-; CHECK: pred.store.continue6:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[J]]
; CHECK-NEXT: store i8 69, ptr [[AJ]], align 8
; CHECK-NEXT: [[JP3:%.*]] = add nuw nsw i32 3, [[J]]
@@ -82,7 +34,7 @@ define void @maxvf3() {
; CHECK-NEXT: store i8 7, ptr [[AJP3]], align 8
; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i32 [[J]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
index b891b4312f18d3..28cf3b61b2554a 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -226,7 +226,7 @@ for.end:
;Check the new calculation of the maximum safe distance in bits which can be vectorized.
;The previous behavior did not take account that the stride was 2.
-;Therefore the maxVF was computed as 8 instead of 4, as the dependence distance here is 6 iterations, given by |N-(N-12)|/2.
+;Therefore the maxVF was computed as 8 instead of 2, as the dependence distance here is 6 iterations, given by |N-(N-12)|/2.
;#define M 32
;#define N 2 * M
@@ -242,7 +242,7 @@ for.end:
;}
; RIGHTVF-LABEL: @pr34283
-; RIGHTVF: <4 x i64>
+; RIGHTVF: <2 x i64>
; WRONGVF-LABLE: @pr34283
; WRONGVF-NOT: <8 x i64>
``````````
</details>
https://github.com/llvm/llvm-project/pull/121156
More information about the llvm-commits
mailing list