[llvm] 9181a7e - [LV] Fix branch weights in epilogue min iteration check block (#152534)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 11 01:52:56 PDT 2025
Author: David Sherwood
Date: 2025-08-11T09:52:54+01:00
New Revision: 9181a7e294507515160cdd9ad3a814bb8e471bd7
URL: https://github.com/llvm/llvm-project/commit/9181a7e294507515160cdd9ad3a814bb8e471bd7
DIFF: https://github.com/llvm/llvm-project/commit/9181a7e294507515160cdd9ad3a814bb8e471bd7.diff
LOG: [LV] Fix branch weights in epilogue min iteration check block (#152534)
I've changed how we construct the EpilogueVectorizerEpilogueLoop and
EpilogueVectorizerMainLoop classes so that we construct the parent class
with an additional boolean parameter indicating whether we're
vectorising the main or epilogue loop. The
InnerLoopAndEpilogueVectorizer class uses this new argument in
combination with the EpilogueLoopVectorizationInfo struct to set the
right UF and VF values. This then allows EpilogueVectorizerEpilogueLoop
to access the correct values of VF and UF for the main loop, which are
required when setting branch weights in the minimum iteration check
block.
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
llvm/test/Transforms/LoopVectorize/branch-weights.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d25add7c4be1f..284592133890e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -687,10 +687,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
- : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
- BFI, PSI, Checks, Plan),
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan,
+ ElementCount VecWidth, ElementCount MinProfitableTripCount,
+ unsigned UnrollFactor)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, VecWidth,
+ MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
+ Checks, Plan),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -725,8 +727,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
- : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, CM, BFI, PSI, Check, Plan) {}
+ : InnerLoopAndEpilogueVectorizer(
+ OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, CM, BFI, PSI, Check,
+ Plan, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
@@ -752,8 +755,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
- : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, CM, BFI, PSI, Checks, Plan) {
+ : InnerLoopAndEpilogueVectorizer(
+ OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, CM, BFI, PSI, Checks,
+ Plan, EPI.EpilogueVF, EPI.EpilogueVF, EPI.EpilogueUF) {
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
@@ -7508,8 +7512,9 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
assert(Bypass && "Expected valid bypass basic block.");
Value *Count = getTripCount();
MinProfitableTripCount = ElementCount::getFixed(0);
- Value *CheckMinIters = createIterationCountCheck(
- ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF);
+ Value *CheckMinIters =
+ createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
+ ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
if (!ForEpilogue)
@@ -7641,9 +7646,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
- // think the MainLoopStep is correct.
- unsigned MainLoopStep = UF * VF.getKnownMinValue();
+ unsigned MainLoopStep = EPI.MainLoopUF * EPI.MainLoopVF.getKnownMinValue();
unsigned EpilogueLoopStep =
EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
// We assume the remaining `Count` is equally distributed in
@@ -10288,8 +10291,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
- EPI.MainLoopVF = EPI.EpilogueVF;
- EPI.MainLoopUF = EPI.EpilogueUF;
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &CM, BFI, PSI,
Checks, BestEpiPlan);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 027a88de14153..9987701e44cde 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -18,7 +18,7 @@ define void @_Z3foov(i64 %n) {
; CHECK-V1-IC1: [[VECTOR_PH]]:
; CHECK-V1-IC1: br label %[[VECTOR_BODY:.*]]
; CHECK-V1-IC1: [[VECTOR_BODY]]:
-; CHECK-V1-IC1: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-V1-IC1: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
; CHECK-V1-IC1: [[MIDDLE_BLOCK]]:
; CHECK-V1-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]]
; CHECK-V1-IC1: [[SCALAR_PH]]:
@@ -34,13 +34,13 @@ define void @_Z3foov(i64 %n) {
; CHECK-V2-IC1: [[VECTOR_PH]]:
; CHECK-V2-IC1: br label %[[VECTOR_BODY:.*]]
; CHECK-V2-IC1: [[VECTOR_BODY]]:
-; CHECK-V2-IC1: br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC1: br i1 [[TMP2:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-V2-IC1: [[MIDDLE_BLOCK]]:
; CHECK-V2-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
; CHECK-V2-IC1: [[SCALAR_PH]]:
; CHECK-V2-IC1: br label %[[FOR_BODY:.*]]
; CHECK-V2-IC1: [[FOR_BODY]]:
-; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]:
;
; CHECK-V2-IC4-LABEL: define void @_Z3foov(
@@ -52,7 +52,7 @@ define void @_Z3foov(i64 %n) {
; CHECK-V2-IC4: [[VECTOR_PH]]:
; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]]
; CHECK-V2-IC4: [[VECTOR_BODY]]:
-; CHECK-V2-IC4: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC4: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-V2-IC4: [[MIDDLE_BLOCK]]:
; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]:
@@ -60,7 +60,7 @@ define void @_Z3foov(i64 %n) {
; CHECK-V2-IC4: [[VEC_EPILOG_PH]]:
; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC4: br i1 [[TMP15:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4: br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-V2-IC4: br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]:
@@ -101,8 +101,9 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]], [[META3]]}
+; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
;.
; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
@@ -110,7 +111,7 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 0}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 12}
; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index 6892709f085f7..7ae06953c5544 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -4,13 +4,6 @@
; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-epilogue-vectorization \
; RUN: -epilogue-vectorization-force-VF=4 | FileCheck %s --check-prefix=MAINVF4IC2_EPI4
-; FIXME: For MAINVF4IC2_EPI4 the branch weights in the terminator of
-; the VEC_EPILOG_ITER_CHECK block should be [4,4] since we process 8
-; scalar iterations in the main loop, leaving the remaining count to
-; be in the range [0,7]. That gives a 4:4 chance of skipping the
-; vector epilogue. I believe the problem lies in
-; EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck
-; where the main loop VF is set to the same value as the epilogue VF.
define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
; MAINVF4IC1_EPI4-LABEL: define void @f0(
; MAINVF4IC1_EPI4-SAME: i8 [[N:%.*]], i32 [[LEN:%.*]], ptr [[P:%.*]]) !prof [[PROF0:![0-9]+]] {
@@ -145,7 +138,7 @@ exit:
; MAINVF4IC2_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1}
; MAINVF4IC2_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
; MAINVF4IC2_EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 7}
-; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 0}
+; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 4}
; MAINVF4IC2_EPI4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
; MAINVF4IC2_EPI4: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
; MAINVF4IC2_EPI4: [[PROF11]] = !{!"branch_weights", i32 1, i32 3}
More information about the llvm-commits
mailing list