[llvm] [LV] Improve accuracy of branch weights in epilogue iteration check block (PR #152980)

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 11 02:18:30 PDT 2025


https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/152980

When one of the vector loops (main or epilogue) is scalable and the
other isn't, we can use the estimated value of vscale to improve the
accuracy.

>From 0d78afb023744b802fcc0122e9c2494dea9d64ee Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 11 Aug 2025 09:15:08 +0000
Subject: [PATCH 1/2] Add tests

---
 .../LoopVectorize/AArch64/check-prof-info.ll  | 193 +++++++++++++++++-
 1 file changed, 188 insertions(+), 5 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 9987701e44cde..7d7704ee77c5b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:" --version 5
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V1-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 \
+; RUN:   -epilogue-vectorization-force-VF=4 -S < %s |  FileCheck %s -check-prefix=CHECK-V1-IC1-FORCE-EPI4
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC1
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC4
 
@@ -10,8 +12,8 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; We expect the branch weight computations after vectorisation to use
 ; vscale=2 for neoverse-v1 and vscale=1 for neoverse-v2.
-define void @_Z3foov(i64 %n) {
-; CHECK-V1-IC1-LABEL: define void @_Z3foov(
+define void @foo_i32(i64 %n) vscale_range(1,16) {
+; CHECK-V1-IC1-LABEL: define void @foo_i32(
 ; CHECK-V1-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-V1-IC1:  [[ENTRY:.*:]]
 ; CHECK-V1-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
@@ -27,7 +29,33 @@ define void @_Z3foov(i64 %n) {
 ; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
 ;
-; CHECK-V2-IC1-LABEL: define void @_Z3foov(
+; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i32(
+; CHECK-V1-IC1-FORCE-EPI4-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-V1-IC1-FORCE-EPI4:  [[ITER_CHECK:.*:]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF4:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC1-LABEL: define void @foo_i32(
 ; CHECK-V2-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-V2-IC1:  [[ENTRY:.*:]]
 ; CHECK-V2-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
@@ -43,9 +71,9 @@ define void @_Z3foov(i64 %n) {
 ; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
 ;
-; CHECK-V2-IC4-LABEL: define void @_Z3foov(
+; CHECK-V2-IC4-LABEL: define void @foo_i32(
 ; CHECK-V2-IC4-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY1:.*:]]
+; CHECK-V2-IC4:  [[ITER_CHECK:.*:]]
 ; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK-V2-IC4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
 ; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
@@ -86,6 +114,128 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 }
 
+define void @foo_i8(i64 %n) vscale_range(1,16) {
+; CHECK-V1-IC1-LABEL: define void @foo_i8(
+; CHECK-V1-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V1-IC1:  [[ITER_CHECK:.*:]]
+; CHECK-V1-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V1-IC1:    br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1:  [[VECTOR_PH]]:
+; CHECK-V1-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1:  [[VECTOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-V1-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF7]]
+; CHECK-V1-IC1:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V1-IC1:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF9:![0-9]+]]
+; CHECK-V1-IC1:  [[VEC_EPILOG_PH]]:
+; CHECK-V1-IC1:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V1-IC1:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[TMP19:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V1-IC1:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V1-IC1:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK-V1-IC1:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V1-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1:  [[FOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i8(
+; CHECK-V1-IC1-FORCE-EPI4-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V1-IC1-FORCE-EPI4:  [[ITER_CHECK:.*:]]
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF10]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF8]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC1-LABEL: define void @foo_i8(
+; CHECK-V2-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V2-IC1:  [[ITER_CHECK:.*:]]
+; CHECK-V2-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V2-IC1:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC1:    br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC1:  [[VECTOR_PH]]:
+; CHECK-V2-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC1:  [[VECTOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF10:![0-9]+]]
+; CHECK-V2-IC1:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC1:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK-V2-IC1:  [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC1:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC1:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[TMP13:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-V2-IC1:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC1:    br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF5]]
+; CHECK-V2-IC1:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC1:  [[FOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC4-LABEL: define void @foo_i8(
+; CHECK-V2-IC4-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V2-IC4:  [[ITER_CHECK:.*:]]
+; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4:  [[VECTOR_PH]]:
+; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF5]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF1]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF14:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC4:  [[FOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %iv
+  %load = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %iv
+  store i8 %load, ptr %arrayidx2, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
+
+for.cond.cleanup:
+  ret void
+}
+
 !0 = !{!"branch_weights", i32 1, i32 1023}
 ;.
 ; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
@@ -95,6 +245,28 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-V1-IC1: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
 ; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
 ; CHECK-V1-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META2]]}
+; CHECK-V1-IC1: [[PROF7]] = !{!"branch_weights", i32 1, i32 31}
+; CHECK-V1-IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META3]]}
+; CHECK-V1-IC1: [[PROF9]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-V1-IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]]}
+; CHECK-V1-IC1: [[PROF11]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V1-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META2]]}
+;.
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V1-IC1-FORCE-EPI4: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 4, i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF8]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META2]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF10]] = !{!"branch_weights", i32 1, i32 31}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
 ;.
 ; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
 ; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
@@ -104,6 +276,12 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
 ; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
 ; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+; CHECK-V2-IC1: [[PROF8]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-V2-IC1: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META4]]}
+; CHECK-V2-IC1: [[PROF10]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V2-IC1: [[PROF11]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V2-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META4]]}
+; CHECK-V2-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]], [[META3]]}
 ;.
 ; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
 ; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
@@ -116,4 +294,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
 ; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
 ; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
+; CHECK-V2-IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF12]] = !{!"branch_weights", i32 8, i32 56}
+; CHECK-V2-IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF14]] = !{!"branch_weights", i32 1, i32 7}
+; CHECK-V2-IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]], [[META3]]}
 ;.

>From 6b7ac6026204fcba2f3e728f0e2335a24cb223c2 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 11 Aug 2025 09:15:18 +0000
Subject: [PATCH 2/2] [LV] Improve accuracy of branch weights in epilogue
 iteration check block

When one of the vector loops (main or epilogue) is scalable and the
other isn't, we can use the estimated value of vscale to improve the
accuracy.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp        | 10 ++++++++++
 .../LoopVectorize/AArch64/check-prof-info.ll           |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5f223e5082b1..d19b8362c074b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7649,6 +7649,16 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     unsigned MainLoopStep = EPI.MainLoopUF * EPI.MainLoopVF.getKnownMinValue();
     unsigned EpilogueLoopStep =
         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
+    // When one of the vector loops is scalable and the other isn't, we can use
+    // the estimated value of vscale to improve the accuracy.
+    if (EPI.MainLoopVF.isScalable() != EPI.EpilogueVF.isScalable()) {
+      if (auto VScale = Cost->getVScaleForTuning()) {
+        if (EPI.MainLoopVF.isScalable())
+          MainLoopStep *= *VScale;
+        else
+          EpilogueLoopStep *= *VScale;
+      }
+    }
     // We assume the remaining `Count` is equally distributed in
     // [0, MainLoopStep)
     // So the probability for `Count < EpilogueLoopStep` should be
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 7d7704ee77c5b..a65bf60803cd6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -257,14 +257,14 @@ for.cond.cleanup:
 ; CHECK-V1-IC1-FORCE-EPI4: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V1-IC1-FORCE-EPI4: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-V1-IC1-FORCE-EPI4: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 4, i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 4, i32 4}
 ; CHECK-V1-IC1-FORCE-EPI4: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]]}
 ; CHECK-V1-IC1-FORCE-EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 3}
 ; CHECK-V1-IC1-FORCE-EPI4: [[PROF8]] = !{!"branch_weights", i32 0, i32 0}
 ; CHECK-V1-IC1-FORCE-EPI4: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META2]]}
 ; CHECK-V1-IC1-FORCE-EPI4: [[PROF10]] = !{!"branch_weights", i32 1, i32 31}
 ; CHECK-V1-IC1-FORCE-EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META3]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 28}
 ; CHECK-V1-IC1-FORCE-EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
 ; CHECK-V1-IC1-FORCE-EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
 ;.



More information about the llvm-commits mailing list