[llvm] [LV] Improve accuracy of branch weights in epilogue iteration check block (PR #152980)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 11 02:18:30 PDT 2025
https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/152980
When one of the vector loops (main or epilogue) is scalable and the
other isn't, we can use the estimated value of vscale to improve the
accuracy.
>From 0d78afb023744b802fcc0122e9c2494dea9d64ee Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 11 Aug 2025 09:15:08 +0000
Subject: [PATCH 1/2] Add tests
---
.../LoopVectorize/AArch64/check-prof-info.ll | 193 +++++++++++++++++-
1 file changed, 188 insertions(+), 5 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 9987701e44cde..7d7704ee77c5b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:" --version 5
; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=CHECK-V1-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 \
+; RUN: -epilogue-vectorization-force-VF=4 -S < %s | FileCheck %s -check-prefix=CHECK-V1-IC1-FORCE-EPI4
; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=CHECK-V2-IC1
; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-V2-IC4
@@ -10,8 +12,8 @@ target triple = "aarch64-unknown-linux-gnu"
; We expect the branch weight computations after vectorisation to use
; vscale=2 for neoverse-v1 and vscale=1 for neoverse-v2.
-define void @_Z3foov(i64 %n) {
-; CHECK-V1-IC1-LABEL: define void @_Z3foov(
+define void @foo_i32(i64 %n) vscale_range(1,16) {
+; CHECK-V1-IC1-LABEL: define void @foo_i32(
; CHECK-V1-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-V1-IC1: [[ENTRY:.*:]]
; CHECK-V1-IC1: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
@@ -27,7 +29,33 @@ define void @_Z3foov(i64 %n) {
; CHECK-V1-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-V1-IC1: [[FOR_COND_CLEANUP]]:
;
-; CHECK-V2-IC1-LABEL: define void @_Z3foov(
+; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i32(
+; CHECK-V1-IC1-FORCE-EPI4-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-V1-IC1-FORCE-EPI4: [[ITER_CHECK:.*:]]
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4: br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF4:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4: br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4: [[FOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC1-LABEL: define void @foo_i32(
; CHECK-V2-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-V2-IC1: [[ENTRY:.*:]]
; CHECK-V2-IC1: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
@@ -43,9 +71,9 @@ define void @_Z3foov(i64 %n) {
; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]:
;
-; CHECK-V2-IC4-LABEL: define void @_Z3foov(
+; CHECK-V2-IC4-LABEL: define void @foo_i32(
; CHECK-V2-IC4-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY1:.*:]]
+; CHECK-V2-IC4: [[ITER_CHECK:.*:]]
; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
; CHECK-V2-IC4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
@@ -86,6 +114,128 @@ for.cond.cleanup: ; preds = %for.body
ret void
}
+define void @foo_i8(i64 %n) vscale_range(1,16) {
+; CHECK-V1-IC1-LABEL: define void @foo_i8(
+; CHECK-V1-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V1-IC1: [[ITER_CHECK:.*:]]
+; CHECK-V1-IC1: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V1-IC1: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1: [[VECTOR_PH]]:
+; CHECK-V1-IC1: br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1: [[VECTOR_BODY]]:
+; CHECK-V1-IC1: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-V1-IC1: [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF7]]
+; CHECK-V1-IC1: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V1-IC1: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF9:![0-9]+]]
+; CHECK-V1-IC1: [[VEC_EPILOG_PH]]:
+; CHECK-V1-IC1: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V1-IC1: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V1-IC1: br i1 [[TMP19:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V1-IC1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V1-IC1: br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK-V1-IC1: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V1-IC1: br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1: [[FOR_BODY]]:
+; CHECK-V1-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-V1-IC1: [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i8(
+; CHECK-V1-IC1-FORCE-EPI4-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V1-IC1-FORCE-EPI4: [[ITER_CHECK:.*:]]
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4: br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF10]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7]]
+; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V1-IC1-FORCE-EPI4: br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1-FORCE-EPI4: [[FOR_BODY]]:
+; CHECK-V1-IC1-FORCE-EPI4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF8]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-V1-IC1-FORCE-EPI4: [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC1-LABEL: define void @foo_i8(
+; CHECK-V2-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V2-IC1: [[ITER_CHECK:.*:]]
+; CHECK-V2-IC1: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V2-IC1: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC1: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC1: [[VECTOR_PH]]:
+; CHECK-V2-IC1: br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC1: [[VECTOR_BODY]]:
+; CHECK-V2-IC1: br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-V2-IC1: [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF10:![0-9]+]]
+; CHECK-V2-IC1: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC1: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
+; CHECK-V2-IC1: [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC1: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC1: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC1: br i1 [[TMP13:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-V2-IC1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC1: br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF5]]
+; CHECK-V2-IC1: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC1: br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC1: [[FOR_BODY]]:
+; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC4-LABEL: define void @foo_i8(
+; CHECK-V2-IC4-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-V2-IC4: [[ITER_CHECK:.*:]]
+; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4: [[VECTOR_PH]]:
+; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC4: [[VECTOR_BODY]]:
+; CHECK-V2-IC4: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF5]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-V2-IC4: [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF1]]
+; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]]
+; CHECK-V2-IC4: [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC4: br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC4: br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF14:![0-9]+]]
+; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC4: br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC4: [[FOR_BODY]]:
+; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-V2-IC4: [[FOR_COND_CLEANUP]]:
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %iv
+ %load = load i8, ptr %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %iv
+ store i8 %load, ptr %arrayidx2, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
+
+for.cond.cleanup:
+ ret void
+}
+
!0 = !{!"branch_weights", i32 1, i32 1023}
;.
; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
@@ -95,6 +245,28 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V1-IC1: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
; CHECK-V1-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META2]]}
+; CHECK-V1-IC1: [[PROF7]] = !{!"branch_weights", i32 1, i32 31}
+; CHECK-V1-IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META3]]}
+; CHECK-V1-IC1: [[PROF9]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-V1-IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]]}
+; CHECK-V1-IC1: [[PROF11]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V1-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META2]]}
+;.
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V1-IC1-FORCE-EPI4: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 4, i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF8]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META2]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF10]] = !{!"branch_weights", i32 1, i32 31}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
+; CHECK-V1-IC1-FORCE-EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
;.
; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
@@ -104,6 +276,12 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+; CHECK-V2-IC1: [[PROF8]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-V2-IC1: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META4]]}
+; CHECK-V2-IC1: [[PROF10]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V2-IC1: [[PROF11]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V2-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META4]]}
+; CHECK-V2-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]], [[META3]]}
;.
; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
@@ -116,4 +294,9 @@ for.cond.cleanup: ; preds = %for.body
; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
+; CHECK-V2-IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF12]] = !{!"branch_weights", i32 8, i32 56}
+; CHECK-V2-IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF14]] = !{!"branch_weights", i32 1, i32 7}
+; CHECK-V2-IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]], [[META3]]}
;.
>From 6b7ac6026204fcba2f3e728f0e2335a24cb223c2 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 11 Aug 2025 09:15:18 +0000
Subject: [PATCH 2/2] [LV] Improve accuracy of branch weights in epilogue
iteration check block
When one of the vector loops (main or epilogue) is scalable and the
other isn't, we can use the estimated value of vscale to improve the
accuracy.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++++++++
.../LoopVectorize/AArch64/check-prof-info.ll | 4 ++--
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5f223e5082b1..d19b8362c074b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7649,6 +7649,16 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
unsigned MainLoopStep = EPI.MainLoopUF * EPI.MainLoopVF.getKnownMinValue();
unsigned EpilogueLoopStep =
EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
+ // When one of the vector loops is scalable and the other isn't, we can use
+ // the estimated value of vscale to improve the accuracy.
+ if (EPI.MainLoopVF.isScalable() != EPI.EpilogueVF.isScalable()) {
+ if (auto VScale = Cost->getVScaleForTuning()) {
+ if (EPI.MainLoopVF.isScalable())
+ MainLoopStep *= *VScale;
+ else
+ EpilogueLoopStep *= *VScale;
+ }
+ }
// We assume the remaining `Count` is equally distributed in
// [0, MainLoopStep)
// So the probability for `Count < EpilogueLoopStep` should be
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 7d7704ee77c5b..a65bf60803cd6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -257,14 +257,14 @@ for.cond.cleanup:
; CHECK-V1-IC1-FORCE-EPI4: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-V1-IC1-FORCE-EPI4: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-V1-IC1-FORCE-EPI4: [[PROF4]] = !{!"branch_weights", i32 1, i32 7}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 4, i32 0}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF5]] = !{!"branch_weights", i32 4, i32 4}
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]]}
; CHECK-V1-IC1-FORCE-EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 3}
; CHECK-V1-IC1-FORCE-EPI4: [[PROF8]] = !{!"branch_weights", i32 0, i32 0}
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META2]]}
; CHECK-V1-IC1-FORCE-EPI4: [[PROF10]] = !{!"branch_weights", i32 1, i32 31}
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META3]]}
-; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 28}
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
;.
More information about the llvm-commits
mailing list