[llvm] a75e062 - [LV] Use vscale for tuning when updating profile information (#143690)

Mon Jun 16 02:02:41 PDT 2025

Author: David Sherwood
Date: 2025-06-16T10:02:38+01:00
New Revision: a75e0627f97ccc36ec222a53c6a1106157a380ac

URL: https://github.com/llvm/llvm-project/commit/a75e0627f97ccc36ec222a53c6a1106157a380ac
DIFF: https://github.com/llvm/llvm-project/commit/a75e0627f97ccc36ec222a53c6a1106157a380ac.diff

LOG: [LV] Use vscale for tuning when updating profile information (#143690)

In fixVectorizedLoop we call setProfileInfoAfterUnrolling to update the
profile information after vectorising, however for scalable VFs we
pessimistically assume vscale=1. We can improve upon this by using the
value of vscale used for tuning, i.e. when targeting neoverse-v1 the
expected value is 2.

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/check-prof-info.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bdbfecd962443..bd0a2ec3986d3 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2691,6 +2691,20 @@ static void cse(BasicBlock *BB) {
   }
 }
 
+/// This function attempts to return a value that represents the vectorization
+/// factor at runtime. For fixed-width VFs we know this precisely at compile
+/// time, but for scalable VFs we calculate it based on an estimate of the
+/// vscale value.
+static unsigned getEstimatedRuntimeVF(ElementCount VF,
+                                      std::optional<unsigned> VScale) {
+  unsigned EstimatedVF = VF.getKnownMinValue();
+  if (VF.isScalable())
+    if (VScale)
+      EstimatedVF *= *VScale;
+  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+  return EstimatedVF;
+}
+
 InstructionCost
 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
                                               ElementCount VF) const {
@@ -2790,10 +2804,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   //
   // For scalable vectorization we can't know at compile time how many
   // iterations of the loop are handled in one vector iteration, so instead
-  // assume a pessimistic vscale of '1'.
+  // use the value of vscale used for tuning.
   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
-  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
-                               VF.getKnownMinValue() * UF);
+  unsigned EstimatedVFxUF =
+      getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
+  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
 }
 
 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -4031,20 +4046,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   return MaxVF;
 }
 
-/// This function attempts to return a value that represents the vectorization
-/// factor at runtime. For fixed-width VFs we know this precisely at compile
-/// time, but for scalable VFs we calculate it based on an estimate of the
-/// vscale value.
-static unsigned getEstimatedRuntimeVF(ElementCount VF,
-                                      std::optional<unsigned> VScale) {
-  unsigned EstimatedVF = VF.getKnownMinValue();
-  if (VF.isScalable())
-    if (VScale)
-      EstimatedVF *= *VScale;
-  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
-  return EstimatedVF;
-}
-
 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
                                                 const VectorizationFactor &B,
                                                 const unsigned MaxTripCount,

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
new file mode 100644
index 0000000000000..9435c544fc812
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:" --version 5
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V1-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC4
+
+target triple = "aarch64-unknown-linux-gnu"
+
+ at a = global [1024 x i32] zeroinitializer, align 16
+ at b = global [1024 x i32] zeroinitializer, align 16
+
+; We expect the branch weight computations after vectorisation to use
+; vscale=2 for neoverse-v1 and vscale=1 for neoverse-v2.
+define void @_Z3foov() {
+; CHECK-V1-IC1-LABEL: define void @_Z3foov(
+; CHECK-V1-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V1-IC1:  [[ENTRY:.*:]]
+; CHECK-V1-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V1-IC1:  [[VECTOR_PH]]:
+; CHECK-V1-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1:  [[VECTOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-V1-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]]
+; CHECK-V1-IC1:  [[SCALAR_PH]]:
+; CHECK-V1-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1:  [[FOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC1-LABEL: define void @_Z3foov(
+; CHECK-V2-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC1:  [[ENTRY:.*:]]
+; CHECK-V2-IC1:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC1:  [[VECTOR_PH]]:
+; CHECK-V2-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC1:  [[VECTOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC1:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC1:  [[SCALAR_PH]]:
+; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC1:  [[FOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC4-LABEL: define void @_Z3foov(
+; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY1:.*:]]
+; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4:  [[VECTOR_PH]]:
+; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC4:  [[FOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
+  %load = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
+  store i32 %load, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 1023}
+;.
+; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V1-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK-V1-IC1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V1-IC1: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V1-IC1: [[PROF4]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V1-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META2]]}
+;.
+; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
+; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+;.
+; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-V2-IC4: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0}
+; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
+;.

diff  --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index 87c1ccb702277..40741941d4b02 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:"
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s |  FileCheck %s
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-MASKED
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 \
+; RUN:   -scalable-vectorization=on -force-target-supports-scalable-vectors -S < %s |  FileCheck %s -check-prefix=CHECK-SCALABLE
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -39,6 +41,21 @@ define void @_Z3foov() {
 ; CHECK-MASKED:  for.body:
 ; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
+; CHECK-SCALABLE-LABEL: @_Z3foov(
+; CHECK-SCALABLE:  entry:
+; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-SCALABLE:  middle.block:
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-SCALABLE:  scalar.ph:
+; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
+; CHECK-SCALABLE:  for.cond.cleanup:
+; CHECK-SCALABLE:  for.body:
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -92,6 +109,21 @@ define void @_Z3foo2v() {
 ; CHECK-MASKED:  for.body:
 ; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
+; CHECK-SCALABLE-LABEL: @_Z3foo2v(
+; CHECK-SCALABLE:  entry:
+; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
+; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SCALABLE:  middle.block:
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK-SCALABLE:  scalar.ph:
+; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
+; CHECK-SCALABLE:  for.cond.cleanup:
+; CHECK-SCALABLE:  for.body:
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+;
 entry:
   br label %for.body