[llvm] [LV] strip TailFoldingStyle::DataWithoutLaneMask (PR #93303)

Fri May 24 07:18:33 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-powerpc

Author: Ramkumar Ramachandra (artagnon)

<details>
<summary>Changes</summary>

There is just one usage of TailFoldingStyle::DataWithoutLaneMask in LoopVectorize, introduced by 413a66f ([LV, VP]VP intrinsics support for the Loop Vectorizer + adding new tail-folding mode using EVL.), but this usage is completely unnecessary, as @llvm.get.active.lane.mask is unrelated to EVL. Moreover, SelectionDAG automatically detects if a target supports the @llvm.get.active.lane.mask intrinsic, and lowers it to equivalent instructions on targets where it is not preferred, since 243a532 ([SelectionDAG] Lower @llvm.get.active.lane.mask to setcc).

---

Patch is 75.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93303.diff


7 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+3-8) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll (-56) 
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll (+320-64) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll (+15-23) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll (+49-52) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll (+16-22) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll (+243-50) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 52cb8c9f88f94..44905bee96519 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -243,9 +243,6 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
         clEnumValN(
             TailFoldingStyle::Data, "data",
             "Create lane mask for data only, using active.lane.mask intrinsic"),
-        clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
-                   "data-without-lane-mask",
-                   "Create lane mask with compare/stepvector"),
         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
                    "Create lane mask using active.lane.mask intrinsic, and use "
                    "it for both data and control flow"),
@@ -1538,12 +1535,10 @@ class LoopVectorizationCostModel {
         // FIXME: remove this once reductions are supported.
         Legal->getReductionVars().empty();
     if (!EVLIsLegal) {
-      // If for some reason EVL mode is unsupported, fallback to
-      // DataWithoutLaneMask to try to vectorize the loop with folded tail
-      // in a generic way.
+      // If for some reason EVL mode is unsupported, fallback to Data to try to
+      // vectorize the loop with folded tail in a generic way.
       ChosenTailFoldingStyle =
-          std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
-                         TailFoldingStyle::DataWithoutLaneMask);
+          std::make_pair(TailFoldingStyle::Data, TailFoldingStyle::Data);
       LLVM_DEBUG(
           dbgs()
           << "LV: Preference for VP intrinsics indicated. Will "
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 13fc0eaafb808..666c03565dc60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA
-; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control-without-rt-check < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL_NO_RT_CHECK
 
@@ -99,61 +98,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA:       while.end.loopexit:
 ; DATA-NEXT:    ret void
 ;
-; DATA_NO_LANEMASK-LABEL: @simple_memset_tailfold(
-; DATA_NO_LANEMASK-NEXT:  entry:
-; DATA_NO_LANEMASK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; DATA_NO_LANEMASK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; DATA_NO_LANEMASK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA_NO_LANEMASK:       vector.ph:
-; DATA_NO_LANEMASK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA_NO_LANEMASK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; DATA_NO_LANEMASK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; DATA_NO_LANEMASK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
-; DATA_NO_LANEMASK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
-; DATA_NO_LANEMASK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; DATA_NO_LANEMASK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA_NO_LANEMASK:       vector.body:
-; DATA_NO_LANEMASK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ]
-; DATA_NO_LANEMASK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
-; DATA_NO_LANEMASK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]])
-; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA_NO_LANEMASK:       middle.block:
-; DATA_NO_LANEMASK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA_NO_LANEMASK:       scalar.ph:
-; DATA_NO_LANEMASK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; DATA_NO_LANEMASK-NEXT:    br label [[WHILE_BODY:%.*]]
-; DATA_NO_LANEMASK:       while.body:
-; DATA_NO_LANEMASK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA_NO_LANEMASK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; DATA_NO_LANEMASK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
-; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; DATA_NO_LANEMASK:       while.end.loopexit:
-; DATA_NO_LANEMASK-NEXT:    ret void
-;
 ; DATA_AND_CONTROL-LABEL: @simple_memset_tailfold(
 ; DATA_AND_CONTROL-NEXT:  entry:
 ; DATA_AND_CONTROL-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
index bd52c2a8f0645..2b50b44a3d289 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
@@ -7,50 +8,183 @@
 ; RUN: -mcpu=pwr10 -disable-output < %s 2>&1 | FileCheck %s
 
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
-; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF * UF
-; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT: Live-in vp<%2> = backedge-taken count
-; CHECK-NEXT: Live-in ir<%N> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
-; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
-; CHECK-NEXT:     EMIT vp<%4> = icmp ule ir<%iv>, vp<%2>
-; CHECK-NEXT:   Successor(s): pred.store
-; CHECK-EMPTY:
-; CHECK-NEXT:  <xVFxUF> pred.store: {
-; CHECK-NEXT:    pred.store.entry:
-; CHECK-NEXT:      BRANCH-ON-MASK vp<%4>
-; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:    pred.store.if:
-; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%3>, ir<1>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%5>
-; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%5>
-; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%5>
-; CHECK-NEXT:      REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
-; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%arrayidx4>
-; CHECK-NEXT:    Successor(s): pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:    pred.store.continue:
-; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
-; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-LABEL: 'foo'
+; CHECK-NEXT:  LV: Loop hints: force=? width=0 interleave=0
+; CHECK-NEXT:  LV: Found a loop: for.body
+; CHECK-NEXT:  LV: Found an induction variable.
+; CHECK-NEXT:  LV: We can vectorize this loop!
+; CHECK-NEXT:  LV: Found trip count: 0
+; CHECK-NEXT:  LV: vector predicate hint/switch found.
+; CHECK-NEXT:  LV: Not allowing scalar epilogue, creating predicated vector loop.
+; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 0.
+; CHECK-NEXT:  LV: The Smallest and Widest types: 32 / 32 bits.
+; CHECK-NEXT:  LV: The Widest register safe to use is: 128 bits.
+; CHECK-NEXT:  LV: The Widest register safe to use is: vscale x 0 bits.
+; CHECK-NEXT:  LV: The target has no scalable vector registers.
+; CHECK-NEXT:  LV: checking if tail can be folded by masking.
+; CHECK-NEXT:  LV: can fold tail by masking.
+; CHECK-NEXT:  LV: Preference for VP intrinsics indicated. Will not try to generate VP Intrinsics due to non-interleaving reasons.
+; CHECK-NEXT:  LV: Invalidate all interleaved groups due to fold-tail by masking which requires masked-interleaved support.
+; CHECK-NEXT:  LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Scalarizing: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Scalarizing: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:  LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Scalarizing: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  VPlan 'Initial VPlan for VF={1},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; CHECK-NEXT:    Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:    <xVFxUF> pred.store: {
+; CHECK-NEXT:      pred.store.entry:
+; CHECK-NEXT:        BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:      Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.if:
+; CHECK-NEXT:        CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK-NEXT:        CLONE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:        CLONE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; CHECK-NEXT:        CLONE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:        CLONE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK-NEXT:        CLONE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:        CLONE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:      Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.continue:
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-NEXT:      No successors
+; CHECK-NEXT:    }
+; CHECK-NEXT:    Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:    for.body.2:
+; CHECK-NEXT:      EMIT vp<%8> = add vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; CHECK-NEXT:    Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:    <xVFxUF> pred.store: {
+; CHECK-NEXT:      pred.store.entry:
+; CHECK-NEXT:        BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:      Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.if:
+; CHECK-NEXT:        REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK-NEXT:        REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:        REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; CHECK-NEXT:        REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:        REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK-NEXT:        REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:        REPLICATE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:      Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.continue:
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-NEXT:      No successors
+; CHECK-NEXT:    }
+; CHECK-NEXT:    Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:    for.body.2:
+; CHECK-NEXT:      EMIT vp<%8> = add vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): for.body.2
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; CHECK-NEXT:  LV: Scalar loop costs: 6.
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 3 for VF 2 For instruction: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; CHECK-NEXT:  LV: Vector loop of width 2 costs: 3000003.
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/93303