[llvm] [LV] strip TailFoldingStyle::DataWithoutLaneMask (PR #93303)

Fri May 24 07:18:02 PDT 2024

https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/93303

There is just one usage of TailFoldingStyle::DataWithoutLaneMask in LoopVectorize, introduced by 413a66f ([LV, VP]VP intrinsics support for the Loop Vectorizer + adding new tail-folding mode using EVL.), but this usage is completely unnecessary, as @llvm.get.active.lane.mask is unrelated to EVL. Moreover, SelectionDAG automatically detects if a target supports the @llvm.get.active.lane.mask intrinsic, and lowers it to equivalent instructions on targets where it is not preferred, since 243a532 ([SelectionDAG] Lower @llvm.get.active.lane.mask to setcc).

>From 1592baba3b6553a906d3ffbbb3dd186b47ff906a Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <r at artagnon.com>
Date: Fri, 24 May 2024 14:44:10 +0100
Subject: [PATCH] [LV] strip TailFoldingStyle::DataWithoutLaneMask

There is just one usage of TailFoldingStyle::DataWithoutLaneMask in
LoopVectorize, introduced by 413a66f ([LV, VP]VP intrinsics support for
the Loop Vectorizer + adding new tail-folding mode using EVL.), but this
usage is completely unnecessary, as @llvm.get.active.lane.mask is
unrelated to EVL. Moreover, SelectionDAG automatically detects if a
target supports the @llvm.get.active.lane.mask intrinsic, and lowers it
to equivalent instructions on targets where it is not preferred, since
243a532 ([SelectionDAG] Lower @llvm.get.active.lane.mask to setcc).
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  11 +-
 .../AArch64/tail-folding-styles.ll            |  56 ---
 .../PowerPC/vplan-force-tail-with-evl.ll      | 384 +++++++++++++++---
 .../LoopVectorize/RISCV/inloop-reduction.ll   |  38 +-
 ...ectorize-force-tail-with-evl-interleave.ll | 101 +++--
 .../X86/vectorize-force-tail-with-evl.ll      |  38 +-
 .../LoopVectorize/X86/vplan-vp-intrinsics.ll  | 293 ++++++++++---
 7 files changed, 646 insertions(+), 275 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 52cb8c9f88f94..44905bee96519 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -243,9 +243,6 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
         clEnumValN(
             TailFoldingStyle::Data, "data",
             "Create lane mask for data only, using active.lane.mask intrinsic"),
-        clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
-                   "data-without-lane-mask",
-                   "Create lane mask with compare/stepvector"),
         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
                    "Create lane mask using active.lane.mask intrinsic, and use "
                    "it for both data and control flow"),
@@ -1538,12 +1535,10 @@ class LoopVectorizationCostModel {
         // FIXME: remove this once reductions are supported.
         Legal->getReductionVars().empty();
     if (!EVLIsLegal) {
-      // If for some reason EVL mode is unsupported, fallback to
-      // DataWithoutLaneMask to try to vectorize the loop with folded tail
-      // in a generic way.
+      // If for some reason EVL mode is unsupported, fallback to Data to try to
+      // vectorize the loop with folded tail in a generic way.
       ChosenTailFoldingStyle =
-          std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
-                         TailFoldingStyle::DataWithoutLaneMask);
+          std::make_pair(TailFoldingStyle::Data, TailFoldingStyle::Data);
       LLVM_DEBUG(
           dbgs()
           << "LV: Preference for VP intrinsics indicated. Will "
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 13fc0eaafb808..666c03565dc60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA
-; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control-without-rt-check < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL_NO_RT_CHECK
 
@@ -99,61 +98,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA:       while.end.loopexit:
 ; DATA-NEXT:    ret void
 ;
-; DATA_NO_LANEMASK-LABEL: @simple_memset_tailfold(
-; DATA_NO_LANEMASK-NEXT:  entry:
-; DATA_NO_LANEMASK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; DATA_NO_LANEMASK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; DATA_NO_LANEMASK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA_NO_LANEMASK:       vector.ph:
-; DATA_NO_LANEMASK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA_NO_LANEMASK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; DATA_NO_LANEMASK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; DATA_NO_LANEMASK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
-; DATA_NO_LANEMASK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
-; DATA_NO_LANEMASK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; DATA_NO_LANEMASK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DATA_NO_LANEMASK:       vector.body:
-; DATA_NO_LANEMASK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ]
-; DATA_NO_LANEMASK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
-; DATA_NO_LANEMASK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]])
-; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA_NO_LANEMASK:       middle.block:
-; DATA_NO_LANEMASK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA_NO_LANEMASK:       scalar.ph:
-; DATA_NO_LANEMASK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; DATA_NO_LANEMASK-NEXT:    br label [[WHILE_BODY:%.*]]
-; DATA_NO_LANEMASK:       while.body:
-; DATA_NO_LANEMASK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA_NO_LANEMASK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; DATA_NO_LANEMASK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
-; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; DATA_NO_LANEMASK:       while.end.loopexit:
-; DATA_NO_LANEMASK-NEXT:    ret void
-;
 ; DATA_AND_CONTROL-LABEL: @simple_memset_tailfold(
 ; DATA_AND_CONTROL-NEXT:  entry:
 ; DATA_AND_CONTROL-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
index bd52c2a8f0645..2b50b44a3d289 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
@@ -7,50 +8,183 @@
 ; RUN: -mcpu=pwr10 -disable-output < %s 2>&1 | FileCheck %s
 
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
-; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF * UF
-; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT: Live-in vp<%2> = backedge-taken count
-; CHECK-NEXT: Live-in ir<%N> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
-; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
-; CHECK-NEXT:     EMIT vp<%4> = icmp ule ir<%iv>, vp<%2>
-; CHECK-NEXT:   Successor(s): pred.store
-; CHECK-EMPTY:
-; CHECK-NEXT:  <xVFxUF> pred.store: {
-; CHECK-NEXT:    pred.store.entry:
-; CHECK-NEXT:      BRANCH-ON-MASK vp<%4>
-; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:    pred.store.if:
-; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%3>, ir<1>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%5>
-; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%5>
-; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%5>
-; CHECK-NEXT:      REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
-; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%arrayidx4>
-; CHECK-NEXT:    Successor(s): pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:    pred.store.continue:
-; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
-; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-LABEL: 'foo'
+; CHECK-NEXT:  LV: Loop hints: force=? width=0 interleave=0
+; CHECK-NEXT:  LV: Found a loop: for.body
+; CHECK-NEXT:  LV: Found an induction variable.
+; CHECK-NEXT:  LV: We can vectorize this loop!
+; CHECK-NEXT:  LV: Found trip count: 0
+; CHECK-NEXT:  LV: vector predicate hint/switch found.
+; CHECK-NEXT:  LV: Not allowing scalar epilogue, creating predicated vector loop.
+; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 0.
+; CHECK-NEXT:  LV: The Smallest and Widest types: 32 / 32 bits.
+; CHECK-NEXT:  LV: The Widest register safe to use is: 128 bits.
+; CHECK-NEXT:  LV: The Widest register safe to use is: vscale x 0 bits.
+; CHECK-NEXT:  LV: The target has no scalable vector registers.
+; CHECK-NEXT:  LV: checking if tail can be folded by masking.
+; CHECK-NEXT:  LV: can fold tail by masking.
+; CHECK-NEXT:  LV: Preference for VP intrinsics indicated. Will not try to generate VP Intrinsics due to non-interleaving reasons.
+; CHECK-NEXT:  LV: Invalidate all interleaved groups due to fold-tail by masking which requires masked-interleaved support.
+; CHECK-NEXT:  LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found scalar instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Scalarizing: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Scalarizing: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:  LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Scalarizing: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing and predicating: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  VPlan 'Initial VPlan for VF={1},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; CHECK-NEXT:    Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:    <xVFxUF> pred.store: {
+; CHECK-NEXT:      pred.store.entry:
+; CHECK-NEXT:        BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:      Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.if:
+; CHECK-NEXT:        CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK-NEXT:        CLONE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:        CLONE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; CHECK-NEXT:        CLONE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:        CLONE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK-NEXT:        CLONE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:        CLONE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:      Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.continue:
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-NEXT:      No successors
+; CHECK-NEXT:    }
+; CHECK-NEXT:    Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:    for.body.2:
+; CHECK-NEXT:      EMIT vp<%8> = add vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; CHECK-NEXT:    Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:    <xVFxUF> pred.store: {
+; CHECK-NEXT:      pred.store.entry:
+; CHECK-NEXT:        BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:      Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.if:
+; CHECK-NEXT:        REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK-NEXT:        REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:        REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; CHECK-NEXT:        REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:        REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK-NEXT:        REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:        REPLICATE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:      Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:      pred.store.continue:
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
+; CHECK-NEXT:        PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-NEXT:      No successors
+; CHECK-NEXT:    }
+; CHECK-NEXT:    Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:    for.body.2:
+; CHECK-NEXT:      EMIT vp<%8> = add vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): for.body.2
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; CHECK-NEXT:  LV: Scalar loop costs: 6.
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 3 for VF 2 For instruction: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; CHECK-NEXT:  LV: Vector loop of width 2 costs: 3000003.
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 6 for VF 4 For instruction: %add = add nsw i32 %1, %0
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; CHECK-NEXT:  LV: Vector loop of width 4 costs: 1500002.
+; CHECK-NEXT:  LV: Selecting VF: 1.
+; CHECK-NEXT:  LV: Vectorization is possible but not beneficial.
+; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-EMPTY:
-; CHECK-NEXT:  for.body.2:
-; CHECK-NEXT:     EMIT vp<%8> = add vp<%3>, vp<%0>
-; CHECK-NEXT:     EMIT branch-on-count vp<%8>, vp<%1>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
 ;
 entry:
   br label %for.body
@@ -73,29 +207,151 @@ for.cond.cleanup:
 }
 
 define void @safe_dep(ptr %p) {
-; CHECK-LABEL: VPlan 'Initial VPlan for VF={2},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF * UF
-; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT: Live-in ir<512> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%6>
-; CHECK-NEXT:     vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
-; CHECK-NEXT:     CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
-; CHECK-NEXT:     vp<%4> = vector-pointer ir<%a1>
-; CHECK-NEXT:     WIDEN ir<%v> = load vp<%4>
-; CHECK-NEXT:     CLONE ir<%offset> = add vp<%3>, ir<100>
-; CHECK-NEXT:     CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
-; CHECK-NEXT:     vp<%5> = vector-pointer ir<%a2>
-; CHECK-NEXT:     WIDEN store vp<%5>, ir<%v>
-; CHECK-NEXT:     EMIT vp<%6> = add nuw vp<%2>, vp<%0>
-; CHECK-NEXT:     EMIT branch-on-count vp<%6>, vp<%1>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
+; CHECK-LABEL: 'safe_dep'
+; CHECK-NEXT:  LV: Loop hints: force=? width=0 interleave=0
+; CHECK-NEXT:  LV: Found a loop: loop
+; CHECK-NEXT:  LV: Found an induction variable.
+; CHECK-NEXT:  LV: We can vectorize this loop!
+; CHECK-NEXT:  LV: Found trip count: 512
+; CHECK-NEXT:  LV: vector predicate hint/switch found.
+; CHECK-NEXT:  LV: Not allowing scalar epilogue, creating predicated vector loop.
+; CHECK-NEXT:  LV: The max safe fixed VF is: 4.
+; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 0.
+; CHECK-NEXT:  LV: The Smallest and Widest types: 64 / 64 bits.
+; CHECK-NEXT:  LV: The Widest register safe to use is: 128 bits.
+; CHECK-NEXT:  LV: The Widest register safe to use is: vscale x 0 bits.
+; CHECK-NEXT:  LV: The target has no scalable vector registers.
+; CHECK-NEXT:  LV: No tail will remain for any chosen VF.
+; CHECK-NEXT:  LV: Found uniform instruction: %cmp = icmp ne i64 %iv, 511
+; CHECK-NEXT:  LV: Found uniform instruction: %a1 = getelementptr i64, ptr %p, i64 %iv
+; CHECK-NEXT:  LV: Found uniform instruction: %a2 = getelementptr i64, ptr %p, i64 %offset
+; CHECK-NEXT:  LV: Found uniform instruction: %offset = add i64 %iv, 100
+; CHECK-NEXT:  LV: Found uniform instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:  LV: Found uniform instruction: %iv.next = add i64 %iv, 1
+; CHECK-NEXT:  LV: Found scalar instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:  LV: Found scalar instruction: %iv.next = add i64 %iv, 1
+; CHECK-NEXT:  LV: Scalarizing: %a1 = getelementptr i64, ptr %p, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing: %v = load i64, ptr %a1, align 32
+; CHECK-NEXT:  LV: Scalarizing: %offset = add i64 %iv, 100
+; CHECK-NEXT:  LV: Scalarizing: %a2 = getelementptr i64, ptr %p, i64 %offset
+; CHECK-NEXT:  LV: Scalarizing: store i64 %v, ptr %a2, align 32
+; CHECK-NEXT:  LV: Scalarizing: %iv.next = add i64 %iv, 1
+; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ne i64 %iv, 511
+; CHECK-NEXT:  LV: Scalarizing: %a1 = getelementptr i64, ptr %p, i64 %iv
+; CHECK-NEXT:  LV: Scalarizing: %offset = add i64 %iv, 100
+; CHECK-NEXT:  LV: Scalarizing: %a2 = getelementptr i64, ptr %p, i64 %offset
+; CHECK-NEXT:  LV: Scalarizing: %iv.next = add i64 %iv, 1
+; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ne i64 %iv, 511
+; CHECK-NEXT:  VPlan 'Initial VPlan for VF={1},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%4>
+; CHECK-NEXT:      vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:      CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
+; CHECK-NEXT:      CLONE ir<%v> = load ir<%a1>
+; CHECK-NEXT:      CLONE ir<%offset> = add vp<%3>, ir<100>
+; CHECK-NEXT:      CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
+; CHECK-NEXT:      CLONE store ir<%v>, ir<%a2>
+; CHECK-NEXT:      EMIT vp<%4> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%4>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%6>
+; CHECK-NEXT:      vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:      CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
+; CHECK-NEXT:      vp<%4> = vector-pointer ir<%a1>
+; CHECK-NEXT:      WIDEN ir<%v> = load vp<%4>
+; CHECK-NEXT:      CLONE ir<%offset> = add vp<%3>, ir<100>
+; CHECK-NEXT:      CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
+; CHECK-NEXT:      vp<%5> = vector-pointer ir<%a2>
+; CHECK-NEXT:      WIDEN store vp<%5>, ir<%v>
+; CHECK-NEXT:      EMIT vp<%6> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%6>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %a1 = getelementptr i64, ptr %p, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v = load i64, ptr %a1, align 32
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %offset = add i64 %iv, 100
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: %a2 = getelementptr i64, ptr %p, i64 %offset
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v, ptr %a2, align 32
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 1 For instruction: %cmp = icmp ne i64 %iv, 511
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %cmp, label %loop, label %exit
+; CHECK-NEXT:  LV: Scalar loop costs: 5.
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %a1 = getelementptr i64, ptr %p, i64 %iv
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %v = load i64, ptr %a1, align 32
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %offset = add i64 %iv, 100
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: %a2 = getelementptr i64, ptr %p, i64 %offset
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: store i64 %v, ptr %a2, align 32
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1
+; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF 2 For instruction: %cmp = icmp ne i64 %iv, 511
+; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %cmp, label %loop, label %exit
+; CHECK-NEXT:  LV: Vector loop of width 2 costs: 2.
+; CHECK-NEXT:  LV: Selecting VF: 2.
+; CHECK-NEXT:  LV: Minimum required TC for runtime checks to be profitable:0
+; CHECK-NEXT:  LV: Interleaving is not beneficial.
+; CHECK-NEXT:  LV: Found a vectorizable loop (2) in <stdin>
+; CHECK-NEXT:  LEV: Unable to vectorize epilogue because no epilogue is allowed.
+; CHECK-NEXT:  Executing best plan with VF=2, UF=1
+; CHECK-NEXT:  VPlan 'Final VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%6>
+; CHECK-NEXT:      vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:      CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
+; CHECK-NEXT:      vp<%4> = vector-pointer ir<%a1>
+; CHECK-NEXT:      WIDEN ir<%v> = load vp<%4>
+; CHECK-NEXT:      CLONE ir<%offset> = add vp<%3>, ir<100>
+; CHECK-NEXT:      CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
+; CHECK-NEXT:      vp<%5> = vector-pointer ir<%a2>
+; CHECK-NEXT:      WIDEN store vp<%5>, ir<%v>
+; CHECK-NEXT:      EMIT vp<%6> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%6>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index b876e9d2c1a5c..1932d78e59814 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -132,50 +132,42 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
-; IF-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; IF-EVL-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP7]], i32 [[N]])
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP9]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison)
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-NEXT:    [[TMP11]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
 ; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IF-EVL-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-NEXT:    [[CONV:%.*]] = sext i16 [[TMP15]] to i32
 ; IF-EVL-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
 ; IF-EVL-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       for.cond.cleanup.loopexit:
-; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; IF-EVL:       for.cond.cleanup:
 ; IF-EVL-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 0b495bc680f0c..abdc6a91b9257 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -13,21 +13,20 @@
 define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-LABEL: @interleave(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP31]], 8
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
-; IF-EVL-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP7]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
@@ -36,46 +35,44 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; IF-EVL-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP15]]
-; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0
 ; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; IF-EVL-NEXT:    [[TMP38:%.*]] = add i64 [[TMP19]], 0
-; IF-EVL-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 1
-; IF-EVL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT:    [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; IF-EVL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; IF-EVL-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
-; IF-EVL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
+; IF-EVL-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 0
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 1
+; IF-EVL-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]]
+; IF-EVL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP17]], i64 [[N]])
+; IF-EVL-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP22]], i64 [[N]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP27:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[TMP28:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]]
+; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP17]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 0
+; IF-EVL-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 4
+; IF-EVL-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP33]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP27]], ptr [[TMP31]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP28]], ptr [[TMP34]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -84,10 +81,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
-; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
-; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP37]], [[TMP36]]
 ; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -106,9 +103,9 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP0]], i32 0
 ; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
@@ -120,15 +117,15 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; NO-VP-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
-; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
-; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
-; NO-VP-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 8
+; NO-VP-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP10]], align 4
 ; NO-VP-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO-VP:       middle.block:
 ; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -138,10 +135,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP:       for.body:
 ; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
-; NO-VP-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
-; NO-VP-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP30]], [[TMP29]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
 ; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
index 1cf71360adf72..1a9d351ca33a1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -17,30 +17,24 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
-; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
-; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
-; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]])
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP2]], i32 4, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <16 x i1> [[ACTIVE_LANE_MASK]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -49,10 +43,10 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP9]]
 ; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
index 9b49d44141db3..b2902c799124a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
@@ -11,62 +12,254 @@
 ; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
 
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-LABEL: 'foo'
+; IF-EVL-NEXT:  LV: Loop hints: force=? width=4 interleave=0
+; IF-EVL-NEXT:  LV: Found a loop: for.body
+; IF-EVL-NEXT:  LV: Found an induction variable.
+; IF-EVL-NEXT:  LV: We can vectorize this loop!
+; IF-EVL-NEXT:  LV: Found trip count: 0
+; IF-EVL-NEXT:  LV: vector predicate hint/switch found.
+; IF-EVL-NEXT:  LV: Not allowing scalar epilogue, creating predicated vector loop.
+; IF-EVL-NEXT:  LV: The max safe fixed VF is: 67108864.
+; IF-EVL-NEXT:  LV: The max safe scalable VF is: vscale x 0.
+; IF-EVL-NEXT:  LV: checking if tail can be folded by masking.
+; IF-EVL-NEXT:  LV: can fold tail by masking.
+; IF-EVL-NEXT:  LV: Preference for VP intrinsics indicated. Will not try to generate VP Intrinsics due to non-interleaving reasons.
+; IF-EVL-NEXT:  LV: Invalidate all interleaved groups due to fold-tail by masking which requires masked-interleaved support.
+; IF-EVL-NEXT:  LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; IF-EVL-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; IF-EVL-NEXT:  LV: Found uniform instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; IF-EVL-NEXT:  LV: Found uniform instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; IF-EVL-NEXT:  LV: Found uniform instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-NEXT:  LV: Found uniform instruction: %iv.next = add nuw nsw i64 %iv, 1
+; IF-EVL-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; IF-EVL-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; IF-EVL-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; IF-EVL-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; IF-EVL-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %add = add nsw i32 %1, %0
+; IF-EVL-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; IF-EVL-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; IF-EVL-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; IF-EVL-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; IF-EVL-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; IF-EVL-NEXT:  LV: Using user VF 4.
+; IF-EVL-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; IF-EVL-NEXT:  LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; IF-EVL-NEXT:  LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; IF-EVL-NEXT:  LV: Scalarizing: %iv.next = add nuw nsw i64 %iv, 1
+; IF-EVL-NEXT:  LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; IF-EVL-NEXT:  VPlan 'Initial VPlan for VF={4},UF>=1' {
+; IF-EVL-NEXT:  Live-in vp<%0> = VF * UF
+; IF-EVL-NEXT:  Live-in vp<%1> = vector-trip-count
+; IF-EVL-NEXT:  Live-in ir<%N> = original trip-count
 ; IF-EVL-EMPTY:
-; IF-EVL:      vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-NEXT:  vector.ph:
+; IF-EVL-NEXT:  Successor(s): vector loop
 ; IF-EVL-EMPTY:
-; IF-EVL-NEXT: <x1> vector loop: {
-; IF-EVL-NEXT:  vector.body:
-; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
-; IF-EVL-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
-; IF-EVL-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]>
-; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, vp<[[MASK]]>
-; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
-; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, vp<[[MASK]]>
-; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
-; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
-; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]>
-; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  <x1> vector loop: {
+; IF-EVL-NEXT:    vector.body:
+; IF-EVL-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%9>
+; IF-EVL-NEXT:      vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; IF-EVL-NEXT:      EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; IF-EVL-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; IF-EVL-NEXT:      vp<%6> = vector-pointer ir<%arrayidx>
+; IF-EVL-NEXT:      WIDEN ir<%0> = load vp<%6>, vp<%5>
+; IF-EVL-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; IF-EVL-NEXT:      vp<%7> = vector-pointer ir<%arrayidx2>
+; IF-EVL-NEXT:      WIDEN ir<%1> = load vp<%7>, vp<%5>
+; IF-EVL-NEXT:      WIDEN ir<%add> = add nsw ir<%1>, ir<%0>
+; IF-EVL-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; IF-EVL-NEXT:      vp<%8> = vector-pointer ir<%arrayidx4>
+; IF-EVL-NEXT:      WIDEN store vp<%8>, ir<%add>, vp<%5>
+; IF-EVL-NEXT:      EMIT vp<%9> = add vp<%3>, vp<%0>
+; IF-EVL-NEXT:      EMIT branch-on-count vp<%9>, vp<%1>
+; IF-EVL-NEXT:    No successors
+; IF-EVL-NEXT:  }
+; IF-EVL-NEXT:  Successor(s): middle.block
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT:  middle.block:
 ; IF-EVL-NEXT:  No successors
-; IF-EVL-NEXT: }
-
-; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' {
-; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-NEXT:  }
+; IF-EVL-NEXT:  LV: Interleaving is not beneficial.
+; IF-EVL-NEXT:  LV: Found a vectorizable loop (4) in <stdin>
+; IF-EVL-NEXT:  LEV: Unable to vectorize epilogue because no epilogue is allowed.
+; IF-EVL-NEXT:  Executing best plan with VF=4, UF=1
+; IF-EVL-NEXT:  VPlan 'Final VPlan for VF={4},UF>=1' {
+; IF-EVL-NEXT:  Live-in vp<%0> = VF * UF
+; IF-EVL-NEXT:  Live-in vp<%1> = vector-trip-count
+; IF-EVL-NEXT:  Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT:  vector.ph:
+; IF-EVL-NEXT:  Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT:  <x1> vector loop: {
+; IF-EVL-NEXT:    vector.body:
+; IF-EVL-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%9>
+; IF-EVL-NEXT:      vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; IF-EVL-NEXT:      EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; IF-EVL-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; IF-EVL-NEXT:      vp<%6> = vector-pointer ir<%arrayidx>
+; IF-EVL-NEXT:      WIDEN ir<%0> = load vp<%6>, vp<%5>
+; IF-EVL-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; IF-EVL-NEXT:      vp<%7> = vector-pointer ir<%arrayidx2>
+; IF-EVL-NEXT:      WIDEN ir<%1> = load vp<%7>, vp<%5>
+; IF-EVL-NEXT:      WIDEN ir<%add> = add nsw ir<%1>, ir<%0>
+; IF-EVL-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; IF-EVL-NEXT:      vp<%8> = vector-pointer ir<%arrayidx4>
+; IF-EVL-NEXT:      WIDEN store vp<%8>, ir<%add>, vp<%5>
+; IF-EVL-NEXT:      EMIT vp<%9> = add vp<%3>, vp<%0>
+; IF-EVL-NEXT:      EMIT branch-on-count vp<%9>, vp<%1>
+; IF-EVL-NEXT:    No successors
+; IF-EVL-NEXT:  }
+; IF-EVL-NEXT:  Successor(s): middle.block
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT:  middle.block:
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT:  }
+; IF-EVL-NEXT:  LV: Interleaving disabled by the pass manager
+; IF-EVL-NEXT:  LV: Vectorizing: innermost loop.
+;
+; NO-VP-LABEL: 'foo'
+; NO-VP-NEXT:  LV: Loop hints: force=? width=4 interleave=0
+; NO-VP-NEXT:  LV: Found a loop: for.body
+; NO-VP-NEXT:  LV: Found an induction variable.
+; NO-VP-NEXT:  LV: We can vectorize this loop!
+; NO-VP-NEXT:  LV: Found trip count: 0
+; NO-VP-NEXT:  LV: vector predicate hint/switch found.
+; NO-VP-NEXT:  LV: Not allowing scalar epilogue, creating predicated vector loop.
+; NO-VP-NEXT:  LV: The max safe fixed VF is: 67108864.
+; NO-VP-NEXT:  LV: The max safe scalable VF is: vscale x 0.
+; NO-VP-NEXT:  LV: checking if tail can be folded by masking.
+; NO-VP-NEXT:  LV: can fold tail by masking.
+; NO-VP-NEXT:  LV: Cannot fold tail by masking: vectorize with a scalar epilogue instead.
+; NO-VP-NEXT:  LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; NO-VP-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; NO-VP-NEXT:  LV: Found uniform instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; NO-VP-NEXT:  LV: Found uniform instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; NO-VP-NEXT:  LV: Found uniform instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-NEXT:  LV: Found uniform instruction: %iv.next = add nuw nsw i64 %iv, 1
+; NO-VP-NEXT:  LV: Found scalar instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-NEXT:  LV: Found scalar instruction: %iv.next = add nuw nsw i64 %iv, 1
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %add = add nsw i32 %1, %0
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; NO-VP-NEXT:  LV: Using user VF 4.
+; NO-VP-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; NO-VP-NEXT:  LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; NO-VP-NEXT:  LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; NO-VP-NEXT:  LV: Scalarizing: %iv.next = add nuw nsw i64 %iv, 1
+; NO-VP-NEXT:  LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; NO-VP-NEXT:  VPlan 'Initial VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT:  Live-in vp<%0> = VF * UF
+; NO-VP-NEXT:  Live-in vp<%1> = vector-trip-count
+; NO-VP-NEXT:  Live-in ir<%N> = original trip-count
 ; NO-VP-EMPTY:
-; NO-VP:      vector.ph:
-; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-NEXT:  vector.ph:
+; NO-VP-NEXT:  Successor(s): vector loop
 ; NO-VP-EMPTY:
-; NO-VP-NEXT: <x1> vector loop: {
-; NO-VP-NEXT:  vector.body:
-; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
-; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
-; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
-; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
-; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
-; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
-; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
-; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
-; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  <x1> vector loop: {
+; NO-VP-NEXT:    vector.body:
+; NO-VP-NEXT:      EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%7>
+; NO-VP-NEXT:      vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; NO-VP-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%3>
+; NO-VP-NEXT:      vp<%4> = vector-pointer ir<%arrayidx>
+; NO-VP-NEXT:      WIDEN ir<%0> = load vp<%4>
+; NO-VP-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%3>
+; NO-VP-NEXT:      vp<%5> = vector-pointer ir<%arrayidx2>
+; NO-VP-NEXT:      WIDEN ir<%1> = load vp<%5>
+; NO-VP-NEXT:      WIDEN ir<%add> = add nsw ir<%1>, ir<%0>
+; NO-VP-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%3>
+; NO-VP-NEXT:      vp<%6> = vector-pointer ir<%arrayidx4>
+; NO-VP-NEXT:      WIDEN store vp<%6>, ir<%add>
+; NO-VP-NEXT:      EMIT vp<%7> = add nuw vp<%2>, vp<%0>
+; NO-VP-NEXT:      EMIT branch-on-count vp<%7>, vp<%1>
+; NO-VP-NEXT:    No successors
+; NO-VP-NEXT:  }
+; NO-VP-NEXT:  Successor(s): middle.block
+; NO-VP-EMPTY:
+; NO-VP-NEXT:  middle.block:
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT:  }
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %add = add nsw i32 %1, %0
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; NO-VP-NEXT:  LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; NO-VP-NEXT:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; NO-VP-NEXT:  LV(REG): Calculating max register usage:
+; NO-VP-NEXT:  LV(REG): At #0 Interval # 0
+; NO-VP-NEXT:  LV(REG): At #1 Interval # 1
+; NO-VP-NEXT:  LV(REG): At #2 Interval # 2
+; NO-VP-NEXT:  LV(REG): At #3 Interval # 2
+; NO-VP-NEXT:  LV(REG): At #4 Interval # 3
+; NO-VP-NEXT:  LV(REG): At #5 Interval # 3
+; NO-VP-NEXT:  LV(REG): At #6 Interval # 2
+; NO-VP-NEXT:  LV(REG): At #8 Interval # 1
+; NO-VP-NEXT:  LV(REG): At #9 Interval # 1
+; NO-VP-NEXT:  LV(REG): VF = 4
+; NO-VP-NEXT:  LV(REG): Found max usage: 2 item
+; NO-VP-NEXT:  LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; NO-VP-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 2 registers
+; NO-VP-NEXT:  LV(REG): Found invariant usage: 0 item
+; NO-VP-NEXT:  LV: The target has 16 registers of Generic::ScalarRC register class
+; NO-VP-NEXT:  LV: The target has 32 registers of Generic::VectorRC register class
+; NO-VP-NEXT:  LV: Loop cost is 6
+; NO-VP-NEXT:  LV: IC is 4
+; NO-VP-NEXT:  LV: VF is 4
+; NO-VP-NEXT:  LV: Interleaving to saturate store or load ports.
+; NO-VP-NEXT:  LV: Found a vectorizable loop (4) in <stdin>
+; NO-VP-NEXT:  LV: Interleave Count is 4
+; NO-VP-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; NO-VP-NEXT:  Executing best plan with VF=4, UF=4
+; NO-VP-NEXT:  VPlan 'Final VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT:  Live-in vp<%0> = VF * UF
+; NO-VP-NEXT:  Live-in vp<%1> = vector-trip-count
+; NO-VP-NEXT:  Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP-NEXT:  vector.ph:
+; NO-VP-NEXT:  Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT:  <x1> vector loop: {
+; NO-VP-NEXT:    vector.body:
+; NO-VP-NEXT:      EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%7>
+; NO-VP-NEXT:      vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; NO-VP-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%3>
+; NO-VP-NEXT:      vp<%4> = vector-pointer ir<%arrayidx>
+; NO-VP-NEXT:      WIDEN ir<%0> = load vp<%4>
+; NO-VP-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%3>
+; NO-VP-NEXT:      vp<%5> = vector-pointer ir<%arrayidx2>
+; NO-VP-NEXT:      WIDEN ir<%1> = load vp<%5>
+; NO-VP-NEXT:      WIDEN ir<%add> = add nsw ir<%1>, ir<%0>
+; NO-VP-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%3>
+; NO-VP-NEXT:      vp<%6> = vector-pointer ir<%arrayidx4>
+; NO-VP-NEXT:      WIDEN store vp<%6>, ir<%add>
+; NO-VP-NEXT:      EMIT vp<%7> = add nuw vp<%2>, vp<%0>
+; NO-VP-NEXT:      EMIT branch-on-count vp<%7>, vp<%1>
+; NO-VP-NEXT:    No successors
+; NO-VP-NEXT:  }
+; NO-VP-NEXT:  Successor(s): middle.block
+; NO-VP-EMPTY:
+; NO-VP-NEXT:  middle.block:
 ; NO-VP-NEXT:  No successors
-; NO-VP-NEXT: }
+; NO-VP-NEXT:  }
+; NO-VP-NEXT:  LV: Interleaving disabled by the pass manager
+; NO-VP-NEXT:  LV: Vectorizing: innermost loop.
+;
+
 
 entry:
   br label %for.body