[llvm] [LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. (WIP) (PR #149042)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 28 08:21:04 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/149042
>From 6e286a4d73448a6d6f70e45dd2402849ffad6662 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 28 Aug 2025 12:07:45 +0100
Subject: [PATCH 1/3] [LV] Modernize reduction-order.ll test.
---
.../LoopVectorize/reduction-order.ll | 128 ++++++++++++------
1 file changed, 86 insertions(+), 42 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
index b07c3833ca235..b1773acb6d3f8 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
@@ -1,63 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -force-vector-interleave=1 -S < %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
; Make sure the selects generated from reduction are always emitted
; in deterministic order.
-; CHECK-LABEL: @foo(
-; CHECK: vector.body:
-; CHECK: [[VEC_PHI_1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_5:%.+]], %vector.body ]
-; CHECK: [[VEC_PHI_2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_3:%.+]], %vector.body ]
-; CHECK: icmp ule <4 x i64>
-; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
-; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
-; CHECK: select <4 x i1> {{.*}}, <4 x i32> [[ADD_5]], <4 x i32>
-; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> [[ADD_3]], <4 x i32>
-; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
-define internal i64 @foo(ptr %t0) !prof !1 {
-t16:
- br label %t20
-
-t17: ; preds = %t20
- %t18 = phi i32 [ %t24, %t20 ]
- %t19 = phi i32 [ %t28, %t20 ]
- br label %t31
+define i32 @foo() !prof !1 {
+; CHECK-LABEL: define i32 @foo() {{.*}}{
+; CHECK-NEXT: [[T16:.*:]]
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9)
+; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
+; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_5]], <4 x i32> [[VEC_PHI_1]]
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_3]], <4 x i32> [[VEC_PHI_2]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF2:![0-9]+]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[T21:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[T29:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[T22:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[T28:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[T23:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[T24:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[T24]] = add i32 3, [[T23]]
+; CHECK-NEXT: [[T28]] = add i32 [[T22]], 5
+; CHECK-NEXT: [[T29]] = add nuw nsw i64 [[T21]], 1
+; CHECK-NEXT: [[T30:%.*]] = icmp eq i64 [[T29]], 10
+; CHECK-NEXT: br i1 [[T30]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[R_2:%.*]] = phi i32 [ [[T24]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[R_1:%.*]] = phi i32 [ [[T28]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[R_2]], [[R_1]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+entry:
+ br label %loop
-t20: ; preds = %t20, %t16
- %t21 = phi i64 [ 0, %t16 ], [ %t29, %t20 ]
- %t22 = phi i32 [ 0, %t16 ], [ %t28, %t20 ]
- %t23 = phi i32 [ 0, %t16 ], [ %t24, %t20 ]
- %t24 = add i32 3, %t23
- %t28 = add i32 %t22, 5
- %t29 = add nuw nsw i64 %t21, 1
- %t30 = icmp eq i64 %t29, 10
- br i1 %t30, label %t17, label %t20, !prof !2
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %red.1 = phi i32 [ 0, %entry ], [ %red.1.next, %loop ]
+ %red.2 = phi i32 [ 0, %entry ], [ %red.2.next, %loop ]
+ %red.2.next = add i32 3, %red.2
+ %red.1.next = add i32 %red.1, 5
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, 10
+ br i1 %ec, label %exit, label %loop, !prof !2
-t31:
- ret i64 undef
+exit:
+ %r.2 = phi i32 [ %red.2.next, %loop ]
+ %r.1 = phi i32 [ %red.1.next, %loop ]
+ %add = add i32 %r.2, %r.1
+ ret i32 %add
}
; Make sure we do not fail when checking for ordered reduction. This test just
; exercises the path and bails out without performing vectorization.
-; CHECK-LABEL: quux
-; CHECK-NOT: fadd <4 x
-define void @quux(i1 %arg) {
-bb:
+define double @quux(i1 %arg) {
+; CHECK-LABEL: define double @quux(
+; CHECK-SAME: i1 [[ARG:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[TMP5:%.*]] = phi double [ 1.300000e+01, %[[ENTRY]] ], [ [[TMP:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = fadd double [[TMP5]], 1.000000e+00
+; CHECK-NEXT: br label %[[LATCH]]
+; CHECK: [[LATCH]]:
+; CHECK-NEXT: [[TMP]] = phi double [ [[TMP6]], %[[HEADER]] ]
+; CHECK-NEXT: br i1 [[ARG]], label %[[HEADER]], label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[R:%.*]] = phi double [ [[TMP]], %[[LATCH]] ]
+; CHECK-NEXT: ret double [[R]]
+;
+entry:
br label %header
-latch: ; preds = %header
+header:
+ %tmp5 = phi double [ 1.300000e+01, %entry ], [ %tmp, %latch ]
+ %tmp6 = fadd double %tmp5, 1.000000e+00
+ br label %latch
+
+latch:
%tmp = phi double [ %tmp6, %header ]
- br i1 %arg, label %header, label %bb2
+ br i1 %arg, label %header, label %exit
-bb2: ; preds = %latch
- %tmp3 = phi double [ %tmp, %latch ]
- ret void
+exit:
+ %r = phi double [ %tmp, %latch ]
+ ret double %r
-header: ; preds = %latch, %bb
- %tmp5 = phi double [ 1.300000e+01, %bb ], [ %tmp, %latch ]
- %tmp6 = fadd double %tmp5, 1.000000e+00
- br label %latch
}
!1 = !{!"function_entry_count", i64 801}
>From ab18b3f4a78fcde90898f984e25e03ef89853db9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 15 Jul 2025 19:05:55 +0100
Subject: [PATCH 2/3] [LV] Use ExtractLane(LastActiveLane, V) live outs when
tail-folding. (WIP)
Building on top of https://github.com/llvm/llvm-project/pull/148817, use
ExtractLane + FirstActiveLane to support vectorizing external users when
tail-folding.
Currently marked as WIP as there is a regression when
-prefer-predicate-over-epilogue=predicate-else-scalar-epilogue is used
because we bail out when building VPlans, so cannot recover and switch
to non-tail-folding.
Ideally we would have built both VPlans
(https://github.com/llvm/llvm-project/issues/148882).
See also https://github.com/llvm/llvm-project/issues/148603
Depends on https://github.com/llvm/llvm-project/pull/148817 (included in
the PR).
---
.../Vectorize/LoopVectorizationLegality.cpp | 18 ----
.../Transforms/Vectorize/LoopVectorize.cpp | 25 +++++-
.../Transforms/Vectorize/VPlanPredicator.cpp | 54 ++++++++++--
.../LoopVectorize/RISCV/dead-ops-cost.ll | 66 +++++++-------
.../first-order-recurrence-scalable-vf1.ll | 48 +----------
.../LoopVectorize/RISCV/scalable-tailfold.ll | 39 +++++----
.../tail-folding-fixed-order-recurrence.ll | 52 ++---------
.../LoopVectorize/RISCV/uniform-load-store.ll | 86 +++++++++++--------
...ctor-loop-backedge-elimination-with-evl.ll | 2 +-
.../LoopVectorize/X86/small-size.ll | 81 ++++++++++++++---
.../pr43166-fold-tail-by-masking.ll | 79 +++++++++++++----
.../LoopVectorize/reduction-order.ll | 6 +-
.../LoopVectorize/select-reduction.ll | 4 +-
.../use-scalar-epilogue-if-tp-fails.ll | 84 +-----------------
14 files changed, 327 insertions(+), 317 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 789047a2a28e7..41b84e5502ec5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1937,24 +1937,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
for (const auto &Reduction : getReductionVars())
ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
- // TODO: handle non-reduction outside users when tail is folded by masking.
- for (auto *AE : AllowedExit) {
- // Check that all users of allowed exit values are inside the loop or
- // are the live-out of a reduction.
- if (ReductionLiveOuts.count(AE))
- continue;
- for (User *U : AE->users()) {
- Instruction *UI = cast<Instruction>(U);
- if (TheLoop->contains(UI))
- continue;
- LLVM_DEBUG(
- dbgs()
- << "LV: Cannot fold tail by masking, loop has an outside user for "
- << *UI << "\n");
- return false;
- }
- }
-
for (const auto &Entry : getInductionVars()) {
PHINode *OrigPhi = Entry.first;
for (User *U : OrigPhi->users()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6317bc3c20e25..8f1998e99c3e5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8435,7 +8435,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
-static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
+static bool addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
+ using namespace llvm::VPlanPatternMatch;
+
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
auto *ScalarPHVPBB = Plan.getScalarPreheader();
auto *MiddleVPBB = Plan.getMiddleBlock();
@@ -8454,6 +8456,15 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
"Cannot handle loops with uncountable early exits");
+ // TODO: Support ExtractLane of last-active-lane with first-order
+ // recurrences.
+
+ if (any_of(FOR->users(), [FOR](VPUser *U) {
+ return match(U, m_VPInstruction<VPInstruction::ExtractLane>(
+ m_VPValue(), m_Specific(FOR)));
+ }))
+ return false;
+
// This is the second phase of vectorizing first-order recurrences, creating
// extract for users outside the loop. An overview of the transformation is
// described below. Suppose we have the following loop with some use after
@@ -8528,6 +8539,7 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
using namespace llvm::VPlanPatternMatch;
if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
continue;
+
// For VF vscale x 1, if vscale = 1, we are unable to extract the
// penultimate value of the recurrence. Instead we rely on the existing
// extract of the last element from the result of
@@ -8535,13 +8547,14 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
// TODO: Consider vscale_range info and UF.
if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
Range))
- return;
+ return true;
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
{}, "vector.recur.extract.for.phi");
cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
}
}
+ return true;
}
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
@@ -8738,7 +8751,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
R->setOperand(1, WideIV->getStepValue());
}
- addExitUsersForFirstOrderRecurrences(*Plan, Range);
+ if (!addExitUsersForFirstOrderRecurrences(*Plan, Range))
+ return nullptr;
DenseMap<VPValue *, VPValue *> IVEndValues;
addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
@@ -9167,7 +9181,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (FinalReductionResult == U || Parent->getParent())
continue;
U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
- if (match(U, m_ExtractLastElement(m_VPValue())))
+ if (match(U, m_VPInstruction<VPInstruction::ExtractLastElement>(
+ m_VPValue())) ||
+ match(U, m_VPInstruction<VPInstruction::ExtractLane>(m_VPValue(),
+ m_VPValue())))
cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index cdadc33e3088e..3cd71ff45bcd9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,11 +14,13 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
#include "llvm/ADT/PostOrderIterator.h"
using namespace llvm;
+using namespace VPlanPatternMatch;
namespace {
class VPPredicator {
@@ -42,11 +44,6 @@ class VPPredicator {
/// possibly inserting new recipes at \p Dst (using Builder's insertion point)
VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
- /// Returns the *entry* mask for \p VPBB.
- VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
- return BlockMaskCache.lookup(VPBB);
- }
-
/// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
/// already have a mask.
void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
@@ -66,6 +63,11 @@ class VPPredicator {
}
public:
+ /// Returns the *entry* mask for \p VPBB.
+ VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
+ return BlockMaskCache.lookup(VPBB);
+ }
+
/// Returns the precomputed predicate of the edge from \p Src to \p Dst.
VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
return EdgeMaskCache.lookup({Src, Dst});
@@ -297,5 +299,47 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
PrevVPBB = VPBB;
}
+
+ // If we folded the tail and introduced a header mask, any extract of the last
+ // element must be updated to only extract the last-active-lane of the header
+ // mask.
+ if (FoldTail) {
+ assert(Plan.getExitBlocks().size() == 1 &&
+ "only a single-exit block is supported currently");
+ VPBasicBlock *EB = Plan.getExitBlocks().front();
+ assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() &&
+ "the exit block must have middle block as single predecessor");
+
+ VPValue *LastActiveLane = nullptr;
+ VPBuilder B(Plan.getMiddleBlock()->getTerminator());
+ for (auto &P : EB->phis()) {
+ auto *ExitIRI = cast<VPIRPhi>(&P);
+ VPValue *Inc = ExitIRI->getIncomingValue(0);
+ VPValue *Op;
+ if (!match(Inc, m_VPInstruction<VPInstruction::ExtractLastElement>(
+ m_VPValue(Op))))
+ continue;
+
+ if (!LastActiveLane) {
+ // Compute the index of the last active lane, by getting the
+ // first-active-lane of the negated header mask (which is the first lane
+ // the original header mask was false) and subtract 1.
+ VPValue *HeaderMask = Predicator.getBlockInMask(
+ Plan.getVectorLoopRegion()->getEntryBasicBlock());
+ LastActiveLane = B.createNaryOp(
+ Instruction::Sub,
+ {B.createNaryOp(VPInstruction::FirstActiveLane,
+ {B.createNot(HeaderMask)}),
+ Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(
+ Plan.getScalarHeader()->getIRBasicBlock()->getContext(),
+ 64),
+ 1))});
+ }
+ auto *Ext =
+ B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
+ Inc->replaceAllUsesWith(Ext);
+ }
+ }
return Predicator.getBlockMaskCache();
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 7e8f4adb0c4cd..635ead8b406fa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -81,11 +81,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required(
; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6)
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005
@@ -94,34 +90,40 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
-; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
-; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
-; CHECK-NEXT: [[TMP14:%.*]] = mul i32 4, [[TMP4]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT: [[TMP1:%.*]] = mul <vscale x 16 x i32> [[TMP0]], splat (i32 4)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i32> zeroinitializer, [[TMP1]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 252, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = mul i32 4, [[TMP4]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP14]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT: [[TMP5:%.*]] = icmp uge <vscale x 16 x i32> [[TMP16]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT: [[TMP6:%.*]] = sext <vscale x 16 x i32> [[VEC_IND]] to <vscale x 16 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 16 x i64> [[TMP6]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP4]]), !alias.scope [[META5:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 16 x i64> [[TMP6]]
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> align 1 [[TMP8]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP4]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
+; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP4]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
+; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP13]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], i64 [[TMP11]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -132,9 +134,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IV]], 1001
-; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ]
+; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i8 [[R]]
;
entry:
@@ -475,7 +477,7 @@ attributes #0 = { "target-features"="+64bit,+v" }
; CHECK: [[META7]] = distinct !{[[META7]], !"LVerDomain"}
; CHECK: [[META8]] = !{[[META9:![0-9]+]]}
; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META3]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]], [[META3]]}
; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
; CHECK: [[META12]] = !{[[META13:![0-9]+]]}
; CHECK: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]]}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 7eb3d7fc5a36d..dca09decef0c4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -8,53 +8,19 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1
-; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: store <vscale x 1 x i64> [[TMP7]], ptr [[TMP8]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP12:%.*]] = sub i32 [[TMP11]], 1
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = sub i32 [[TMP14]], 1
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP15]]
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L]] = load i64, ptr [[GEP_SRC]], align 8
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
; CHECK-NEXT: store i64 [[FOR]], ptr [[GEP_DST]], align 8
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -77,9 +43,3 @@ exit:
}
attributes #0 = { "target-features"="+64bit,+v,+zvl128b,+zvl256b" }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index be043745cd4bb..a5b248a480735 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -279,34 +279,39 @@ for.end:
define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
; CHECK-LABEL: @uniform_load(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1025, [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[TMP5]], [[IV]]
+; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP8]]
+; CHECK-NEXT: br label [[FOR_END:%.*]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[B]], align 8
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
; CHECK-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
@@ -314,7 +319,7 @@ define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 1025
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: for.end:
-; CHECK-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], [[FOR_BODY1]] ], [ [[V]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], [[FOR_BODY1]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[V_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index ec67c15357eab..6a7518f08fe6b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -444,50 +444,10 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-LABEL: define i32 @FOR_reduction(
; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
; IF-EVL-NEXT: [[ENTRY:.*]]:
-; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP9]], 2
-; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
-; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IF-EVL: [[VECTOR_PH]]:
-; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
-; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
-; IF-EVL-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
-; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
-; IF-EVL: [[VECTOR_BODY]]:
-; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
-; IF-EVL-NEXT: [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
-; IF-EVL-NEXT: [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], [[WIDE_LOAD]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
-; IF-EVL-NEXT: store <vscale x 4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS]], [[TMP3]]
-; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; IF-EVL: [[MIDDLE_BLOCK]]:
-; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
-; IF-EVL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP16]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
-; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 2
-; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
-; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; IF-EVL: [[SCALAR_PH]]:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
; IF-EVL-NEXT: br label %[[FOR_BODY:.*]]
; IF-EVL: [[FOR_BODY]]:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
; IF-EVL-NEXT: [[TMP0]] = load i32, ptr [[ARRAYIDX1]], align 4
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP0]]
@@ -495,9 +455,9 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; IF-EVL: [[FOR_END]]:
-; IF-EVL-NEXT: [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ]
; IF-EVL-NEXT: ret i32 [[FOR1_LCSSA]]
;
; NO-VP-LABEL: define i32 @FOR_reduction(
@@ -720,8 +680,8 @@ for.end:
; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]}
; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]}
; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META3]], [[META1]]}
-; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META3]]}
-; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META3]], [[META1]]}
+; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]]}
+; IF-EVL: [[META10]] = !{!"llvm.loop.vectorize.enable", i1 true}
; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]], [[META3]]}
; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1adb6b9a887cd..8f6c27c97344d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -130,35 +130,40 @@ for.end:
define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
; SCALABLE-LABEL: define i64 @uniform_load_outside_use(
; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT: [[ENTRY:.*]]:
-; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SCALABLE-NEXT: [[ENTRY:.*:]]
+; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; SCALABLE: [[VECTOR_PH]]:
-; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; SCALABLE: [[VECTOR_BODY]]:
-; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
; SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B]], align 8
; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
-; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; SCALABLE-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]]
+; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
-; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; SCALABLE-NEXT: [[TMP14:%.*]] = sub i64 [[TMP7]], 1
+; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
+; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 0
+; SCALABLE-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP14]]
+; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[SCALAR_PH]]:
-; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; SCALABLE-NEXT: br label %[[FOR_BODY:.*]]
; SCALABLE: [[FOR_BODY]]:
-; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
@@ -166,7 +171,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; SCALABLE: [[FOR_END]]:
-; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
; SCALABLE-NEXT: ret i64 [[V_LCSSA]]
;
; FIXEDLEN-LABEL: define i64 @uniform_load_outside_use(
@@ -206,35 +211,40 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
;
; TF-SCALABLE-LABEL: define i64 @uniform_load_outside_use(
; TF-SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-SCALABLE-NEXT: [[ENTRY:.*]]:
-; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; TF-SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; TF-SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; TF-SCALABLE-NEXT: [[ENTRY:.*:]]
+; TF-SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; TF-SCALABLE: [[VECTOR_PH]]:
-; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
; TF-SCALABLE: [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; TF-SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
-; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; TF-SCALABLE-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[IV]]
+; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; TF-SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; TF-SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
-; TF-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; TF-SCALABLE-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
+; TF-SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 0
+; TF-SCALABLE-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP8]]
+; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
-; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]]
; TF-SCALABLE: [[FOR_BODY]]:
-; TF-SCALABLE-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; TF-SCALABLE-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; TF-SCALABLE-NEXT: [[V1:%.*]] = load i64, ptr [[B]], align 8
; TF-SCALABLE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
; TF-SCALABLE-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
@@ -242,7 +252,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
-; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
+; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll
index 0e3507e11c230..cef085468ea5f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll
@@ -55,7 +55,7 @@ define i32 @test_remove_iv(i32 %start) #0 {
; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 6, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[TMP4:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], splat (i32 3)
-; CHECK-NEXT: [[TMP5]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP3]])
+; CHECK-NEXT: [[TMP5]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP4]], i32 [[TMP3]])
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP3]]
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index c67b02b0f39f4..f63d68b8e001b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -548,21 +548,82 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
; induction is used outside the loop.
define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) optsize {
; CHECK-LABEL: @example23d(
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[TMP1:%.*]]
-; CHECK: 1:
-; CHECK-NEXT: [[DOT04:%.*]] = phi ptr [ [[SRC:%.*]], [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[TMP1]] ]
-; CHECK-NEXT: [[DOT013:%.*]] = phi ptr [ [[DST:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[TMP1]] ]
-; CHECK-NEXT: [[I_02:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
-; CHECK-NEXT: [[TMP2]] = getelementptr inbounds nuw i8, ptr [[DOT04]], i64 2
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP10]], i64 6
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[TMP11]], i64 4
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[TMP32]], i64 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP6]], i64 12
+; CHECK-NEXT: [[TMP33:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP33]], i64 0
+; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: pred.store.if:
+; CHECK-NEXT: [[DOT013:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT: [[DOT04:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[DOT04]], align 2
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7
-; CHECK-NEXT: [[TMP6]] = getelementptr inbounds nuw i8, ptr [[DOT013]], i64 4
; CHECK-NEXT: store i32 [[TMP5]], ptr [[DOT013]], align 4
-; CHECK-NEXT: [[TMP7]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[TMP7]], 257
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[TMP8:%.*]], label [[TMP1]]
-; CHECK: 8:
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK: pred.store.continue:
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP33]], i64 1
+; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK: pred.store.if9:
+; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[NEXT_GEP1]], align 2
+; CHECK-NEXT: [[TMP14:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i32 [[TMP14]], 7
+; CHECK-NEXT: store i32 [[TMP15]], ptr [[NEXT_GEP6]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; CHECK: pred.store.continue10:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP33]], i64 2
+; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK: pred.store.if11:
+; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[NEXT_GEP2]], align 2
+; CHECK-NEXT: [[TMP18:%.*]] = zext i16 [[TMP17]] to i32
+; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT: store i32 [[TMP19]], ptr [[NEXT_GEP7]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]]
+; CHECK: pred.store.continue12:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP33]], i64 3
+; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
+; CHECK: pred.store.if13:
+; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[NEXT_GEP3]], align 2
+; CHECK-NEXT: [[TMP22:%.*]] = zext i16 [[TMP21]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
+; CHECK-NEXT: store i32 [[TMP23]], ptr [[NEXT_GEP8]], align 4
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]]
+; CHECK: pred.store.continue14:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[TMP1]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP33]], splat (i1 true)
+; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP25]], i1 true)
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], -1
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = add nsw i64 [[TMP28]], 1
+; CHECK-NEXT: br label [[TMP31:%.*]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: br label [[TMP30:%.*]]
+; CHECK: 30:
+; CHECK-NEXT: br i1 poison, label [[TMP31]], label [[TMP30]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK: 31:
+; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ poison, [[TMP30]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[TMP7]]
;
br label %1
diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
index cbc9fccebb881..6fb4e4409a6a3 100644
--- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
@@ -39,21 +39,38 @@
define i64 @test1(i64 %y) {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[ENTRY:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i64> splat (i64 3), [[BROADCAST_SPLAT]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> splat (i64 77), <4 x i64> [[TMP1]]
+; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP4]]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
; CHECK: cond.false:
; CHECK-NEXT: [[DIV:%.*]] = xor i64 3, [[Y]]
; CHECK-NEXT: br label [[COND_END]]
; CHECK: cond.end:
-; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
+; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY1]] ]
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -84,20 +101,36 @@ for.cond.cleanup:
define i64 @test2(i64 %y) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[ENTRY:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 77), <4 x i64> splat (i64 55)
+; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP3]]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
; CHECK: cond.false:
; CHECK-NEXT: br label [[COND_END]]
; CHECK: cond.end:
-; CHECK-NEXT: [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
+; CHECK-NEXT: [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY1]] ]
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -127,20 +160,36 @@ for.cond.cleanup:
define i32 @test3(i64 %y) {
; CHECK-LABEL: @test3(
; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[ENTRY:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> splat (i32 55)
+; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 [[TMP3]]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[Y]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
; CHECK: cond.false:
; CHECK-NEXT: br label [[COND_END]]
; CHECK: cond.end:
-; CHECK-NEXT: [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY1]] ]
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[COND_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
index b1773acb6d3f8..a11301ac9ad12 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
@@ -19,11 +19,11 @@ define i32 @foo() !prof !1 {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i64> [[VEC_IV]], splat (i64 9)
; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
-; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_5]], <4 x i32> [[VEC_PHI_1]]
-; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_3]], <4 x i32> [[VEC_PHI_2]]
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_PHI_1]], <4 x i32> [[ADD_5]]
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_PHI_2]], <4 x i32> [[ADD_3]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF2:![0-9]+]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
index cc2ec68d07687..83575bd403b93 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
@@ -27,10 +27,10 @@ define i32 @test(i64 %N, i32 %x) {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], splat (i32 10)
; CHECK-NEXT: [[TMP2]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_PHI]], <4 x i32> splat (i32 10)
-; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
index 3b34b75a4c511..7f6e741b25a4b 100644
--- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -13,47 +13,6 @@
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
-; CHECK-LABEL: @basic_loop(
-; CHECK-NEXT: header:
-; CHECK-NEXT: [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
-; CHECK-NEXT: br label [[BODY:%.*]]
-; CHECK: body:
-; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
-; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
-; CHECK-NEXT: store i8 [[TMP5]], ptr [[BUFF]], align 1
-; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: end:
-; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
-; CHECK-NEXT: ret void
-;
header:
%ptr0 = load ptr, ptr %pos, align 4
br label %body
@@ -74,47 +33,6 @@ end:
}
define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
-; CHECK-LABEL: @metadata(
-; CHECK-NEXT: header:
-; CHECK-NEXT: [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
-; CHECK-NEXT: br label [[BODY:%.*]]
-; CHECK: body:
-; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
-; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
-; CHECK-NEXT: store i8 [[TMP5]], ptr [[BUFF]], align 1
-; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: end:
-; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
-; CHECK-NEXT: ret void
-;
header:
%ptr0 = load ptr, ptr %pos, align 4
br label %body
@@ -137,3 +55,5 @@ end:
!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
>From 51369a967e394e2e52900dc9122895cab9f18590 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 28 Aug 2025 16:20:02 +0100
Subject: [PATCH 3/3] Support first-order-recurrences
---
.../Transforms/Vectorize/LoopVectorize.cpp | 38 +++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 7 +-
llvm/lib/Transforms/Vectorize/VPlanUtils.h | 1 +
.../first-order-recurrence-scalable-vf1.ll | 63 +++++-
.../tail-folding-fixed-order-recurrence.ll | 69 +++++-
llvm/test/Transforms/LoopVectorize/optsize.ll | 214 ++++++++++++++----
6 files changed, 324 insertions(+), 68 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8f1998e99c3e5..a6635fb6600fe 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8456,15 +8456,6 @@ static bool addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
"Cannot handle loops with uncountable early exits");
- // TODO: Support ExtractLane of last-active-lane with first-order
- // recurrences.
-
- if (any_of(FOR->users(), [FOR](VPUser *U) {
- return match(U, m_VPInstruction<VPInstruction::ExtractLane>(
- m_VPValue(), m_Specific(FOR)));
- }))
- return false;
-
// This is the second phase of vectorizing first-order recurrences, creating
// extract for users outside the loop. An overview of the transformation is
// described below. Suppose we have the following loop with some use after
@@ -8537,6 +8528,35 @@ static bool addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
// the VPIRInstruction modeling the phi.
for (VPUser *U : FOR->users()) {
using namespace llvm::VPlanPatternMatch;
+ if (match(U, m_VPInstruction<VPInstruction::ExtractLane>(
+ m_Sub(m_VPInstruction<VPInstruction::FirstActiveLane>(
+ m_VPValue()),
+ m_VPValue()),
+ m_Specific(FOR)))) {
+
+ VPBuilder MiddleBuilder(cast<VPInstruction>(U));
+ VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
+ VPValue *PenultimateLastIter = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractLane,
+ {MiddleBuilder.createNaryOp(
+ Instruction::Sub,
+ {LastActiveLane,
+ Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(Plan.getContext(), 64), 1))}),
+ FOR->getBackedgeValue()});
+
+ VPValue *LastPrevIter = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractLastElement, {FOR});
+ VPValue *Cmp = MiddleBuilder.createICmp(
+ CmpInst::ICMP_EQ, LastActiveLane,
+ Plan.getOrAddLiveIn(
+ ConstantInt::get(IntegerType::get(Plan.getContext(), 64), 0)));
+ VPValue *Sel =
+ MiddleBuilder.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
+ cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
+ continue;
+ }
+
if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bd9a93ed57b8a..eadb8018aa366 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -601,7 +601,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
llvm_unreachable("should be handled by VPPhi::execute");
}
case Instruction::Select: {
- bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+ bool OnlyFirstLaneUsed =
+ vputils::onlyFirstLaneUsed(this) ||
+ (isa<VPInstruction>(getOperand(1)) &&
+ cast<VPInstruction>(getOperand(1))->isVectorToScalar() &&
+ isa<VPInstruction>(getOperand(2)) &&
+ cast<VPInstruction>(getOperand(2))->isVectorToScalar());
Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 9e1d325a4d8d6..2959e9440e753 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,6 +49,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
case Instruction::GetElementPtr:
case Instruction::ICmp:
case Instruction::FCmp:
+ case Instruction::Select:
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
return true;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index dca09decef0c4..4e7d9798b2223 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -7,20 +7,66 @@ target triple = "riscv64-unknown-linux-gnu"
define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[TMP4]], 1
+; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 0, i32 [[TMP5]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 23, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT: [[TMP8:%.*]] = icmp uge <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT: [[VP_OP_LOAD]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[VP_OP_LOAD]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP6]])
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP8]], i1 true)
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP14]], 1
+; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], 1
+; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 2 x i64> [[VP_OP_LOAD]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 2
+; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x i64> [[TMP10]], i32 [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[TMP15]], 0
+; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 [[TMP20]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L]] = load i64, ptr [[GEP_SRC]], align 8
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
; CHECK-NEXT: store i64 [[FOR]], ptr [[GEP_DST]], align 8
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -43,3 +89,10 @@ exit:
}
attributes #0 = { "target-features"="+64bit,+v,+zvl128b,+zvl256b" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index 6a7518f08fe6b..f1ee2d5c06685 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -443,21 +443,68 @@ for.end:
define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-LABEL: define i32 @FOR_reduction(
; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
-; IF-EVL-NEXT: [[ENTRY:.*]]:
+; IF-EVL-NEXT: [[ENTRY:.*:]]
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i32 [[TMP4]], 1
+; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP5]]
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TC]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP6]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; IF-EVL-NEXT: [[TMP8:%.*]] = icmp uge <vscale x 4 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP8]], i1 true)
+; IF-EVL-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], 1
+; IF-EVL-NEXT: [[TMP20:%.*]] = sub i64 [[TMP16]], 1
+; IF-EVL-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 0
+; IF-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i64 [[TMP20]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT: [[TMP23:%.*]] = mul nuw i32 [[TMP22]], 4
+; IF-EVL-NEXT: [[TMP24:%.*]] = sub i32 [[TMP23]], 1
+; IF-EVL-NEXT: [[TMP25:%.*]] = extractelement <vscale x 4 x i32> [[TMP10]], i32 [[TMP24]]
+; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP16]], 0
+; IF-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP21]]
+; IF-EVL-NEXT: br label %[[FOR_END:.*]]
+; IF-EVL: [[SCALAR_PH]]:
; IF-EVL-NEXT: br label %[[FOR_BODY:.*]]
; IF-EVL: [[FOR_BODY]]:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT: [[TMP0]] = load i32, ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ 33, %[[SCALAR_PH]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV1]]
+; IF-EVL-NEXT: [[TMP0]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP0]]
-; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV1]]
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[IV1]], 1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; IF-EVL: [[FOR_END]]:
-; IF-EVL-NEXT: [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[TMP27]], %[[MIDDLE_BLOCK]] ]
; IF-EVL-NEXT: ret i32 [[FOR1_LCSSA]]
;
; NO-VP-LABEL: define i32 @FOR_reduction(
@@ -680,8 +727,8 @@ for.end:
; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]}
; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]}
; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META3]], [[META1]]}
-; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]]}
-; IF-EVL: [[META10]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]], [[META3]]}
+; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META3]], [[META1]]}
; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]], [[META3]]}
; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index a843aeb1ee8a2..52902c19bf648 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -521,44 +521,116 @@ define i32 @pr45526() optsize {
;
; CHECK-LABEL: define i32 @pr45526(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; CHECK-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
+; CHECK-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
; CHECK-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[FOR_LCSSA]]
;
; PGSO-LABEL: define i32 @pr45526(
; PGSO-SAME: ) #[[ATTR0]] {
-; PGSO-NEXT: [[ENTRY:.*]]:
+; PGSO-NEXT: [[ENTRY:.*:]]
+; PGSO-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; PGSO: [[VECTOR_PH]]:
+; PGSO-NEXT: br label %[[VECTOR_BODY:.*]]
+; PGSO: [[VECTOR_BODY]]:
+; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; PGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; PGSO-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PGSO: [[MIDDLE_BLOCK]]:
+; PGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; PGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; PGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; PGSO-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; PGSO-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; PGSO-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; PGSO-NEXT: br label %[[EXIT:.*]]
+; PGSO: [[SCALAR_PH]]:
; PGSO-NEXT: br label %[[LOOP:.*]]
; PGSO: [[LOOP]]:
-; PGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; PGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
+; PGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
+; PGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
; PGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
; PGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; PGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; PGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP22:![0-9]+]]
; PGSO: [[EXIT]]:
-; PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; PGSO-NEXT: ret i32 [[FOR_LCSSA]]
;
; NPGSO-LABEL: define i32 @pr45526(
; NPGSO-SAME: ) #[[ATTR0]] {
-; NPGSO-NEXT: [[ENTRY:.*]]:
+; NPGSO-NEXT: [[ENTRY:.*:]]
+; NPGSO-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NPGSO: [[VECTOR_PH]]:
+; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]]
+; NPGSO: [[VECTOR_BODY]]:
+; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; NPGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; NPGSO-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; NPGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; NPGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; NPGSO: [[MIDDLE_BLOCK]]:
+; NPGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; NPGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; NPGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; NPGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; NPGSO-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; NPGSO-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; NPGSO-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; NPGSO-NEXT: br label %[[EXIT:.*]]
+; NPGSO: [[SCALAR_PH]]:
; NPGSO-NEXT: br label %[[LOOP:.*]]
; NPGSO: [[LOOP]]:
-; NPGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; NPGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
+; NPGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
+; NPGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
; NPGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
; NPGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; NPGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP24:![0-9]+]]
; NPGSO: [[EXIT]]:
-; NPGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; NPGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; NPGSO-NEXT: ret i32 [[FOR_LCSSA]]
;
entry:
@@ -579,30 +651,78 @@ define i32 @pr45526_pgso() !prof !14 {
;
; CHECK-LABEL: define i32 @pr45526_pgso(
; CHECK-SAME: ) !prof [[PROF14]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; CHECK-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
+; CHECK-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
; CHECK-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[FOR_LCSSA]]
;
; PGSO-LABEL: define i32 @pr45526_pgso(
; PGSO-SAME: ) !prof [[PROF14]] {
-; PGSO-NEXT: [[ENTRY:.*]]:
+; PGSO-NEXT: [[ENTRY:.*:]]
+; PGSO-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; PGSO: [[VECTOR_PH]]:
+; PGSO-NEXT: br label %[[VECTOR_BODY:.*]]
+; PGSO: [[VECTOR_BODY]]:
+; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; PGSO-NEXT: [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; PGSO-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; PGSO: [[MIDDLE_BLOCK]]:
+; PGSO-NEXT: [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; PGSO-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PGSO-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; PGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; PGSO-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; PGSO-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; PGSO-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; PGSO-NEXT: br label %[[EXIT:.*]]
+; PGSO: [[SCALAR_PH]]:
; PGSO-NEXT: br label %[[LOOP:.*]]
; PGSO: [[LOOP]]:
-; PGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; PGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
+; PGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
+; PGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
; PGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
; PGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; PGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; PGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP24:![0-9]+]]
; PGSO: [[EXIT]]:
-; PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; PGSO-NEXT: ret i32 [[FOR_LCSSA]]
;
; NPGSO-LABEL: define i32 @pr45526_pgso(
@@ -618,7 +738,7 @@ define i32 @pr45526_pgso() !prof !14 {
; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; NPGSO-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
-; NPGSO-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; NPGSO-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; NPGSO: [[MIDDLE_BLOCK]]:
; NPGSO-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; NPGSO-NEXT: br label %[[SCALAR_PH]]
@@ -631,7 +751,7 @@ define i32 @pr45526_pgso() !prof !14 {
; NPGSO-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
; NPGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
; NPGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP24:![0-9]+]]
+; NPGSO-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP26:![0-9]+]]
; NPGSO: [[EXIT]]:
; NPGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
; NPGSO-NEXT: ret i32 [[FOR_LCSSA]]
@@ -687,7 +807,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[FOR_END:.*]]
; CHECK: [[SCALAR_PH]]:
@@ -699,7 +819,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
; CHECK-NEXT: store i16 42, ptr [[GEPOFB]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: ret void
;
@@ -735,7 +855,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
; PGSO-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; PGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; PGSO: [[MIDDLE_BLOCK]]:
; PGSO-NEXT: br label %[[FOR_END:.*]]
; PGSO: [[SCALAR_PH]]:
@@ -747,7 +867,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
; PGSO-NEXT: store i16 42, ptr [[GEPOFB]], align 4
; PGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; PGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; PGSO: [[FOR_END]]:
; PGSO-NEXT: ret void
;
@@ -783,7 +903,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
; NPGSO-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; NPGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; NPGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; NPGSO: [[MIDDLE_BLOCK]]:
; NPGSO-NEXT: br label %[[FOR_END:.*]]
; NPGSO: [[SCALAR_PH]]:
@@ -795,7 +915,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
; NPGSO-NEXT: store i16 42, ptr [[GEPOFB]], align 4
; NPGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; NPGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; NPGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; NPGSO: [[FOR_END]]:
; NPGSO-NEXT: ret void
;
@@ -834,7 +954,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
; CHECK-NEXT: store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -847,7 +967,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
; CHECK-NEXT: store i16 42, ptr [[GEPOFB]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; CHECK: [[FOR_END]]:
; CHECK-NEXT: ret void
;
@@ -866,7 +986,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
; PGSO-NEXT: store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
; PGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; PGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; PGSO: [[MIDDLE_BLOCK]]:
; PGSO-NEXT: br label %[[SCALAR_PH]]
; PGSO: [[SCALAR_PH]]:
@@ -879,7 +999,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
; PGSO-NEXT: store i16 42, ptr [[GEPOFB]], align 4
; PGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; PGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; PGSO: [[FOR_END]]:
; PGSO-NEXT: ret void
;
@@ -898,7 +1018,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
; NPGSO-NEXT: store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
; NPGSO-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; NPGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; NPGSO-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; NPGSO: [[MIDDLE_BLOCK]]:
; NPGSO-NEXT: br label %[[SCALAR_PH]]
; NPGSO: [[SCALAR_PH]]:
@@ -911,7 +1031,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
; NPGSO-NEXT: store i16 42, ptr [[GEPOFB]], align 4
; NPGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; NPGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; NPGSO-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
; NPGSO: [[FOR_END]]:
; NPGSO-NEXT: ret void
;
@@ -1100,7 +1220,11 @@ exit:
; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]}
; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META16]]}
; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]], [[META17]]}
-; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]]}
+; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META17]], [[META16]]}
+; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]], [[META17]]}
+; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META17]], [[META16]]}
+; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]], [[META17]]}
+; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META16]]}
;.
; PGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
; PGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1112,7 +1236,11 @@ exit:
; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]}
; PGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META16]]}
; PGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]], [[META17]]}
-; PGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]]}
+; PGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META17]], [[META16]]}
+; PGSO: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]], [[META17]]}
+; PGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META17]], [[META16]]}
+; PGSO: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]], [[META17]]}
+; PGSO: [[LOOP28]] = distinct !{[[LOOP28]], [[META16]]}
;.
; NPGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
; NPGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1128,5 +1256,7 @@ exit:
; NPGSO: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]], [[META17]]}
; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META17]], [[META16]]}
; NPGSO: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]], [[META17]]}
-; NPGSO: [[LOOP28]] = distinct !{[[LOOP28]], [[META16]]}
+; NPGSO: [[LOOP28]] = distinct !{[[LOOP28]], [[META17]], [[META16]]}
+; NPGSO: [[LOOP29]] = distinct !{[[LOOP29]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP30]] = distinct !{[[LOOP30]], [[META16]]}
;.
More information about the llvm-commits
mailing list