[llvm] [LV] Don't require scalar epilogue for unsupported IAG with tail (PR #96544)
Kolya Panchenko via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 25 11:12:47 PDT 2024
https://github.com/nikolaypanchenko updated https://github.com/llvm/llvm-project/pull/96544
>From 5bf2739dbc0097109dce915635921fe276ae402f Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Mon, 24 Jun 2024 11:16:13 -0700
Subject: [PATCH 1/2] [LV] Don't require scalar epilogue for unsupported IAG
with tail
LV should check that all groups that require scalar epilogue can be widened,
otherwise if InterleavedAccessGroup cannot be widened and does have tail element,
current logic in LV requires to emit scalar epilogue, which leads to
inefficient vector code.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 83 +++--
.../LoopVectorize/ARM/pointer_iv.ll | 338 ++++++++----------
.../RISCV/riscv-vector-reverse.ll | 8 +
.../LoopVectorize/RISCV/strided-accesses.ll | 47 ++-
...nsupported-interleaved-access-with-gaps.ll | 126 +++++++
.../Transforms/LoopVectorize/RISCV/zvl32b.ll | 8 +-
.../LoopVectorize/X86/cost-model.ll | 85 +++--
.../X86/fixed-order-recurrence.ll | 137 ++++++-
.../vplan-printing-before-execute.ll | 2 +-
9 files changed, 518 insertions(+), 316 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/unsupported-interleaved-access-with-gaps.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 771fb247f201f..c7a74283b5258 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1448,29 +1448,58 @@ class LoopVectorizationCostModel {
/// Returns true if \p I is a memory instruction in an interleaved-group
/// of memory accesses that can be vectorized with wide vector loads/stores
/// and shuffles.
- bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
+ bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
/// Check if \p Instr belongs to any interleaved access group.
- bool isAccessInterleaved(Instruction *Instr) {
+ bool isAccessInterleaved(Instruction *Instr) const {
return InterleaveInfo.isInterleaved(Instr);
}
/// Get the interleaved access group that \p Instr belongs to.
const InterleaveGroup<Instruction> *
- getInterleavedAccessGroup(Instruction *Instr) {
+ getInterleavedAccessGroup(Instruction *Instr) const {
return InterleaveInfo.getInterleaveGroup(Instr);
}
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
- bool requiresScalarEpilogue(bool IsVectorizing) const {
- if (!isScalarEpilogueAllowed())
+ bool requiresScalarEpilogue(ElementCount VF) const {
+ if (!isScalarEpilogueAllowed()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
+ << " does not require scalar epilogue\n");
return false;
+ }
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
+ << " requires scalar epilogue: multiple exists\n");
return true;
- return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
+ }
+ if (VF.isVector()) {
+ if (InterleaveInfo.requiresScalarEpilogue()) {
+ // Make sure interleaved groups that require scalar epilogue will be
+ // widened.
+ for (auto *G : InterleaveInfo.getInterleaveGroups()) {
+ if (!G->requiresScalarEpilogue())
+ continue;
+
+ Instruction *I = G->getMember(0);
+ InstWidening Decision = getWideningDecision(I, VF);
+ if (Decision == CM_Interleave ||
+ (Decision == CM_Unknown &&
+ interleavedAccessCanBeWidened(G->getMember(0), VF))) {
+ LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
+ << " requires scalar epilogue: vectorizable "
+ "interleaved group\n");
+ return true;
+ }
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
+ << " does not require scalar epilogue\n");
+ return false;
}
/// Returns true if we're required to use a scalar epilogue for at least
@@ -1479,7 +1508,7 @@ class LoopVectorizationCostModel {
/// none.
bool requiresScalarEpilogue(VFRange Range) const {
auto RequiresScalarEpilogue = [this](ElementCount VF) {
- return requiresScalarEpilogue(VF.isVector());
+ return requiresScalarEpilogue(VF);
};
bool IsRequired = all_of(Range, RequiresScalarEpilogue);
assert(
@@ -2776,7 +2805,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
// the step does not evenly divide the trip count, no adjustment is necessary
// since there will already be scalar iterations. Note that the minimum
// iterations check ensures that N >= Step.
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
+ if (Cost->requiresScalarEpilogue(VF)) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -2829,8 +2858,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
- auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
// If tail is to be folded, vector loop takes care of all iterations.
Type *CountTy = Count->getType();
@@ -2879,7 +2908,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// Update dominator for Bypass & LoopExit (if needed).
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(VF.isVector()))
+ if (!Cost->requiresScalarEpilogue(VF))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -2908,7 +2937,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
- if (!Cost->requiresScalarEpilogue(VF.isVector()))
+ if (!Cost->requiresScalarEpilogue(VF))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -2961,7 +2990,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
"multiple exit loop without required epilogue?");
LoopMiddleBlock =
@@ -2976,7 +3005,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
// unconditional branch from the middle block to the scalar preheader. In that
// case, there's no edge from the middle block to exit blocks and thus no
// need to update the immediate dominator of the exit blocks.
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
+ if (Cost->requiresScalarEpilogue(VF)) {
assert(
LoopMiddleBlock->getSingleSuccessor() == LoopScalarPreHeader &&
" middle block should have the scalar preheader as single successor");
@@ -3109,7 +3138,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// Thus if tail is to be folded, we know we don't need to run the
// remainder and we can use the previous value for the condition (true).
// 3) Otherwise, construct a runtime check.
- if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
+ if (!Cost->requiresScalarEpilogue(VF) &&
!Cost->foldTailByMasking()) {
// Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
@@ -3419,7 +3448,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
+ if (Cost->requiresScalarEpilogue(VF)) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
@@ -3936,7 +3965,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
}
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
- Instruction *I, ElementCount VF) {
+ Instruction *I, ElementCount VF) const {
assert(isAccessInterleaved(I) && "Expecting interleaved access.");
assert(getWideningDecision(I, VF) == CM_Unknown &&
"Decision should not be set yet.");
@@ -4670,7 +4699,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
// When a scalar epilogue is required, at least one iteration of the scalar
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
// max VF that results in a dead vector loop.
- if (MaxTripCount > 0 && requiresScalarEpilogue(true))
+ if (MaxTripCount > 0 && requiresScalarEpilogue(MaxVectorElementCount))
MaxTripCount -= 1;
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
@@ -5302,7 +5331,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC =
- requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
+ requiresScalarEpilogue(VF) ? KnownTC - 1 : KnownTC;
// If trip count is known we select between two prospective ICs, where
// 1) the aggressive IC is capped by the trip count divided by VF
@@ -5331,7 +5360,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
} else if (BestKnownTC && *BestKnownTC > 0) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
- unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
+ unsigned AvailableTC = requiresScalarEpilogue(VF)
? (*BestKnownTC) - 1
: *BestKnownTC;
@@ -7638,8 +7667,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// Generate code to check if the loop's trip count is less than VF * UF of the
// main vector loop.
- auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
- : VF.isVector())
+ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF)
? ICmpInst::ICMP_ULE
: ICmpInst::ICMP_ULT;
@@ -7661,7 +7689,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// Update dominator for Bypass & LoopExit.
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
// For loops with multiple exits, there's no edge from the middle block
// to exit blocks (as the epilogue must run) and thus no need to update
// the immediate dominator of the exit blocks.
@@ -7730,7 +7758,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
DT->changeImmediateDominator(LoopScalarPreHeader,
EPI.EpilogueIterationCountCheck);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -7812,9 +7840,8 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
// Generate code to check if the loop's trip count is less than VF * UF of the
// vector epilogue loop.
- auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
- ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
Value *CheckMinIters =
Builder.CreateICmp(P, Count,
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
index 2269b774d9f31..1592ddb26f67d 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -12,16 +12,16 @@ define hidden void @pointer_phi_v4i32_add1(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <4 x i32> [[TMP0]], ptr [[NEXT_GEP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -53,24 +53,24 @@ define hidden void @pointer_phi_v4i32_add2(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 3
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 3
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <4 x i32> [[TMP0]], ptr [[NEXT_GEP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR_09]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_09]], align 4
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 8
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]]
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[Y]]
; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_07]], align 4
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
@@ -100,36 +100,22 @@ end:
define hidden void @pointer_phi_v4i32_add3(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i32 %y) {
; CHECK-LABEL: @pointer_phi_v4i32_add3(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 11952
-; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 3984
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 48
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.body:
-; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 12
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]]
-; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP2]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -160,16 +146,16 @@ define hidden void @pointer_phi_v8i16_add1(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP2]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[NEXT_GEP4]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[NEXT_GEP5]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP4]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP2]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -203,17 +189,17 @@ define hidden void @pointer_phi_v8i16_add2(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP2]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[NEXT_GEP4]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[NEXT_GEP5]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[A_ADDR_011:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
@@ -225,7 +211,7 @@ define hidden void @pointer_phi_v8i16_add2(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_09]], i32 2
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -305,7 +291,7 @@ define hidden void @pointer_phi_v16i8_add1(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: store <16 x i8> [[TMP1]], ptr [[NEXT_GEP4]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[A_ADDR_010:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
@@ -317,7 +303,7 @@ define hidden void @pointer_phi_v16i8_add1(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_08]], i32 1
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -352,28 +338,28 @@ define hidden void @pointer_phi_v16i8_add2(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[INDEX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT: [[TMP2:%.*]] = add <16 x i8> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP4]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <16 x i8> [[TMP1]], ptr [[NEXT_GEP4]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[A_ADDR_010:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
; CHECK-NEXT: [[B_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[A_ADDR_010]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[A_ADDR_010]], align 1
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_010]], i32 2
-; CHECK-NEXT: [[CONV1:%.*]] = add i8 [[TMP4]], [[TMP0]]
+; CHECK-NEXT: [[CONV1:%.*]] = add i8 [[TMP3]], [[TMP0]]
; CHECK-NEXT: store i8 [[CONV1]], ptr [[B_ADDR_08]], align 1
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_08]], i32 1
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -445,16 +431,16 @@ define hidden void @pointer_phi_v4f32_add1(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[NEXT_GEP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -486,29 +472,29 @@ define hidden void @pointer_phi_v4f32_add2(ptr noalias nocapture readonly %A, pt
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 3
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 3
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[NEXT_GEP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[A_ADDR_09]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[A_ADDR_09]], align 4
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 8
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]]
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP2]], [[Y]]
; CHECK-NEXT: store float [[ADD]], ptr [[B_ADDR_07]], align 4
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -533,36 +519,22 @@ end:
define hidden void @pointer_phi_v4f32_add3(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, float %y) {
; CHECK-LABEL: @pointer_phi_v4f32_add3(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 11952
-; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 3984
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[Y:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 48
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.body:
-; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 12
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]]
-; CHECK-NEXT: store float [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP2]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -592,16 +564,16 @@ define hidden void @pointer_phi_v4half_add1(ptr noalias nocapture readonly %A, p
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <8 x half> [[TMP2]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <8 x half> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -633,29 +605,29 @@ define hidden void @pointer_phi_v4half_add2(ptr noalias nocapture readonly %A, p
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x half>, ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x half> [[WIDE_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <8 x half> [[TMP2]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[A_ADDR_09]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[A_ADDR_09]], align 4
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP4]], [[Y]]
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP2]], [[Y]]
; CHECK-NEXT: store half [[ADD]], ptr [[B_ADDR_07]], align 4
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 2
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -687,29 +659,29 @@ define hidden void @pointer_phi_v4half_add3(ptr noalias nocapture readonly %A, p
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[INDEX]], 6
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 6
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x half>, ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x half> [[WIDE_VEC]], <24 x half> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: store <8 x half> [[TMP2]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[A_ADDR_09]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[A_ADDR_09]], align 4
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 6
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP4]], [[Y]]
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP2]], [[Y]]
; CHECK-NEXT: store half [[ADD]], ptr [[B_ADDR_07]], align 4
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 2
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
@@ -737,43 +709,29 @@ end:
define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i32 %n, i32 %y) {
; CHECK-LABEL: @pointer_phi_v4i32_uf2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 239808
-; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 39968
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 24, i32 48, i32 72>
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 96, i32 120, i32 144, i32 168>
-; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP2]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
-; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
+; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP4]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 192
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992
-; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 10000
+; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[A_ADDR_08:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9992, [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[B_ADDR_06:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[A_ADDR_08]], align 4
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_08]], i32 24
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[Y]]
-; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_06]], align 4
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_06]], i32 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
;
entry:
@@ -802,53 +760,39 @@ for.body:
define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i32 %n, i32 %y) {
; CHECK-LABEL: @pointer_phi_v4i32_uf4(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 239616
-; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 39936
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 24, i32 48, i32 72>
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 96, i32 120, i32 144, i32 168>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 192, i32 216, i32 240, i32 264>
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 288, i32 312, i32 336, i32 360>
-; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP4]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48
-; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48
+; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP8]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP10]], align 4
-; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 384
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984
-; CHECK-NEXT: br i1 [[TMP12]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 10000
+; CHECK-NEXT: br i1 [[TMP11]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[A_ADDR_08:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9984, [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[B_ADDR_06:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[A_ADDR_08]], align 4
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_08]], i32 24
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[Y]]
-; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_06]], align 4
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_06]], i32 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
;
entry:
br label %for.body
@@ -886,23 +830,23 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias n
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 1
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META28:![0-9]+]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META24:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 2
-; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META28]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META28]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META24]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META24]]
; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], <i8 10, i8 10, i8 10, i8 10>
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]]
; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META31:![0-9]+]], !noalias [[META28]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META27:![0-9]+]], !noalias [[META24]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 2
-; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META31]], !noalias [[META28]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META31]], !noalias [[META28]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META27]], !noalias [[META24]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META27]], !noalias [[META24]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
; CHECK-NEXT: [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI5]], i32 12
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; CHECK: for.body:
; CHECK-NEXT: [[X_ADDR_050:%.*]] = phi ptr [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ], [ [[X]], [[ENTRY]] ]
; CHECK-NEXT: [[Z_ADDR_049:%.*]] = phi ptr [ [[INCDEC_PTR34:%.*]], [[FOR_BODY]] ], [ [[Z]], [[ENTRY]] ]
@@ -924,7 +868,7 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias n
; CHECK-NEXT: store i8 [[MUL2]], ptr [[INCDEC_PTR33]], align 1
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_048]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b5aa96eb23f5e..ba070878df4a5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -50,6 +50,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
@@ -112,6 +114,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
; CHECK-NEXT: LV: Loop cost is 32
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
@@ -121,6 +124,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK: LV: Interleaving disabled by the pass manager
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
; CHECK-NEXT: LV: Vectorizing: innermost loop.
; CHECK-EMPTY:
;
@@ -191,6 +195,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
@@ -253,6 +259,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
; CHECK-NEXT: LV: Loop cost is 32
; CHECK-NEXT: LV: IC is 1
; CHECK-NEXT: LV: VF is vscale x 4
@@ -262,6 +269,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK: LV: Interleaving disabled by the pass manager
+; CHECK-NEXT: LV: Loop with VF = vscale x 4 does not require scalar epilogue
; CHECK-NEXT: LV: Vectorizing: innermost loop.
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 6936887cd166c..d00dd93c615f9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -8,41 +8,40 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 1024, [[TMP1]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 8, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP14]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 8, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP12]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
@@ -55,7 +54,7 @@ define void @single_constant_stride_int_scaled(ptr %p) {
; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/unsupported-interleaved-access-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/RISCV/unsupported-interleaved-access-with-gaps.ll
new file mode 100644
index 0000000000000..1d479c4b6d2ba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/unsupported-interleaved-access-with-gaps.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S -scalable-vectorization=on -force-vector-width=4 -debug-only=loop-vectorize %s 2> %t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+
+; DEBUG: LV: Loop with VF = vscale x 4 does not require scalar epilogue
+
+define i32 @test(ptr %px) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[PX:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 5
+; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PX]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PX]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = mul i64 5, [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP14]]
+; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP15]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 5, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[TMP17:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[TMP16]], i64 1
+; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_GATHER3]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP21:%.*]] = add <vscale x 4 x i32> [[TMP18]], [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[TMP16]], i64 2
+; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP24:%.*]] = add <vscale x 4 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[TMP16]], i64 3
+; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8> poison)
+; CHECK-NEXT: [[TMP26:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_GATHER5]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP27]] = add <vscale x 4 x i32> [[TMP24]], [[TMP26]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], %[[MIDDLE_BLOCK]] ], [ [[PX]], %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK: [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT: [[Y_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC6:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[R_016:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD_3:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[PX_ADDR_015:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[PX_ADDR_015]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP30]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[R_016]], [[CONV]]
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[PX_ADDR_015]], i64 1
+; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; CHECK-NEXT: [[CONV_1:%.*]] = sext i8 [[TMP31]] to i32
+; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[ADD]], [[CONV_1]]
+; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[PX_ADDR_015]], i64 2
+; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; CHECK-NEXT: [[CONV_2:%.*]] = sext i8 [[TMP32]] to i32
+; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[PX_ADDR_015]], i64 3
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; CHECK-NEXT: [[CONV_3:%.*]] = sext i8 [[TMP33]] to i32
+; CHECK-NEXT: [[ADD_3]] = add nsw i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PX_ADDR_015]], i64 5
+; CHECK-NEXT: [[INC6]] = add nuw nsw i32 [[Y_017]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC6]], 16
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[ADD_3_LCSSA:%.*]] = phi i32 [ [[ADD_3]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[ADD_3_LCSSA]]
+;
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %y.017 = phi i32 [ 0, %entry ], [ %inc6, %for.cond1.preheader ]
+ %r.016 = phi i32 [ 0, %entry ], [ %add.3, %for.cond1.preheader ]
+ %px.addr.015 = phi ptr [ %px, %entry ], [ %add.ptr, %for.cond1.preheader ]
+ %0 = load i8, ptr %px.addr.015, align 1
+ %conv = sext i8 %0 to i32
+ %add = add nsw i32 %r.016, %conv
+ %arrayidx.1 = getelementptr inbounds i8, ptr %px.addr.015, i64 1
+ %1 = load i8, ptr %arrayidx.1, align 1
+ %conv.1 = sext i8 %1 to i32
+ %add.1 = add nsw i32 %add, %conv.1
+ %arrayidx.2 = getelementptr inbounds i8, ptr %px.addr.015, i64 2
+ %2 = load i8, ptr %arrayidx.2, align 1
+ %conv.2 = sext i8 %2 to i32
+ %add.2 = add nsw i32 %add.1, %conv.2
+ %arrayidx.3 = getelementptr inbounds i8, ptr %px.addr.015, i64 3
+ %3 = load i8, ptr %arrayidx.3, align 1
+ %conv.3 = sext i8 %3 to i32
+ %add.3 = add nsw i32 %add.2, %conv.3
+ %add.ptr = getelementptr inbounds i8, ptr %px.addr.015, i64 5
+ %inc6 = add nuw nsw i32 %y.017, 1
+ %exitcond.not = icmp eq i32 %inc6, 16
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup:
+ %add.3.lcssa = phi i32 [ %add.3, %for.cond1.preheader ]
+ ret i32 %add.3.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
index ba78216100598..0dbedd368321b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
@@ -24,12 +24,12 @@ define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {
; CHECK-NEXT: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[TMP1]], <4 x ptr> [[TMP0]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br label [[SCALAR_PH]]
+; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1020, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -39,7 +39,7 @@ define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {
; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX]], align 2
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index b067766b54357..40cdfa58808bd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -393,56 +393,55 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 40
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[N_VEC]], 40
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 40
+; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 40
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 40
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 80
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 120
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
-; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP7]]
-; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP8]]
-; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 8
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 8
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 8
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 8
-; CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP10]], align 8
-; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP11]], align 8
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP15]], i32 1
-; CHECK-NEXT: [[TMP18:%.*]] = load ptr, ptr [[TMP12]], align 8
-; CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP13]], align 8
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x ptr> [[TMP20]], ptr [[TMP19]], i32 1
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq <2 x ptr> [[TMP17]], zeroinitializer
-; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <2 x ptr> [[TMP21]], zeroinitializer
-; CHECK-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[TMP22]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP25:%.*]] = xor <2 x i1> [[TMP23]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP26]] = or <2 x i1> [[VEC_PHI]], [[TMP24]]
-; CHECK-NEXT: [[TMP27]] = or <2 x i1> [[VEC_PHI3]], [[TMP25]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 40
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 80
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 120
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
+; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 8
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 8
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 8
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 8
+; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x ptr> [[TMP14]], ptr [[TMP13]], i32 1
+; CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> [[TMP18]], ptr [[TMP17]], i32 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <2 x ptr> [[TMP15]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq <2 x ptr> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = xor <2 x i1> [[TMP20]], <i1 true, i1 true>
+; CHECK-NEXT: [[TMP23:%.*]] = xor <2 x i1> [[TMP21]], <i1 true, i1 true>
+; CHECK-NEXT: [[TMP24]] = or <2 x i1> [[VEC_PHI]], [[TMP22]]
+; CHECK-NEXT: [[TMP25]] = or <2 x i1> [[VEC_PHI3]], [[TMP23]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP27]], [[TMP26]]
-; CHECK-NEXT: [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[BIN_RDX]])
-; CHECK-NEXT: [[TMP30:%.*]] = freeze i1 [[TMP29]]
-; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP30]], i1 false, i1 false
-; CHECK-NEXT: br label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP25]], [[TMP24]]
+; CHECK-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[BIN_RDX]])
+; CHECK-NEXT: [[TMP28:%.*]] = freeze i1 [[TMP27]]
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP28]], i1 false, i1 false
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
@@ -456,9 +455,9 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 {
; CHECK-NEXT: [[ANY_OF_NEXT]] = select i1 [[CMP13_NOT_NOT]], i1 [[ANY_OF]], i1 false
; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 40
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
-; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: exit:
-; CHECK-NEXT: [[ANY_OF_NEXT_LCSSA:%.*]] = phi i1 [ [[ANY_OF_NEXT]], [[LOOP]] ]
+; CHECK-NEXT: [[ANY_OF_NEXT_LCSSA:%.*]] = phi i1 [ [[ANY_OF_NEXT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i1 [[ANY_OF_NEXT_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 51e10521a7a0e..cbe74638866ab 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -157,11 +157,11 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD5]], i32 15
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <16 x i8> [[TMP10]], i32 15
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <16 x i8> [[TMP10]], i32 15
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[SCALAR_RECUR_INIT7:%.*]] = phi i8 [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i8 [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
@@ -171,10 +171,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: ret void
; CHECK: for.body:
; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP24:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SCALAR_RECUR8:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT7]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SCALAR_RECUR11:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT10]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR8]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_RECUR9:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT8]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_RECUR11:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT10]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR9]], [[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD8:%.*]] = add i8 [[SCALAR_RECUR8]], [[SCALAR_RECUR11]]
+; CHECK-NEXT: [[ADD8:%.*]] = add i8 [[SCALAR_RECUR9]], [[SCALAR_RECUR11]]
; CHECK-NEXT: [[ADD15:%.*]] = add i8 [[ADD8]], [[SCALAR_RECUR]]
; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP24]] = load i8, ptr [[ARRAYIDX18]], align 1
@@ -230,19 +230,118 @@ define i64 @test_pr62954_scalar_epilogue_required(ptr %A, ptr noalias %B, ptr %C
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15, i64 17, i64 19, i64 21, i64 23, i64 25, i64 27, i64 29, i64 31>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i64> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i64> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP98:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT: [[TMP0:%.*]] = sub nsw <16 x i64> zeroinitializer, [[VEC_IND]]
-; CHECK-NEXT: [[TMP1]] = sub nsw <16 x i64> zeroinitializer, [[STEP_ADD]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i64> [[TMP1]], i32 15
-; CHECK-NEXT: store i64 [[TMP4]], ptr [[GEP]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 18
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 20
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 22
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 24
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 26
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 28
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 30
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 34
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 36
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 38
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 40
+; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 42
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 44
+; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 46
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 48
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 50
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 52
+; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[OFFSET_IDX]], 54
+; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 56
+; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[OFFSET_IDX]], 58
+; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[OFFSET_IDX]], 60
+; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[OFFSET_IDX]], 62
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP41:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP43:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP46:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP47:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP49:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP17]]
+; CHECK-NEXT: [[TMP50:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP18]]
+; CHECK-NEXT: [[TMP51:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP19]]
+; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP54:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP22]]
+; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP23]]
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP24]]
+; CHECK-NEXT: [[TMP57:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP58:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP26]]
+; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP27]]
+; CHECK-NEXT: [[TMP60:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP28]]
+; CHECK-NEXT: [[TMP61:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP29]]
+; CHECK-NEXT: [[TMP62:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP30]]
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP31]]
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP32]]
+; CHECK-NEXT: [[TMP65:%.*]] = load double, ptr [[TMP33]], align 8
+; CHECK-NEXT: [[TMP66:%.*]] = load double, ptr [[TMP34]], align 8
+; CHECK-NEXT: [[TMP67:%.*]] = load double, ptr [[TMP35]], align 8
+; CHECK-NEXT: [[TMP68:%.*]] = load double, ptr [[TMP36]], align 8
+; CHECK-NEXT: [[TMP69:%.*]] = load double, ptr [[TMP37]], align 8
+; CHECK-NEXT: [[TMP70:%.*]] = load double, ptr [[TMP38]], align 8
+; CHECK-NEXT: [[TMP71:%.*]] = load double, ptr [[TMP39]], align 8
+; CHECK-NEXT: [[TMP72:%.*]] = load double, ptr [[TMP40]], align 8
+; CHECK-NEXT: [[TMP73:%.*]] = load double, ptr [[TMP41]], align 8
+; CHECK-NEXT: [[TMP74:%.*]] = load double, ptr [[TMP42]], align 8
+; CHECK-NEXT: [[TMP75:%.*]] = load double, ptr [[TMP43]], align 8
+; CHECK-NEXT: [[TMP76:%.*]] = load double, ptr [[TMP44]], align 8
+; CHECK-NEXT: [[TMP77:%.*]] = load double, ptr [[TMP45]], align 8
+; CHECK-NEXT: [[TMP78:%.*]] = load double, ptr [[TMP46]], align 8
+; CHECK-NEXT: [[TMP79:%.*]] = load double, ptr [[TMP47]], align 8
+; CHECK-NEXT: [[TMP80:%.*]] = load double, ptr [[TMP48]], align 8
+; CHECK-NEXT: [[TMP81:%.*]] = load double, ptr [[TMP49]], align 8
+; CHECK-NEXT: [[TMP82:%.*]] = load double, ptr [[TMP50]], align 8
+; CHECK-NEXT: [[TMP83:%.*]] = load double, ptr [[TMP51]], align 8
+; CHECK-NEXT: [[TMP84:%.*]] = load double, ptr [[TMP52]], align 8
+; CHECK-NEXT: [[TMP85:%.*]] = load double, ptr [[TMP53]], align 8
+; CHECK-NEXT: [[TMP86:%.*]] = load double, ptr [[TMP54]], align 8
+; CHECK-NEXT: [[TMP87:%.*]] = load double, ptr [[TMP55]], align 8
+; CHECK-NEXT: [[TMP88:%.*]] = load double, ptr [[TMP56]], align 8
+; CHECK-NEXT: [[TMP89:%.*]] = load double, ptr [[TMP57]], align 8
+; CHECK-NEXT: [[TMP90:%.*]] = load double, ptr [[TMP58]], align 8
+; CHECK-NEXT: [[TMP91:%.*]] = load double, ptr [[TMP59]], align 8
+; CHECK-NEXT: [[TMP92:%.*]] = load double, ptr [[TMP60]], align 8
+; CHECK-NEXT: [[TMP93:%.*]] = load double, ptr [[TMP61]], align 8
+; CHECK-NEXT: [[TMP94:%.*]] = load double, ptr [[TMP62]], align 8
+; CHECK-NEXT: [[TMP95:%.*]] = load double, ptr [[TMP63]], align 8
+; CHECK-NEXT: [[TMP96:%.*]] = load double, ptr [[TMP64]], align 8
+; CHECK-NEXT: [[TMP97:%.*]] = sub nsw <16 x i64> zeroinitializer, [[VEC_IND]]
+; CHECK-NEXT: [[TMP98]] = sub nsw <16 x i64> zeroinitializer, [[STEP_ADD]]
+; CHECK-NEXT: [[TMP99:%.*]] = extractelement <16 x i64> [[TMP98]], i32 15
+; CHECK-NEXT: store i64 [[TMP99]], ptr [[GEP]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[STEP_ADD]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[TMP100:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT: br i1 [[TMP100]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i64> [[TMP1]], i32 15
-; CHECK-NEXT: br label [[SCALAR_PH]]
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i64> [[TMP98]], i32 14
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i64> [[TMP98]], i32 15
+; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[REC_START]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
@@ -250,16 +349,16 @@ define i64 @test_pr62954_scalar_epilogue_required(ptr %A, ptr noalias %B, ptr %C
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[NEG_IV:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
; CHECK-NEXT: [[L_B:%.*]] = load double, ptr [[GEP_B]], align 8
; CHECK-NEXT: [[NEG_IV]] = sub nsw i64 0, [[IV]]
; CHECK-NEXT: store i64 [[NEG_IV]], ptr [[GEP]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
; CHECK-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV]], 74
-; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: exit:
-; CHECK-NEXT: [[DOTIN_LCSSA:%.*]] = phi i64 [ [[SCALAR_RECUR]], [[LOOP]] ]
-; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi double [ [[L_B]], [[LOOP]] ]
+; CHECK-NEXT: [[DOTIN_LCSSA:%.*]] = phi i64 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi double [ [[L_B]], [[LOOP]] ], [ [[TMP96]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: store double [[DOTLCSSA]], ptr [[C:%.*]], align 8
; CHECK-NEXT: ret i64 [[DOTIN_LCSSA]]
;
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index 2bb3c898c7cda..0fbaafe0ccb63 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
; VF/IC combination.
define void @test_tc_less_than_16(ptr %A, i64 %N) {
; CHECK: LV: Scalarizing: %cmp =
-; CHECK-NEXT: VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK: VPlan 'Initial VPlan for VF={8},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
>From 1d2c4fe89095b20c0ca1be3970db20d59a1e0312 Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Tue, 25 Jun 2024 11:12:22 -0700
Subject: [PATCH 2/2] Addressed comments
---
.../Transforms/Vectorize/LoopVectorize.cpp | 30 +++++++++----------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c7a74283b5258..b4547ccd2960e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1473,27 +1473,27 @@ class LoopVectorizationCostModel {
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
- << " requires scalar epilogue: multiple exists\n");
+ << " requires scalar epilogue: multiple exits\n");
return true;
}
if (VF.isVector()) {
if (InterleaveInfo.requiresScalarEpilogue()) {
// Make sure interleaved groups that require scalar epilogue will be
// widened.
- for (auto *G : InterleaveInfo.getInterleaveGroups()) {
- if (!G->requiresScalarEpilogue())
- continue;
-
- Instruction *I = G->getMember(0);
- InstWidening Decision = getWideningDecision(I, VF);
- if (Decision == CM_Interleave ||
- (Decision == CM_Unknown &&
- interleavedAccessCanBeWidened(G->getMember(0), VF))) {
- LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
- << " requires scalar epilogue: vectorizable "
- "interleaved group\n");
- return true;
- }
+ if (any_of(InterleaveInfo.getInterleaveGroups(), [&](auto *Group) {
+ if (!Group->requiresScalarEpilogue())
+ return false;
+
+ Instruction *I = Group->getMember(0);
+ InstWidening Decision = getWideningDecision(I, VF);
+ return Decision == CM_Interleave ||
+ (Decision == CM_Unknown &&
+ interleavedAccessCanBeWidened(I, VF));
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
+ << " requires scalar epilogue: vectorizable "
+ "interleaved group\n");
+ return true;
}
}
}
More information about the llvm-commits
mailing list