[llvm] [VPlan] Model FOR resume value extraction in VPlan. (PR #93396)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sat May 25 21:32:00 PDT 2024
https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/93396
This patch introduces a new ExtractRecurrenceResume VPInstruction opcode
to extract the value of a FOR to be used as resume value for the ph in
the scalar loop.
Note that it takes a VPFirstOrderRecurrencePHIRecipe as operand. This is
needed at the moment to conveniently let fixFixedOrderRecurrence still
handle creating and patching up the phis in the scalar pre-header and
header. As ExtractRecurrenceResume won't have any users yet, it is
marked as having side-effects until the phi in the scalar pre-header is
also created and managed by VPlan.
Depends on https://github.com/llvm/llvm-project/pull/93395
>From 5a26ba22b90743ec36c68b5efe7bba0535efef12 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 25 May 2024 22:45:52 +0100
Subject: [PATCH 1/2] [VPlan] Model FOR extract of exit value in VPlan.
This patch introduces a new ExtractRecurrenceResult VPInstruction opcode
to extract the value of a FOR for users outside the loop (i.e. in the
scalar loop's exits). This moves the first part of fixing first order
recurrences to VPlan, and removes some additional code to patch up
live-outs, which is now handled automatically.
The majority of test changes is due to changes in the order of which the
extracts are generated now. As we are now using VPTransformState to
generate the extracts, we may be able to re-use existing extracts in the
loop body in some cases. For scalable vectors, in some cases we now have
to compute the runtime VF twice, as each extract is now independent, but
those should be trivial to clean up for later passes (and in line with
other places in the code that also liberally re-compute runtime VFs).
---
.../Transforms/Vectorize/LoopVectorize.cpp | 38 -------------
llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 27 +++++++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 8 +++
.../LoopVectorize/AArch64/induction-costs.ll | 6 +-
.../AArch64/loop-vectorization-factors.ll | 2 +-
.../first-order-recurrence-chains.ll | 56 +++++++++----------
.../LoopVectorize/first-order-recurrence.ll | 24 ++++----
.../scalable-first-order-recurrence.ll | 12 ++--
.../LoopVectorize/vplan-printing.ll | 3 +-
10 files changed, 93 insertions(+), 91 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 48981a6bd39e3..71ee690a8fabd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3536,44 +3536,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
}
- auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
- assert(PhiR->getNumUsers() == 1 &&
- RecurSplice->getOpcode() ==
- VPInstruction::FirstOrderRecurrenceSplice &&
- "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
- SmallVector<VPLiveOut *> LiveOuts;
- for (VPUser *U : RecurSplice->users())
- if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
- LiveOuts.push_back(LiveOut);
-
- if (!LiveOuts.empty()) {
- // Extract the second last element in the middle block if the
- // Phi is used outside the loop. We need to extract the phi itself
- // and not the last element (the phi update in the current iteration). This
- // will be the value when jumping to the exit block from the
- // LoopMiddleBlock, when the scalar loop is not run at all.
- Value *ExtractForPhiUsedOutsideLoop = nullptr;
- if (VF.isVector()) {
- auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
- ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
- Incoming, Idx, "vector.recur.extract.for.phi");
- } else {
- assert(UF > 1 && "VF and UF cannot both be 1");
- // When loop is unrolled without vectorizing, initialize
- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
- // value of `Incoming`. This is analogous to the vectorized case above:
- // extracting the second last element when VF > 1.
- ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
- }
-
- for (VPLiveOut *LiveOut : LiveOuts) {
- assert(!Cost->requiresScalarEpilogue(VF.isVector()));
- PHINode *LCSSAPhi = LiveOut->getPhi();
- LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- State.Plan->removeLiveOut(LCSSAPhi);
- }
- }
-
// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3aee17921086d..91cbd459675d9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -167,8 +167,8 @@ class VPLane {
static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
- static VPLane getLastLaneForVF(const ElementCount &VF) {
- unsigned LaneOffset = VF.getKnownMinValue() - 1;
+ static VPLane getLastLaneForVF(const ElementCount &VF, unsigned Offset = 1) {
+ unsigned LaneOffset = VF.getKnownMinValue() - Offset;
Kind LaneKind;
if (VF.isScalable())
// In this case 'LaneOffset' refers to the offset from the start of the
@@ -1179,6 +1179,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
BranchOnCount,
BranchOnCond,
ComputeReductionResult,
+ ExtractRecurrenceResult,
LogicalAnd, // Non-poison propagating logical And.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
@@ -3612,7 +3613,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
return all_of(GEP->operands(), isUniformAfterVectorization);
if (auto *VPI = dyn_cast<VPInstruction>(Def))
- return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
+ return VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
+ VPI->getOpcode() == VPInstruction::ExtractRecurrenceResult;
return false;
}
} // end namespace vputils
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5eb99ffd1e10e..82e3bd530a8d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -137,6 +137,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInstruction::Not:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::ExtractRecurrenceResult:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
return false;
@@ -300,6 +301,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ComputeReductionResult:
+ case VPInstruction::ExtractRecurrenceResult:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
return true;
@@ -558,6 +560,27 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
return ReducedPartRdx;
}
+ case VPInstruction::ExtractRecurrenceResult: {
+ if (Part != 0)
+ return State.get(this, 0, /*IsScalar*/ true);
+
+ // Extract the second last element in the middle block for users outside the
+ // loop.
+ Value *Res;
+ if (State.VF.isVector()) {
+ Res = State.get(
+ getOperand(0),
+ VPIteration(State.UF - 1, VPLane::getLastLaneForVF(State.VF, 2)));
+ } else {
+ assert(State.UF > 1 && "VF and UF cannot both be 1");
+ // When loop is unrolled without vectorizing, retrieve the value just
+ // prior to the final unrolled value. This is analogous to the vectorized
+ // case above: extracting the second last element when VF > 1.
+ Res = State.get(getOperand(0), State.UF - 2);
+ }
+ Res->setName(Name);
+ return Res;
+ }
case VPInstruction::LogicalAnd: {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
@@ -598,6 +621,7 @@ void VPInstruction::execute(VPTransformState &State) {
bool GeneratesPerFirstLaneOnly =
canGenerateScalarForFirstLane() &&
(vputils::onlyFirstLaneUsed(this) ||
+ getOpcode() == VPInstruction::ExtractRecurrenceResult ||
getOpcode() == VPInstruction::ComputeReductionResult);
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
for (unsigned Part = 0; Part < State.UF; ++Part) {
@@ -692,6 +716,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
+ case VPInstruction::ExtractRecurrenceResult:
+ O << "extract-recurrence-result";
+ break;
case VPInstruction::ComputeReductionResult:
O << "compute-reduction-result";
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 422579ea8b84f..f4f43ed15615e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -812,6 +812,8 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
RecurrencePhis.push_back(FOR);
+ VPBuilder MiddleBuilder(
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()));
for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
@@ -843,6 +845,12 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
// Set the first operand of RecurSplice to FOR again, after replacing
// all users.
RecurSplice->setOperand(0, FOR);
+
+ auto *Result = cast<VPInstruction>(MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractRecurrenceResult, {FOR->getBackedgeValue()}, {},
+ "vector.recur.extract.for.phi"));
+ RecurSplice->replaceUsesWithIf(
+ Result, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
}
return true;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 4d9c850abdf3d..162fb9c802dd2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -126,9 +126,9 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -191,8 +191,8 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -332,9 +332,9 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; DEFAULT: middle.block:
+; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
-; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; DEFAULT: scalar.ph:
; DEFAULT-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index f2f58f89eb95d..718148a67fcc8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -870,9 +870,9 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: middle.block:
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index c663d2b15b587..a1173c6b46a2c 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -18,10 +18,10 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
;
entry:
br label %loop
@@ -61,10 +61,10 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
;
entry:
br label %loop
@@ -107,12 +107,12 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
;
entry:
br label %loop
@@ -219,12 +219,12 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
;
entry:
br label %loop
@@ -270,12 +270,12 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
;
entry:
br label %loop
@@ -321,12 +321,12 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr)
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
;
entry:
br label %loop
@@ -371,12 +371,12 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
;
entry:
br label %loop
@@ -420,10 +420,10 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) {
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
; CHECK-NEXT: br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
;
entry:
br label %loop
@@ -488,8 +488,8 @@ define i64 @test_first_order_recurrences_and_induction(ptr %ptr) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true
entry:
@@ -528,8 +528,8 @@ define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true
;
entry:
@@ -569,8 +569,8 @@ define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
; CHECK-NEXT: br i1 true
;
entry:
@@ -613,8 +613,8 @@ define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
; CHECK-NEXT: br i1 true
;
entry:
@@ -660,12 +660,12 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
; CHECK-NEXT: br i1 true, label %End, label %scalar.ph
;
Entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 0a37e5ea0ca00..a588d71a2c76e 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -918,9 +918,9 @@ define i32 @PR27246() {
; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; UNROLL-NO-IC: middle.block:
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1012,9 +1012,9 @@ define i32 @PR27246() {
; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; SINK-AFTER: middle.block:
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1114,12 +1114,12 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 3
; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP23]], align 4
; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP24]], align 4
-; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP25]], align 4
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = load i32, ptr [[TMP25]], align 4
; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP26]], align 4
; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> poison, i32 [[TMP35]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP36]], i32 1
-; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP37]], i32 2
-; UNROLL-NO-IC-NEXT: [[TMP42]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 3
+; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 3
; UNROLL-NO-IC-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP34]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP34]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -1128,7 +1128,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP42]], i32 3
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP42]], i32 2
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1223,11 +1222,11 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
; SINK-AFTER-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
; SINK-AFTER-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
; SINK-AFTER-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; SINK-AFTER-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = load i32, ptr [[TMP13]], align 4
; SINK-AFTER-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
; SINK-AFTER-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
; SINK-AFTER-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 1
-; SINK-AFTER-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP17]], i32 2
+; SINK-AFTER-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]], i32 2
; SINK-AFTER-NEXT: [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP18]], i32 3
; SINK-AFTER-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1236,7 +1235,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP22]], i32 2
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1403,8 +1401,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) {
; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
; UNROLL-NO-IC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; UNROLL-NO-IC-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1473,8 +1471,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) {
; SINK-AFTER-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
; SINK-AFTER-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; SINK-AFTER: middle.block:
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; SINK-AFTER-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -3511,8 +3509,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
; UNROLL-NO-IC-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -3607,8 +3605,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
; SINK-AFTER-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; SINK-AFTER-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
; SINK-AFTER: middle.block:
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
; SINK-AFTER-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index d64755999635c..b959894b671e1 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -25,10 +25,12 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
; CHECK-VF4UF1: middle.block:
; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF1: %[[SUB2:.*]] = sub i32 %[[MUL2]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB2]]
; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2
; CHECK-VF4UF1: %[[VEC_RECUR_FOR_PHI:.*]] = extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB3]]
+; CHECK-VF4UF1: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4
+; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL3]], 1
+; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB3]]
entry:
br label %for.preheader
@@ -211,10 +213,12 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) {
; CHECK-VF4UF2: middle.block
; CHECK-VF4UF2: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
; CHECK-VF4UF2: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF2: %[[SUB2:.*]] = sub i32 %[[MUL2]], 1
-; CHECK-VF4UF2: %vector.recur.extract = extractelement <vscale x 4 x i32> %[[ADD2]], i32 %[[SUB2]]
; CHECK-VF4UF2: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2
; CHECK-VF4UF2: %vector.recur.extract.for.phi = extractelement <vscale x 4 x i32> %[[ADD2]], i32 %[[SUB3]]
+; CHECK-VF4UF2: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4
+; CHECK-VF4UF2: %[[SUB2:.*]] = sub i32 %[[MUL3]], 1
+; CHECK-VF4UF2: %vector.recur.extract = extractelement <vscale x 4 x i32> %[[ADD2]], i32 %[[SUB2]]
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index c84191665d949..aca3608cb1ede 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -911,9 +911,10 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[FOR_RESULT:%.+]]> = extract-recurrence-result ir<%for.1.next>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR1_SPLICE]]>
+; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
; CHECK-NEXT: }
;
entry:
>From 9094aa145913a2120465b365b9f2a37f0c601054 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 25 May 2024 23:45:11 +0100
Subject: [PATCH 2/2] [VPlan] Model FOR resume value extraction in VPlan.
This patch introduces a new ExtractRecurrenceResume VPInstruction opcode
to extract the value of a FOR to be used as resume value for the ph in
the scalar loop.
Note that it takes a VPFirstOrderRecurrencePHIRecipe as operand. This is
needed at the moment to conveniently let fixFixedOrderRecurrence still
handle creating and patching up the phis in the scalar pre-header and
header. As ExtractRecurrenceResume won't have any users yet, it is
marked as having side-effects until the phi in the scalar pre-header is
also created and managed by VPlan.
Depends on https://github.com/llvm/llvm-project/pull/93395
---
.../Transforms/Vectorize/LoopVectorize.cpp | 91 ++++---------------
llvm/lib/Transforms/Vectorize/VPlan.h | 4 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 ++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 51 +++++++++++
.../AArch64/fixed-order-recurrence.ll | 4 +-
.../LoopVectorize/AArch64/induction-costs.ll | 2 +-
.../AArch64/loop-vectorization-factors.ll | 4 +-
.../AArch64/reduction-recurrence-costs-sve.ll | 2 +-
.../AArch64/sve-interleaved-accesses.ll | 2 +-
.../X86/fixed-order-recurrence.ll | 4 +-
.../Transforms/LoopVectorize/X86/pr72969.ll | 2 +-
.../first-order-recurrence-chains-vplan.ll | 5 +
.../first-order-recurrence-chains.ll | 30 +++---
.../first-order-recurrence-complex.ll | 4 +-
...-order-recurrence-sink-replicate-region.ll | 6 ++
.../LoopVectorize/first-order-recurrence.ll | 54 +++++------
.../Transforms/LoopVectorize/induction.ll | 10 +-
.../interleave-and-scalarize-only.ll | 1 +
.../LoopVectorize/interleaved-accesses.ll | 2 +-
.../LoopVectorize/vplan-printing.ll | 1 +
20 files changed, 156 insertions(+), 139 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 71ee690a8fabd..6e8bd83473148 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -608,8 +608,7 @@ class InnerLoopVectorizer {
/// Create the exit value of first order recurrences in the middle block and
/// update their users.
- void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
- VPTransformState &State);
+ void fixFixedOrderRecurrence(VPInstruction *VPI, VPTransformState &State);
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
@@ -3391,16 +3390,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
fixNonInductionPHIs(Plan, State);
// At this point every instruction in the original loop is widened to a
- // vector form. Now we need to fix the recurrences in the loop. These PHI
- // nodes are currently empty because we did not want to introduce cycles.
- // This is the second stage of vectorizing recurrences. Note that fixing
- // reduction phis are already modeled in VPlan.
- // TODO: Also model fixing fixed-order recurrence phis in VPlan.
+ // vector form. Note that fixing reduction phis, as well as extracting the
+ // exit and resume values for fixed-order recurrences are already modeled in
+ // VPlan. All that remains to do here is creating a phi in the scalar
+ // pre-header for each fixed-rder recurrence resume value.
+ // TODO: Also model creating phis in the scalar pre-header in VPlan.
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
- VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
- for (VPRecipeBase &R : HeaderVPBB->phis()) {
- if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
- fixFixedOrderRecurrence(FOR, State);
+ VPBasicBlock *ExitVPBB =
+ cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ if (auto *VPI = dyn_cast<VPInstruction>(&R))
+ if (VPI->getOpcode() == VPInstruction::ExtractRecurrenceResume)
+ fixFixedOrderRecurrence(VPI, State);
}
// Forget the original basic block.
@@ -3469,73 +3470,13 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VF.getKnownMinValue() * UF);
}
-void InnerLoopVectorizer::fixFixedOrderRecurrence(
- VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
- // This is the second phase of vectorizing first-order recurrences. An
- // overview of the transformation is described below. Suppose we have the
- // following loop.
- //
- // for (int i = 0; i < n; ++i)
- // b[i] = a[i] - a[i - 1];
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s_init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, ...
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // vector phi v1 for s1. We now complete the vectorization and produce the
- // shorthand vector IR shown below (for VF = 4, UF = 1).
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3];
- // v3 = vector(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v3
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // x = v2(3)
- // br scalar.ph
- //
- // scalar.ph:
- // s_init = phi [x, middle.block], [a[-1], otherwise]
- // br scalar.body
- //
- // After execution completes the vector loop, we extract the next value of
- // the recurrence (x) to use as the initial value in the scalar loop.
-
+void InnerLoopVectorizer::fixFixedOrderRecurrence(VPInstruction *VPI,
+ VPTransformState &State) {
// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
- VPValue *PreviousDef = PhiR->getBackedgeValue();
- Value *Incoming = State.get(PreviousDef, UF - 1);
- auto *ExtractForScalar = Incoming;
- auto *IdxTy = Builder.getInt32Ty();
- Value *RuntimeVF = nullptr;
- if (VF.isVector()) {
- auto *One = ConstantInt::get(IdxTy, 1);
- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
- auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
- ExtractForScalar =
- Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
- }
+ Value *ExtractForScalar = State.get(VPI, UF - 1, true);
+ auto *PhiR = cast<VPFirstOrderRecurrencePHIRecipe>(VPI->getOperand(0));
// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 91cbd459675d9..6c63c4715f697 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1180,6 +1180,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
BranchOnCond,
ComputeReductionResult,
ExtractRecurrenceResult,
+ ExtractRecurrenceResume,
LogicalAnd, // Non-poison propagating logical And.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
@@ -3614,7 +3615,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
return all_of(GEP->operands(), isUniformAfterVectorization);
if (auto *VPI = dyn_cast<VPInstruction>(Def))
return VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
- VPI->getOpcode() == VPInstruction::ExtractRecurrenceResult;
+ VPI->getOpcode() == VPInstruction::ExtractRecurrenceResult ||
+ VPI->getOpcode() == VPInstruction::ExtractRecurrenceResume;
return false;
}
} // end namespace vputils
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 82e3bd530a8d6..8fe9c049ca161 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -302,6 +302,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ComputeReductionResult:
case VPInstruction::ExtractRecurrenceResult:
+ case VPInstruction::ExtractRecurrenceResume:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
return true;
@@ -581,6 +582,18 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
Res->setName(Name);
return Res;
}
+ case VPInstruction::ExtractRecurrenceResume: {
+ if (Part != 0)
+ return State.get(this, 0, /*IsScalar*/ true);
+
+ auto *PhiR = cast<VPFirstOrderRecurrencePHIRecipe>(getOperand(0));
+ VPValue *PreviousDef = PhiR->getBackedgeValue();
+ Value *Res =
+ State.get(PreviousDef, VPIteration(State.UF - 1,
+ VPLane::getLastLaneForVF(State.VF)));
+ Res->setName(Name);
+ return Res;
+ }
case VPInstruction::LogicalAnd: {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
@@ -716,6 +729,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
+ case VPInstruction::ExtractRecurrenceResume:
+ O << "extract-recurrence-resume";
+ break;
case VPInstruction::ExtractRecurrenceResult:
O << "extract-recurrence-result";
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f4f43ed15615e..3099bd8c620de 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -846,11 +846,62 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
// all users.
RecurSplice->setOperand(0, FOR);
+ // This is the second phase of vectorizing first-order recurrences. An
+ // overview of the transformation is described below. Suppose we have the
+ // following loop.
+ //
+ // for (int i = 0; i < n; ++i)
+ // b[i] = a[i] - a[i - 1];
+ //
+ // There is a first-order recurrence on "a". For this loop, the shorthand
+ // scalar IR looks like:
+ //
+ // scalar.ph:
+ // s_init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, ...
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // vector phi v1 for s1. We now complete the vectorization and produce the
+ // shorthand vector IR shown below (for VF = 4, UF = 1).
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3];
+ // v3 = vector(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v3
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // x = v2(3)
+ // br scalar.ph
+ //
+ // scalar.ph:
+ // s_init = phi [x, middle.block], [a[-1], otherwise]
+ // br scalar.body
+ //
+ // After execution completes the vector loop, we extract the next value of
+ // the recurrence (x) to use as the initial value in the scalar loop. This
+ // is modeled by ExtractRecurrenceResume.
auto *Result = cast<VPInstruction>(MiddleBuilder.createNaryOp(
VPInstruction::ExtractRecurrenceResult, {FOR->getBackedgeValue()}, {},
"vector.recur.extract.for.phi"));
RecurSplice->replaceUsesWithIf(
Result, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+ MiddleBuilder.createNaryOp(VPInstruction::ExtractRecurrenceResume, {FOR},
+ {}, "vector.recur.extract");
}
return true;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
index 33d7a3a3c8ac0..50ab61cac5b1f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
@@ -47,8 +47,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD1]], i32 15
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD5]], i32 15
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <16 x i8> [[TMP10]], i32 15
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 162fb9c802dd2..32dc2ec1d50a8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -127,8 +127,8 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index 718148a67fcc8..b7463032cada8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -786,8 +786,8 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 15
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -871,8 +871,8 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index c24c1a38177d5..1353290fa2e28 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -100,7 +100,6 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; DEFAULT: middle.block:
; DEFAULT-NEXT: [[BIN_RDX:%.*]] = or <vscale x 4 x i32> [[TMP58]], [[TMP57]]
; DEFAULT-NEXT: [[TMP60:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: [[TMP61:%.*]] = call i32 @llvm.vscale.i32()
; DEFAULT-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], 4
; DEFAULT-NEXT: [[TMP63:%.*]] = sub i32 [[TMP62]], 1
@@ -109,6 +108,7 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; DEFAULT-NEXT: [[TMP65:%.*]] = mul i32 [[TMP64]], 4
; DEFAULT-NEXT: [[TMP66:%.*]] = sub i32 [[TMP65]], 1
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT13:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP66]]
+; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; DEFAULT: scalar.ph:
; DEFAULT-NEXT: [[SCALAR_RECUR_INIT14:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT13]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 1853e551806bc..3a25ffe26cc03 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1509,11 +1509,11 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP31:%.*]] = shl nuw nsw i32 [[TMP30]], 2
; CHECK-NEXT: [[TMP32:%.*]] = add nsw i32 [[TMP31]], -1
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 [[TMP32]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 8004563f38165..94575004e364b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -47,8 +47,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD1]], i32 15
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD5]], i32 15
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <16 x i8> [[TMP10]], i32 15
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 45af544f0304d..829a91f18e5e2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -83,8 +83,8 @@ define void @test(ptr %p) {
; VEC-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; VEC-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VEC: middle.block:
-; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
; VEC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP28]], i32 3
+; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
; VEC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; VEC: scalar.ph:
; VEC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index c04178a1c13e2..d5a33e9360529 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -33,6 +33,8 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%for.1>
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%for.2>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
@@ -89,6 +91,9 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%for.1>
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%for.2>
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%for.3>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index a1173c6b46a2c..447f0b0bfee20 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -19,8 +19,8 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
;
entry:
@@ -62,8 +62,8 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
;
entry:
@@ -108,10 +108,10 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
;
entry:
@@ -220,10 +220,10 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
;
entry:
@@ -271,10 +271,10 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
;
entry:
@@ -322,10 +322,10 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr)
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
;
entry:
@@ -372,10 +372,10 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %
; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
;
entry:
@@ -421,8 +421,8 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) {
; CHECK-NEXT: br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
;
entry:
@@ -661,10 +661,10 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT4]], i32 2
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x double> [[BROADCAST_SPLAT4]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
; CHECK-NEXT: br i1 true, label %End, label %scalar.ph
;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index 149e4705885b1..6b1d38142b9ec 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -854,8 +854,8 @@ define void @sink_dominance(ptr %ptr, i32 %N) {
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -937,8 +937,8 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) {
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 833d55f09294e..58f14cfb9680c 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -73,6 +73,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%0>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
@@ -145,6 +146,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%recur>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
@@ -216,6 +218,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%recur>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
@@ -314,6 +317,7 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%0>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
@@ -397,6 +401,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%recur>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
@@ -469,6 +474,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%.pn>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index a588d71a2c76e..6d3654d3b97e4 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -55,8 +55,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD1]], i32 3
+; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[FOR_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -165,8 +165,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
; SINK-AFTER-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; SINK-AFTER: middle.block:
-; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[FOR_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -267,8 +267,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP17]], <4 x i32> [[TMP18]])
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[RDX_MINMAX]])
-; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i32 3
+; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -400,8 +400,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
; SINK-AFTER-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]])
-; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -530,8 +530,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 3
+; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP0]], [[FOR_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -674,8 +674,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
; SINK-AFTER-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; SINK-AFTER: middle.block:
-; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP0]], [[FOR_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -919,8 +919,8 @@ define i32 @PR27246() {
; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
-; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
+; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1013,8 +1013,8 @@ define i32 @PR27246() {
; SINK-AFTER-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
-; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
+; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1115,11 +1115,11 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP23]], align 4
; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP24]], align 4
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = load i32, ptr [[TMP25]], align 4
-; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP26]], align 4
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = load i32, ptr [[TMP26]], align 4
; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> poison, i32 [[TMP35]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP36]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]], i32 2
-; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 3
+; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[VECTOR_RECUR_EXTRACT]], i32 3
; UNROLL-NO-IC-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP34]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP34]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -1127,7 +1127,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
; UNROLL-NO-IC-NEXT: br i1 [[TMP45]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP42]], i32 3
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1223,18 +1222,17 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
; SINK-AFTER-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
; SINK-AFTER-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = load i32, ptr [[TMP13]], align 4
-; SINK-AFTER-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = load i32, ptr [[TMP14]], align 4
; SINK-AFTER-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
; SINK-AFTER-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 1
; SINK-AFTER-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]], i32 2
-; SINK-AFTER-NEXT: [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP18]], i32 3
+; SINK-AFTER-NEXT: [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[VECTOR_RECUR_EXTRACT]], i32 3
; SINK-AFTER-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[PRE_LOAD]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1574,11 +1572,11 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP12]], align 8
; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP13]], align 8
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP14]], align 8
-; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP15]], align 8
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = load double, ptr [[TMP15]], align 8
; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <4 x double> poison, double [[TMP24]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = insertelement <4 x double> [[TMP28]], double [[TMP25]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP26]], i32 2
-; UNROLL-NO-IC-NEXT: [[TMP31]] = insertelement <4 x double> [[TMP30]], double [[TMP27]], i32 3
+; UNROLL-NO-IC-NEXT: [[TMP31]] = insertelement <4 x double> [[TMP30]], double [[VECTOR_RECUR_EXTRACT]], i32 3
; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> [[TMP31]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = fmul <4 x double> [[TMP32]], [[TMP23]]
@@ -1595,7 +1593,6 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP41]], [[TMP40]]
; UNROLL-NO-IC-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP31]], i32 3
; UNROLL-NO-IC-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1710,11 +1707,11 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
; SINK-AFTER-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
; SINK-AFTER-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
; SINK-AFTER-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8
-; SINK-AFTER-NEXT: [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = load double, ptr [[TMP7]], align 8
; SINK-AFTER-NEXT: [[TMP12:%.*]] = insertelement <4 x double> poison, double [[TMP8]], i32 0
; SINK-AFTER-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP12]], double [[TMP9]], i32 1
; SINK-AFTER-NEXT: [[TMP14:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP10]], i32 2
-; SINK-AFTER-NEXT: [[TMP15]] = insertelement <4 x double> [[TMP14]], double [[TMP11]], i32 3
+; SINK-AFTER-NEXT: [[TMP15]] = insertelement <4 x double> [[TMP14]], double [[VECTOR_RECUR_EXTRACT]], i32 3
; SINK-AFTER-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP15]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[TMP17:%.*]] = fmul <4 x double> [[TMP16]], [[TMP15]]
; SINK-AFTER-NEXT: [[TMP18:%.*]] = fcmp une <4 x double> [[TMP17]], zeroinitializer
@@ -1725,7 +1722,6 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
; SINK-AFTER-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]])
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP15]], i32 3
; SINK-AFTER-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1826,8 +1822,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 3
+; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -1934,8 +1930,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
; SINK-AFTER-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; SINK-AFTER: middle.block:
-; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -2042,11 +2038,11 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load i16, ptr [[TMP14]], align 2
; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = load i16, ptr [[TMP15]], align 2
; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i16, ptr [[TMP16]], align 2
-; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i16, ptr [[TMP17]], align 2
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = load i16, ptr [[TMP17]], align 2
; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> poison, i16 [[TMP28]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP29]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = insertelement <4 x i16> [[TMP33]], i16 [[TMP30]], i32 2
-; UNROLL-NO-IC-NEXT: [[TMP35]] = insertelement <4 x i16> [[TMP34]], i16 [[TMP31]], i32 3
+; UNROLL-NO-IC-NEXT: [[TMP35]] = insertelement <4 x i16> [[TMP34]], i16 [[VECTOR_RECUR_EXTRACT]], i32 3
; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP27]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = shufflevector <4 x i16> [[TMP27]], <4 x i16> [[TMP35]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[TMP36]] to <4 x i32>
@@ -2066,7 +2062,6 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
; UNROLL-NO-IC-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP35]], i32 3
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -2177,11 +2172,11 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
; SINK-AFTER-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP5]], align 2
; SINK-AFTER-NEXT: [[TMP11:%.*]] = load i16, ptr [[TMP6]], align 2
; SINK-AFTER-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP7]], align 2
-; SINK-AFTER-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP8]], align 2
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = load i16, ptr [[TMP8]], align 2
; SINK-AFTER-NEXT: [[TMP14:%.*]] = insertelement <4 x i16> poison, i16 [[TMP10]], i32 0
; SINK-AFTER-NEXT: [[TMP15:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[TMP11]], i32 1
; SINK-AFTER-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP12]], i32 2
-; SINK-AFTER-NEXT: [[TMP17]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 3
+; SINK-AFTER-NEXT: [[TMP17]] = insertelement <4 x i16> [[TMP16]], i16 [[VECTOR_RECUR_EXTRACT]], i32 3
; SINK-AFTER-NEXT: [[TMP18:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP17]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[TMP18]] to <4 x i32>
; SINK-AFTER-NEXT: [[TMP20:%.*]] = sext <4 x i16> [[TMP17]] to <4 x i32>
@@ -2194,7 +2189,6 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
; SINK-AFTER-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -2294,8 +2288,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 3
+; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -2407,8 +2401,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
; SINK-AFTER-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; SINK-AFTER: middle.block:
-; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 50a5cc6774c5c..df61798853ef6 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -6256,8 +6256,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -6327,8 +6327,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
; IND-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IND-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
; IND: middle.block:
-; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i64 1
+; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; IND: scalar.ph:
; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -6402,8 +6402,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
; UNROLL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
; UNROLL: middle.block:
-; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i64 1
+; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; UNROLL: scalar.ph:
; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -6485,8 +6485,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1
+; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -6560,8 +6560,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
; INTERLEAVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; INTERLEAVE-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
; INTERLEAVE: middle.block:
-; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3
+; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; INTERLEAVE: scalar.ph:
; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 81cc2024bb31a..909a3c23dd117 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -202,6 +202,7 @@ exit:
; DBG-NEXT: Successor(s): middle.block
; DBG-EMPTY:
; DBG-NEXT: middle.block:
+; DBG-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%for>
; DBG-NEXT: No successors
; DBG-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 4c3377255b21a..0f3e7e6ac4017 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -1511,8 +1511,8 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) {
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i64 7
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index aca3608cb1ede..a857ae3de4f57 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -912,6 +912,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[FOR_RESULT:%.+]]> = extract-recurrence-result ir<%for.1.next>
+; CHECK-NEXT: EMIT vp<{{.+}}> = extract-recurrence-resume ir<%for.1>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
More information about the llvm-commits
mailing list