[llvm] [VPlan] Iterate over header phis to determine FORs that need EVL fixup. NFCI (PR #147032)

Tue Jul 8 01:05:36 PDT 2025

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/147032

>From 26ed857d733edf81999c1546acb66b04755a18cb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 4 Jul 2025 11:03:26 +0100
Subject: [PATCH 1/4] [VPlan] Iterate over header phis to determine FORs that
 need EVL fixup. NFCI

This is a follow-up to https://github.com/llvm/llvm-project/pull/146672#discussion_r2183176231

We can avoid iterating over every recipe to pick out splices that need fixed up given that for now, all splices must use a VPFirstOrderRecurrencePHIRecipe.

An assertion was added since this doesn't hold for unrolled loops:

    vector.body:
      EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
      FIRST-ORDER-RECURRENCE-PHI ir<%10> = phi ir<%pre_load>, ir<%11>.1
      CLONE ir<%indvars.iv.next> = add nuw nsw vp<%index>, ir<1>
      CLONE ir<%arrayidx32> = getelementptr inbounds ir<%a>, ir<%indvars.iv.next>
      vp<%3> = vector-pointer ir<%arrayidx32>
      vp<%4> = vector-pointer ir<%arrayidx32>, ir<1>
      WIDEN ir<%11> = load vp<%3>
      WIDEN ir<%11>.1 = load vp<%4>
      EMIT vp<%5> = first-order splice ir<%10>, ir<%11>
      EMIT vp<%6> = first-order splice ir<%11>, ir<%11>.1 <-- doesn't use phi

Or sometimes we splices in loops without a FOR phi at all:

    vector.body:
      EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
      CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index>
      vp<%3> = vector-pointer ir<%gep.a>
      vp<%4> = vector-pointer ir<%gep.a>, ir<1>
      WIDEN ir<%load.a> = load vp<%3>
      WIDEN ir<%load.a>.1 = load vp<%4>
      WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
      WIDEN-CAST ir<%ext.a>.1 = zext ir<%load.a>.1 to i32
      CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
      vp<%5> = vector-pointer ir<%gep.b>
      vp<%6> = vector-pointer ir<%gep.b>, ir<1>
      WIDEN ir<%load.b> = load vp<%5>
      WIDEN ir<%load.b>.1 = load vp<%6>
      WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
      WIDEN-CAST ir<%ext.b>.1 = zext ir<%load.b>.1 to i32
      WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
      WIDEN ir<%mul>.1 = mul ir<%ext.b>.1, ir<%ext.a>.1
      EMIT vp<%7> = first-order splice ir<%mul>, ir<%mul>.1

A test was added for second order recurrences just to double check that they indeed also have their own FOR phi.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  78 ++++----
 ...ce-tail-with-evl-fixed-order-recurrence.ll | 171 ++++++++++++++++++
 2 files changed, 213 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0bceb70d8661f..c4ff941cdfded 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2187,42 +2187,48 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   // VPTypeAnalysis cache.
   SmallVector<VPRecipeBase *> ToErase;
 
-  // Create a scalar phi to track the previous EVL if fixed-order recurrence is
-  // contained.
-  bool ContainsFORs =
-      any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
-  if (ContainsFORs) {
-    // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
-    VPValue *MaxEVL = &Plan.getVF();
-    // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
-    VPBuilder Builder(LoopRegion->getPreheaderVPBB());
-    MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx),
-                                             TypeInfo.inferScalarType(MaxEVL),
-                                             DebugLoc());
-
-    Builder.setInsertPoint(Header, Header->getFirstNonPhi());
-    VPValue *PrevEVL =
-        Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
-
-    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
-             vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
-      for (VPRecipeBase &R : *VPBB) {
-        using namespace VPlanPatternMatch;
-        VPValue *V1, *V2;
-        if (!match(&R,
-                   m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
-                       m_VPValue(V1), m_VPValue(V2))))
-          continue;
-        VPValue *Imm = Plan.getOrAddLiveIn(
-            ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1));
-        VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
-            Intrinsic::experimental_vp_splice,
-            {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
-            TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
-        VPSplice->insertBefore(&R);
-        R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
-        ToErase.push_back(&R);
-      }
+  // Fix-up first-order recurrences
+  VPValue *PrevEVL = nullptr;
+  for (VPRecipeBase &PhiR : Header->phis()) {
+    auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&PhiR);
+    if (!FOR)
+      continue;
+
+    // Create a scalar phi to track the previous EVL if fixed-order recurrence
+    // is contained.
+    if (!PrevEVL) {
+      // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
+      VPValue *MaxEVL = &Plan.getVF();
+      // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
+      VPBuilder Builder(LoopRegion->getPreheaderVPBB());
+      MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx),
+                                               TypeInfo.inferScalarType(MaxEVL),
+                                               DebugLoc());
+
+      Builder.setInsertPoint(Header, Header->getFirstNonPhi());
+      PrevEVL = Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
+    }
+
+    assert(!Plan.isUnrolled() && "When unrolled splices might not use "
+                                 "VPFirstOrederRecurrencePHIRecipe!");
+
+    for (VPUser *User : PhiR.getVPSingleValue()->users()) {
+      auto *R = cast<VPRecipeBase>(User);
+      using namespace VPlanPatternMatch;
+      VPValue *V1, *V2;
+      if (!match(R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
+                        m_VPValue(V1), m_VPValue(V2))))
+        continue;
+      VPValue *Imm = Plan.getOrAddLiveIn(
+          ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1));
+      VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
+          Intrinsic::experimental_vp_splice,
+          {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
+          TypeInfo.inferScalarType(R->getVPSingleValue()), R->getDebugLoc());
+
+      VPSplice->insertBefore(R);
+      R->getVPSingleValue()->replaceAllUsesWith(VPSplice);
+      ToErase.push_back(R);
     }
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 0490d63f67d4e..c4e9b3fefc98a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -737,6 +737,173 @@ for.end:
   ret void
 }
 
+
+define void @second_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
+; IF-EVL-LABEL: define void @second_order_recurrence_indvar(
+; IF-EVL-SAME: ptr noalias [[A:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
+; IF-EVL-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 2
+; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP9]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 33, i32 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2
+; IF-EVL-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 33, i32 [[TMP13]]
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[PREV_EVL:%.*]] = phi i32 [ [[TMP5]], %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP14]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP16]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[VEC_IND]], splat (i64 42)
+; IF-EVL-NEXT:    [[TMP18]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[TMP17]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP14]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR2]], <vscale x 2 x i64> [[TMP18]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP14]])
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP20]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP19]], ptr align 8 [[TMP21]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP14]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP22]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 33, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i64 [ 33, %[[ENTRY]] ]
+; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
+; IF-EVL:       [[FOR_BODY]]:
+; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[X:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT3]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[X]] = add i64 [[INDVARS]], 42
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    store i64 [[FOR2]], ptr [[ARRAYIDX]], align 8
+; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL:       [[FOR_END]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @second_order_recurrence_indvar(
+; NO-VP-SAME: ptr noalias [[A:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; NO-VP-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP5]]
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 2
+; NO-VP-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 33, i32 [[TMP11]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP13:%.*]] = mul nuw i32 [[TMP12]], 2
+; NO-VP-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 33, i32 [[TMP14]]
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_IND]], splat (i64 42)
+; NO-VP-NEXT:    [[TMP16]] = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[TMP15]], i32 -1)
+; NO-VP-NEXT:    [[TMP17:%.*]] = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR2]], <vscale x 2 x i64> [[TMP16]], i32 -1)
+; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP18]], i32 0
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP17]], ptr [[TMP19]], align 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 2
+; NO-VP-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[TMP15]], i32 [[TMP23]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 2
+; NO-VP-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
+; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <vscale x 2 x i64> [[TMP16]], i32 [[TMP26]]
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; NO-VP-NEXT:    br label %[[FOR_BODY:.*]]
+; NO-VP:       [[FOR_BODY]]:
+; NO-VP-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[X:%.*]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[FOR2:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; NO-VP-NEXT:    [[X]] = add i64 [[INDVARS]], 42
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[INDVARS]]
+; NO-VP-NEXT:    store i64 [[FOR2]], ptr [[ARRAYIDX]], align 8
+; NO-VP-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; NO-VP:       [[FOR_END]]:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ]
+  %for1 = phi i64 [ 33, %entry ], [ %x, %for.body ]
+  %for2 = phi i64 [ 33, %entry ], [ %for1, %for.body ]
+
+  %x = add i64 %indvars, 42
+
+  %arrayidx = getelementptr inbounds nuw i64, ptr %A, i64 %indvars
+  store i64 %for2, ptr %arrayidx
+
+  %indvars.next = add nuw nsw i64 %indvars, 1
+  %exitcond.not = icmp eq i64 %indvars.next, %TC
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.enable", i1 true}
 ;.
@@ -753,6 +920,8 @@ for.end:
 ; IF-EVL: [[META10]] = !{!"llvm.loop.vectorize.enable", i1 true}
 ; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]], [[META3]]}
 ; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META1]]}
+; IF-EVL: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]], [[META3]]}
+; IF-EVL: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META1]]}
 ;.
 ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -766,4 +935,6 @@ for.end:
 ; NO-VP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
 ; NO-VP: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
 ; NO-VP: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; NO-VP: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
 ;.

>From 50d68b2bb4c8ad4a3ca124c7e18f5bf8095877de Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 7 Jul 2025 22:04:42 +0800
Subject: [PATCH 2/4] Avoid redundant getVPSingleValue()

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c4ff941cdfded..3c78d4006a30f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2212,7 +2212,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     assert(!Plan.isUnrolled() && "When unrolled splices might not use "
                                  "VPFirstOrederRecurrencePHIRecipe!");
 
-    for (VPUser *User : PhiR.getVPSingleValue()->users()) {
+    for (VPUser *User : FOR->users()) {
       auto *R = cast<VPRecipeBase>(User);
       using namespace VPlanPatternMatch;
       VPValue *V1, *V2;

>From 0f752fe9be3899bab11f591fdc91ecacf352950f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 8 Jul 2025 15:59:37 +0800
Subject: [PATCH 3/4] Move assert, fix typos and adjust comments

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3c78d4006a30f..b3880f6e0aee2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2187,7 +2187,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   // VPTypeAnalysis cache.
   SmallVector<VPRecipeBase *> ToErase;
 
-  // Fix-up first-order recurrences
+  // When unrolling splices may not use VPFirstOrderRecurrencePHIRecipes, so the
+  // below transformation will need to be fixed.
+  assert(!Plan.isUnrolled() && "Unrolling not supported with EVL tail folding.");
+
+  // Replace FirstOrderRecurrenceSplice with experimental_vp_splice intrinsics.
   VPValue *PrevEVL = nullptr;
   for (VPRecipeBase &PhiR : Header->phis()) {
     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&PhiR);
@@ -2209,9 +2213,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
       PrevEVL = Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
     }
 
-    assert(!Plan.isUnrolled() && "When unrolled splices might not use "
-                                 "VPFirstOrederRecurrencePHIRecipe!");
-
     for (VPUser *User : FOR->users()) {
       auto *R = cast<VPRecipeBase>(User);
       using namespace VPlanPatternMatch;

>From 5d4bac97a9954f7f0121f5549e60db5270e4a29d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 8 Jul 2025 16:05:14 +0800
Subject: [PATCH 4/4] clang-format

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b3880f6e0aee2..68d3c4f03f1ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2189,7 +2189,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
 
   // When unrolling splices may not use VPFirstOrderRecurrencePHIRecipes, so the
   // below transformation will need to be fixed.
-  assert(!Plan.isUnrolled() && "Unrolling not supported with EVL tail folding.");
+  assert(!Plan.isUnrolled() &&
+         "Unrolling not supported with EVL tail folding.");
 
   // Replace FirstOrderRecurrenceSplice with experimental_vp_splice intrinsics.
   VPValue *PrevEVL = nullptr;