[llvm] [VPlan] Handle FirstActiveLane when unrolling. (PR #145394)

Thu Jun 26 06:50:42 PDT 2025

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/145394

>From badd9226b3fe7c213efbe683729c30af9c0f61b9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 23 Jun 2025 16:01:11 +0100
Subject: [PATCH 1/5] [VPlan] Handle FirstActiveLane when unrolling.

Currently FirstActiveLane is not handled correctly during
 unrolling. This is currently causing mis-compiles when
 vectorizing early-exit loops with interleaving forced.

This patch updates handling of FirstActiveLane to be analogous
to computing final reduction results: during unrolling, the
created copies for its original operand are added as additional
operands, and FirstActiveLane will always produce the index of
the first active lane across all unrolled iterations.

Note that some of the generated code is still incorrect, as we
also need to handle ExtractElement with FirstActiveLane operands.
I will share patches for those soon as well.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |   4 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  32 ++-
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |   8 +-
 .../AArch64/single-early-exit-interleave.ll   |  49 +++-
 .../single-early-exit-interleave.ll           | 258 +++++++++++++++++-
 ...or-loop-backedge-elimination-early-exit.ll |  10 +-
 6 files changed, 344 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 972eca1fe8376..bcf3658dd7880 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -955,7 +955,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // Returns a scalar boolean value, which is true if any lane of its (only
     // boolean) vector operand is true.
     AnyOf,
-    // Calculates the first active lane index of the vector predicate operand.
+    // Calculates the first active lane index of the vector predicate operands.
+    // It produces the lane index across all unrolled iterations. Unrolling will
+    // add all copies of its original operand as additional operands.
     FirstActiveLane,
 
     // The opcodes below are used for VPInstructionWithType.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4b0b74c0ea7d8..d5058e3edba22 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -821,9 +821,35 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreateOrReduce(A);
   }
   case VPInstruction::FirstActiveLane: {
-    Value *Mask = State.get(getOperand(0));
-    return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
-                                                true, Name);
+    if (getNumOperands() == 1) {
+      Value *Mask = State.get(getOperand(0));
+      return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
+                                                  true, Name);
+    }
+    // If there are multiple operands, create a chain of selects to pick the
+    // first operand with an active lane and add the number of lanes of the
+    // preceding operands.
+    Value *RuntimeVF =
+        getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
+    Type *ElemTy = State.TypeAnalysis.inferScalarType(getOperand(0));
+    Value *RuntimeBitwidth = Builder.CreateMul(
+        Builder.getInt64(ElemTy->getScalarSizeInBits()), RuntimeVF);
+    unsigned LastOpIdx = getNumOperands() - 1;
+    Value *Res = nullptr;
+    for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
+      Value *Current = Builder.CreateCountTrailingZeroElems(
+          Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
+      Current = Builder.CreateAdd(
+          Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), Current);
+      if (Res) {
+        Value *Cmp = Builder.CreateICmpNE(Current, RuntimeBitwidth);
+        Res = Builder.CreateSelect(Cmp, Current, Res);
+      } else {
+        Res = Current;
+      }
+    }
+
+    return Res;
   }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 0bc683e557e70..532539ff5cb00 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -344,10 +344,12 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
       continue;
 
-    // Add all VPValues for all parts to ComputeReductionResult which combines
-    // the parts to compute the final reduction value.
+    // Add all VPValues for all parts to Compute*Result and FirstActiveLaneMask
+    // which combine the parts to compute the final value.
     VPValue *Op1;
-    if (match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
+    if (match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
+                      m_VPValue(Op1))) ||
+        match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
                       m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
                       m_VPValue(), m_VPValue(Op1))) ||
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index 9dfe70ddf1b05..a0d00b7d4b438 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -31,11 +31,38 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 32
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], 48
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP37]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP38]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 32
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 48
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP27]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP11]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
@@ -47,8 +74,28 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP63]], 16
+; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 1, [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP42]], 3
+; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[TMP62]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
+; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP42]], 2
+; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[TMP58]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP50]], [[TMP43]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP47]], i64 [[TMP50]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
+; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[TMP42]], 1
+; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP64]], [[TMP52]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ne i64 [[TMP56]], [[TMP43]]
+; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP53]], i64 [[TMP56]], i64 [[TMP51]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP15]]
+; CHECK-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP42]], 0
+; CHECK-NEXT:    [[TMP60:%.*]] = add i64 [[TMP65]], [[TMP15]]
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne i64 [[TMP60]], [[TMP43]]
+; CHECK-NEXT:    [[TMP61:%.*]] = select i1 [[TMP59]], i64 [[TMP60]], i64 [[TMP57]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index 1f8cfa1bfd11c..68f25e92af866 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -91,11 +91,26 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
@@ -106,7 +121,20 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -170,8 +198,17 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 4
+; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF4IC4-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
+; VF4IC4-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], splat (i8 72)
+; VF4IC4-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], splat (i8 72)
+; VF4IC4-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], splat (i8 72)
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -182,7 +219,20 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP6:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 12, [[TMP28]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP16]], i1 true)
+; VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 8, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP19]], 4
+; VF4IC4-NEXT:    [[TMP20:%.*]] = select i1 [[TMP18]], i64 [[TMP19]], i64 [[TMP29]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
+; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 4, [[TMP21]]
+; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP23]], 4
+; VF4IC4-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i64 [[TMP23]], i64 [[TMP20]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP25]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP27]], 4
+; VF4IC4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP26]], i64 [[TMP27]], i64 [[TMP24]]
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -240,11 +290,26 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
@@ -255,7 +320,20 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -320,11 +398,26 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
@@ -335,7 +428,20 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
@@ -407,11 +513,26 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
@@ -422,7 +543,20 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
@@ -495,14 +629,41 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
+; VF4IC4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
+; VF4IC4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP23]], i32 -3
+; VF4IC4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
+; VF4IC4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
+; VF4IC4-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP24]], align 1
+; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP26]], align 1
+; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
+; VF4IC4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -4
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -12
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
 ; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE3]], [[REVERSE10]]
+; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
@@ -513,7 +674,20 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP42:%.*]] = add i64 12, [[TMP41]]
+; VF4IC4-NEXT:    [[TMP43:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 8, [[TMP43]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP42]]
+; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 4, [[TMP34]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[TMP36]], 4
+; VF4IC4-NEXT:    [[TMP37:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP33]]
+; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[TMP40:%.*]] = add i64 0, [[TMP38]]
+; VF4IC4-NEXT:    [[TMP39:%.*]] = icmp ne i64 [[TMP40]], 4
+; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP39]], i64 [[TMP40]], i64 [[TMP37]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = sub i64 1023, [[TMP11]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -577,11 +751,26 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
+; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -592,7 +781,20 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE8]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP22]], 4
+; VF4IC4-NEXT:    [[TMP23:%.*]] = select i1 [[TMP21]], i64 [[TMP22]], i64 [[TMP20]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP25:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE9]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP25]], 4
+; VF4IC4-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i64 [[TMP25]], i64 [[TMP23]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE10]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP26]]
 ; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
@@ -657,14 +859,41 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
+; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
+; VF4IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 -3
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
+; VF4IC4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
+; VF4IC4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP23]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; VF4IC4-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
+; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP24]], align 1
+; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
+; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -4
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -12
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
 ; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE3]], [[REVERSE10]]
+; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
@@ -675,7 +904,20 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE15]]
+; VF4IC4-NEXT:    [[TMP29:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i64 [[TMP30]], i64 [[TMP28]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE16]]
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP33]], 4
+; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP32]], i64 [[TMP33]], i64 [[TMP31]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE17:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE17]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[TMP36]], 4
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP34]]
 ; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE]], i64 [[FIRST_ACTIVE_LANE]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 1a8361ce2bac3..fe0e9a594745c 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -189,8 +189,11 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
 ; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 8
 ; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
 ; VF8UF2-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
@@ -198,8 +201,13 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
+; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP6]], i1 true)
+; VF8UF2-NEXT:    [[TMP10:%.*]] = add i64 8, [[TMP8]]
 ; VF8UF2-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
 ; VF8UF2-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
+; VF8UF2-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP5]], 8
+; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP5]], i64 [[TMP10]]
+; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
 ; VF8UF2-NEXT:    br label %[[EXIT]]
 ; VF8UF2:       [[SCALAR_PH]]:
 ; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -215,7 +223,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
 ; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
-; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP12]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i64 [[RES]]
 ;
 ; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(

>From e80224dbf892821c330d579eab0e0e4473cbbe91 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 24 Jun 2025 14:16:11 +0100
Subject: [PATCH 2/5] !fixup check trailing zeros again

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 ++--
 .../AArch64/single-early-exit-interleave.ll   |  6 +--
 .../single-early-exit-interleave.ll           | 48 +++++++++----------
 ...or-loop-backedge-elimination-early-exit.ll |  2 +-
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d5058e3edba22..194777b312269 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -837,12 +837,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
     unsigned LastOpIdx = getNumOperands() - 1;
     Value *Res = nullptr;
     for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
-      Value *Current = Builder.CreateCountTrailingZeroElems(
+      Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
           Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
-      Current = Builder.CreateAdd(
-          Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), Current);
+      Value *Current = Builder.CreateAdd(
+          Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
       if (Res) {
-        Value *Cmp = Builder.CreateICmpNE(Current, RuntimeBitwidth);
+        Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeBitwidth);
         Res = Builder.CreateSelect(Cmp, Current, Res);
       } else {
         Res = Current;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index a0d00b7d4b438..e05ad66537fa3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -83,17 +83,17 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
 ; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP42]], 2
 ; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[TMP58]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP50]], [[TMP43]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP46]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP47]], i64 [[TMP50]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
 ; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[TMP42]], 1
 ; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP64]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = icmp ne i64 [[TMP56]], [[TMP43]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ne i64 [[TMP52]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP53]], i64 [[TMP56]], i64 [[TMP51]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
 ; CHECK-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP42]], 0
 ; CHECK-NEXT:    [[TMP60:%.*]] = add i64 [[TMP65]], [[TMP15]]
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne i64 [[TMP60]], [[TMP43]]
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne i64 [[TMP15]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = select i1 [[TMP59]], i64 [[TMP60]], i64 [[TMP57]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index 68f25e92af866..854c8327cf801 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -125,15 +125,15 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
@@ -223,15 +223,15 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 12, [[TMP28]]
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP16]], i1 true)
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 8, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP19]], 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP20:%.*]] = select i1 [[TMP18]], i64 [[TMP19]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 4, [[TMP21]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP23]], 4
+; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 4
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i64 [[TMP23]], i64 [[TMP20]]
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP25]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP27]], 4
+; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP25]], 4
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP26]], i64 [[TMP27]], i64 [[TMP24]]
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]]
@@ -324,15 +324,15 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
@@ -432,15 +432,15 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
@@ -547,15 +547,15 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP24]], 4
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
@@ -678,15 +678,15 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[TMP42:%.*]] = add i64 12, [[TMP41]]
 ; VF4IC4-NEXT:    [[TMP43:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 8, [[TMP43]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP32]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP43]], 4
 ; VF4IC4-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP42]]
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
 ; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 4, [[TMP34]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[TMP36]], 4
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[TMP34]], 4
 ; VF4IC4-NEXT:    [[TMP37:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP33]]
 ; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; VF4IC4-NEXT:    [[TMP40:%.*]] = add i64 0, [[TMP38]]
-; VF4IC4-NEXT:    [[TMP39:%.*]] = icmp ne i64 [[TMP40]], 4
+; VF4IC4-NEXT:    [[TMP39:%.*]] = icmp ne i64 [[TMP38]], 4
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP39]], i64 [[TMP40]], i64 [[TMP37]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = sub i64 1023, [[TMP11]]
@@ -785,15 +785,15 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    [[TMP20:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE8]]
-; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP22]], 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE8]], 4
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = select i1 [[TMP21]], i64 [[TMP22]], i64 [[TMP20]]
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE9]]
-; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP25]], 4
+; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE9]], 4
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i64 [[TMP25]], i64 [[TMP23]]
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE10]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP28]], 4
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE10]], 4
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP26]]
 ; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -908,15 +908,15 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE15]]
-; VF4IC4-NEXT:    [[TMP29:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE15]], 4
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i64 [[TMP30]], i64 [[TMP28]]
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
 ; VF4IC4-NEXT:    [[TMP33:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE16]]
-; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP33]], 4
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE16]], 4
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP32]], i64 [[TMP33]], i64 [[TMP31]]
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE17:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE17]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[TMP36]], 4
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE17]], 4
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP34]]
 ; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE]], i64 [[FIRST_ACTIVE_LANE]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index fe0e9a594745c..a58bdf2dcd02f 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -205,7 +205,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2-NEXT:    [[TMP10:%.*]] = add i64 8, [[TMP8]]
 ; VF8UF2-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
 ; VF8UF2-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
-; VF8UF2-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP5]], 8
+; VF8UF2-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE]], 8
 ; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP5]], i64 [[TMP10]]
 ; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
 ; VF8UF2-NEXT:    br label %[[EXIT]]

>From f4e7f8a2402c5c13cc84f2fbce9c3592f98762f9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Jun 2025 15:44:05 +0100
Subject: [PATCH 3/5] !fixup don't multiply by type size.

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp             | 5 +----
 .../LoopVectorize/AArch64/single-early-exit-interleave.ll  | 7 +++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 194777b312269..ba6e224f05c9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -831,9 +831,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
     // preceding operands.
     Value *RuntimeVF =
         getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
-    Type *ElemTy = State.TypeAnalysis.inferScalarType(getOperand(0));
-    Value *RuntimeBitwidth = Builder.CreateMul(
-        Builder.getInt64(ElemTy->getScalarSizeInBits()), RuntimeVF);
     unsigned LastOpIdx = getNumOperands() - 1;
     Value *Res = nullptr;
     for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
@@ -842,7 +839,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
       Value *Current = Builder.CreateAdd(
           Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
       if (Res) {
-        Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeBitwidth);
+        Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
         Res = Builder.CreateSelect(Cmp, Current, Res);
       } else {
         Res = Current;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index e05ad66537fa3..5f88ed1bfc4c0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -76,24 +76,23 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP63]], 16
-; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 1, [[TMP42]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
 ; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP42]], 3
 ; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[TMP62]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
 ; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP42]], 2
 ; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[TMP58]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP46]], [[TMP43]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP46]], [[TMP42]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP47]], i64 [[TMP50]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
 ; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[TMP42]], 1
 ; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP64]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = icmp ne i64 [[TMP52]], [[TMP43]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ne i64 [[TMP52]], [[TMP42]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP53]], i64 [[TMP56]], i64 [[TMP51]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
 ; CHECK-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP42]], 0
 ; CHECK-NEXT:    [[TMP60:%.*]] = add i64 [[TMP65]], [[TMP15]]
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne i64 [[TMP15]], [[TMP43]]
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne i64 [[TMP15]], [[TMP42]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = select i1 [[TMP59]], i64 [[TMP60]], i64 [[TMP57]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]

>From e7a1d64f6730f8056857199b97b96f1de29af320 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 26 Jun 2025 14:43:03 +0100
Subject: [PATCH 4/5] !fixup updates after merge

---
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |   2 +-
 .../AArch64/single-early-exit-interleave.ll   |  10 +
 .../single-early-exit-interleave.ll           | 178 ++++++++++++++++--
 ...or-loop-backedge-elimination-early-exit.ll |   2 +
 4 files changed, 177 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 42057eab83303..4d3c16a93176c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -349,7 +349,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
-                              m_VPValue(Op1))) || 
+                      m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
                       m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index 29b5a44d8aaf0..551fcd2567f0b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -34,6 +34,16 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], 32
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 48
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP40]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP38]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP41]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index 3b15c63a2f3e2..6b652a006aa44 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -3,7 +3,6 @@
 
 declare void @init_mem(ptr, i64);
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-NEXT:  entry:
@@ -87,7 +86,6 @@ e2:
   ret i64 %p2
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-LABEL: define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:  entry:
@@ -103,6 +101,22 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
@@ -182,7 +196,6 @@ loop.end:
   ret i64 %retval
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-LABEL: define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:  entry:
@@ -197,6 +210,13 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 4
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
 ; VF4IC4-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], splat (i8 72)
 ; VF4IC4-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], splat (i8 72)
@@ -269,7 +289,6 @@ loop.end:
   ret ptr %retval
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-LABEL: define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:  entry:
@@ -285,6 +304,22 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
@@ -364,7 +399,6 @@ loop.end:
   ret i64 %retval
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-LABEL: define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:  entry:
@@ -380,6 +414,22 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
@@ -466,7 +516,6 @@ loop.end:
   ret i64 %retval2
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-LABEL: define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:  entry:
@@ -482,6 +531,22 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
@@ -568,7 +633,6 @@ loop.end:
   ret i64 %retval2
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:  entry:
@@ -586,11 +650,12 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
-; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -8
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
 ; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
-; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -12
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
 ; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
 ; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -598,10 +663,27 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE3]], [[REVERSE10]]
-; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
-; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
+; VF4IC4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3
+; VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 -3
+; VF4IC4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -8
+; VF4IC4-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 -3
+; VF4IC4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -12
+; VF4IC4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP45]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1
+; VF4IC4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD7]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1
+; VF4IC4-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD10]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, ptr [[TMP44]], align 1
+; VF4IC4-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD12]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP46]], align 1
+; VF4IC4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE8]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE10]], [[REVERSE11]]
+; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE12]], [[REVERSE13]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE14]], [[REVERSE15]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]]
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
@@ -693,6 +775,30 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]]
+; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
+; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -782,6 +888,50 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 -3
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
+; VF4IC4-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
+; VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
+; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP37]], align 1
+; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; VF4IC4-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP38]], i32 -3
+; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -4
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -12
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP39]], align 1
+; VF4IC4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD7]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE8]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE10]]
+; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP21]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
+; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
+; VF4IC4-NEXT:    [[TMP27:%.*]] = or i1 [[TMP25]], [[TMP26]]
+; VF4IC4-NEXT:    br i1 [[TMP27]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP25]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index a9faed3499435..a782274c96123 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -198,6 +198,8 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF8UF2-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP3]], [[TMP6]]
+; VF8UF2-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]])
 ; VF8UF2-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
 ; VF8UF2-NEXT:    br i1 [[TMP7]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]

>From 1bdc736ec2e7f3bd0c015d4634652baf8617373f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 26 Jun 2025 14:50:15 +0100
Subject: [PATCH 5/5] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 4d3c16a93176c..4bc9e409069d1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -345,7 +345,8 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
       continue;
 
-    // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and Compute*Result which combine all parts to compute the final value.
+    // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
+    // Compute*Result which combine all parts to compute the final value.
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(